diff --git a/.github/workflows/accelsim.yml b/.github/workflows/accelsim.yml
index 39ae46df0..08a65be11 100644
--- a/.github/workflows/accelsim.yml
+++ b/.github/workflows/accelsim.yml
@@ -15,7 +15,8 @@ on:
 
 # By default regress against accel-sim's dev branch
 env:
-  ACCELSIM_BRANCH: dev
+  ACCELSIM_REPO: https://github.com/purdue-aalp/accel-sim-framework-public.git
+  ACCELSIM_BRANCH: dev-uvm
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
diff --git a/README.md b/README.md
index 84cfcd02c..90dd928e8 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,18 @@ Complex Dynamics in Many-Core Accelerator Architectures, In Proceedings of the
 IEEE International Symposium on Performance Analysis of Systems and Software
 (ISPASS), pp. 164-174, White Plains, NY, March 28-30, 2010.
 
+If you use prefetchers and page eviction policies, please cite:
+
+Debashis Ganguly, Ziyu Zhang, Jun Yang, and Rami Melhem, Interplay between hardware prefetcher and page eviction policy in CPU-GPU unified virtual memory, In Proceedings of the 46th International Symposium on Computer Architecture (ISCA '19), New York, NY, USA, 2019.
+
+If you use access counter-based delayed migration, LFU eviction, cold vs hot data structure classification, and page migration and pinning, please cite:
+
+Debashis Ganguly, Ziyu Zhang, Jun Yang, and Rami Melhem, Adaptive Page Migration for Irregular Data-intensive Applications under GPU Memory Oversubscription, In Proceedings of the 34th IEEE International Parallel & Distributed Processing Symposium (IPDPS 2020), New Orleans, Louisiana, USA, 2020.
+
+If you use adaptive runtime to detect pattern in CPU-GPU interconnect traffic, and policy engine to choose and dynamically employ memory management policies, please cite:
+
+Debashis Ganguly, Rami Melhem, and Jun Yang, An Adaptive Framework for Oversubscription Management in CPU-GPU Unified Memory, In 2021 Design, Automation & Test in Europe Conference & Exhibition (DATE 2021).
+
 This file contains instructions on installing, building and running GPGPU-Sim.
 Detailed documentation on what GPGPU-Sim models, how to configure it, and a
 guide to the source code can be found here: <http://gpgpu-sim.org/manual/>.
diff --git a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
index 47637286e..bc01821db 100644
--- a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
+++ b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
@@ -56,8 +56,8 @@
 
 
 # In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
-# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 -gpgpu_cache:dl1  N:32:128:4,L:L:m:N:H,S:64:8,8
 -gpgpu_shmem_size 49152
diff --git a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
index 9cb328f5d..ef47ddfd9 100644
--- a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
+++ b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
@@ -99,8 +99,8 @@
 # Greedy then oldest scheduler
 -gpgpu_scheduler gto
 
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
-# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 # The defulat is to disable the L1 cache, unless cache modifieres are used
 -gpgpu_cache:dl1  S:4:128:32,L:L:s:N:L,A:256:8,16:0,32
diff --git a/configs/tested-cfgs/SM6_GTX1080Ti/config_fermi_islip.icnt b/configs/tested-cfgs/SM6_GTX1080Ti/config_fermi_islip.icnt
new file mode 100644
index 000000000..2a69ddd4c
--- /dev/null
+++ b/configs/tested-cfgs/SM6_GTX1080Ti/config_fermi_islip.icnt
@@ -0,0 +1,70 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 32; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 50;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 8;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 2;
+output_speedup    = 1;
+internal_speedup  = 1.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM6_GTX1080Ti/gpgpusim.config b/configs/tested-cfgs/SM6_GTX1080Ti/gpgpusim.config
new file mode 100644
index 000000000..03854ff27
--- /dev/null
+++ b/configs/tested-cfgs/SM6_GTX1080Ti/gpgpusim.config
@@ -0,0 +1,289 @@
+# This config models the Volta
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1132.0:1132.0:1132.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 60
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler gto
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+# Volta unified cache has four banks
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_latency 20
+-gpgpu_smem_latency 20
+-gpgpu_flush_l1_cache 1
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs, disable it untill we create a real energy model for Volta
+-power_simulation_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD,LIVENESS
+#-trace_sampling_core 0
+
+### items for functional and timing simulation of UVM ###
+
+# gddr size should be less than or equal to 1GB, in the unit of MB/GB
+-gddr_size 1GB
+
+# size of gddr page, only 4KB and 2MB available
+-page_size 4KB
+
+# number of tlb entries per SM
+-tlb_size 4096
+
+# average page table walk latency (in core cycle)
+# for 4K page, set to 100 and for 2M page, set to 66
+-page_table_walk_latency 100
+
+# page eviction policy
+# 0 - lru 2MB (default)
+# 1 - lru tree-based neighborhood
+# 2 - lru sequential locality 64K
+# 3 - random 4KB
+# 4 - LFU 2MB
+# 5 - lru 4KB
+-eviction_policy 0
+
+# invalidate clean pages directly instead of writing back
+-invalidate_clean 0
+
+# reserve percentage (e.g. 10 or 20) of accesses pages from eviction in hope that they will be accessed in   next iteration
+-reserve_accessed_page_percent 0
+
+# percentage of free page buffer to trigger the page eviction (e.g. 5 or 10)
+-percentage_of_free_page_buffer 0
+
+# pcie bandwidth per direction
+-pcie_bandwidth 16.0GB/s
+
+# enable/disable GMMU statistics profiling for UVM
+-sim_prof_enable 1
+
+# disable deadlock check for UVM
+-gpgpu_deadlock_detect 0
+
+# latency to process kernel launch (150us or 222150 core cycles)
+#-gpgpu_kernel_launch_latency 222150
+
+# hardware prefetcher
+# 0 - disabled
+# 1 - tree-based neighborhood (default)
+# 2 - sequential locality 64K
+# 3 - random 4 K
+-hardware_prefetch 1
+
+# hardware prefetcher under over-subscription
+# 0 - disable upon eviction (default)
+# 1 - tree-based neighborhood
+# 2 - sequential locality 64K
+# 3 - random 4 K
+-hwprefetch_oversub 1
+
+# latency in core cycle to handle page fault (45us)
+# encompass the overhead of stalling threads, deciding memory address, page table walk, maintaining page     flags, transfer chunks and orders
+-page_fault_latency 66645
+
+# enabling accurate simulation for stalling warps and serializing accesses for page fault handling (default  0)
+-enable_accurate_simulation 0
+
+# Enable direct CPU-memory access from GPU
+# 0 - disable
+# 1 - adaptive
+# 2 - always
+# 3 - after oversubscription
+-enable_dma 0
+
+# Access counter threshold for migrating the page from cpu to gpu
+-migrate_threshold 8
+
+# Oversubscription Multiplicative Penalty Factor for Adaptive DMA
+-multiply_dma_penalty 2
+
+# enabling access pattern detection, policy engine, and adaptive memory management
+-enable_smart_runtime 0 
+
+# enabling skip cycles when all warps stall and wait for page fault come back
+-skip_cycles_enable 1
diff --git a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
index 882630e76..7d3e2d47e 100644
--- a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
+++ b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
@@ -123,8 +123,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
-# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 # The defulat is to disable the L1 cache, unless cache modifieres are used
 -gpgpu_l1_banks 2
diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 8cc3ed6bf..6ff4b6c08 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -83,8 +83,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
-# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
 # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
index 290c08d6c..08ac75277 100644
--- a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
@@ -128,8 +128,8 @@
 -gpgpu_num_reg_banks 16
 -gpgpu_reg_file_port_throughput 2
 
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
-# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
 -gpgpu_adaptive_cache_config 0
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
diff --git a/configs/tested-cfgs/SM7_GV100/gpgpusim.config b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
index 4887be8cf..26ce0eb58 100644
--- a/configs/tested-cfgs/SM7_GV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
@@ -137,8 +137,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
-# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index a14ae7567..b3384afcb 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -137,8 +137,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
-# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index b48f37006..c37aaf053 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -107,8 +107,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
-# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index c884541bb..d26b1a621 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -83,8 +83,8 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>,<data_port_width>
-# ** Optional parameter - Required when mshr_type==Texture Fifo, set to 0 if not used
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
 # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1
diff --git a/libcuda/cuda_api_object.h b/libcuda/cuda_api_object.h
index 3e34641ed..d382d66c0 100644
--- a/libcuda/cuda_api_object.h
+++ b/libcuda/cuda_api_object.h
@@ -222,7 +222,8 @@ class cuda_runtime_api {
                                               gpgpu_ptx_sim_arg_list_t args,
                                               struct dim3 gridDim,
                                               struct dim3 blockDim,
-                                              struct CUctx_st *context);
+                                              struct CUctx_st *context,
+                                              const gpgpu_sim_config &gpu_config);
   int load_static_globals(symbol_table *symtab, unsigned min_gaddr,
                           unsigned max_gaddr, gpgpu_t *gpu);
   int load_constants(symbol_table *symtab, addr_t min_gaddr, gpgpu_t *gpu);
diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index 8de12a335..f707c2365 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -114,6 +114,7 @@
 #include <regex>
 #include <sstream>
 #include <string>
+#include <set>
 #ifdef OPENGL_SUPPORT
 #define GL_GLEXT_PROTOTYPES
 #ifdef __APPLE__
@@ -954,8 +955,35 @@ cudaError_t cudaSetupArgumentInternal(const void *arg, size_t size,
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
   }
+
+  CUctx_st *context = GPGPUSim_Context(ctx);
+
   gpgpusim_ptx_assert(!ctx->api->g_cuda_launch_stack.empty(),
                       "empty launch stack");
+  
+  uint64_t hostPtr = *(uint64_t *)arg;
+
+  struct allocation_info *allocation =
+      context->get_device()->get_gpgpu()->gpu_get_managed_allocation(hostPtr);
+
+  if (allocation != NULL) { // verify whether a pointer to malloc managed memory
+    // during the kernel launch copy all the data from cpu to gpu
+    // pages are valid or invalid are tested later
+    uint64_t devPtr = allocation->gpu_mem_addr;
+
+    if (!allocation->copied) {
+      context->get_device()->get_gpgpu()->memcpy_to_gpu(
+          (size_t)devPtr, (void *)hostPtr, allocation->allocation_size);
+
+      allocation->copied = true;
+    }
+
+    // override the pointer argument to refer to gpu side allocation rather than
+    // cpu side memory gpgpu-sim only understands pointer reference from
+    // m_dev_malloc
+    *(uint64_t *)arg = devPtr;
+  }
+
   kernel_config &config = ctx->api->g_cuda_launch_stack.back();
   config.set_arg(arg, size, offset);
   printf(
@@ -1003,7 +1031,7 @@ cudaError_t cudaLaunchInternal(const char *hostFun,
          stream ? stream->get_uid() : 0);
   kernel_info_t *grid = ctx->api->gpgpu_cuda_ptx_sim_init_grid(
       hostFun, config.get_args(), config.grid_dim(), config.block_dim(),
-      context);
+      context, *(ctx->the_gpgpusim->g_the_gpu_config));
   // do dynamic PDOM analysis for performance simulation scenario
   std::string kname = grid->name();
   function_info *kernel_func_info = grid->entry();
@@ -1491,9 +1519,8 @@ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlagsInternal(
     dim3 gridDim(context->get_device()->get_gpgpu()->max_cta_per_core() *
                  context->get_device()->get_gpgpu()->get_config().num_shader());
     dim3 blockDim(blockSize);
-    // because this fuction is only checking for resource requirements, we do
-    // not care which stream this kernel runs at, just picked -1
-    kernel_info_t result(gridDim, blockDim, entry, -1);
+    kernel_info_t result(gridDim, blockDim, entry, -1,
+        *(ctx->the_gpgpusim->g_the_gpu_config));
     // if(entry == NULL){
     //	*numBlocks = 1;
     //	return g_last_cudaError = cudaErrorUnknown;
@@ -2370,8 +2397,88 @@ cudaDeviceSynchronizeInternal(gpgpu_context *gpgpu_ctx = NULL) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
   }
+  CUctx_st *context = GPGPUSim_Context(ctx);
   // Blocks until the device has completed all preceding requested tasks
+  
   ctx->synchronize();
+
+  const std::map<uint64_t, struct allocation_info *> &managedAllocations =
+      context->get_device()->get_gpgpu()->gpu_get_managed_allocations();
+
+  std::set<mem_addr_t> evicted_page_list;
+
+  // at this point kernel execution is over
+  // loop over all managed allocations
+  // copy the data back from gpu to cpu
+  for (std::map<uint64_t, struct allocation_info *>::const_iterator iter =
+           managedAllocations.begin();
+       iter != managedAllocations.end(); iter++) {
+
+    if (iter->second->copied) {
+
+      uint64_t hostPtr = iter->first;
+      uint64_t devPtr = iter->second->gpu_mem_addr;
+      size_t size = iter->second->allocation_size;
+
+      iter->second->copied = false;
+
+      while (size != 0) {
+        mem_addr_t page_num = context->get_device()
+                                  ->get_gpgpu()
+                                  ->get_global_memory()
+                                  ->get_page_num(devPtr);
+
+        size_t size_in_this_page = context->get_device()
+                                       ->get_gpgpu()
+                                       ->get_global_memory()
+                                       ->get_data_size(devPtr);
+
+        if (context->get_device()
+                ->get_gpgpu()
+                ->get_global_memory()
+                ->is_page_dirty(page_num)) {
+          context->get_device()->get_gpgpu()->memcpy_from_gpu(
+              (void *)hostPtr, (size_t)devPtr,
+              size > size_in_this_page ? size_in_this_page : size);
+
+          evicted_page_list.insert(page_num);
+        }
+
+        if (size <= size_in_this_page) {
+          size = 0;
+        } else {
+          size -= size_in_this_page;
+        }
+
+        devPtr += size_in_this_page;
+        hostPtr += size_in_this_page;
+      }
+    }
+  }
+
+  for (std::set<mem_addr_t>::const_iterator iter = evicted_page_list.begin();
+       iter != evicted_page_list.end(); iter++) {
+    context->get_device()->get_gpgpu()->get_global_memory()->invalidate_page(
+        *iter);
+    context->get_device()->get_gpgpu()->get_global_memory()->clear_page_access(
+        *iter);
+    context->get_device()->get_gpgpu()->get_global_memory()->clear_page_dirty(
+        *iter);
+    context->get_device()->get_gpgpu()->get_global_memory()->free_pages(1);
+    context->get_device()->get_gpgpu()->getGmmu()->tlb_flush(*iter);
+  }
+
+  context->get_device()->get_gpgpu()->get_global_memory()->reset();
+  context->get_device()->get_gpgpu()->getGmmu()->valid_pages_clear();
+  context->get_device()->get_gpgpu()->getGmmu()->reset_large_page_info();
+
+  unsigned transfer_size =
+      context->get_device()->get_gpgpu()->get_global_memory()->get_page_size() *
+      evicted_page_list.size();
+  if (transfer_size != 0)
+    context->get_device()->get_gpgpu()->getGmmu()->calculate_devicesync_time(
+        transfer_size);
+
   return g_last_cudaError = cudaSuccess;
 }
 
@@ -2468,6 +2575,94 @@ cudaError_t cudaPeekAtLastError(void) { return g_last_cudaError; }
 
 __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size) {
   return cudaMallocInternal(devPtr, size);
+}                                        
+                                         
+cudaError_t cudaMallocManagedInternal(   
+    void **devPtr, size_t size, unsigned  int flags = cudaMemAttachGlobal,
+    gpgpu_context *gpgpu_ctx = NULL) {   
+  gpgpu_context *ctx;                    
+  if (gpgpu_ctx) {                       
+    ctx = gpgpu_ctx;                     
+  } else {                               
+    ctx = GPGPU_Context();               
+  }                                      
+  if (g_debug_execution >= 3) {          
+    announce_call(__my_func__);          
+  }                                      
+  CUctx_st *context = GPGPUSim_Context(ctx);
+                                         
+  if (size == 0) {                       
+    return g_last_cudaError = cudaErrorInvalidValue;
+  }                                      
+  size_t num_large_pages = (size_t)(size / MAX_PREFETCH_SIZE);
+                                         
+  size_t remainder = size - (num_large_pages * MAX_PREFETCH_SIZE);
+                                         
+  size_t corrected_remainder;
+
+  if (remainder == 0)
+    corrected_remainder = 0;
+  else {
+    for (corrected_remainder = MIN_PREFETCH_SIZE;
+         corrected_remainder < remainder; corrected_remainder *= 2)
+      ;
+  }
+
+  size = (num_large_pages * MAX_PREFETCH_SIZE) + corrected_remainder;
+
+  // create a piece of memory for cpu side so that cpu side initialization code
+  // doesn't get SIGSEGV
+  void *cpuMemPtr = (void *)malloc(size);
+
+  // get a regular cudaMalloc memory
+  void *gpuMemPtr = context->get_device()->get_gpgpu()->gpu_mallocmanaged(size);
+
+  // maintain a map keyed by cpu memory pointer
+  // with a tuple of gpu malloc memory pointe and allocation size as value
+  context->get_device()->get_gpgpu()->gpu_insert_managed_allocation(
+      (uint64_t)cpuMemPtr, (uint64_t)gpuMemPtr, size);
+
+  // at the begining itself allocate memory storage for gpu malloced allocation
+  // note after this point data is not initialized on CPU
+  // so we need to copy the actual data on kernel launch
+  context->get_device()->get_gpgpu()->memcpy_to_gpu((size_t)gpuMemPtr,
+                                                    (void *)cpuMemPtr, size);
+
+  context->get_device()->get_gpgpu()->set_pages_managed((size_t)gpuMemPtr,
+                                                        size);
+  // return cpu memory pointer to the user code
+  // such that cpu side code can access the memory
+  *devPtr = cpuMemPtr;
+
+  mem_addr_t tempGPUPtr = *((mem_addr_t *)(&gpuMemPtr));
+
+  for (size_t cur_size = 0; cur_size < size;) {
+    if ((size - cur_size) < MAX_PREFETCH_SIZE) {
+      context->get_device()->get_gpgpu()->getGmmu()->initialize_large_page(
+          tempGPUPtr, size - cur_size);
+      break;
+    } else {
+      context->get_device()->get_gpgpu()->getGmmu()->initialize_large_page(
+          tempGPUPtr, MAX_PREFETCH_SIZE);
+      cur_size += MAX_PREFETCH_SIZE;
+      tempGPUPtr += MAX_PREFETCH_SIZE;
+    }
+  }
+
+  if (g_debug_execution >= 3)
+    printf("GPGPU-Sim PTX: cudaMallocing %zu bytes starting at 0x%llx..\n",
+           size, (unsigned long long)*devPtr);
+
+  if (gpuMemPtr) {
+    return g_last_cudaError = cudaSuccess;
+  } else {
+    return g_last_cudaError = cudaErrorMemoryAllocation;
+  }
+}
+
+__host__ cudaError_t CUDARTAPI cudaMallocManaged(
+        void **devPtr, size_t size, unsigned int flags = cudaMemAttachGlobal) {
+    return cudaMallocManagedInternal(devPtr, size, flags);
 }
 
 __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
@@ -2614,6 +2809,91 @@ __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total) {
  *                                                                              *
  *******************************************************************************/
 
+__host__ cudaError_t CUDARTAPI cudaMemPrefetchAsyncInternal(const void *devPtr,
+                                                    size_t count, int dstDevice,
+                                                    cudaStream_t stream = 0,
+                                                    gpgpu_context *gpgpu_ctx = NULL) {
+  gpgpu_context *ctx;
+  if (gpgpu_ctx) {
+    ctx = gpgpu_ctx;
+  } else {
+    ctx = GPGPU_Context();
+  }
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  // If dstDevice is a GPU, then the device attribute
+  // cudaDevAttrConcurrentManagedAccess must be non-zero. Additionally, stream
+  // must be associated with a device that has a non-zero value for the device
+  // attribute cudaDevAttrConcurrentManagedAccess. The memory range must refer
+  // to managed memory allocated via cudaMallocManaged or declared via
+  // __managed__ variables.
+
+  struct CUstream_st *s = (struct CUstream_st *)stream;
+
+  if (dstDevice == cudaCpuDeviceId) {
+    // not a priority thing as cudaDeviceSynchronize does the same job
+  } else if (dstDevice == ctx->api->g_active_device) {
+    CUctx_st *context = GPGPUSim_Context(ctx);
+
+    const std::map<uint64_t, struct allocation_info *> &managedAllocations =
+        context->get_device()->get_gpgpu()->gpu_get_managed_allocations();
+
+    uint64_t gpuPtr = 0;
+    uint64_t allocationPtr = 0;
+
+    for (std::map<uint64_t, struct allocation_info *>::const_iterator iter =
+             managedAllocations.begin();
+         iter != managedAllocations.end(); iter++) {
+      // find the allocation for the host pointer recieved as argument
+      // remember: we have emulated behavior of UVM by having both CPU and GPU
+      // copies of same data
+      if ((uint64_t)devPtr >= iter->first &&
+          (uint64_t)devPtr + count <=
+              iter->first + iter->second->allocation_size) {
+        allocationPtr = iter->second->gpu_mem_addr;
+        // gpuPtr is offset to align with host ptr or cpu ptr from the
+        // allocation start
+        gpuPtr = iter->second->gpu_mem_addr + ((uint64_t)devPtr - iter->first);
+        break;
+      }
+    }
+
+    assert(gpuPtr != NULL);
+
+    size_t page_size = context->get_device()
+                           ->get_gpgpu()
+                           ->get_global_memory()
+                           ->get_page_size();
+
+    uint64_t start_addr =
+        (gpuPtr / page_size) * page_size; // rolling up to make it page aligned
+    uint64_t end_addr = (gpuPtr + count - 1) / page_size * page_size +
+                        page_size; // rolling down to to make it page aligned
+                                   // after adding the total size
+
+    assert(start_addr != end_addr);
+    assert((end_addr - start_addr) % page_size == 0);
+
+    ctx->the_gpgpusim->g_stream_manager->register_prefetch(
+        (size_t)start_addr, (size_t)allocationPtr,
+        (size_t)(end_addr - start_addr),
+        s == NULL ? ctx->the_gpgpusim->g_stream_manager->get_stream_zero() : s);
+
+    ctx->the_gpgpusim->g_stream_manager->push(stream_operation(
+        (size_t)start_addr, (size_t)(end_addr - start_addr), s));
+
+  } else {
+    abort();
+  }
+  return g_last_cudaError = cudaSuccess;
+}
+
+cudaError_t cudaMemPrefetchAsync(const void *devPtr, size_t count, 
+                                 int dstDevice, cudaStream_t stream = 0) {
+  return cudaMemPrefetchAsyncInternal(devPtr, count, dstDevice, stream);
+}
+
 __host__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src,
                                                size_t count,
                                                enum cudaMemcpyKind kind,
@@ -2993,40 +3273,6 @@ __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream) {
   return cudaStreamSynchronizeInternal(stream);
 }
 
-__host__ cudaError_t CUDARTAPI cudaStreamSynchronizeSST(cudaStream_t stream) {
-  // For SST, perform a one-time check
-  gpgpu_context *ctx = GPGPU_Context();
-  if (g_debug_execution >= 3) {
-    announce_call(__my_func__);
-  }
-
-  // default stream: all is done
-  // other streams: no more ops
-  g_last_cudaError = cudaSuccess;
-  if (stream == NULL) {
-    // For default stream, sync is equivalent to cudaThreadSync
-    bool thread_synced = ctx->synchronize_check();
-    if (thread_synced) {
-      // We are already done, so no need to poll for sync done
-      return cudaSuccess;
-    } else {
-      // Otherwise we mark we should wait for default strem to sync
-      ctx->the_gpgpusim->g_stream_manager->get_stream_zero()->set_request_synchronize();
-      return cudaErrorNotReady;
-    }
-  } else {
-    // For other stream, check if it is already sync'ed
-    bool stream_synced = stream->synchronize_check();
-    if (stream_synced) {
-      return cudaSuccess;
-    } else {
-      stream->set_request_synchronize();
-      return cudaErrorNotReady;
-    }
-  }
-  return g_last_cudaError = cudaSuccess;
-}
-
 __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -4385,7 +4631,8 @@ int cuda_runtime_api::load_constants(symbol_table *symtab, addr_t min_gaddr,
 
 kernel_info_t *cuda_runtime_api::gpgpu_cuda_ptx_sim_init_grid(
     const char *hostFun, gpgpu_ptx_sim_arg_list_t args, struct dim3 gridDim,
-    struct dim3 blockDim, CUctx_st *context) {
+    struct dim3 blockDim, CUctx_st *context,
+    const gpgpu_sim_config &gpu_config) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
   }
@@ -4397,7 +4644,7 @@ kernel_info_t *cuda_runtime_api::gpgpu_cuda_ptx_sim_init_grid(
   */
   kernel_info_t *result =
       new kernel_info_t(gridDim, blockDim, entry, gpu->getNameArrayMapping(),
-                        gpu->getNameInfoMapping());
+                        gpu->getNameInfoMapping(), gpu_config);
   if (entry == NULL) {
     printf(
         "GPGPU-Sim PTX: ERROR launching kernel -- no PTX implementation found "
diff --git a/short-tests-accelsim.sh b/short-tests-accelsim.sh
index 5cb4d2cc6..b404d4e31 100755
--- a/short-tests-accelsim.sh
+++ b/short-tests-accelsim.sh
@@ -8,6 +8,11 @@ if [ ! -n "$ACCELSIM_BRANCH" ]; then
 	exit 1;
 fi
 
+if [ ! -n "$ACCELSIM_REPO" ]; then
+    echo "ERROR ** set the ACCELSIM_REPO env variable";
+    exit 1;
+fi
+
 if [ ! -n "$GPUAPPS_ROOT" ]; then
 	echo "ERROR ** GPUAPPS_ROOT to a location where the apps have been compiled";
 	exit 1;
@@ -19,10 +24,12 @@ export PATH=$CUDA_INSTALL_PATH/bin:$PATH
 source ./setup_environment
 make -j
 
-git clone https://github.com/accel-sim/accel-sim-framework.git
+git clone $ACCELSIM_REPO
+basename=$(basename $ACCELSIM_REPO)
+filename=${basename%.*}
 
 # Build accel-sim
-cd accel-sim-framework
+cd $filename
 git checkout $ACCELSIM_BRANCH
 source ./gpu-simulator/setup_environment.sh
 make -j -C ./gpu-simulator
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 8743cc7a7..287fd7b54 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -178,15 +178,59 @@ void gpgpu_functional_sim_config::ptx_set_tex_cache_linesize(
   m_texcache_linesize = linesize;
 }
 
+void gpgpu_functional_sim_config::convert_byte_string() {
+  float temp_size = 0;
+  if (strstr(gddr_size_string, "MB")) {
+    temp_size = atof(gddr_size_string) * 1024.0 * 1024.0;
+    gddr_size = ((unsigned long long)(temp_size)-1) / 4096 * 4096 + 4096;
+  } else if (strstr(gddr_size_string, "GB")) {
+    temp_size = atof(gddr_size_string) * 1024.0 * 1024.0 * 1024.0;
+    gddr_size = ((unsigned long long)(temp_size)-1) / 4096 * 4096 + 4096;
+  } else {
+    printf("-gddr_size must be in MB/GB\n");
+    exit(1);
+  }
+
+  // the only available page size is 4k/2mb
+  if (std::string(page_size_string) == "4KB") {
+    page_size = 4096;
+  } else if (std::string(page_size_string) == "2MB") {
+    page_size = 2097152;
+  } else {
+    printf("-page_size only support 4KB and 2MB\n");
+    exit(1);
+  }
+}
+
 gpgpu_t::gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx)
     : m_function_model_config(config) {
   gpgpu_ctx = ctx;
-  m_global_mem = new memory_space_impl<8192>("global", 64 * 1024);
+  if (config.page_size == 4096) {
+    m_global_mem =
+        new memory_space_impl<4096>("global", 64 * 1024, config.gddr_size);
+    m_tex_mem = new memory_space_impl<4096>("tex", 64 * 1024);
+    m_surf_mem = new memory_space_impl<4096>("surf", 64 * 1024);
+  } else {
+    m_global_mem = new memory_space_impl<2 * 1024 * 1024>("global", 64 * 1024,
+                                                          config.gddr_size);
+    m_tex_mem = new memory_space_impl<2 * 1024 * 1024>("tex", 64 * 1024);
+    m_surf_mem = new memory_space_impl<2 * 1024 * 1024>("surf", 64 * 1024);
+  }
 
-  m_tex_mem = new memory_space_impl<8192>("tex", 64 * 1024);
-  m_surf_mem = new memory_space_impl<8192>("surf", 64 * 1024);
+  // make sure the memory address that would be used for m_dev_malloc_managed
+  // doesn't go accross the 32 bit addressing limit
+  assert(((unsigned long long)GLOBAL_HEAP_START + config.gddr_size * 2) <=
+         MEM_SPACE_LIMIT);
 
   m_dev_malloc = GLOBAL_HEAP_START;
+
+  // latter is different from former as managed and unmanaged allocations behave
+  // differently managed allocations can be evicted on memory overflow whereas
+  // unmanaged are pinned also only managed pages may suffer latency for page
+  // table walkthrough/access and PCI-E because of this managed and unmanaged
+  // allocation can not be from same page
+  m_dev_malloc_managed = (GLOBAL_HEAP_START + GLOBAL_MEM_SIZE_MAX);
+
   checkpoint_option = m_function_model_config.get_checkpoint_option();
   checkpoint_kernel = m_function_model_config.get_checkpoint_kernel();
   checkpoint_CTA = m_function_model_config.get_checkpoint_CTA();
@@ -769,7 +813,40 @@ kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim,
   m_num_cores_running = 0;
   m_uid = (entry->gpgpu_ctx->kernel_info_m_next_uid)++;
   m_streamID = streamID;
-  m_param_mem = new memory_space_impl<8192>("param", 64 * 1024);
+  m_param_mem = new memory_space_impl<4096>("param", 64 * 1024);
+
+  // Jin: parent and child kernel management for CDP
+  m_parent_kernel = NULL;
+
+  // Jin: launch latency management
+  m_launch_latency = entry->gpgpu_ctx->device_runtime->g_kernel_launch_latency;
+
+  m_kernel_TB_latency =
+      entry->gpgpu_ctx->device_runtime->g_kernel_launch_latency +
+      num_blocks() * entry->gpgpu_ctx->device_runtime->g_TB_launch_latency;
+
+  cache_config_set = false;
+}
+
+kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim,
+                             class function_info *entry,
+                             unsigned long long streamID,
+                             const gpgpu_sim_config &gpu_config) {
+  m_kernel_entry = entry;
+  m_grid_dim = gridDim;
+  m_block_dim = blockDim;
+  m_next_cta.x = 0;
+  m_next_cta.y = 0;
+  m_next_cta.z = 0;
+  m_next_tid = m_next_cta;
+  m_num_cores_running = 0;
+  m_uid = (entry->gpgpu_ctx->kernel_info_m_next_uid)++;
+  m_streamID = streamID;
+  if (gpu_config.page_size == 4096) {
+    m_param_mem = new memory_space_impl<4096>("param", 64 * 1024);
+  } else {
+    m_param_mem = new memory_space_impl<2 * 1024 * 1024>("param", 64 * 1024);
+  }
 
   // Jin: parent and child kernel management for CDP
   m_parent_kernel = NULL;
@@ -790,7 +867,8 @@ kernels should use the texture bindings seen at the time of launch and textures
 kernel_info_t::kernel_info_t(
     dim3 gridDim, dim3 blockDim, class function_info *entry,
     std::map<std::string, const struct cudaArray *> nameToCudaArray,
-    std::map<std::string, const struct textureInfo *> nameToTextureInfo) {
+    std::map<std::string, const struct textureInfo *> nameToTextureInfo,
+    const gpgpu_sim_config &gpu_config) {
   m_kernel_entry = entry;
   m_grid_dim = gridDim;
   m_block_dim = blockDim;
@@ -800,8 +878,13 @@ kernel_info_t::kernel_info_t(
   m_next_tid = m_next_cta;
   m_num_cores_running = 0;
   m_uid = (entry->gpgpu_ctx->kernel_info_m_next_uid)++;
-  m_param_mem = new memory_space_impl<8192>("param", 64 * 1024);
-
+  //m_param_mem = new memory_space_impl<8192>("param", 64 * 1024);
+  if (gpu_config.page_size == 4096) {
+    m_param_mem = new memory_space_impl<4096>("param", 64 * 1024);
+  } else {
+    m_param_mem = new memory_space_impl<2 * 1024 * 1024>("param", 64 * 1024);
+  }
+  
   // Jin: parent and child kernel management for CDP
   m_parent_kernel = NULL;
 
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index cddf523e7..ac3b97785 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -33,6 +33,7 @@
 #define ABSTRACT_HARDWARE_MODEL_INCLUDED
 
 // Forward declarations
+class gpgpu_sim_config;
 class gpgpu_sim;
 class kernel_info_t;
 class gpgpu_context;
@@ -194,6 +195,7 @@ enum _memory_op_t { no_memory_op = 0, memory_load, memory_store };
 #include <deque>
 #include <list>
 #include <map>
+#include <stdint.h>
 #include <vector>
 
 #if !defined(__VECTOR_TYPES_H__)
@@ -235,10 +237,14 @@ class kernel_info_t {
   //   }
   kernel_info_t(dim3 gridDim, dim3 blockDim, class function_info *entry,
                 unsigned long long streamID);
+  kernel_info_t(dim3 gridDim, dim3 blockDim, class function_info *entry,
+                unsigned long long streamID,
+                const gpgpu_sim_config &gpu_config);
   kernel_info_t(
       dim3 gridDim, dim3 blockDim, class function_info *entry,
       std::map<std::string, const struct cudaArray *> nameToCudaArray,
-      std::map<std::string, const struct textureInfo *> nameToTextureInfo);
+      std::map<std::string, const struct textureInfo *> nameToTextureInfo,
+      const gpgpu_sim_config &gpu_config);
   ~kernel_info_t();
 
   void inc_running() { m_num_cores_running++; }
@@ -248,7 +254,9 @@ class kernel_info_t {
   }
   bool running() const { return m_num_cores_running > 0; }
   bool done() const { return no_more_ctas_to_run() && !running(); }
-  class function_info *entry() { return m_kernel_entry; }
+  class function_info *entry() {
+    return m_kernel_entry;
+  }
   const class function_info *entry() const { return m_kernel_entry; }
 
   size_t num_blocks() const {
@@ -479,7 +487,10 @@ class simt_stack {
 // Let's just upgrade to C++11 so we can use constexpr here...
 // start allocating from this address (lower values used for allocating globals
 // in .ptx file)
-const unsigned long long GLOBAL_HEAP_START = 0xC0000000;
+const unsigned long long MEM_SPACE_LIMIT = 0x100000000;
+const unsigned long long GLOBAL_HEAP_START = 0x80000000;
+// fix the max addressable global mem size as 1GB instead of dynamically deciding
+const unsigned long long GLOBAL_MEM_SIZE_MAX = (2 * 1024 * 1024 * 1024);
 // Volta max shmem size is 96kB
 const unsigned long long SHARED_MEM_SIZE_MAX = 96 * (1 << 10);
 // Volta max local mem is 16kB
@@ -559,6 +570,8 @@ class gpgpu_functional_sim_config {
   int get_checkpoint_CTA_t() const { return checkpoint_CTA_t; }
   int get_checkpoint_insn_Y() const { return checkpoint_insn_Y; }
 
+  void convert_byte_string();
+
  private:
   // PTX options
   int m_ptx_convert_to_ptxplus;
@@ -578,11 +591,55 @@ class gpgpu_functional_sim_config {
   int g_ptx_inst_debug_thread_uid;
 
   unsigned m_texcache_linesize;
+
+  protected:
+  unsigned long long gddr_size;
+  int page_size;
+
+  unsigned long long page_fault_latency;
+  bool enable_accurate_simulation;
+
+  char *gddr_size_string;
+  char *page_size_string;
+
+  friend class gpgpu_t;
+  friend class kernel_info_t;
+  template <unsigned BSIZE> friend class memory_space_impl;
+  friend void calculate_sim_prof(FILE *fout, gpgpu_sim *gpu);
+};
+
+struct allocation_info {
+  uint64_t gpu_mem_addr;
+  size_t allocation_size;
+  bool copied;
 };
 
+#define MAX_PREFETCH_SIZE (2 * 1024 * 1024)
+#define MIN_PREFETCH_SIZE (64 * 1024)
+
 class gpgpu_t {
  public:
   gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx);
+
+  // Declare a constructor for gmmu_t type
+  // gpgpu_t(class gpgpu_sim *gpu, const gpgpu_sim_config &config,
+  //         class gpgpu_new_stats *new_stats);
+  struct allocation_info *gpu_get_managed_allocation(uint64_t cpuMemAddr);
+  const std::map<uint64_t, struct allocation_info *> &
+  gpu_get_managed_allocations();
+  void gpu_insert_managed_allocation(uint64_t cpuMemAddr, uint64_t gpuMemAddr,
+                                     size_t size);
+
+  // set the allocated pages as managed
+  void set_pages_managed(size_t addr, size_t size);
+
+  // write content of dirty page back to CPU on eviction
+  void gpu_writeback(uint64_t gpuMemAddr);
+
+  // method used to managed allocation which ensures the unmanaged & managed
+  // allocation does not fall into same page
+  void *gpu_mallocmanaged(size_t size);
+
   // backward pointer
   class gpgpu_context *gpgpu_ctx;
   int checkpoint_option;
@@ -671,8 +728,15 @@ class gpgpu_t {
   class memory_space *m_global_mem;
   class memory_space *m_tex_mem;
   class memory_space *m_surf_mem;
+  
+  unsigned long long
+      m_dev_malloc; // variable to store a known heap pointer for unmanaged
+                    // allocation (cudaMalloc, cudaMallocArray)
+  unsigned long long m_dev_malloc_managed; // variable to store a known heap
+                                           // pointer for any managed allocation
+
+  std::map<uint64_t, struct allocation_info *> managedAllocations;
 
-  unsigned long long m_dev_malloc;
   //  These maps contain the current texture mappings for the GPU at any given
   //  time.
   std::map<std::string, std::set<const struct textureReference *> >
@@ -821,6 +885,32 @@ class mem_access_t {
     m_req_size = size;
     m_write = wr;
   }
+  mem_access_t(unsigned int uid, mem_access_type type, new_addr_type address, unsigned size,
+               bool wr, const active_mask_t &active_mask,
+               const mem_access_byte_mask_t &byte_mask,
+               const mem_access_sector_mask_t &sector_mask, gpgpu_context *ctx)
+      : m_warp_mask(active_mask),
+        m_byte_mask(byte_mask),
+        m_sector_mask(sector_mask) {
+    init(ctx);
+    m_uid = uid;
+    m_type = type;
+    m_addr = address;
+    m_req_size = size;
+    m_write = wr;
+  }
+  mem_access_t(const mem_access_t &ma) {
+    m_uid = ma.m_uid;
+    m_addr = ma.m_addr;
+    m_write = ma.m_write;
+    m_req_size = ma.m_req_size;
+    m_type = ma.m_type;
+    m_warp_mask = ma.m_warp_mask;
+    m_byte_mask = ma.m_byte_mask;
+    m_sector_mask = ma.m_sector_mask;
+  }
+
+  unsigned get_uid() const { return m_uid; }
 
   new_addr_type get_addr() const { return m_addr; }
   void set_addr(new_addr_type addr) { m_addr = addr; }
@@ -1125,9 +1215,9 @@ class warp_inst_t : public inst_t {
       printf("Printing mem access generated\n");
       std::list<mem_access_t>::iterator it;
       for (it = m_accessq.begin(); it != m_accessq.end(); ++it) {
-        printf("MEM_TXN_GEN:%s:%llx, Size:%d \n",
+        printf("MEM_TXN_GEN:%s:%llx, uid:%d, Size:%d \n",
                mem_access_type_str(it->get_type()), it->get_addr(),
-               it->get_size());
+               it->get_uid(), it->get_size());
       }
     }
   }
@@ -1185,6 +1275,7 @@ class warp_inst_t : public inst_t {
     return m_warp_issued_mask.count();
   }  // for instruction counting
   bool empty() const { return m_empty; }
+  unsigned get_warp_id() { return m_warp_id; }
   unsigned warp_id() const {
     assert(!m_empty);
     return m_warp_id;
@@ -1212,6 +1303,14 @@ class warp_inst_t : public inst_t {
 
   bool accessq_empty() const { return m_accessq.empty(); }
   unsigned accessq_count() const { return m_accessq.size(); }
+
+  // for queue, always push back and pop front
+  mem_access_t &accessq_front() { return m_accessq.front(); }
+  void accessq_pop_front() { printf("accessq_pop_front is called\n"); m_accessq.pop_front(); }
+  void accessq_push_back(mem_access_t mem_access) {
+    m_accessq.push_back(mem_access);
+  }
+  
   const mem_access_t &accessq_back() { return m_accessq.back(); }
   void accessq_pop_back() { m_accessq.pop_back(); }
 
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index 69d1eb74f..0cc31611b 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -445,6 +445,88 @@ addr_t generic_to_local(unsigned smid, unsigned hwtid, addr_t addr) {
 
 addr_t generic_to_global(addr_t addr) { return addr; }
 
+struct allocation_info *
+gpgpu_t::gpu_get_managed_allocation(uint64_t cpuMemAddr) {
+  if (managedAllocations.find(cpuMemAddr) == managedAllocations.end()) {
+    return NULL;
+  } else {
+    return managedAllocations[cpuMemAddr];
+  }
+}
+
+const std::map<uint64_t, struct allocation_info *> &
+gpgpu_t::gpu_get_managed_allocations() {
+  return managedAllocations;
+}
+
+void gpgpu_t::gpu_insert_managed_allocation(uint64_t cpuMemAddr,
+                                            uint64_t gpuMemAddr, size_t size) {
+  struct allocation_info *a_i =
+      (struct allocation_info *)malloc(sizeof(struct allocation_info));
+
+  a_i->gpu_mem_addr = gpuMemAddr;
+  a_i->allocation_size = size;
+  a_i->copied = false;
+
+  managedAllocations.insert(
+      std::pair<uint64_t, struct allocation_info *>(cpuMemAddr, a_i));
+}
+
+void gpgpu_t::gpu_writeback(uint64_t gpuMemAddr) {
+  size_t page_size = get_global_memory()->get_page_size();
+
+  mem_addr_t page_num = get_global_memory()->get_page_num(gpuMemAddr);
+
+  // a page may be shared by multiple managed allocations
+  // on every dynamic memory allocation cpu gives a separate page
+  // however GPU can coallesce multiple allocations in the same page based on
+  // the size of the allocations
+  for (std::map<uint64_t, struct allocation_info *>::const_iterator iter =
+           managedAllocations.begin();
+       iter != managedAllocations.end(); iter++) {
+
+    if (iter->second->copied) {
+
+      uint64_t devPtr = iter->second->gpu_mem_addr;
+
+      // check whether the allocation consists of the page we are trying to
+      // evict
+      if (page_num >= get_global_memory()->get_page_num(devPtr) &&
+          page_num <= get_global_memory()->get_page_num(
+                          devPtr + iter->second->allocation_size)) {
+
+        // the allocation on GPU side starts from the evicted page
+        if (page_num == get_global_memory()->get_page_num(devPtr)) {
+          size_t size_on_page =
+              get_global_memory()->get_page_size() - (devPtr - gpuMemAddr);
+
+          // the allocation size can be much smaller than the size in bytes from
+          // the allocation starting address to the end of the page if yes, then
+          // just copy the bytes worth of allocation size or else copy the whole
+          // thing starting from the allocation address to the end of page
+          memcpy_from_gpu((void *)iter->first, (size_t)devPtr,
+                          size_on_page > iter->second->allocation_size
+                              ? iter->second->allocation_size
+                              : size_on_page);
+        } else { // trailing (or middle) part of the allocation is in the
+                 // evicted page
+          size_t size_remaining =
+              iter->second->allocation_size - (gpuMemAddr - devPtr);
+
+          // the remaining size can be greater than a page (when the evicted
+          // page is in the middle of multi-page allocation) then just write
+          // back the data worth of evicted page if the page is trailing of the
+          // allocation and less than the page size, then copy only remaining
+          // size
+          memcpy_from_gpu(
+              (void *)(iter->first + (gpuMemAddr - devPtr)), (size_t)gpuMemAddr,
+              size_remaining > page_size ? page_size : size_remaining);
+        }
+      }
+    }
+  }
+}
+
 void *gpgpu_t::gpu_malloc(size_t size) {
   unsigned long long result = m_dev_malloc;
   if (g_debug_execution >= 3) {
@@ -454,12 +536,39 @@ void *gpgpu_t::gpu_malloc(size_t size) {
         size, m_dev_malloc);
     fflush(stdout);
   }
+
+  // make sure there is still memory space for allocation
+  if (!m_global_mem->alloc_page_by_byte(size)) {
+    return NULL;
+  }
+
   m_dev_malloc += size;
   if (size % 256)
     m_dev_malloc += (256 - size % 256);  // align to 256 byte boundaries
   return (void *)result;
 }
 
+void *gpgpu_t::gpu_mallocmanaged(size_t size) {
+  unsigned long long result = m_dev_malloc_managed;
+  if (g_debug_execution >= 3) {
+    printf("GPGPU-Sim PTX: allocating %zu bytes on GPU starting at address "
+           "0x%Lx\n",
+           size, m_dev_malloc_managed);
+    fflush(stdout);
+  }
+
+  // make sure the m_dev_malloc_managed does not go beyond the 32 bit address
+  // limit
+  assert(m_dev_malloc_managed + size <= MEM_SPACE_LIMIT);
+
+  // using seperate address range to distinguish between managed and unmanaged
+  // memory
+  m_dev_malloc_managed += size;
+  if (size % 256)
+    m_dev_malloc_managed += (256 - size % 256); // align to 256 byte boundaries
+  return (void *)result;
+}
+
 void *gpgpu_t::gpu_mallocarray(size_t size) {
   unsigned long long result = m_dev_malloc;
   if (g_debug_execution >= 3) {
@@ -469,6 +578,10 @@ void *gpgpu_t::gpu_mallocarray(size_t size) {
         size, m_dev_malloc);
     fflush(stdout);
   }
+  // make sure there is still memory space for allocation
+  if (!m_global_mem->alloc_page_by_byte(size)) {
+    return NULL;
+  }
   m_dev_malloc += size;
   if (size % 256)
     m_dev_malloc += (256 - size % 256);  // align to 256 byte boundaries
@@ -496,6 +609,10 @@ void gpgpu_t::memcpy_to_gpu(size_t dst_start_addr, const void *src,
   }
 }
 
+void gpgpu_t::set_pages_managed(size_t addr, size_t count) {
+  m_global_mem->set_pages_managed(addr, count);
+}
+
 void gpgpu_t::memcpy_from_gpu(void *dst, size_t src_start_addr, size_t count) {
   if (g_debug_execution >= 3) {
     printf("GPGPU-Sim PTX: copying %zu bytes from GPU[0x%Lx] to CPU[0x%Lx] ...",
@@ -2230,10 +2347,11 @@ size_t get_kernel_code_size(class function_info *entry) {
 
 kernel_info_t *cuda_sim::gpgpu_opencl_ptx_sim_init_grid(
     class function_info *entry, gpgpu_ptx_sim_arg_list_t args,
-    struct dim3 gridDim, struct dim3 blockDim, gpgpu_t *gpu) {
+    struct dim3 gridDim, struct dim3 blockDim, gpgpu_t *gpu,
+    const gpgpu_sim_config &config) {
   kernel_info_t *result =
       new kernel_info_t(gridDim, blockDim, entry, gpu->getNameArrayMapping(),
-                        gpu->getNameInfoMapping());
+                        gpu->getNameInfoMapping(), config);
   unsigned argcount = args.size();
   unsigned argn = 1;
   for (gpgpu_ptx_sim_arg_list_t::iterator a = args.begin(); a != args.end();
diff --git a/src/cuda-sim/cuda-sim.h b/src/cuda-sim/cuda-sim.h
index b1caf0c64..0b7bb5635 100644
--- a/src/cuda-sim/cuda-sim.h
+++ b/src/cuda-sim/cuda-sim.h
@@ -186,7 +186,8 @@ class cuda_sim {
                                                 gpgpu_ptx_sim_arg_list_t args,
                                                 struct dim3 gridDim,
                                                 struct dim3 blockDim,
-                                                gpgpu_t *gpu);
+                                                gpgpu_t *gpu,
+                                                const gpgpu_sim_config &config);
   void gpgpu_ptx_sim_register_global_variable(void *hostVar,
                                               const char *deviceName,
                                               size_t size);
diff --git a/src/cuda-sim/cuda_device_runtime.cc b/src/cuda-sim/cuda_device_runtime.cc
index 8ed90bcc2..9669411ef 100644
--- a/src/cuda-sim/cuda_device_runtime.cc
+++ b/src/cuda-sim/cuda_device_runtime.cc
@@ -177,9 +177,12 @@ void cuda_device_runtime::gpgpusim_cuda_launchDeviceV2(
 
       // create child kernel_info_t and index it with parameter_buffer address
       gpgpu_t *gpu = thread->get_gpu();
+      const gpgpu_sim_config g_the_gpu_config = 
+          *(gpu->gpgpu_ctx->the_gpgpusim->g_the_gpu_config);
       device_grid = new kernel_info_t(
           config.grid_dim, config.block_dim, device_kernel_entry,
-          gpu->getNameArrayMapping(), gpu->getNameInfoMapping());
+          gpu->getNameArrayMapping(), gpu->getNameInfoMapping(), 
+          g_the_gpu_config);
       device_grid->launch_cycle = gpu->gpu_sim_cycle + gpu->gpu_tot_sim_cycle;
       kernel_info_t &parent_grid = thread->get_kernel();
       DEV_RUNTIME_REPORT(
diff --git a/src/cuda-sim/memory.cc b/src/cuda-sim/memory.cc
index 036badaf1..c824ec1cb 100644
--- a/src/cuda-sim/memory.cc
+++ b/src/cuda-sim/memory.cc
@@ -33,7 +33,9 @@
 
 template <unsigned BSIZE>
 memory_space_impl<BSIZE>::memory_space_impl(std::string name,
-                                            unsigned hash_size) {
+                                            unsigned hash_size,
+                                            unsigned long long gddr_size)
+: num_gddr_pages(gddr_size / BSIZE) {
   m_name = name;
   MEM_MAP_RESIZE(hash_size);
 
@@ -45,6 +47,9 @@ memory_space_impl<BSIZE>::memory_space_impl(std::string name,
     }
   }
   assert(m_log2_block_size != (unsigned)-1);
+
+  // initialize the number of free pages based on size of GDDR5 and page size
+  num_free_pages = num_gddr_pages;
 }
 
 template <unsigned BSIZE>
@@ -102,6 +107,27 @@ void memory_space_impl<BSIZE>::write(mem_addr_t addr, size_t length,
   }
 }
 
+template <unsigned BSIZE> void memory_space_impl<BSIZE>::reset() {
+  num_free_pages = num_gddr_pages;
+}
+
+template <unsigned BSIZE>
+bool memory_space_impl<BSIZE>::is_page_managed(mem_addr_t addr, size_t length) {
+  mem_addr_t page_index = get_page_num(addr + length - 1);
+  return m_data[page_index].is_managed();
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::set_pages_managed(mem_addr_t addr,
+                                                 size_t length) {
+  mem_addr_t start_page = get_page_num(addr);
+  mem_addr_t end_page = get_page_num(addr + length - 1);
+  while (start_page <= end_page) {
+    m_data[start_page].set_managed();
+    start_page++;
+  }
+}
+
 template <unsigned BSIZE>
 void memory_space_impl<BSIZE>::read_single_block(mem_addr_t blk_idx,
                                                  mem_addr_t addr, size_t length,
@@ -179,9 +205,157 @@ void memory_space_impl<BSIZE>::set_watch(addr_t addr, unsigned watchpoint) {
   m_watchpoints[watchpoint] = addr;
 }
 
+// get page number from a virtual address
+template <unsigned BSIZE>
+mem_addr_t memory_space_impl<BSIZE>::get_page_num(mem_addr_t addr) {
+  return addr >> m_log2_block_size;
+}
+
+// check whether the valid flag of corresponding physical page is set or not
+template <unsigned BSIZE>
+bool memory_space_impl<BSIZE>::is_valid(mem_addr_t pg_index) {
+  // asserts whether the physical page is allocated.
+  // should never happen as they are allocated while memcpy.
+  assert(m_data.find(pg_index) != m_data.end());
+  return m_data[pg_index].is_valid();
+}
+
+// set the valid flag of corresponding physical page
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::validate_page(mem_addr_t pg_index) {
+  assert(m_data.find(pg_index) != m_data.end());
+  m_data[pg_index].validate_page();
+}
+
+// clear the valid flag of corresponding physical page
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::invalidate_page(mem_addr_t pg_index) {
+  assert(m_data.find(pg_index) != m_data.end());
+  m_data[pg_index].invalidate_page();
+}
+
+// a variable accessed by a memory address and the datatype size may exceed a
+// page boundary method returns list of page numbers if at all they are faulty
+// or invalid
+template <unsigned BSIZE>
+std::list<mem_addr_t>
+memory_space_impl<BSIZE>::get_faulty_pages(mem_addr_t addr, size_t length) {
+  std::list<mem_addr_t> page_list;
+
+  mem_addr_t start_page = get_page_num(addr);
+  mem_addr_t end_page = get_page_num(addr + length - 1);
+
+  while (start_page <= end_page) {
+    if (!is_valid(start_page)) {
+      page_list.push_back(start_page);
+    }
+    start_page++;
+  }
+
+  return page_list;
+}
+
+template <unsigned BSIZE>
+bool memory_space_impl<BSIZE>::alloc_page_by_byte(size_t size) {
+  size_t page_num = (size - 1) / BSIZE + 1;
+  if (num_free_pages >= page_num) {
+    num_free_pages -= page_num;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::alloc_pages(size_t num) {
+  assert(num_free_pages >= num);
+  num_free_pages -= num;
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::free_pages(size_t num) {
+  num_free_pages += num;
+}
+
+template <unsigned BSIZE> size_t memory_space_impl<BSIZE>::get_free_pages() {
+  return num_free_pages;
+}
+
+// if the already allocated pages and about to allocate pages(in read stage
+// queue) reaches the buffer size in gddr, then it should start eviction
+// procedure
+template <unsigned BSIZE>
+bool memory_space_impl<BSIZE>::should_evict_page(
+    size_t read_stage_queue_size, size_t write_stage_queue_size,
+    float eviction_buffer_percentage) {
+  return ((float)(write_stage_queue_size + num_gddr_pages)) <
+         ((((float)num_gddr_pages) * eviction_buffer_percentage / 100) +
+          ((float)(num_gddr_pages - num_free_pages + read_stage_queue_size)));
+}
+
+template <unsigned BSIZE>
+float memory_space_impl<BSIZE>::get_projected_occupancy(
+    size_t read_stage_queue_size, size_t write_stage_queue_size,
+    float eviction_buffer_percentage) {
+  return ((((float)num_gddr_pages) * eviction_buffer_percentage / 100) +
+          ((float)(num_gddr_pages - num_free_pages + read_stage_queue_size))) /
+         ((float)(write_stage_queue_size + num_gddr_pages));
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::set_page_dirty(mem_addr_t pg_index) {
+  m_data[pg_index].set_dirty();
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::clear_page_dirty(mem_addr_t pg_index) {
+  m_data[pg_index].clear_dirty();
+}
+
+template <unsigned BSIZE>
+bool memory_space_impl<BSIZE>::is_page_dirty(mem_addr_t pg_index) {
+  return m_data[pg_index].is_dirty();
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::set_page_access(mem_addr_t pg_index) {
+  return m_data[pg_index].set_access();
+}
+
+template <unsigned BSIZE>
+bool memory_space_impl<BSIZE>::is_page_access(mem_addr_t pg_index) {
+  return m_data[pg_index].is_access();
+}
+
+template <unsigned BSIZE>
+void memory_space_impl<BSIZE>::clear_page_access(mem_addr_t pg_index) {
+  return m_data[pg_index].clear_access();
+}
+
+// get size in bytes starting from the addr to the end of the page
+// first get starting address of the page containing the address
+// then get how many bytes are there starting from the page to the given address
+// then subtract it from the total page size
+template <unsigned BSIZE>
+size_t memory_space_impl<BSIZE>::get_data_size(mem_addr_t addr) {
+  return BSIZE - (addr - (mem_addr_t)((addr >> m_log2_block_size)
+                                      << m_log2_block_size));
+}
+
+template <unsigned BSIZE> size_t memory_space_impl<BSIZE>::get_page_size() {
+  return BSIZE;
+}
+
+template <unsigned BSIZE>
+mem_addr_t memory_space_impl<BSIZE>::get_mem_addr(mem_addr_t pg_index) {
+  return pg_index << m_log2_block_size;
+}
+
 template class memory_space_impl<32>;
 template class memory_space_impl<64>;
-template class memory_space_impl<8192>;
+//template class memory_space_impl<8192>;
+template class memory_space_impl<4096>;
+template class memory_space_impl<1024 * 1024 * 2>;
 template class memory_space_impl<16 * 1024>;
 
 void g_print_memory_space(memory_space *mem, const char *format = "%08x",
diff --git a/src/cuda-sim/memory.h b/src/cuda-sim/memory.h
index 5850aa1d6..e4e0cd390 100644
--- a/src/cuda-sim/memory.h
+++ b/src/cuda-sim/memory.h
@@ -40,6 +40,7 @@
 #endif
 
 #include <assert.h>
+#include <list>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -48,6 +49,14 @@
 
 typedef address_type mem_addr_t;
 
+typedef struct _eviction_t {
+  mem_addr_t addr;
+  size_t size;
+  unsigned long long cycle;
+  uint32_t access_counter;
+  uint8_t RW;
+} eviction_t;
+
 #define MEM_BLOCK_SIZE (4 * 1024)
 
 template <unsigned BSIZE>
@@ -56,8 +65,30 @@ class mem_storage {
   mem_storage(const mem_storage &another) {
     m_data = (unsigned char *)calloc(1, BSIZE);
     memcpy(m_data, another.m_data, BSIZE);
+
+    // initialize page as unmanaged
+    managed = false;
+
+    // initialize page flags to default value
+    valid = false;
+    dirty = false;
+    access = false;
+
+    counter = 0;
+  }
+  mem_storage() { 
+    m_data = (unsigned char *)calloc(1, BSIZE); 
+
+    // initialize page as unmanaged
+    managed = false;
+
+    // initialize page flags to default value
+    valid = false;
+    dirty = false;
+    access = false;
+
+    counter = 0;
   }
-  mem_storage() { m_data = (unsigned char *)calloc(1, BSIZE); }
   ~mem_storage() { free(m_data); }
 
   void write(unsigned offset, size_t length, const unsigned char *data) {
@@ -83,9 +114,43 @@ class mem_storage {
     fflush(fout);
   }
 
+  // set the flag of managed into true in order to distinguish it from the
+  // unmanaged allocation
+  void set_managed() { managed = true; }
+  bool is_managed() { return managed; }
+
+  // methods to query and modify page table flags
+  bool is_valid() { return valid; }
+  void validate_page() { valid = true; }
+  void invalidate_page() { valid = false; }
+
+  void set_dirty() { dirty = true; }
+  void clear_dirty() { dirty = false; }
+  bool is_dirty() { return dirty; }
+
+  void set_access() { access = true; }
+  void clear_access() { access = false; }
+  bool is_access() { return access; }
+
  private:
   unsigned m_nbytes;
   unsigned char *m_data;
+
+  // flag to differentiate whether a page is a managed allocation or traditional
+  // unmanged by deafult it is false to denote cudaMalloc and cudaMallocArray on
+  // managed allocation set this to true check this at the generation of
+  // mem_fetch to determine which path to take managed may take the longer
+  // latency path
+  bool managed;
+
+  // flags for page table
+  bool valid;
+
+  bool dirty;
+
+  bool access;
+
+  unsigned counter;
 };
 
 class ptx_thread_info;
@@ -101,12 +166,53 @@ class memory_space {
   virtual void read(mem_addr_t addr, size_t length, void *data) const = 0;
   virtual void print(const char *format, FILE *fout) const = 0;
   virtual void set_watch(addr_t addr, unsigned watchpoint) = 0;
+
+  // method to find out whether or not to follow the managed time simulation
+  virtual bool is_page_managed(mem_addr_t addr, size_t length) = 0;
+  // method to set the pages as managed allocation
+  virtual void set_pages_managed(mem_addr_t addr, size_t length) = 0;
+
+  // method to allocate page(s) from free pages and change the count of free
+  // pages
+  virtual bool alloc_page_by_byte(size_t size) = 0;
+  virtual void alloc_pages(size_t num) = 0;
+  virtual void free_pages(size_t num) = 0;
+  virtual size_t get_free_pages() = 0;
+
+  virtual void set_page_dirty(mem_addr_t pg_index) = 0;
+  virtual bool is_page_dirty(mem_addr_t pg_index) = 0;
+  virtual void clear_page_dirty(mem_addr_t pg_index) = 0;
+
+  virtual void set_page_access(mem_addr_t pg_index) = 0;
+  virtual bool is_page_access(mem_addr_t pg_index) = 0;
+  virtual void clear_page_access(mem_addr_t pg_index) = 0;
+
+  // methods to query page table
+  virtual void validate_page(mem_addr_t pg_index) = 0;
+  virtual void invalidate_page(mem_addr_t pg_index) = 0;
+  virtual std::list<mem_addr_t> get_faulty_pages(mem_addr_t addr,
+                                                 size_t length) = 0;
+  virtual mem_addr_t get_page_num(mem_addr_t addr) = 0;
+
+  virtual size_t get_data_size(mem_addr_t addr) = 0;
+  virtual size_t get_page_size() = 0;
+  virtual mem_addr_t get_mem_addr(mem_addr_t pg_index) = 0;
+  virtual bool is_valid(mem_addr_t pg_index) = 0;
+  virtual bool should_evict_page(size_t read_stage_queue_size,
+                                 size_t write_stage_queue_size,
+                                 float eviction_buffer_percentage) = 0;
+  virtual float get_projected_occupancy(size_t read_stage_queue_size,
+                                        size_t write_stage_queue_size,
+                                        float eviction_buffer_percentage) = 0;
+
+  virtual void reset() = 0;
 };
 
 template <unsigned BSIZE>
 class memory_space_impl : public memory_space {
  public:
-  memory_space_impl(std::string name, unsigned hash_size);
+  memory_space_impl(std::string name, unsigned hash_size,
+                    unsigned long long gddr_size = 0);
 
   virtual void write(mem_addr_t addr, size_t length, const void *data,
                      ptx_thread_info *thd, const ptx_instruction *pI);
@@ -117,13 +223,68 @@ class memory_space_impl : public memory_space {
 
   virtual void set_watch(addr_t addr, unsigned watchpoint);
 
+  // method to find out whether or not to follow the managed time simulation
+  virtual bool is_page_managed(mem_addr_t addr, size_t length);
+  // method to set the pages as managed allocation
+  virtual void set_pages_managed(mem_addr_t addr, size_t length);
+
+  // methods to query page table
+  virtual void validate_page(mem_addr_t pg_index);
+  virtual void invalidate_page(mem_addr_t pg_index);
+  virtual std::list<mem_addr_t> get_faulty_pages(mem_addr_t addr,
+                                                 size_t length);
+  virtual mem_addr_t get_page_num(mem_addr_t addr);
+
+  // methods to implement gddr size constraint
+  virtual bool alloc_page_by_byte(size_t size);
+  virtual void alloc_pages(size_t num);
+  virtual void free_pages(size_t num);
+  virtual size_t get_free_pages();
+
+  virtual void set_page_dirty(mem_addr_t pg_index);
+  virtual bool is_page_dirty(mem_addr_t pg_index);
+  virtual void clear_page_dirty(mem_addr_t pg_index);
+
+  virtual void set_page_access(mem_addr_t pg_index);
+  virtual bool is_page_access(mem_addr_t pg_index);
+  virtual void clear_page_access(mem_addr_t pg_index);
+
+  virtual size_t get_data_size(mem_addr_t addr);
+  virtual size_t get_page_size();
+  virtual mem_addr_t get_mem_addr(mem_addr_t pg_index);
+
+  virtual bool is_valid(mem_addr_t pg_index);
+  virtual bool should_evict_page(size_t read_stage_queue_size,
+                                 size_t write_stage_queue_size,
+                                 float eviction_buffer_percentage);
+  virtual float get_projected_occupancy(size_t read_stage_queue_size,
+                                        size_t write_stage_queue_size,
+                                        float eviction_buffer_percentage);
+
+  virtual void reset();
+
  private:
   void read_single_block(mem_addr_t blk_idx, mem_addr_t addr, size_t length,
                          void *data) const;
   std::string m_name;
   unsigned m_log2_block_size;
+
+  // map_t m_data closely resembles to a page table
+  // the dictionary is keyed by the virtual address
+  // mem_storage acts as the physical page
   typedef mem_map<mem_addr_t, mem_storage<BSIZE> > map_t;
   map_t m_data;
+
+  // variable to store total number of 8KB pages in global memory
+  // calculated based on the GDDR5 size specified in config
+  // it is used to enforce size restriction on both managed and unmanaged malloc
+  // it should be decremented on every allocation either managed or unmanaged
+  // i.e., gpu_malloc, gpu_mallocmanaged, gpu_mallocarray
+  size_t num_free_pages;
+
+  // the size of gddr in number of pages
+  const size_t num_gddr_pages;
+  
   std::map<unsigned, mem_addr_t> m_watchpoints;
 };
 
diff --git a/src/gpgpu-sim/gpu-misc.h b/src/gpgpu-sim/gpu-misc.h
index 117eb7875..da41414d0 100644
--- a/src/gpgpu-sim/gpu-misc.h
+++ b/src/gpgpu-sim/gpu-misc.h
@@ -38,5 +38,7 @@ unsigned int LOGB2(unsigned int v);
 
 #define gs_min2(a, b) (((a) < (b)) ? (a) : (b))
 #define min3(x, y, z) (((x) < (y) && (x) < (z)) ? (x) : (gs_min2((y), (z))))
+#define min4(w, x, y, z)                                                       \
+  ((gs_min2(w, x) < gs_min2(y, z)) ? gs_min2(w, x) : gs_min2(y, z))
 
 #endif
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 3f84d42fc..cf2c07d38 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -85,6 +85,169 @@ class gpgpu_sim_wrapper {};
 
 bool g_interactive_debugger_enabled = false;
 
+bool sim_prof_enable = false;
+
+std::list<shd_warp_t *> all_warps;
+std::list<shd_warp_t *> fail_warps;
+bool skip_cycles = false;
+bool skip_cycles_enable = false;
+int skipped_cycles = 0;
+int gpu_tot_skipped_cycle = 0;
+
+std::map<unsigned long long, std::list<event_stats *>> sim_prof;
+
+void print_sim_prof(FILE *fout, float freq) {
+  freq /= 1000;
+  for (std::map<unsigned long long, std::list<event_stats *>>::iterator iter =
+           sim_prof.begin();
+       iter != sim_prof.end(); iter++) {
+    for (std::list<event_stats *>::iterator iter2 = iter->second.begin();
+         iter2 != iter->second.end(); iter2++) {
+      (*iter2)->print(fout, freq);
+    }
+  }
+}
+
+unsigned long long kernel_time = 0;
+unsigned long long memory_copy_time_h2d = 0;
+unsigned long long memory_copy_time_d2h = 0;
+unsigned long long prefetch_time = 0;
+unsigned long long devicesync_time = 0;
+unsigned long long writeback_time = 0;
+unsigned long long dma_time = 0;
+
+//unsigned long long gpu_sim_cycle = 0;
+//unsigned long long gpu_tot_sim_cycle = 0;
+
+void calculate_sim_prof(FILE *fout, gpgpu_sim *gpu) {
+  float freq = gpu->shader_clock() / 1000.0;
+  for (std::map<unsigned long long, std::list<event_stats *>>::iterator iter =
+           sim_prof.begin();
+       iter != sim_prof.end(); iter++) {
+    for (std::list<event_stats *>::iterator iter2 = iter->second.begin();
+         iter2 != iter->second.end(); iter2++) {
+      (*iter2)->calculate();
+    }
+  }
+
+  unsigned long long page_fault_time = 0;
+  if (!gpu->get_config().enable_accurate_simulation) {
+    page_fault_time = gpu->m_new_stats->mf_page_fault_outstanding *
+                      gpu->get_config().page_fault_latency;
+  }
+
+  fprintf(fout, "Tot_prefetch_time: %llu(cycle), %f(us)\n", prefetch_time,
+          ((float)prefetch_time) / freq);
+  fprintf(fout, "Tot_kernel_exec_time: %llu(cycle), %f(us)\n", kernel_time,
+          ((float)kernel_time) / freq);
+
+  if (!gpu->get_config().enable_accurate_simulation) {
+    fprintf(fout, "Tot_kernel_exec_time_and_fault_time: %llu(cycle), %f(us)\n",
+            kernel_time + page_fault_time,
+            ((float)(kernel_time + page_fault_time)) / freq);
+  }
+
+  fprintf(fout, "Tot_memcpy_h2d_time: %llu(cycle), %f(us)\n",
+          memory_copy_time_h2d, ((float)memory_copy_time_h2d) / freq);
+  fprintf(fout, "Tot_memcpy_d2h_time: %llu(cycle), %f(us)\n",
+          memory_copy_time_d2h, ((float)memory_copy_time_d2h) / freq);
+  fprintf(fout, "Tot_memcpy_time: %llu(cycle), %f(us)\n",
+          memory_copy_time_h2d + memory_copy_time_d2h,
+          ((float)(memory_copy_time_h2d + memory_copy_time_d2h)) / freq);
+  fprintf(fout, "Tot_devicesync_time: %llu(cycle), %f(us)\n", devicesync_time,
+          ((float)devicesync_time) / freq);
+  fprintf(fout, "Tot_writeback_time: %llu(cycle), %f(us)\n", writeback_time,
+          ((float)writeback_time) / freq);
+  fprintf(fout, "Tot_dma_time: %llu(cycle), %f(us)\n", dma_time,
+          ((float)dma_time) / freq);
+  fprintf(fout, "Tot_memcpy_d2h_sync_wb_time: %llu(cycle), %f(us)\n",
+          writeback_time + devicesync_time + memory_copy_time_d2h,
+          ((float)(writeback_time + devicesync_time + memory_copy_time_d2h) /
+           freq));
+}
+
+void update_sim_prof_kernel(unsigned kernel_id, unsigned long long end_time) {
+  for (std::map<unsigned long long, std::list<event_stats *>>::iterator iter =
+           sim_prof.begin();
+       iter != sim_prof.end(); iter++) {
+    for (std::list<event_stats *>::iterator iter2 = iter->second.begin();
+         iter2 != iter->second.end(); iter2++) {
+      if ((*iter2)->type == kernel_launching &&
+          ((kernel_stats *)(*iter2))->kernel_id == kernel_id) {
+        (*iter2)->end_time = end_time;
+        return;
+      }
+    }
+  }
+}
+
+void update_sim_prof_prefetch(mem_addr_t start_addr, size_t size,
+                              unsigned long long end_time) {
+  for (std::map<unsigned long long, std::list<event_stats *>>::reverse_iterator
+           iter = sim_prof.rbegin();
+       iter != sim_prof.rend(); iter++) {
+    for (std::list<event_stats *>::iterator iter2 = iter->second.begin();
+         iter2 != iter->second.end(); iter2++) {
+      if ((*iter2)->type == prefetch &&
+          ((memory_stats *)(*iter2))->start_addr == start_addr &&
+          ((memory_stats *)(*iter2))->size == size) {
+        (*iter2)->end_time = end_time;
+        return;
+      }
+    }
+  }
+}
+
+void update_sim_prof_prefetch_break_down(unsigned long long end_time) {
+  for (std::map<unsigned long long, std::list<event_stats *>>::reverse_iterator
+           iter = sim_prof.rbegin();
+       iter != sim_prof.rend(); iter++) {
+    for (std::list<event_stats *>::reverse_iterator iter2 =
+             iter->second.rbegin();
+         iter2 != iter->second.rend(); iter2++) {
+      if ((*iter2)->type == prefetch_breakdown) {
+        (*iter2)->end_time = end_time;
+        return;
+      }
+    }
+  }
+}
+
+void print_UVM_stats(gpgpu_new_stats *new_stats, gpgpu_sim *gpu, FILE *fout) {
+  new_stats->print(stdout);
+
+  /*
+      FILE* f1 = fopen("Pcie_trace.txt", "w");
+
+      g_the_gpu->m_new_stats->print_pcie(f1);
+
+      fclose(f1);
+
+      FILE* f2 = fopen("Access_pattern_detail.txt", "w");
+
+      g_the_gpu->m_new_stats->print_access_pattern_detail(f2);
+
+      fclose(f2);
+
+      FILE* f3 = fopen("Access_pattern.txt", "w");
+
+      g_the_gpu->m_new_stats->print_access_pattern(f3);
+
+      fclose(f3);
+
+  */
+  FILE *f4 = fopen("access.txt", "w");
+
+  new_stats->print_time_and_access(f4);
+
+  fclose(f4);
+
+  if (sim_prof_enable) {
+    print_sim_prof(stdout, gpu->shader_clock());
+    calculate_sim_prof(stdout, gpu);
+  }
+}
+
 tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
 /* Clock Domains */
@@ -93,6 +256,7 @@ tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 #define L2 0x02
 #define DRAM 0x04
 #define ICNT 0x08
+#define GMMU 0x10
 
 #define MEM_LATENCY_STAT_IMPL
 
@@ -654,6 +818,8 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(opp, "-gpgpu_reg_file_port_throughput", OPT_INT32,
                          &reg_file_port_throughput,
                          "the number ports of the register file", "1");
+  option_parser_register(opp, "-tlb_size", OPT_INT32, &tlb_size,
+                         "Number of tlb entries per SM.", "4096");                       
 
   for (unsigned j = 0; j < SPECIALIZED_UNIT_NUM; ++j) {
     std::stringstream ss;
@@ -777,8 +943,104 @@ void gpgpu_sim_config::reg_options(option_parser_t opp) {
                          &(gpgpu_ctx->device_runtime->g_TB_launch_latency),
                          "thread block launch latency in cycles. Default: 0",
                          "0");
+  option_parser_register(
+      opp, "-gddr_size", OPT_CSTR, &gddr_size_string,
+      "Size of GDDR in MB/GB.(GLOBAL_HEAP_START, GLOBAL_HEAP_START + "
+      "gddr_size) would be used for unmanged memory, (GLOBAL_HEAP_START + "
+      "gddr_size, GLOBAL_HEAP_START + gddr_size*2) would be used for managed "
+      "memory. ",
+      "1GB");
+
+  option_parser_register(
+      opp, "-page_table_walk_latency", OPT_INT64, &page_table_walk_latency,
+      "Average page table walk latency (in core cycle).", "100");
+
+  option_parser_register(opp, "-eviction_policy", OPT_INT32, &eviction_policy,
+                         "Select page eviction policy", "0");
+
+  option_parser_register(opp, "-invalidate_clean", OPT_BOOL, &invalidate_clean,
+                         "Should directly invalidate clean pages", "0");
+
+  option_parser_register(
+      opp, "-reserve_accessed_page_percent", OPT_FLOAT,
+      &reserve_accessed_page_percent,
+      "Percentage of accessed pages reserved from eviction in hope that they "
+      "will be accessed in next iteration.",
+      "0.0");
+
+  option_parser_register(
+      opp, "-percentage_of_free_page_buffer", OPT_FLOAT,
+      &free_page_buffer_percentage,
+      "Percentage of free page buffer to trigger the page eviction.", "0.0");
+
+  option_parser_register(opp, "-page_size", OPT_CSTR, &page_size_string,
+                         "GDDR page size, only 4KB/2MB avaliable.", "4KB");
+
+  option_parser_register(opp, "-pcie_bandwidth", OPT_CSTR,
+                         &pcie_bandwidth_string,
+                         "PCI-e bandwidth per direction, in GB/s.", "16.0GB/s");
+
+  option_parser_register(opp, "-enable_dma", OPT_INT32, &enable_dma,
+                         "Enable direct access to CPU memory", "0");
+
+  option_parser_register(
+      opp, "-multiply_dma_penalty", OPT_INT32, &multiply_dma_penalty,
+      "Oversubscription Multiplicative Penalty Factor for Adaptive DMA", "0");
+
+  option_parser_register(
+      opp, "-migrate_threshold", OPT_INT32, &migrate_threshold,
+      "Access counter threshold for migrating the page from cpu to gpu", "10");
+
+  option_parser_register(opp, "-sim_prof_enable", OPT_BOOL, &sim_prof_enable,
+                         "Enable gpgpu-sim profiler", "0");
+
+  option_parser_register(opp, "-hardware_prefetch", OPT_INT32,
+                         &hardware_prefetch,
+                         "Select gpgpu-sim hardware prefetcher", "1");
+
+  option_parser_register(
+      opp, "-hwprefetch_oversub", OPT_INT32, &hwprefetch_oversub,
+      "Select gpgpu-sim hardware prefetcher under over-subscription", "0");
+
+  option_parser_register(opp, "-page_fault_latency", OPT_INT64,
+                         &page_fault_latency,
+                         "Average fault latency (in core cycle).", "66645");
+
+  option_parser_register(opp, "-enable_accurate_simulation", OPT_BOOL,
+                         &enable_accurate_simulation,
+                         "Enable page fault functional simulation.", "0");
+
+  option_parser_register(opp, "-enable_smart_runtime", OPT_BOOL,
+                         &enable_smart_runtime,
+                         "Enable access pattern detection, policy engine, and "
+                         "adaptive memory management.",
+                         "0");
+  option_parser_register(
+      opp, "-skip_cycles_enable", OPT_BOOL, &skip_cycles_enable,
+      "Enable skip cycles if all warps stall and wait for page fualt", "0");
 }
 
+void gpgpu_sim_config::convert_byte_string() {
+  gpgpu_functional_sim_config::convert_byte_string();
+  if (strstr(pcie_bandwidth_string, "GB/s")) {
+    pcie_bandwidth = strtof(pcie_bandwidth_string, NULL);
+    if (pcie_bandwidth == 16.0) {
+      curve_a = 12.0;
+    } else if (pcie_bandwidth == 32.0) {
+      curve_a = 24.0;
+    } else if (pcie_bandwidth == 64.0) {
+      curve_a = 48.0;
+    } else {
+      printf("-pcie_bandwidth should be 16.0GB/s, 32.0GB/s or 64.0GB/s\n");
+    }
+
+    curve_b = 0.07292;
+
+  } else {
+    printf("-pcie_bandwidth should be in GB/s\n");
+    exit(1);
+  }
+}
 /////////////////////////////////////////////////////////////////////////////
 
 void increment_x_then_y_then_z(dim3 &i, const dim3 &bound) {
@@ -958,7 +1220,8 @@ void exec_gpgpu_sim::createSIMTCluster() {
   for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
     m_cluster[i] =
         new exec_simt_core_cluster(this, i, m_shader_config, m_memory_config,
-                                   m_shader_stats, m_memory_stats);
+                                   m_shader_stats, m_memory_stats,
+                                   m_new_stats);
 }
 
 // SST get its own simt_cluster
@@ -994,6 +1257,9 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
       new power_stat_t(m_shader_config, average_pipeline_duty_cycle, active_sms,
                        m_shader_stats, m_memory_config, m_memory_stats);
 
+  printf("Create m_new_stats.\n");
+  m_new_stats = new gpgpu_new_stats(m_config);
+
   gpu_sim_insn = 0;
   gpu_tot_sim_insn = 0;
   gpu_tot_issued_cta = 0;
@@ -1001,6 +1267,8 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
   m_total_cta_launched = 0;
   gpu_deadlock = false;
 
+  m_gmmu = new gmmu_t(this, config, m_new_stats);
+
   gpu_stall_dramfull = 0;
   gpu_stall_icnt2sh = 0;
   partiton_reqs_in_parallel = 0;
@@ -1140,6 +1408,7 @@ void gpgpu_sim::reinit_clock_domains(void) {
   dram_time = 0;
   icnt_time = 0;
   l2_time = 0;
+  gmmu_time = 0;
 }
 
 bool gpgpu_sim::active() {
@@ -1190,6 +1459,7 @@ bool sst_gpgpu_sim::active() {
 void gpgpu_sim::init() {
   // run a CUDA grid on the GPU microarchitecture simulator
   gpu_sim_cycle = 0;
+  skipped_cycles = 0;
   gpu_sim_insn = 0;
   last_gpu_sim_insn = 0;
   m_total_cta_launched = 0;
@@ -1237,6 +1507,7 @@ void gpgpu_sim::init() {
 void gpgpu_sim::update_stats() {
   m_memory_stats->memlatstat_lat_pw();
   gpu_tot_sim_cycle += gpu_sim_cycle;
+  gpu_tot_skipped_cycle += skipped_cycles;
   gpu_tot_sim_insn += gpu_sim_insn;
   gpu_tot_issued_cta += m_total_cta_launched;
   partiton_reqs_in_parallel_total += partiton_reqs_in_parallel;
@@ -1246,6 +1517,7 @@ void gpgpu_sim::update_stats() {
   gpu_tot_occupancy += gpu_occupancy;
 
   gpu_sim_cycle = 0;
+  skipped_cycles = 0;
   partiton_reqs_in_parallel = 0;
   partiton_replys_in_parallel = 0;
   partiton_reqs_in_parallel_util = 0;
@@ -1636,6 +1908,7 @@ void gpgpu_sim::gpu_print_stat(unsigned long long streamID) {
   printf("icnt_total_pkts_simt_to_mem=%ld\n", total_simt_to_mem);
 
   time_vector_print();
+  print_UVM_stats(m_new_stats, this, stdout);
   fflush(stdout);
 
   clear_executed_kernel_info();
@@ -1933,7 +2206,16 @@ void dram_t::dram_log(int task) {
 
 // Find next clock domain and increment its time
 int gpgpu_sim::next_clock_domain(void) {
-  double smallest = min3(core_time, icnt_time, dram_time);
+  // to get the cycles spent for any cuda stream operation before and after the
+  // kernel is launched monotonically increase the total simulation cycle
+  if (!active()) {
+    int mask = 0x00;
+    mask |= GMMU;
+    gpu_tot_sim_cycle++;
+    return mask;
+  }
+
+  double smallest = min4(core_time, icnt_time, dram_time, gmmu_time);
   int mask = 0x00;
   if (l2_time <= smallest) {
     smallest = l2_time;
@@ -1952,6 +2234,10 @@ int gpgpu_sim::next_clock_domain(void) {
     mask |= CORE;
     core_time += m_config.core_period;
   }
+  if (gmmu_time <= smallest) {
+    mask |= GMMU;
+    gmmu_time += m_config.core_period;
+  }
   return mask;
 }
 
@@ -1970,163 +2256,2763 @@ void gpgpu_sim::issue_block2core() {
 unsigned long long g_single_step =
     0;  // set this in gdb to single step the pipeline
 
-void gpgpu_sim::cycle() {
-  int clock_mask = next_clock_domain();
+gpgpu_new_stats::gpgpu_new_stats(const gpgpu_sim_config &config)
+    : m_config(config) {
+  tlb_hit = new unsigned long long[m_config.num_cluster()];
+  tlb_miss = new unsigned long long[m_config.num_cluster()];
+  tlb_val = new unsigned long long[m_config.num_cluster()];
+  tlb_evict = new unsigned long long[m_config.num_cluster()];
+  tlb_page_evict = new unsigned long long[m_config.num_cluster()];
 
-  if (clock_mask & CORE) {
-    // shader core loading (pop from ICNT into core) follows CORE clock
-    for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
-      m_cluster[i]->icnt_cycle();
+  mf_page_hit = new unsigned long long[m_config.num_cluster()];
+  mf_page_miss = new unsigned long long[m_config.num_cluster()];
+
+  mf_page_fault_outstanding = 0;
+  mf_page_fault_pending = 0;
+
+  for (unsigned i = 0; i < m_config.num_cluster(); i++) {
+    tlb_hit[i] = 0;
+    tlb_miss[i] = 0;
+    tlb_val[i] = 0;
+    tlb_evict[i] = 0;
+    tlb_page_evict[i] = 0;
+    mf_page_hit[i] = 0;
+    mf_page_miss[i] = 0;
   }
-  unsigned partiton_replys_in_parallel_per_cycle = 0;
-  if (clock_mask & ICNT) {
-    // pop from memory controller to interconnect
-    for (unsigned i = 0; i < m_memory_config->m_n_mem_sub_partition; i++) {
-      mem_fetch *mf = m_memory_sub_partition[i]->top();
-      if (mf) {
-        unsigned response_size =
-            mf->get_is_write() ? mf->get_ctrl_size() : mf->size();
-        if (::icnt_has_buffer(m_shader_config->mem2device(i), response_size)) {
-          // if (!mf->get_is_write())
-          mf->set_return_timestamp(gpu_sim_cycle + gpu_tot_sim_cycle);
-          mf->set_status(IN_ICNT_TO_SHADER, gpu_sim_cycle + gpu_tot_sim_cycle);
-          ::icnt_push(m_shader_config->mem2device(i), mf->get_tpc(), mf,
-                      response_size);
-          m_memory_sub_partition[i]->pop();
-          partiton_replys_in_parallel_per_cycle++;
-        } else {
-          gpu_stall_icnt2sh++;
-        }
-      } else {
-        m_memory_sub_partition[i]->pop();
-      }
+
+  pf_page_hit = 0;
+  pf_page_miss = 0;
+
+  page_evict_not_dirty = 0;
+  page_evict_dirty = 0;
+
+  num_dma = 0;
+  dma_page_transfer_read = 0;
+  dma_page_transfer_write = 0;
+
+  tlb_thrashing =
+      new std::map<mem_addr_t, std::vector<bool>>[m_config.num_cluster()*m_config.num_core_per_cluster()];
+
+  ma_latency =
+      new std::map<unsigned,
+                   std::pair<bool, unsigned long long>>[m_config.num_cluster()*m_config.num_core_per_cluster()];
+
+  page_access_times =
+      new std::map<mem_addr_t, unsigned>[m_config.num_cluster()*m_config.num_core_per_cluster()];
+}
+
+void gpgpu_new_stats::print_pcie(FILE *fout) const {
+  fprintf(fout, "Read lanes:\n");
+  for (std::list<std::pair<unsigned long long, float>>::const_iterator iter =
+           pcie_read_utilization.begin();
+       iter != pcie_read_utilization.end(); iter++) {
+    fprintf(fout, "%llu %f\n", iter->first, iter->second);
+  }
+  fprintf(fout, "Write lanes:\n");
+  for (std::list<std::pair<unsigned long long, float>>::const_iterator iter =
+           pcie_write_utilization.begin();
+       iter != pcie_write_utilization.end(); iter++) {
+    fprintf(fout, "%llu %f\n", iter->first, iter->second);
+  }
+}
+
+void gpgpu_new_stats::print_access_pattern_detail(FILE *fout) const {
+  for (unsigned i = 0; i < m_config.num_cluster(); i++) {
+    fprintf(fout, "Shader %u\n", i);
+    for (std::map<mem_addr_t, unsigned>::const_iterator iter =
+             page_access_times[i].begin();
+         iter != page_access_times[i].end(); iter++) {
+      fprintf(fout, "%u %u\n", iter->first, iter->second);
     }
   }
-  partiton_replys_in_parallel += partiton_replys_in_parallel_per_cycle;
+}
 
-  if (clock_mask & DRAM) {
-    for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
-      if (m_memory_config->simple_dram_model)
-        m_memory_partition_unit[i]->simple_dram_model_cycle();
-      else
-        m_memory_partition_unit[i]
-            ->dram_cycle();  // Issue the dram command (scheduler + delay model)
-      // Update performance counters for DRAM
-      m_memory_partition_unit[i]->set_dram_power_stats(
-          m_power_stats->pwr_mem_stat->n_cmd[CURRENT_STAT_IDX][i],
-          m_power_stats->pwr_mem_stat->n_activity[CURRENT_STAT_IDX][i],
-          m_power_stats->pwr_mem_stat->n_nop[CURRENT_STAT_IDX][i],
-          m_power_stats->pwr_mem_stat->n_act[CURRENT_STAT_IDX][i],
-          m_power_stats->pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i],
-          m_power_stats->pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i],
-          m_power_stats->pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i],
-          m_power_stats->pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i],
-          m_power_stats->pwr_mem_stat->n_req[CURRENT_STAT_IDX][i]);
+void gpgpu_new_stats::print_access_pattern(FILE *fout) const {
+  std::map<mem_addr_t, unsigned> tot_access;
+  fprintf(fout, "Total page access pttern:\n");
+  for (unsigned i = 0; i < m_config.num_cluster(); i++) {
+    for (std::map<mem_addr_t, unsigned>::const_iterator iter =
+             page_access_times[i].begin();
+         iter != page_access_times[i].end(); iter++) {
+      tot_access[iter->first] += iter->second;
     }
   }
+  for (std::map<mem_addr_t, unsigned>::const_iterator iter = tot_access.begin();
+       iter != tot_access.end(); iter++) {
+    fprintf(fout, "%u %u\n", iter->first, iter->second);
+  }
+}
 
-  // L2 operations follow L2 clock domain
-  unsigned partiton_reqs_in_parallel_per_cycle = 0;
-  if (clock_mask & L2) {
-    m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].clear();
-    for (unsigned i = 0; i < m_memory_config->m_n_mem_sub_partition; i++) {
-      // move memory request from interconnect into memory partition (if not
-      // backed up) Note:This needs to be called in DRAM clock domain if there
-      // is no L2 cache in the system In the worst case, we may need to push
-      // SECTOR_CHUNCK_SIZE requests, so ensure you have enough buffer for them
-      if (m_memory_sub_partition[i]->full(SECTOR_CHUNCK_SIZE)) {
-        gpu_stall_dramfull++;
-      } else {
-        mem_fetch *mf = (mem_fetch *)icnt_pop(m_shader_config->mem2device(i));
-        m_memory_sub_partition[i]->push(mf, gpu_sim_cycle + gpu_tot_sim_cycle);
-        if (mf) partiton_reqs_in_parallel_per_cycle++;
-      }
-      m_memory_sub_partition[i]->cache_cycle(gpu_sim_cycle + gpu_tot_sim_cycle);
-      if (m_config.g_power_simulation_enabled) {
-        m_memory_sub_partition[i]->accumulate_L2cache_stats(
-            m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX]);
+void gpgpu_new_stats::print_time_and_access(FILE *fout) const {
+  for (std::list<access_info>::const_iterator iter =
+           time_and_page_access.begin();
+       iter != time_and_page_access.end(); iter++) {
+    fprintf(fout, "%u 0x%x %u %llu %d %u %u\n", iter->page_no, iter->mem_addr,
+            iter->size, iter->cycle, iter->is_read, iter->sm_id, iter->warp_id);
+  }
+
+  for (std::map<unsigned long long, std::list<event_stats *>>::iterator iter =
+           sim_prof.begin();
+       iter != sim_prof.end(); iter++) {
+    for (std::list<event_stats *>::iterator iter2 = iter->second.begin();
+         iter2 != iter->second.end(); iter2++) {
+      if ((*iter2)->type == kernel_launching) {
+        fprintf(fout, "K: %llu %llu\n", (*iter2)->start_time,
+                (*iter2)->end_time);
       }
     }
   }
-  partiton_reqs_in_parallel += partiton_reqs_in_parallel_per_cycle;
-  if (partiton_reqs_in_parallel_per_cycle > 0) {
-    partiton_reqs_in_parallel_util += partiton_reqs_in_parallel_per_cycle;
-    gpu_sim_cycle_parition_util++;
+}
+
+void gpgpu_new_stats::print(FILE *fout) const {
+  fprintf(fout, "========================================UVM "
+                "statistics==============================\n");
+
+  fprintf(fout, "========================================TLB "
+                "statistics(access)==============================\n");
+  unsigned long long tot_tlb_hit = 0;
+  unsigned long long tot_tlb_miss = 0;
+  for (unsigned i = 0; i < m_config.num_cluster(); i++) {
+    fprintf(fout,
+            "Shader%u: Tlb_access: %llu Tlb_hit: %llu Tlb_miss: %llu "
+            "Tlb_hit_rate: %f\n",
+            i, tlb_hit[i] + tlb_miss[i], tlb_hit[i], tlb_miss[i],
+            ((float)tlb_hit[i]) / ((float)(tlb_hit[i] + tlb_miss[i])));
+    tot_tlb_hit += tlb_hit[i];
+    tot_tlb_miss += tlb_miss[i];
   }
 
-  if (clock_mask & ICNT) {
-    icnt_transfer();
+  fprintf(fout,
+          "Tlb_tot_access: %llu Tlb_tot_hit: %llu, Tlb_tot_miss: %llu, "
+          "Tlb_tot_hit_rate: %f\n",
+          tot_tlb_hit + tot_tlb_miss, tot_tlb_hit, tot_tlb_miss,
+          ((float)tot_tlb_hit) / ((float)(tot_tlb_hit + tot_tlb_miss)));
+
+  fprintf(fout, "========================================TLB "
+                "statistics(validate)==============================\n");
+  unsigned long long tot_tlb_val = 0;
+  unsigned long long tot_tlb_inval_te = 0;
+  unsigned long long tot_tlb_inval_pe = 0;
+  for (unsigned i = 0; i < m_config.num_cluster(); i++) {
+    fprintf(fout,
+            "Shader%u: Tlb_validate: %llu Tlb_invalidate: %llu Tlb_evict: %llu "
+            "Tlb_page_evict: %llu\n",
+            i, tlb_val[i], tlb_evict[i] + tlb_page_evict[i], tlb_evict[i],
+            tlb_page_evict[i]);
+    tot_tlb_val += tlb_val[i];
+    tot_tlb_inval_te += tlb_evict[i];
+    tot_tlb_inval_pe += tlb_page_evict[i];
   }
 
-  if (clock_mask & CORE) {
-    // L1 cache + shader core pipeline stages
-    m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].clear();
-    for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
-      if (m_cluster[i]->get_not_completed() || get_more_cta_left()) {
-        m_cluster[i]->core_cycle();
-        *active_sms += m_cluster[i]->get_n_active_sms();
-      }
-      // Update core icnt/cache stats for AccelWattch
-      if (m_config.g_power_simulation_enabled) {
-        m_cluster[i]->get_icnt_stats(
-            m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
-            m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
-        m_cluster[i]->get_cache_stats(
-            m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]);
+  fprintf(fout,
+          "Tlb_tot_valiate: %llu Tlb_invalidate: %llu, Tlb_tot_evict: %llu, "
+          "Tlb_tot_evict page: %llu\n",
+          tot_tlb_val, tot_tlb_inval_te + tot_tlb_inval_pe, tot_tlb_inval_te,
+          tot_tlb_inval_pe);
+
+  fprintf(fout, "========================================TLB "
+                "statistics(thrashing)==============================\n");
+  std::map<mem_addr_t, unsigned> tlb_thrash[m_config.num_cluster()];
+  for (unsigned i = 0; i < m_config.num_cluster(); i++) {
+    for (std::map<mem_addr_t, std::vector<bool>>::const_iterator iter =
+             tlb_thrashing[i].begin();
+         iter != tlb_thrashing[i].end(); iter++) {
+      for (unsigned j = 0; j != iter->second.size(); j++) {
+        if (j + 2 >= iter->second.size())
+          break;
+        if (iter->second[j] == true && iter->second[j + 1] == false &&
+            iter->second[j + 2] == true)
+          tlb_thrash[i][iter->first]++;
       }
-      m_cluster[i]->get_current_occupancy(
-          gpu_occupancy.aggregate_warp_slot_filled,
-          gpu_occupancy.aggregate_theoretical_warp_slots);
-    }
-    float temp = 0;
-    for (unsigned i = 0; i < m_shader_config->num_shader(); i++) {
-      temp += m_shader_stats->m_pipeline_duty_cycle[i];
     }
-    temp = temp / m_shader_config->num_shader();
-    *average_pipeline_duty_cycle = ((*average_pipeline_duty_cycle) + temp);
-    // cout<<"Average pipeline duty cycle:
-    // "<<*average_pipeline_duty_cycle<<endl;
+  }
 
-    if (g_single_step &&
-        ((gpu_sim_cycle + gpu_tot_sim_cycle) >= g_single_step)) {
-      raise(SIGTRAP);  // Debug breakpoint
+  unsigned tot_tlb_thrash = 0;
+  for (unsigned i = 0; i < m_config.num_cluster(); i++) {
+    unsigned s_thrash = 0;
+    fprintf(fout, "Shader%u: ", i);
+    for (std::map<mem_addr_t, unsigned>::iterator iter = tlb_thrash[i].begin();
+         iter != tlb_thrash[i].end(); iter++) {
+      fprintf(fout, "Page: %u Trashed: %u | ", iter->first, iter->second);
+      s_thrash += iter->second;
     }
-    gpu_sim_cycle++;
+    fprintf(fout, "Total %u\n", s_thrash);
+    tot_tlb_thrash += s_thrash;
+  }
+  fprintf(fout, "Tlb_tot_thrash: %u\n", tot_tlb_thrash);
 
-    if (g_interactive_debugger_enabled) gpgpu_debug();
+  fprintf(fout, "========================================Page fault "
+                "statistics==============================\n");
 
-      // McPAT main cycle (interface with McPAT)
-#ifdef GPGPUSIM_POWER_MODEL
-    if (m_config.g_power_simulation_enabled) {
-      if (m_config.g_power_simulation_mode == 0) {
-        mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
-                    m_power_stats, m_config.gpu_stat_sample_freq,
-                    gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
-                    gpu_sim_insn, m_config.g_dvfs_enabled);
-      }
-    }
-#endif
+  unsigned long long tot_page_hit = 0;
+  unsigned long long tot_page_miss = 0;
+  for (unsigned i = 0; i < m_config.num_cluster(); i++) {
+    fprintf(
+        fout,
+        "Shader%u: Page_table_access:%llu Page_hit: %llu Page_miss: %llu "
+        "Page_hit_rate: %f\n",
+        i, mf_page_hit[i] + mf_page_miss[i], mf_page_hit[i], mf_page_miss[i],
+        ((float)mf_page_hit[i]) / ((float)(mf_page_hit[i] + mf_page_miss[i])));
+    tot_page_hit += mf_page_hit[i];
+    tot_page_miss += mf_page_miss[i];
+  }
 
-    issue_block2core();
-    decrement_kernel_latency();
+  fprintf(fout,
+          "Page_table_tot_access: %llu Page_tot_hit: %llu, Page_tot_miss %llu, "
+          "Page_tot_hit_rate: %f Page_tot_fault: %llu Page_tot_pending: %llu\n",
+          tot_page_hit + tot_page_miss, tot_page_hit, tot_page_miss,
+          ((float)tot_page_hit) / ((float)(tot_page_hit + tot_page_miss)),
+          mf_page_fault_outstanding, mf_page_fault_pending);
+
+  float avg_mf_latency = 0;
+  unsigned long long tot_mf_fault = 0;
+  for (std::map<mem_addr_t, std::list<unsigned long long>>::const_iterator
+           iter = mf_page_fault_latency.begin();
+       iter != mf_page_fault_latency.end(); iter++) {
+    for (std::list<unsigned long long>::const_iterator iter2 =
+             iter->second.begin();
+         iter2 != iter->second.end(); iter2++) {
+      avg_mf_latency =
+          ((float)tot_mf_fault) / ((float)(tot_mf_fault + 1)) * avg_mf_latency +
+          ((float)(*iter2)) / ((float)(tot_mf_fault + 1));
+      tot_mf_fault++;
+    }
+  }
+  fprintf(fout, "Total_memory_access_page_fault: %llu, Average_latency: %f, "
+                "skipped cycles: %llu, Total_skipped_cycles: %llu \n",
+                tot_mf_fault, avg_mf_latency, skipped_cycles, gpu_tot_skipped_cycle);
+
+  fprintf(fout, "========================================Page thrashing "
+                "statistics==============================\n");
+
+  unsigned long long tot_validate = 0;
+  for (std::map<mem_addr_t, std::vector<bool>>::const_iterator iter =
+           page_thrashing.begin();
+       iter != page_thrashing.end(); iter++) {
+    for (std::vector<bool>::const_iterator iter2 = iter->second.begin();
+         iter2 != iter->second.end(); iter2++) {
+      if (*iter2 == true)
+        tot_validate++;
+    }
+  }
 
-    // Depending on configuration, invalidate the caches once all of threads are
-    // completed.
-    int all_threads_complete = 1;
-    if (m_config.gpgpu_flush_l1_cache) {
-      for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
-        if (m_cluster[i]->get_not_completed() == 0)
-          m_cluster[i]->cache_invalidate();
-        else
-          all_threads_complete = 0;
-      }
+  fprintf(
+      fout,
+      "Page_validate: %llu Page_evict_dirty: %llu Page_evict_not_dirty: %llu\n",
+      tot_validate, page_evict_dirty, page_evict_not_dirty);
+
+  std::map<mem_addr_t, unsigned> page_thrash;
+  for (std::map<mem_addr_t, std::vector<bool>>::const_iterator iter =
+           page_thrashing.begin();
+       iter != page_thrashing.end(); iter++) {
+    for (unsigned j = 0; j != iter->second.size(); j++) {
+      if (j + 2 >= iter->second.size())
+        break;
+      if (iter->second[j] == true && iter->second[j + 1] == false &&
+          iter->second[j + 2] == true)
+        page_thrash[iter->first]++;
     }
+  }
 
-    if (m_config.gpgpu_flush_l2_cache) {
-      if (!m_config.gpgpu_flush_l1_cache) {
-        for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
-          if (m_cluster[i]->get_not_completed() != 0) {
-            all_threads_complete = 0;
+  unsigned tot_page_thrash = 0;
+  for (std::map<mem_addr_t, unsigned>::iterator iter = page_thrash.begin();
+       iter != page_thrash.end(); iter++) {
+    fprintf(fout, "Page: %u Thrashed: %u\n", iter->first, iter->second);
+    tot_page_thrash += iter->second;
+  }
+  fprintf(fout, "Page_tot_thrash: %u\n", tot_page_thrash);
+
+  fprintf(fout, "========================================Memory access "
+                "statistics==============================\n");
+
+  /*
+     unsigned long long* ma_num = new unsigned long
+     long[m_config.num_cluster()]; float* avg_ma_latency = new
+     float[m_config.num_cluster()];
+
+     unsigned long long tot_ma_num = 0;
+     float tot_avg_ma_latency = 0;
+
+     for(unsigned i = 0; i < m_config.num_cluster(); i++) {
+         ma_num[i] = 0;
+         avg_ma_latency[i] = 0;
+         for(std::map<unsigned, std::pair<bool, unsigned long long>
+     >::const_iterator iter = ma_latency[i].begin(); iter !=
+     ma_latency[i].end(); iter++) { assert(iter->second.first);
+             avg_ma_latency[i] = ((float)ma_num[i]) / ((float)(ma_num[i]+1)) *
+     avg_ma_latency[i] + ((float)(iter->second.second)) /
+     ((float)(ma_num[i]+1)); ma_num[i]++;
+         }
+         fprintf(fout, "Shader%u: Memory_access: %u, Avg_memory_access_latency:
+     %llu\n", i, ma_latency[i].size(), ((unsigned long long)
+     (avg_ma_latency[i])));
+     }
+
+     for(unsigned i = 0; i < m_config.num_cluster(); i++) {
+         tot_avg_ma_latency = ((float)tot_ma_num) /
+     ((float)(tot_ma_num+ma_num[i])) * tot_avg_ma_latency + avg_ma_latency[i] /
+     ((float)(tot_ma_num+ma_num[i])) * ((float)ma_num[i]); tot_ma_num +=
+     ma_num[i];
+     }
+     fprintf(fout,"Tot_memory_access: %u, Tot_avg_memory_access_latency:
+     %llu\n", tot_ma_num, ((unsigned long long)tot_avg_ma_latency));
+
+     delete[] ma_num;
+     delete[] avg_ma_latency;
+  */
+  fprintf(fout, "========================================Prefetch "
+                "statistics==============================\n");
+
+  fprintf(fout,
+          "Tot_page_hit: %llu, Tot_page_miss: %llu, Tot_page_fault: %lu\n",
+          pf_page_hit, pf_page_miss, pf_fault_latency.size());
+
+  float avg_pf_latency = 0;
+  float avg_pref_size = 0;
+  float avg_pref_latency = 0;
+
+  unsigned long long tot_pf_fault = 0;
+  unsigned long long tot_pref_fault = 0;
+  for (std::map<mem_addr_t, std::list<unsigned long long>>::const_iterator
+           iter = pf_page_fault_latency.begin();
+       iter != pf_page_fault_latency.end(); iter++) {
+    for (std::list<unsigned long long>::const_iterator iter2 =
+             iter->second.begin();
+         iter2 != iter->second.end(); iter2++) {
+      avg_pf_latency =
+          ((float)tot_pf_fault) / ((float)(tot_pf_fault + 1)) * avg_pf_latency +
+          ((float)(*iter2)) / ((float)(tot_pf_fault + 1));
+      tot_pf_fault++;
+    }
+  }
+
+  for (std::vector<std::pair<unsigned long, unsigned long long>>::const_iterator
+           iter = pf_fault_latency.begin();
+       iter != pf_fault_latency.end(); iter++) {
+    avg_pref_size += iter->first;
+    avg_pref_latency = ((float)tot_pref_fault) / ((float)(tot_pref_fault + 1)) *
+                           avg_pref_latency +
+                       ((float)(iter->second)) / ((float)(tot_pref_fault + 1));
+    tot_pref_fault++;
+  }
+
+  avg_pref_size /= ((float)pf_fault_latency.size());
+  fprintf(
+      fout,
+      "Avg_page_latency: %f, Avg_prefetch_size: %f, Avg_prefetch_latency: %f\n",
+      avg_pf_latency, avg_pref_size, avg_pref_latency);
+
+  fprintf(fout, "========================================Rdma "
+                "statistics==============================\n");
+  fprintf(fout, "dma_read: %llu\n", num_dma);
+  fprintf(fout, "dma_migration_read %llu\n", dma_page_transfer_read);
+  fprintf(fout, "dma_migration_write %llu\n", dma_page_transfer_write);
+
+  fprintf(fout, "========================================PCI-e "
+                "statistics==============================\n");
+  float avg_r = 0;
+  unsigned long long r_0 = 0;
+  unsigned long long r_25 = 0;
+  unsigned long long r_50 = 0;
+  unsigned long long r_75 = 0;
+  unsigned long long r_tot = 0;
+  float avg_w = 0;
+  unsigned long long w_0 = 0;
+  unsigned long long w_25 = 0;
+  unsigned long long w_50 = 0;
+  unsigned long long w_75 = 0;
+  unsigned long long w_tot = 0;
+  for (std::list<std::pair<unsigned long long, float>>::const_iterator iter =
+           pcie_read_utilization.begin();
+       iter != pcie_read_utilization.end(); iter++) {
+    if (iter->second <= 0.25) {
+      r_0++;
+    } else if (iter->second <= 0.5) {
+      r_25++;
+    } else if (iter->second <= 0.75) {
+      r_50++;
+    } else {
+      r_75++;
+    }
+    avg_r = (avg_r * ((float)r_tot) + iter->second) / ((float)(r_tot + 1));
+    r_tot++;
+  }
+  for (std::list<std::pair<unsigned long long, float>>::const_iterator iter =
+           pcie_write_utilization.begin();
+       iter != pcie_write_utilization.end(); iter++) {
+    if (iter->second <= 0.25) {
+      w_0++;
+    } else if (iter->second <= 0.5) {
+      w_25++;
+    } else if (iter->second <= 0.75) {
+      w_50++;
+    } else {
+      w_75++;
+    }
+    avg_w = (avg_w * ((float)w_tot) + iter->second) / ((float)(w_tot + 1));
+    w_tot++;
+  }
+
+  fprintf(fout, "Pcie_read_utilization: %f\n", avg_r);
+  fprintf(fout, "[0-25]: %f, [26-50]: %f, [51-75]: %f, [76-100]: %f\n",
+          ((float)r_0) / ((float)r_tot), ((float)r_25) / ((float)r_tot),
+          ((float)r_50) / ((float)r_tot), ((float)r_75) / ((float)r_tot));
+  fprintf(fout, "Pcie_write_utilization: %f\n", avg_w);
+  fprintf(fout, "[0-25]: %f, [26-50]: %f, [51-75]: %f, [76-100]: %f\n",
+          ((float)w_0) / ((float)w_tot), ((float)w_25) / ((float)w_tot),
+          ((float)w_50) / ((float)w_tot), ((float)w_75) / ((float)w_tot));
+}
+
+gpgpu_new_stats::~gpgpu_new_stats() {
+  delete[] tlb_hit;
+  delete[] tlb_miss;
+  delete[] tlb_val;
+  delete[] tlb_evict;
+  delete[] tlb_page_evict;
+  delete[] mf_page_hit;
+  delete[] mf_page_miss;
+  delete[] page_access_times;
+  delete[] tlb_thrashing;
+  delete[] ma_latency;
+}
+
+gmmu_t::gmmu_t(class gpgpu_sim *gpu, const gpgpu_sim_config &config,
+               class gpgpu_new_stats *new_stats)
+    : m_gpu(gpu), m_config(config), m_new_stats(new_stats) {
+  m_shader_config = &m_config.m_shader_config;
+
+  if (m_config.enable_dma == 0) {
+    dma_mode = dma_type::DISABLED;
+  } else if (m_config.enable_dma == 1) {
+    dma_mode = dma_type::ADAPTIVE;
+  } else if (m_config.enable_dma == 2) {
+    dma_mode = dma_type::ALWAYS;
+  } else if (m_config.enable_dma == 3) {
+    dma_mode = dma_type::OVERSUB;
+  } else {
+    printf("Unknown DMA mode\n");
+    exit(1);
+  }
+
+  if (m_config.eviction_policy == 0) {
+    evict_policy = eviction_policy::LRU;
+  } else if (m_config.eviction_policy == 1) {
+    evict_policy = eviction_policy::TBN;
+  } else if (m_config.eviction_policy == 2) {
+    evict_policy = eviction_policy::SEQUENTIAL_LOCAL;
+  } else if (m_config.eviction_policy == 3) {
+    evict_policy = eviction_policy::RANDOM;
+  } else if (m_config.eviction_policy == 4) {
+    evict_policy = eviction_policy::LFU;
+  } else if (m_config.eviction_policy == 5) {
+    evict_policy = eviction_policy::LRU4K;
+  } else {
+    printf("Unknown eviction policy");
+    exit(1);
+  }
+
+  if (m_config.hardware_prefetch == 0) {
+    prefetcher = hwardware_prefetcher::DISBALED;
+  } else if (m_config.hardware_prefetch == 1) {
+    prefetcher = hwardware_prefetcher::TBN;
+  } else if (m_config.hardware_prefetch == 2) {
+    prefetcher = hwardware_prefetcher::SEQUENTIAL_LOCAL;
+  } else if (m_config.hardware_prefetch == 3) {
+    prefetcher = hwardware_prefetcher::RANDOM;
+  } else {
+    printf("Unknown hardware prefeching policy");
+    exit(1);
+  }
+
+  if (m_config.hwprefetch_oversub == 0) {
+    oversub_prefetcher = hwardware_prefetcher_oversub::DISBALED;
+  } else if (m_config.hwprefetch_oversub == 1) {
+    oversub_prefetcher = hwardware_prefetcher_oversub::TBN;
+  } else if (m_config.hwprefetch_oversub == 2) {
+    oversub_prefetcher = hwardware_prefetcher_oversub::SEQUENTIAL_LOCAL;
+  } else if (m_config.hwprefetch_oversub == 3) {
+    oversub_prefetcher = hwardware_prefetcher_oversub::RANDOM;
+  } else {
+    printf("Unknown hardware prefeching policy under over-subscription");
+    exit(1);
+  }
+
+  m_log2_page_size = -1;
+  for (unsigned n = 0, mask = 1; mask != 0; mask <<= 1, n++) {
+    if (m_config.page_size & mask) {
+      assert(m_log2_page_size == (unsigned)-1);
+      m_log2_page_size = n;
+    }
+  }
+  pcie_read_latency_queue = NULL;
+  pcie_write_latency_queue = NULL;
+
+  total_allocation_size = 0;
+
+  over_sub = false;
+
+  //gpu_sim_cycle = m_gpu->gpu_sim_cycle;
+  //gpu_tot_sim_cycle = m_gpu->gpu_tot_sim_cycle;
+}
+
+std::list<mem_addr_t> gmmu_t::get_faulty_pages(mem_addr_t addr, size_t length) {
+  list<mem_addr_t> page_list;
+  // For UVA, we assume there's no page faults as all pages are 
+  // allocated in advance
+  return page_list;
+}
+
+unsigned long long gmmu_t::calculate_transfer_time(size_t data_size) {
+  float speed = 2.0 * m_config.curve_a / M_PI *
+                atan(m_config.curve_b * ((float)(data_size) / (float)(1024)));
+
+  if (data_size >= 2 * 1024 * 1024) {
+    speed /= 2;
+  }
+
+  return (unsigned long long)((float)(data_size)*m_config.core_freq / speed /
+                              (1024.0 * 1024.0 * 1024.0));
+}
+
+void gmmu_t::calculate_devicesync_time(size_t data_size) {
+
+  unsigned cur_turn = 0;
+  unsigned cur_size = 0;
+
+  float speed;
+
+  while (data_size != 0) {
+
+    unsigned long long cur_cycle = m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle;
+    unsigned long long cur_time = 0;
+
+    if (cur_turn == 0) {
+      cur_size = MIN_PREFETCH_SIZE;
+    } else {
+      cur_size = MIN_PREFETCH_SIZE * pow(2, cur_turn - 1);
+    }
+
+    if (data_size < 4096) {
+      speed = 2.0 * m_config.curve_a / M_PI *
+              atan(m_config.curve_b * ((float)(data_size) / (float)(1024)));
+      cur_time = (unsigned long long)((float)(data_size)*m_config.core_freq /
+                                      speed / (1024.0 * 1024.0 * 1024.0));
+
+      if (sim_prof_enable) {
+        event_stats *d_sync = new memory_stats(
+            device_sync, cur_cycle, cur_cycle + cur_time, 0, data_size, 0);
+        sim_prof[cur_cycle].push_back(d_sync);
+      }
+
+      m_gpu->gpu_tot_sim_cycle += cur_time;
+
+      return;
+    } else {
+      cur_size -= 4096;
+      data_size -= 4096;
+      speed = 2.0 * m_config.curve_a / M_PI *
+              atan(m_config.curve_b * ((float)(4096) / (float)(1024)));
+      cur_time = (unsigned long long)((float)(4096) * m_config.core_freq /
+                                      speed / (1024.0 * 1024.0 * 1024.0));
+
+      if (sim_prof_enable) {
+        event_stats *d_sync = new memory_stats(
+            device_sync, cur_cycle, cur_cycle + cur_time, 0, 4096, 0);
+        sim_prof[cur_cycle].push_back(d_sync);
+      }
+
+      m_gpu->gpu_tot_sim_cycle += cur_time;
+    }
+
+    if (data_size < cur_size) {
+      speed = 2.0 * m_config.curve_a / M_PI *
+              atan(m_config.curve_b * ((float)(data_size) / (float)(1024)));
+      cur_time = (unsigned long long)((float)(data_size)*m_config.core_freq /
+                                      speed / (1024.0 * 1024.0 * 1024.0));
+
+      if (sim_prof_enable) {
+        event_stats *d_sync = new memory_stats(
+            device_sync, cur_cycle, cur_cycle + cur_time, 0, data_size, 0);
+        sim_prof[cur_cycle].push_back(d_sync);
+      }
+
+      m_gpu->gpu_tot_sim_cycle += cur_time;
+
+      return;
+    } else {
+      data_size -= cur_size;
+      speed = 2.0 * m_config.curve_a / M_PI *
+              atan(m_config.curve_b * ((float)(cur_size) / (float)(1024)));
+      cur_time = (unsigned long long)((float)(cur_size)*m_config.core_freq /
+                                      speed / (1024.0 * 1024.0 * 1024.0));
+
+      if (sim_prof_enable) {
+        event_stats *d_sync = new memory_stats(
+            device_sync, cur_cycle, cur_cycle + cur_time, 0, cur_size, 0);
+        sim_prof[cur_cycle].push_back(d_sync);
+      }
+
+      m_gpu->gpu_tot_sim_cycle += cur_time;
+    }
+
+    cur_turn++;
+    if (cur_turn == 6) {
+      cur_turn = 0;
+    }
+  }
+  return;
+}
+
+bool gmmu_t::pcie_transfers_completed() {
+  return pcie_write_stage_queue.empty() && pcie_write_latency_queue == NULL &&
+         pcie_read_stage_queue.empty() && pcie_read_latency_queue == NULL;
+}
+
+void gmmu_t::register_tlbflush_callback(
+    std::function<void(mem_addr_t)> cb_tlb) {
+  callback_tlb_flush.push_back(cb_tlb);
+}
+
+void gmmu_t::tlb_flush(mem_addr_t page_num) {
+  for (list<std::function<void(mem_addr_t)>>::iterator iter =
+           callback_tlb_flush.begin();
+       iter != callback_tlb_flush.end(); iter++) {
+    (*iter)(page_num);
+  }
+}
+
+void gmmu_t::check_write_stage_queue(mem_addr_t page_num, bool refresh) {
+  // the page, about to be accessed, was selected for eviction earlier
+  // so don't evict that page
+  for (std::list<pcie_latency_t *>::iterator iter =
+           pcie_write_stage_queue.begin();
+       iter != pcie_write_stage_queue.end(); iter++) {
+    if (std::find((*iter)->page_list.begin(), (*iter)->page_list.end(),
+                  page_num) != (*iter)->page_list.end()) {
+      // on tlb hit refresh position of pages in the valid page list
+      for (std::list<mem_addr_t>::iterator pg_iter = (*iter)->page_list.begin();
+           pg_iter != (*iter)->page_list.end(); pg_iter++) {
+        m_gpu->get_global_memory()->set_page_access(*pg_iter);
+
+        m_gpu->get_global_memory()->set_page_dirty(*pg_iter);
+
+        // reclaim valid size in large page tree for unique basic blocks
+        // corresponding to all pages
+        mem_addr_t page_addr =
+            m_gpu->get_global_memory()->get_mem_addr(*pg_iter);
+        struct lp_tree_node *root = get_lp_node(page_addr);
+        update_basic_block(root, page_addr, m_config.page_size, true);
+
+        refresh_valid_pages(page_addr);
+      }
+
+      pcie_write_stage_queue.erase(iter);
+      break;
+    }
+  }
+}
+
+// check if the block is already scheduled for eviction or is not valid at all
+bool gmmu_t::is_block_evictable(mem_addr_t addr, size_t size) {
+  for (mem_addr_t start = addr; start != addr + size;
+       start += m_config.page_size) {
+    if (!m_gpu->get_global_memory()->is_valid(
+            m_gpu->get_global_memory()->get_page_num(start))) {
+      return false;
+    }
+  }
+
+  for (std::list<pcie_latency_t *>::iterator iter =
+           pcie_write_stage_queue.begin();
+       iter != pcie_write_stage_queue.end(); iter++) {
+    if ((addr >= (*iter)->start_addr) &&
+        (addr < (*iter)->start_addr + (*iter)->size)) {
+      return false;
+    }
+  }
+
+  for (mem_addr_t start = addr; start != addr + size;
+       start += m_config.page_size) {
+    if (!reserve_pages_check(start)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void gmmu_t::page_eviction_procedure() {
+  sort_valid_pages();
+
+  std::list<std::pair<mem_addr_t, size_t>> evicted_pages;
+
+  int eviction_start =
+      (int)(valid_pages.size() * m_config.reserve_accessed_page_percent / 100);
+
+  if (evict_policy == eviction_policy::LRU4K) {
+    std::list<eviction_t *>::iterator iter = valid_pages.begin();
+    std::advance(iter, eviction_start);
+
+    while (iter != valid_pages.end() &&
+           !is_block_evictable((*iter)->addr, (*iter)->size)) {
+      iter++;
+    }
+
+    if (iter != valid_pages.end()) {
+      mem_addr_t page_addr = (*iter)->addr;
+      struct lp_tree_node *root = get_lp_node(page_addr);
+      update_basic_block(root, page_addr, m_config.page_size, false);
+
+      evicted_pages.push_back(std::make_pair(page_addr, m_config.page_size));
+    }
+  } else if (evict_policy == eviction_policy::LRU ||
+             evict_policy == eviction_policy::LFU ||
+             m_config.page_size == MAX_PREFETCH_SIZE) {
+    // in lru, only evict the least recently used pages at the front of accessed
+    // pages queue in lfu, only evict the page accessed least number of times
+    // from the front of accessed pages queue
+    std::list<eviction_t *>::iterator iter = valid_pages.begin();
+    std::advance(iter, eviction_start);
+
+    while (iter != valid_pages.end() &&
+           !is_block_evictable((*iter)->addr, (*iter)->size)) {
+      iter++;
+    }
+
+    if (iter != valid_pages.end()) {
+      mem_addr_t page_addr = (*iter)->addr;
+      struct lp_tree_node *root = get_lp_node(page_addr);
+      evict_whole_tree(root);
+
+      evicted_pages.push_back(std::make_pair(root->addr, root->size));
+    }
+  } else if (evict_policy == eviction_policy::RANDOM) {
+    // in random eviction, select a random page
+    std::list<eviction_t *>::iterator iter = valid_pages.begin();
+    std::advance(
+        iter, eviction_start +
+                  (rand() %
+                   (int)(valid_pages.size() *
+                         (1 - m_config.reserve_accessed_page_percent / 100))));
+
+    while (iter != valid_pages.end() &&
+           !is_block_evictable((*iter)->addr, (*iter)->size)) {
+      iter++;
+    }
+
+    if (iter != valid_pages.end()) {
+      mem_addr_t page_addr = (*iter)->addr;
+      struct lp_tree_node *root = get_lp_node(page_addr);
+      update_basic_block(root, page_addr, m_config.page_size, false);
+
+      evicted_pages.push_back(std::make_pair(page_addr, m_config.page_size));
+    }
+  } else if (evict_policy == eviction_policy::SEQUENTIAL_LOCAL) {
+    // we evict sixteen 4KB pages in the 2 MB allocation where this evictable
+    // belong to
+    std::list<eviction_t *>::iterator iter = valid_pages.begin();
+    std::advance(iter, eviction_start);
+
+    struct lp_tree_node *root;
+    mem_addr_t page_addr;
+    mem_addr_t bb_addr;
+
+    for (; iter != valid_pages.end(); iter++) {
+      page_addr = (*iter)->addr;
+
+      root = get_lp_node(page_addr);
+
+      bb_addr = get_basic_block(root, page_addr);
+
+      if (is_block_evictable(bb_addr, MIN_PREFETCH_SIZE)) {
+        update_basic_block(root, page_addr, MIN_PREFETCH_SIZE, false);
+        break;
+      }
+    }
+
+    if (iter != valid_pages.end()) {
+      evicted_pages.push_back(std::make_pair(bb_addr, MIN_PREFETCH_SIZE));
+    }
+  } else if (evict_policy == eviction_policy::TBN) {
+    // we evict multiple 64KB pages in the 2 MB allocation where this evictable
+    // belong
+    std::list<eviction_t *>::iterator iter = valid_pages.begin();
+    std::advance(iter, eviction_start);
+
+    // find all basic blocks which are not staged/scheduled for write back or
+    // not invalid or not in ld/st unit
+    std::set<mem_addr_t> all_basic_blocks;
+
+    struct lp_tree_node *root;
+    mem_addr_t page_addr;
+    mem_addr_t bb_addr;
+
+    for (; iter != valid_pages.end(); iter++) {
+      page_addr = (*iter)->addr;
+
+      root = get_lp_node(page_addr);
+
+      bb_addr = get_basic_block(root, page_addr);
+
+      if (is_block_evictable(bb_addr, MIN_PREFETCH_SIZE)) {
+        update_basic_block(root, page_addr, MIN_PREFETCH_SIZE, false);
+        break;
+      }
+    }
+
+    if (iter != valid_pages.end()) {
+      all_basic_blocks.insert(bb_addr);
+      traverse_and_remove_lp_tree(root, all_basic_blocks);
+    }
+
+    // group all contiguous basic blocks if possible
+    std::set<mem_addr_t>::iterator bb = all_basic_blocks.begin();
+
+    while (bb != all_basic_blocks.end()) {
+      std::set<mem_addr_t>::iterator next_bb = bb;
+      size_t cur_num = 0;
+
+      do {
+        next_bb++;
+        cur_num++;
+      } while (next_bb != all_basic_blocks.end() &&
+               ((*next_bb) == ((*bb) + cur_num * MIN_PREFETCH_SIZE)));
+
+      evicted_pages.push_back(
+          std::make_pair((*bb), (cur_num * MIN_PREFETCH_SIZE)));
+
+      bb = next_bb;
+    }
+  }
+
+  // always write back the chunk no matter what it has not dirty pages or dirty
+  // pages
+  for (std::list<std::pair<mem_addr_t, size_t>>::iterator iter =
+           evicted_pages.begin();
+       iter != evicted_pages.end(); iter++) {
+    pcie_latency_t *p_t = new pcie_latency_t();
+
+    p_t->start_addr = iter->first;
+    p_t->size = iter->second;
+
+    latency_type ltype = latency_type::PCIE_WRITE_BACK;
+
+    for (std::list<eviction_t *>::iterator it = valid_pages.begin();
+         it != valid_pages.end(); it++) {
+      if ((*it)->addr <= iter->first &&
+          iter->first < (*it)->addr + (*it)->size) {
+        if ((*it)->RW == 1) {
+          ltype = latency_type::INVALIDATE;
+          break;
+        }
+      }
+    }
+
+    p_t->type = ltype;
+
+    if (m_config.page_size == MAX_PREFETCH_SIZE) {
+      mem_addr_t page_num =
+          m_gpu->get_global_memory()->get_page_num(iter->first);
+
+      p_t->page_list.push_back(page_num);
+
+      valid_pages_erase(page_num);
+    } else {
+      mem_addr_t page_num =
+          m_gpu->get_global_memory()->get_page_num(iter->first);
+
+      for (int i = 0; i < (int)(iter->second / m_config.page_size); i++) {
+        p_t->page_list.push_back(page_num + i);
+
+        valid_pages_erase(page_num + i);
+      }
+    }
+
+    pcie_write_stage_queue.push_back(p_t);
+  }
+}
+
+void gmmu_t::valid_pages_erase(mem_addr_t page_num) {
+  mem_addr_t page_addr = m_gpu->get_global_memory()->get_mem_addr(page_num);
+  for (std::list<eviction_t *>::iterator it = valid_pages.begin();
+       it != valid_pages.end(); it++) {
+    if ((*it)->addr <= page_addr && page_addr < (*it)->addr + (*it)->size) {
+      valid_pages.erase(it);
+      break;
+    }
+  }
+}
+
+void gmmu_t::valid_pages_clear() { valid_pages.clear(); }
+
+void gmmu_t::refresh_valid_pages(mem_addr_t page_addr) {
+  // bool valid = false;
+  // For UVA, assume all pages are valid
+  bool valid = true;
+  for (std::list<eviction_t *>::iterator it = valid_pages.begin();
+       it != valid_pages.end(); it++) {
+    if ((*it)->addr <= page_addr && page_addr < (*it)->addr + (*it)->size) {
+      (*it)->cycle = m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle;
+      valid = true;
+      break;
+    }
+  }
+
+  if (!valid) {
+    eviction_t *item = new eviction_t();
+    item->addr = get_eviction_base_addr(page_addr);
+    item->size = get_eviction_granularity(page_addr);
+    item->cycle = m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle;
+    valid_pages.push_back(item);
+  }
+}
+
+void gmmu_t::sort_valid_pages() {
+  for (std::list<eviction_t *>::iterator vp_iter = valid_pages.begin();
+       vp_iter != valid_pages.end(); vp_iter++) {
+    for (std::list<struct lp_tree_node *>::iterator lp_iter =
+             large_page_info.begin();
+         lp_iter != large_page_info.end(); lp_iter++) {
+      if ((*vp_iter)->addr == (*lp_iter)->addr) {
+        (*vp_iter)->access_counter = (*lp_iter)->access_counter;
+        (*vp_iter)->RW = (*lp_iter)->RW;
+        break;
+      }
+    }
+  }
+
+  if (evict_policy == eviction_policy::LFU) {
+    valid_pages.sort([](const eviction_t *i, const eviction_t *j) {
+      return (i->access_counter < j->access_counter) ||
+             ((i->access_counter == j->access_counter) && (i->RW < j->RW)) ||
+             ((i->access_counter == j->access_counter) && (i->RW == j->RW) &&
+              (i->cycle < j->cycle));
+    });
+  } else {
+    if (evict_policy == eviction_policy::TBN ||
+        evict_policy == eviction_policy::SEQUENTIAL_LOCAL) {
+      std::map<mem_addr_t, std::list<eviction_t *>> tempMap;
+
+      for (std::list<eviction_t *>::iterator it = valid_pages.begin();
+           it != valid_pages.end(); it++) {
+        struct lp_tree_node *root = m_gpu->getGmmu()->get_lp_node((*it)->addr);
+        tempMap[root->addr].push_back(*it);
+      }
+
+      for (std::map<mem_addr_t, std::list<eviction_t *>>::iterator it =
+               tempMap.begin();
+           it != tempMap.end(); it++) {
+        it->second.sort([](const eviction_t *i, const eviction_t *j) {
+          return i->cycle > j->cycle;
+        });
+      }
+
+      std::list<pair<mem_addr_t, std::list<eviction_t *>>> tempList;
+
+      for (std::map<mem_addr_t, std::list<eviction_t *>>::iterator it =
+               tempMap.begin();
+           it != tempMap.end(); it++) {
+        tempList.push_back(make_pair(it->first, it->second));
+      }
+
+      tempList.sort([](const pair<mem_addr_t, std::list<eviction_t *>> i,
+                       const pair<mem_addr_t, std::list<eviction_t *>> j) {
+        return i.second.front()->cycle < j.second.front()->cycle;
+      });
+
+      std::list<eviction_t *> new_valid_pages;
+
+      for (std::list<pair<mem_addr_t, std::list<eviction_t *>>>::iterator it =
+               tempList.begin();
+           it != tempList.end(); it++) {
+        (*it).second.sort([](const eviction_t *i, const eviction_t *j) {
+          return i->cycle < j->cycle;
+        });
+        new_valid_pages.insert(new_valid_pages.end(), it->second.begin(),
+                               it->second.end());
+      }
+
+      valid_pages = new_valid_pages;
+    } else {
+      valid_pages.sort([](const eviction_t *i, const eviction_t *j) {
+        return i->cycle < j->cycle;
+      });
+    }
+  }
+}
+
+unsigned long long gmmu_t::get_ready_cycle(unsigned num_pages) {
+  float speed = 2.0 * m_config.curve_a / M_PI *
+                atan(m_config.curve_b *
+                     ((float)(num_pages * m_config.page_size) / 1024.0));
+
+  return m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle +
+         (unsigned long long)((float)(m_config.page_size * num_pages) *
+                              m_config.core_freq / speed /
+                              (1024.0 * 1024.0 * 1024.0));
+}
+
+unsigned long long gmmu_t::get_ready_cycle_dma(unsigned size) {
+  float speed = 2.0 * m_config.curve_a / M_PI *
+                atan(m_config.curve_b * ((float)(size) / 1024.0));
+  return m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle + 200;
+}
+
+float gmmu_t::get_pcie_utilization(unsigned num_pages) {
+  return 2.0 * m_config.curve_a / M_PI *
+         atan(m_config.curve_b *
+              ((float)(num_pages * m_config.page_size) / 1024.0)) /
+         m_config.pcie_bandwidth;
+}
+
+void gmmu_t::activate_prefetch(mem_addr_t m_device_addr, size_t m_cnt,
+                               struct CUstream_st *m_stream) {
+  for (std::list<prefetch_req>::iterator iter = prefetch_req_buffer.begin();
+       iter != prefetch_req_buffer.end(); iter++) {
+    if (iter->start_addr == m_device_addr && iter->size == m_cnt &&
+        iter->m_stream->get_uid() == m_stream->get_uid()) {
+      assert(iter->cur_addr == m_device_addr);
+      iter->active = true;
+      return;
+    }
+  }
+}
+
+void gmmu_t::register_prefetch(mem_addr_t m_device_addr,
+                               mem_addr_t m_device_allocation_ptr, size_t m_cnt,
+                               struct CUstream_st *m_stream) {
+  struct prefetch_req pre_q;
+
+  pre_q.start_addr = m_device_addr;
+  pre_q.cur_addr = m_device_addr;
+  pre_q.allocation_addr = m_device_allocation_ptr;
+  pre_q.size = m_cnt;
+  pre_q.active = false;
+  pre_q.m_stream = m_stream;
+
+  prefetch_req_buffer.push_back(pre_q);
+}
+
+struct lp_tree_node *gmmu_t::build_lp_tree(mem_addr_t addr, size_t size) {
+  struct lp_tree_node *node = new lp_tree_node();
+  node->addr = addr;
+  node->size = size;
+  node->valid_size = 0;
+  node->access_counter = 0;
+  node->RW = 0;
+
+  if (size == MIN_PREFETCH_SIZE) {
+    node->left = NULL;
+    node->right = NULL;
+  } else {
+    node->left = build_lp_tree(addr, size / 2);
+    node->right = build_lp_tree(addr + size / 2, size / 2);
+  }
+  return node;
+}
+
+void gmmu_t::initialize_large_page(mem_addr_t start_addr, size_t size) {
+  struct lp_tree_node *root = build_lp_tree(start_addr, size);
+
+  large_page_info.push_back(root);
+
+  total_allocation_size += size;
+}
+
+struct lp_tree_node *gmmu_t::get_lp_node(mem_addr_t addr) {
+  for (std::list<struct lp_tree_node *>::iterator iter =
+           large_page_info.begin();
+       iter != large_page_info.end(); iter++) {
+    if ((*iter)->addr <= addr && addr < (*iter)->addr + (*iter)->size) {
+      return *iter;
+    }
+  }
+  return NULL;
+}
+
+mem_addr_t gmmu_t::get_basic_block(struct lp_tree_node *node, mem_addr_t addr) {
+  while (node->size != MIN_PREFETCH_SIZE) {
+    if (node->left->addr <= addr &&
+        addr < node->left->addr + node->left->size) {
+      node = node->left;
+    } else {
+      node = node->right;
+    }
+  }
+
+  return node->addr;
+}
+
+void gmmu_t::evict_whole_tree(struct lp_tree_node *node) {
+  if (node != NULL) {
+    node->valid_size = 0;
+    evict_whole_tree(node->left);
+    evict_whole_tree(node->right);
+  }
+}
+
+mem_addr_t gmmu_t::update_basic_block(struct lp_tree_node *node,
+                                      mem_addr_t addr, size_t size,
+                                      bool prefetch) {
+  while (node->size != MIN_PREFETCH_SIZE) {
+    if (prefetch) {
+      if (node->valid_size != node->size) {
+        node->valid_size += size;
+      }
+    } else {
+      if (node->valid_size != 0) {
+        node->valid_size -= size;
+      }
+    }
+
+    if (node->left->addr <= addr &&
+        addr < node->left->addr + node->left->size) {
+      node = node->left;
+    } else {
+      node = node->right;
+    }
+  }
+
+  if (prefetch) {
+    if (node->valid_size != node->size) {
+      node->valid_size += size;
+    }
+  } else {
+    if (node->valid_size != 0) {
+      node->valid_size -= size;
+    }
+  }
+
+  return node->addr;
+}
+
+void gmmu_t::fill_lp_tree(struct lp_tree_node *node,
+                          std::set<mem_addr_t> &scheduled_basic_blocks) {
+  if (node->size == MIN_PREFETCH_SIZE) {
+    if (node->valid_size == 0) {
+      node->valid_size = MIN_PREFETCH_SIZE;
+      scheduled_basic_blocks.insert(node->addr);
+    }
+  } else {
+    fill_lp_tree(node->left, scheduled_basic_blocks);
+    fill_lp_tree(node->right, scheduled_basic_blocks);
+    node->valid_size = node->left->valid_size + node->right->valid_size;
+  }
+}
+
+void gmmu_t::remove_lp_tree(struct lp_tree_node *node,
+                            std::set<mem_addr_t> &scheduled_basic_blocks) {
+  if (node->size == MIN_PREFETCH_SIZE) {
+    if (node->valid_size == MIN_PREFETCH_SIZE &&
+        is_block_evictable(node->addr, MIN_PREFETCH_SIZE)) {
+      node->valid_size = 0;
+      scheduled_basic_blocks.insert(node->addr);
+    }
+  } else {
+    remove_lp_tree(node->left, scheduled_basic_blocks);
+    remove_lp_tree(node->right, scheduled_basic_blocks);
+    node->valid_size = node->left->valid_size + node->right->valid_size;
+  }
+}
+
+void gmmu_t::traverse_and_fill_lp_tree(
+    struct lp_tree_node *node, std::set<mem_addr_t> &scheduled_basic_blocks) {
+  if (node->size != MIN_PREFETCH_SIZE) {
+    traverse_and_fill_lp_tree(node->left, scheduled_basic_blocks);
+    traverse_and_fill_lp_tree(node->right, scheduled_basic_blocks);
+    node->valid_size = node->left->valid_size + node->right->valid_size;
+
+    if (node->valid_size != node->size && node->valid_size > node->size / 2) {
+      fill_lp_tree(node, scheduled_basic_blocks);
+    }
+  }
+}
+
+void gmmu_t::traverse_and_remove_lp_tree(
+    struct lp_tree_node *node, std::set<mem_addr_t> &scheduled_basic_blocks) {
+  if (node->size != MIN_PREFETCH_SIZE) {
+    traverse_and_remove_lp_tree(node->left, scheduled_basic_blocks);
+    traverse_and_remove_lp_tree(node->right, scheduled_basic_blocks);
+    node->valid_size = node->left->valid_size + node->right->valid_size;
+
+    if (node->valid_size != 0 && node->valid_size < node->size / 2) {
+      remove_lp_tree(node, scheduled_basic_blocks);
+    }
+  }
+}
+
+void gmmu_t::reserve_pages_insert(mem_addr_t addr, unsigned ma_uid) {
+  mem_addr_t page_num = m_gpu->get_global_memory()->get_page_num(addr);
+
+  fflush(stdout);
+  if (find(reserve_pages[page_num].begin(), reserve_pages[page_num].end(),
+           ma_uid) == reserve_pages[page_num].end()) {
+    reserve_pages[page_num].push_back(ma_uid);
+  }
+}
+
+void gmmu_t::reserve_pages_remove(mem_addr_t addr, unsigned ma_uid) {
+  mem_addr_t page_num = m_gpu->get_global_memory()->get_page_num(addr);
+
+  assert(reserve_pages.find(page_num) != reserve_pages.end());
+
+  std::list<unsigned>::iterator iter = std::find(
+      reserve_pages[page_num].begin(), reserve_pages[page_num].end(), ma_uid);
+
+  assert(iter != reserve_pages[page_num].end());
+
+  reserve_pages[page_num].erase(iter);
+
+  if (reserve_pages[page_num].empty()) {
+    reserve_pages.erase(page_num);
+  }
+}
+
+bool gmmu_t::reserve_pages_check(mem_addr_t addr) {
+  mem_addr_t page_num = m_gpu->get_global_memory()->get_page_num(addr);
+
+  return reserve_pages.find(page_num) == reserve_pages.end();
+}
+
+void gmmu_t::update_hardware_prefetcher_oversubscribed() {
+  if (oversub_prefetcher == hwardware_prefetcher_oversub::DISBALED) {
+    prefetcher = hwardware_prefetcher::DISBALED;
+  } else if (oversub_prefetcher == hwardware_prefetcher_oversub::TBN) {
+    prefetcher = hwardware_prefetcher::TBN;
+  } else if (oversub_prefetcher ==
+             hwardware_prefetcher_oversub::SEQUENTIAL_LOCAL) {
+    prefetcher = hwardware_prefetcher::SEQUENTIAL_LOCAL;
+  } else if (oversub_prefetcher == hwardware_prefetcher_oversub::RANDOM) {
+    prefetcher = hwardware_prefetcher::RANDOM;
+  }
+}
+
+void gmmu_t::log_kernel_info(unsigned kernel_id, unsigned long long time,
+                             bool finish) {
+  if (!finish) {
+    kernel_info.insert(std::make_pair(kernel_id, std::make_pair(time, 0)));
+  } else {
+    std::map<unsigned,
+             std::pair<unsigned long long, unsigned long long>>::iterator it =
+        kernel_info.find(kernel_id);
+    if (it != kernel_info.end()) {
+      it->second.second = time;
+    }
+  }
+}
+
+void gmmu_t::update_memory_management_policy() {
+  std::map<std::string, ds_pattern> accessPatterns;
+
+  int i = 1;
+  std::map<std::pair<mem_addr_t, size_t>, std::string> dataStructures;
+  std::map<std::string, std::list<mem_addr_t>> dsUniqueBlocks;
+
+  // get the managed allocations
+  const std::map<uint64_t, struct allocation_info *> &managedAllocations =
+      m_gpu->gpu_get_managed_allocations();
+
+  // loop over managed allocations to create three maps
+  // 1. data structures - key: pair of start addr and size; value: ds_i
+  // 2. access pattern: key: ds_i; value: UNDECIDED pattern
+  // 3. unique accessed blocks for reuse: key: ds_i; value: empty list of block
+  // start address
+  for (std::map<uint64_t, struct allocation_info *>::const_iterator iter =
+           managedAllocations.begin();
+       iter != managedAllocations.end(); iter++) {
+    dataStructures.insert(
+        std::make_pair(std::make_pair(iter->second->gpu_mem_addr,
+                                      iter->second->allocation_size),
+                       std::string("ds" + std::to_string(i))));
+
+    accessPatterns.insert(std::make_pair(std::string("ds" + std::to_string(i)),
+                                         ds_pattern::UNDECIDED));
+    dsUniqueBlocks.insert(std::make_pair(std::string("ds" + std::to_string(i)),
+                                         std::list<mem_addr_t>()));
+    i++;
+  }
+
+  // create three level hierarchy for kernel-wise then data-structure wise block
+  // address first level: name of kernel (k_i); second level: ds_i; third level:
+  // block addresses ordered by access time
+  std::map<unsigned, std::map<std::string, std::list<mem_addr_t>>>
+      kernel_pattern;
+
+  for (std::map<unsigned,
+                std::pair<unsigned long long, unsigned long long>>::iterator
+           k_iter = kernel_info.begin();
+       k_iter != kernel_info.end(); k_iter++) {
+
+    unsigned long long start = k_iter->second.first;
+    unsigned long long end = k_iter->second.second;
+
+    std::map<std::string, std::list<mem_addr_t>> dsAccess;
+
+    for (std::list<std::pair<unsigned long long, mem_addr_t>>::iterator
+             acc_iter = block_access_list.begin();
+         acc_iter != block_access_list.end(); acc_iter++) {
+
+      unsigned long long access_cycle = acc_iter->first;
+      mem_addr_t block_addr = acc_iter->second;
+
+      if (access_cycle >= start && ((end == 0) || (access_cycle <= end))) {
+
+        for (std::map<std::pair<mem_addr_t, size_t>, std::string>::iterator
+                 ds_iter = dataStructures.begin();
+             ds_iter != dataStructures.end(); ds_iter++) {
+
+          if (block_addr >= ds_iter->first.first &&
+              block_addr < ds_iter->first.first + ds_iter->first.second) {
+            dsAccess[ds_iter->second].push_back(block_addr);
+          }
+        }
+      }
+    }
+
+    kernel_pattern.insert(std::make_pair(k_iter->first, dsAccess));
+  }
+
+  // determine pattern per data structure
+  // first loop on kernel level then on data structures accessed in that kernel
+  for (std::map<unsigned,
+                std::map<std::string, std::list<mem_addr_t>>>::iterator k_iter =
+           kernel_pattern.begin();
+       k_iter != kernel_pattern.end(); k_iter++) {
+
+    for (std::map<std::string, std::list<mem_addr_t>>::iterator da_iter =
+             k_iter->second.begin();
+         da_iter != k_iter->second.end(); da_iter++) {
+
+      // get the sorted list of block addresses belonging to the current
+      // data-structure in current kernel
+      std::list<mem_addr_t> curBlocks = std::list<mem_addr_t>(da_iter->second);
+      curBlocks.sort();
+      curBlocks.unique();
+
+      // check for data reuse
+      bool reuse = false;
+
+      // first within this kernel
+      // if the number of unique blocks accessed and total number of blocks
+      // accessed are not same then there is repetition
+      if (curBlocks.size() != da_iter->second.size()) {
+        reuse = true;
+      }
+
+      // second check if the current accessed blocks are already seen or not
+      std::map<std::string, std::list<mem_addr_t>>::iterator ub_it =
+          dsUniqueBlocks.find(da_iter->first);
+
+      // check for intersection between unique blocks accessed in current kernel
+      // and the previous kernels is null set or not
+      std::list<int> intersection;
+      std::set_intersection(curBlocks.begin(), curBlocks.end(),
+                            ub_it->second.begin(), ub_it->second.end(),
+                            std::back_inserter(intersection));
+
+      if (intersection.size() != 0) {
+        reuse = true;
+      }
+
+      // add the current blocks to the seen set per data structure
+      ub_it->second.merge(curBlocks);
+      ub_it->second.sort();
+      ub_it->second.unique();
+
+      // now update the pattern
+      std::map<std::string, ds_pattern>::iterator dsp_it =
+          accessPatterns.find(da_iter->first);
+      ds_pattern curPattern;
+
+      // check for linearity or randomness in current kernel
+      if (std::is_sorted(da_iter->second.begin(), da_iter->second.end())) {
+        if (reuse) {
+          curPattern = ds_pattern::LINEAR_REUSE;
+        } else {
+          curPattern = ds_pattern::LINEAR;
+        }
+      } else {
+        if (reuse) {
+          curPattern = ds_pattern::RANDOM_REUSE;
+        } else {
+          curPattern = ds_pattern::RANDOM;
+        }
+      }
+
+      // determine the pattern
+      if (dsp_it->second == ds_pattern::UNDECIDED) {
+        dsp_it->second = curPattern;
+      } else if (dsp_it->second == ds_pattern::LINEAR) {
+        if (curPattern == ds_pattern::LINEAR_REUSE) {
+          dsp_it->second = ds_pattern::LINEAR_REUSE;
+        } else if (curPattern == ds_pattern::RANDOM) {
+          dsp_it->second = ds_pattern::MIXED;
+        } else if (curPattern == ds_pattern::RANDOM_REUSE) {
+          dsp_it->second = ds_pattern::MIXED_REUSE;
+        }
+      } else if (dsp_it->second == ds_pattern::LINEAR_REUSE) {
+        if (curPattern == ds_pattern::RANDOM ||
+            curPattern == ds_pattern::RANDOM_REUSE) {
+          dsp_it->second = ds_pattern::MIXED_REUSE;
+        }
+      } else if (dsp_it->second == ds_pattern::RANDOM) {
+        if (curPattern == ds_pattern::RANDOM_REUSE) {
+          dsp_it->second = ds_pattern::RANDOM_REUSE;
+        } else if (curPattern == ds_pattern::LINEAR) {
+          dsp_it->second = ds_pattern::MIXED;
+        } else if (curPattern == ds_pattern::LINEAR_REUSE) {
+          dsp_it->second = ds_pattern::MIXED_REUSE;
+        }
+      } else if (dsp_it->second == ds_pattern::RANDOM_REUSE) {
+        if (curPattern == ds_pattern::LINEAR ||
+            curPattern == ds_pattern::LINEAR_REUSE) {
+          dsp_it->second = ds_pattern::MIXED_REUSE;
+        }
+      }
+    }
+  }
+
+  bool is_random = false, is_random_reuse = false, is_linear = false,
+       is_linear_reuse = false, is_mixed = false, is_mixed_reuse = false;
+
+  for (std::map<std::string, ds_pattern>::iterator ap_iter =
+           accessPatterns.begin();
+       ap_iter != accessPatterns.end(); ap_iter++) {
+    if (ap_iter->second == ds_pattern::RANDOM) {
+      is_random = true;
+    } else if (ap_iter->second == ds_pattern::RANDOM_REUSE) {
+      is_random_reuse = true;
+    } else if (ap_iter->second == ds_pattern::LINEAR) {
+      is_linear = true;
+    } else if (ap_iter->second == ds_pattern::LINEAR_REUSE) {
+      is_linear_reuse = true;
+    } else if (ap_iter->second == ds_pattern::MIXED) {
+      is_mixed = true;
+    } else if (ap_iter->second == ds_pattern::MIXED_REUSE) {
+      is_mixed_reuse = true;
+    }
+  }
+
+  if (is_random || is_random_reuse || is_mixed || is_mixed_reuse) {
+    dma_mode = dma_type::OVERSUB;
+    evict_policy = eviction_policy::TBN;
+  } else if (is_linear_reuse) {
+    evict_policy = eviction_policy::TBN;
+  }
+}
+
+void gmmu_t::reset_lp_tree_node(struct lp_tree_node *node) {
+  node->valid_size = 0;
+  node->access_counter = 0;
+  node->RW = 0;
+
+  if (node->size != MIN_PREFETCH_SIZE) {
+    reset_lp_tree_node(node->left);
+    reset_lp_tree_node(node->right);
+  }
+}
+
+void gmmu_t::reset_large_page_info() {
+  for (std::list<struct lp_tree_node *>::iterator iter =
+           large_page_info.begin();
+       iter != large_page_info.end(); iter++) {
+    reset_lp_tree_node(*iter);
+  }
+
+  over_sub = false;
+}
+
+mem_addr_t gmmu_t::get_eviction_base_addr(mem_addr_t page_addr) {
+  mem_addr_t lru_addr;
+
+  struct lp_tree_node *root = m_gpu->getGmmu()->get_lp_node(page_addr);
+
+  if (evict_policy == eviction_policy::TBN ||
+      evict_policy == eviction_policy::SEQUENTIAL_LOCAL) {
+    lru_addr = m_gpu->getGmmu()->get_basic_block(root, page_addr);
+  } else if (evict_policy == eviction_policy::LRU4K ||
+             evict_policy == eviction_policy::RANDOM) {
+    lru_addr = page_addr;
+  } else {
+    lru_addr = root->addr;
+  }
+
+  return lru_addr;
+}
+
+size_t gmmu_t::get_eviction_granularity(mem_addr_t page_addr) {
+  size_t lru_size;
+
+  struct lp_tree_node *root = m_gpu->getGmmu()->get_lp_node(page_addr);
+
+  if (evict_policy == eviction_policy::TBN ||
+      evict_policy == eviction_policy::SEQUENTIAL_LOCAL) {
+    lru_size = MIN_PREFETCH_SIZE;
+  } else if (evict_policy == eviction_policy::LRU4K ||
+             evict_policy == eviction_policy::RANDOM) {
+    lru_size = m_config.page_size;
+  } else {
+    lru_size = root->size;
+  }
+
+  return lru_size;
+}
+
+void gmmu_t::update_access_type(mem_addr_t addr, int type) {
+  struct lp_tree_node *node = m_gpu->getGmmu()->get_lp_node(addr);
+
+  while (node->size != MIN_PREFETCH_SIZE) {
+    node->RW |= type;
+
+    if (node->left->addr <= addr &&
+        addr < node->left->addr + node->left->size) {
+      node = node->left;
+    } else {
+      node = node->right;
+    }
+  }
+
+  node->RW |= type;
+}
+
+int gmmu_t::get_bb_access_counter(struct lp_tree_node *node, mem_addr_t addr) {
+  while (node->size != MIN_PREFETCH_SIZE) {
+    if (node->left->addr <= addr &&
+        addr < node->left->addr + node->left->size) {
+      node = node->left;
+    } else {
+      node = node->right;
+    }
+  }
+
+  return node->access_counter & ((1 << 27) - 1);
+}
+
+int gmmu_t::get_bb_round_trip(struct lp_tree_node *node, mem_addr_t addr) {
+  while (node->size != MIN_PREFETCH_SIZE) {
+    if (node->left->addr <= addr &&
+        addr < node->left->addr + node->left->size) {
+      node = node->left;
+    } else {
+      node = node->right;
+    }
+  }
+
+  return (node->access_counter & (((1 << 6) - 1) << 27)) >> 27;
+}
+
+void gmmu_t::inc_bb_access_counter(mem_addr_t addr) {
+  struct lp_tree_node *node = m_gpu->getGmmu()->get_lp_node(addr);
+
+  while (node->size != MIN_PREFETCH_SIZE) {
+    node->access_counter++;
+
+    if (node->left->addr <= addr &&
+        addr < node->left->addr + node->left->size) {
+      node = node->left;
+    } else {
+      node = node->right;
+    }
+  }
+
+  if (node->access_counter == ((1 << 27) - 1)) {
+    reset_bb_access_counter();
+  }
+
+  node->access_counter++;
+}
+
+void gmmu_t::inc_bb_round_trip(struct lp_tree_node *node) {
+  if (node->size != MIN_PREFETCH_SIZE) {
+    inc_bb_round_trip(node->left);
+    inc_bb_round_trip(node->right);
+  } else {
+    uint16_t round_trip = (node->access_counter & (((1 << 6) - 1) << 27)) >> 27;
+
+    if (round_trip == ((1 << 6) - 1)) {
+      reset_bb_round_trip();
+    }
+
+    round_trip = (node->access_counter & (((1 << 6) - 1) << 27)) >> 27;
+    round_trip++;
+
+    node->access_counter =
+        (round_trip << 27) | (node->access_counter & ((1 << 27) - 1));
+  }
+}
+
+void gmmu_t::traverse_and_reset_access_counter(struct lp_tree_node *node) {
+  if (node->size == MIN_PREFETCH_SIZE) {
+    int round_trip = (node->access_counter & (((1 << 6) - 1) << 27)) >> 27;
+    int access_counter = (node->access_counter & ((1 << 27) - 1)) >> 1;
+
+    node->access_counter = (round_trip << 27) | access_counter;
+  } else {
+    traverse_and_reset_access_counter(node->left);
+    traverse_and_reset_access_counter(node->right);
+    node->access_counter = node->access_counter >> 1;
+  }
+}
+
+void gmmu_t::reset_bb_access_counter() {
+  for (std::list<struct lp_tree_node *>::iterator iter =
+           large_page_info.begin();
+       iter != large_page_info.end(); iter++) {
+    traverse_and_reset_access_counter(*iter);
+  }
+}
+
+void gmmu_t::traverse_and_reset_round_trip(struct lp_tree_node *node) {
+  if (node->size == MIN_PREFETCH_SIZE) {
+    int round_trip = (node->access_counter & (((1 << 6) - 1) << 27)) >> 28;
+    int access_counter = node->access_counter & ((1 << 27) - 1);
+
+    node->access_counter = (round_trip << 27) | access_counter;
+  } else {
+    traverse_and_reset_access_counter(node->left);
+    traverse_and_reset_access_counter(node->right);
+  }
+}
+
+void gmmu_t::reset_bb_round_trip() {
+  for (std::list<struct lp_tree_node *>::iterator iter =
+           large_page_info.begin();
+       iter != large_page_info.end(); iter++) {
+    traverse_and_reset_round_trip(*iter);
+  }
+}
+
+bool gmmu_t::should_cause_page_migration(mem_addr_t addr, bool is_write) {
+  if (dma_mode == dma_type::DISABLED) {
+    return true;
+  } else if (dma_mode == dma_type::ALWAYS) {
+    if (is_write) {
+      return true;
+    } else {
+      struct lp_tree_node *root = m_gpu->getGmmu()->get_lp_node(addr);
+
+      if (get_bb_access_counter(root, addr) < m_config.migrate_threshold) {
+        return false;
+      } else {
+        return true;
+      }
+    }
+  } else if (dma_mode == dma_type::OVERSUB) {
+    if (over_sub) {
+      if (is_write) {
+        return true;
+      } else {
+        struct lp_tree_node *root = m_gpu->getGmmu()->get_lp_node(addr);
+
+        if (get_bb_access_counter(root, addr) < m_config.migrate_threshold) {
+          return false;
+        } else {
+          return true;
+        }
+      }
+    } else {
+      return true;
+    }
+  } else if (dma_mode == dma_type::ADAPTIVE) {
+    if (is_write) {
+      return true;
+    } else {
+      struct lp_tree_node *root = m_gpu->getGmmu()->get_lp_node(addr);
+
+      int derived_threshold;
+
+      if (over_sub) {
+        derived_threshold = m_config.migrate_threshold *
+                            m_config.multiply_dma_penalty *
+                            (get_bb_round_trip(root, addr) + 1);
+      } else {
+        size_t num_read_stage_queue = 0;
+
+        for (std::list<pcie_latency_t *>::iterator iter =
+                 pcie_read_stage_queue.begin();
+             iter != pcie_read_stage_queue.end(); iter++) {
+          num_read_stage_queue += (*iter)->page_list.size();
+        }
+
+        size_t num_write_stage_queue = 0;
+
+        for (std::list<pcie_latency_t *>::iterator iter =
+                 pcie_write_stage_queue.begin();
+             iter != pcie_write_stage_queue.end(); iter++) {
+          num_write_stage_queue += (*iter)->page_list.size();
+        }
+
+        derived_threshold =
+            (int)(1.0 + m_config.migrate_threshold *
+                            m_gpu->get_global_memory()->get_projected_occupancy(
+                                num_read_stage_queue, num_write_stage_queue,
+                                m_config.free_page_buffer_percentage));
+      }
+
+      if (get_bb_access_counter(root, addr) < derived_threshold) {
+        return false;
+      } else {
+        return true;
+      }
+    }
+  }
+}
+
+void gmmu_t::cycle() {
+  int simt_cluster_id = 0;
+
+  size_t num_read_stage_queue = 0;
+
+  for (std::list<pcie_latency_t *>::iterator iter =
+           pcie_read_stage_queue.begin();
+       iter != pcie_read_stage_queue.end(); iter++) {
+    num_read_stage_queue += (*iter)->page_list.size();
+  }
+
+  size_t num_write_stage_queue = 0;
+
+  for (std::list<pcie_latency_t *>::iterator iter =
+           pcie_write_stage_queue.begin();
+       iter != pcie_write_stage_queue.end(); iter++) {
+    num_write_stage_queue += (*iter)->page_list.size();
+  }
+
+  num_write_stage_queue += pcie_write_latency_queue != NULL
+                               ? pcie_write_latency_queue->page_list.size()
+                               : 0;
+
+  if (m_gpu->get_global_memory()->should_evict_page(
+          num_read_stage_queue, num_write_stage_queue,
+          m_config.free_page_buffer_percentage)) {
+
+    if (m_config.enable_smart_runtime) {
+      update_memory_management_policy();
+    }
+
+    page_eviction_procedure();
+  }
+
+  // check whether current transfer in the pcie write latency queue is finished
+  if (pcie_write_latency_queue != NULL &&
+      (m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle) >=
+          pcie_write_latency_queue->ready_cycle) {
+
+    for (std::list<mem_addr_t>::iterator iter =
+             pcie_write_latency_queue->page_list.begin();
+         iter != pcie_write_latency_queue->page_list.end(); iter++) {
+      m_gpu->gpu_writeback(m_gpu->get_global_memory()->get_mem_addr(*iter));
+    }
+
+    if (sim_prof_enable) {
+      for (std::list<event_stats *>::iterator iter = writeback_stats.begin();
+           iter != writeback_stats.end(); iter++) {
+        if (((memory_stats *)(*iter))->start_addr ==
+            m_gpu->get_global_memory()->get_mem_addr(
+                pcie_write_latency_queue->page_list.front())) {
+          event_stats *wb = *iter;
+          wb->end_time = m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle;
+          sim_prof[wb->start_time].push_back(wb);
+          writeback_stats.erase(iter);
+          break;
+        }
+      }
+    }
+
+    pcie_write_latency_queue = NULL;
+  }
+
+  // schedule a write back transfer if there is a write back request in staging
+  // queue and a free lane
+  if (!pcie_write_stage_queue.empty() && pcie_write_latency_queue == NULL) {
+    pcie_write_latency_queue = pcie_write_stage_queue.front();
+    pcie_write_latency_queue->ready_cycle =
+        get_ready_cycle(pcie_write_latency_queue->page_list.size());
+
+    for (unsigned long long write_period = m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle;
+         write_period != pcie_write_latency_queue->ready_cycle; write_period++)
+      m_new_stats->pcie_write_utilization.push_back(std::make_pair(
+          write_period,
+          get_pcie_utilization(pcie_write_latency_queue->page_list.size())));
+
+    for (std::list<mem_addr_t>::iterator iter =
+             pcie_write_latency_queue->page_list.begin();
+         iter != pcie_write_latency_queue->page_list.end(); iter++) {
+      m_new_stats->page_thrashing[*iter].push_back(false);
+
+      if (m_gpu->get_global_memory()->is_page_dirty(*iter)) {
+        m_new_stats->page_evict_dirty++;
+      } else {
+        m_new_stats->page_evict_not_dirty++;
+      }
+
+      m_gpu->get_global_memory()->invalidate_page(*iter);
+      m_gpu->get_global_memory()->clear_page_dirty(*iter);
+      m_gpu->get_global_memory()->clear_page_access(*iter);
+
+      m_gpu->get_global_memory()->free_pages(1);
+
+      tlb_flush(*iter);
+    }
+
+    struct lp_tree_node *root =
+        m_gpu->getGmmu()->get_lp_node(m_gpu->get_global_memory()->get_mem_addr(
+            pcie_write_latency_queue->page_list.front()));
+    inc_bb_round_trip(root);
+
+    if (sim_prof_enable) {
+      if (pcie_write_latency_queue->type == latency_type::INVALIDATE &&
+          m_config.invalidate_clean) {
+        event_stats *inv = new memory_stats(
+            invalidate, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle,
+            m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle,
+            m_gpu->get_global_memory()->get_mem_addr(
+                pcie_write_latency_queue->page_list.front()),
+            pcie_write_latency_queue->page_list.size() * m_config.page_size, 0);
+        sim_prof[inv->start_time].push_back(inv);
+      } else {
+        event_stats *wb = new memory_stats(
+            write_back, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle,
+            m_gpu->get_global_memory()->get_mem_addr(
+                pcie_write_latency_queue->page_list.front()),
+            pcie_write_latency_queue->page_list.size() * m_config.page_size, 0);
+        writeback_stats.push_back(wb);
+      }
+    }
+
+    pcie_write_stage_queue.pop_front();
+
+    if (pcie_write_latency_queue->type == latency_type::INVALIDATE &&
+        m_config.invalidate_clean) {
+      pcie_write_latency_queue = NULL;
+    }
+  }
+
+  list<mem_addr_t> page_finsihed_for_mf;
+
+  // check whether the current transfer in the pcie latency queue is finished
+  if (pcie_read_latency_queue != NULL &&
+      (m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle) >=
+          pcie_read_latency_queue->ready_cycle) {
+
+    if (pcie_read_latency_queue->type == latency_type::PCIE_READ) {
+
+      for (std::list<mem_addr_t>::iterator iter =
+               pcie_read_latency_queue->page_list.begin();
+           iter != pcie_read_latency_queue->page_list.end(); iter++) {
+        // validate the page in page table
+        m_gpu->get_global_memory()->validate_page(*iter);
+
+        // add to the valid pages list
+        refresh_valid_pages(m_gpu->get_global_memory()->get_mem_addr(*iter));
+
+        m_new_stats->page_thrashing[*iter].push_back(true);
+
+        // check if the transferred page is part of a prefetch request
+        if (!prefetch_req_buffer.empty()) {
+
+          prefetch_req &pre_q = prefetch_req_buffer.front();
+
+          std::list<mem_addr_t>::iterator iter2 =
+              find(pre_q.pending_prefetch.begin(), pre_q.pending_prefetch.end(),
+                   *iter);
+
+          if (iter2 != pre_q.pending_prefetch.end()) {
+
+            // pending prefetch holds the list of 4KB pages of a big chunk of
+            // tranfer (max upto 2MB) remove it from the list as the PCI-e has
+            // transferred the page
+            pre_q.pending_prefetch.erase(iter2);
+
+            // if this page is part of current prefecth request
+            // add all the dependant memory requests to the
+            // outgoing_replayable_nacks these should be replayed only when
+            // current block of memory transfer is finished
+            pre_q.outgoing_replayable_nacks[*iter].merge(req_info[*iter]);
+
+            // erase the page from the MSHR map
+            req_info.erase(req_info.find(*iter));
+
+            skip_cycles = false;
+
+            m_new_stats->pf_page_fault_latency[*iter].back() =
+                m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle -
+                m_new_stats->pf_page_fault_latency[*iter].back();
+          }
+        }
+
+        // this page request is created by core on page fault and not part of a
+        // prefetch
+        if (req_info.find(*iter) != req_info.end()) {
+
+          page_finsihed_for_mf.push_back(*iter);
+
+          // for all memory fetches that were waiting for this page, should be
+          // replayed back for cache access
+          for (std::list<mem_fetch *>::iterator iter2 = req_info[*iter].begin();
+               iter2 != req_info[*iter].end(); iter2++) {
+            mem_fetch *mf = *iter2;
+
+            simt_cluster_id = mf->get_sid() / m_config.num_core_per_cluster();
+
+            // push the memory fetch into the gmmu to cu queue
+            (m_gpu->getSIMTCluster(simt_cluster_id))->push_gmmu_cu_queue(mf);
+          }
+
+          // erase the page from the MSHR map
+          req_info.erase(req_info.find(*iter));
+
+          skip_cycles = false;
+
+          m_new_stats->mf_page_fault_latency[*iter].back() =
+              m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle -
+              m_new_stats->pf_page_fault_latency[*iter].back();
+        }
+      }
+    } else if (pcie_read_latency_queue->type ==
+               latency_type::PAGE_FAULT) { // processed far-fault is returned to
+                                           // upward queue
+
+      if (sim_prof_enable) {
+        for (std::list<event_stats *>::iterator iter = fault_stats.begin();
+             iter != fault_stats.end(); iter++) {
+          if (((page_fault_stats *)(*iter))->transfering_pages.front() ==
+              pcie_read_latency_queue->page_list.front()) {
+            event_stats *mf_fault = *iter;
+            mf_fault->end_time = m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle;
+            sim_prof[mf_fault->start_time].push_back(mf_fault);
+            fault_stats.erase(iter);
+            break;
+          }
+        }
+      }
+    } else if (pcie_read_latency_queue->type ==
+               latency_type::DMA) { // processed DMA request is returned to
+                                    // upward queue
+      mem_fetch *mf = pcie_read_latency_queue->mf;
+
+      simt_cluster_id = mf->get_sid() / m_config.num_core_per_cluster();
+
+      // push the memory fetch into the gmmu to cu queue
+      (m_gpu->getSIMTCluster(simt_cluster_id))->push_gmmu_cu_queue(mf);
+    }
+    pcie_read_latency_queue = NULL;
+  }
+
+  // schedule a transfer if there is a pending item in staging queue and
+  // nothing is being served at the read latency queue and we have available
+  // free pages
+
+  if (!pcie_read_stage_queue.empty() && pcie_read_latency_queue == NULL &&
+      m_gpu->get_global_memory()->get_free_pages() >=
+          pcie_read_stage_queue.front()->page_list.size()) {
+
+    std::list<pcie_latency_t *>::const_iterator iter =
+        pcie_read_stage_queue.begin();
+    for (; iter != pcie_read_stage_queue.end(); iter++) {
+      if ((*iter)->type == latency_type::DMA) {
+        break;
+      }
+    }
+
+    // prioritize dma before page migration
+    if (iter == pcie_read_stage_queue.end()) {
+      pcie_read_latency_queue = pcie_read_stage_queue.front();
+    } else {
+      pcie_read_latency_queue = *iter;
+    }
+
+    if (pcie_read_latency_queue->type == latency_type::PCIE_READ) {
+      pcie_read_latency_queue->ready_cycle =
+          get_ready_cycle(pcie_read_latency_queue->page_list.size());
+      if (sim_prof_enable) {
+        event_stats *cp_h2d =
+            new memory_stats(memcpy_h2d, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
+                             pcie_read_latency_queue->ready_cycle,
+                             pcie_read_latency_queue->start_addr,
+                             pcie_read_latency_queue->size, 0);
+        sim_prof[m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle].push_back(cp_h2d);
+      }
+
+      for (unsigned long long read_period = m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle;
+           read_period != pcie_read_latency_queue->ready_cycle; read_period++)
+        m_new_stats->pcie_read_utilization.push_back(std::make_pair(
+            read_period,
+            get_pcie_utilization(pcie_read_latency_queue->page_list.size())));
+
+      m_gpu->get_global_memory()->alloc_pages(
+          pcie_read_latency_queue->page_list.size());
+    } else if (pcie_read_latency_queue->type ==
+               latency_type::PAGE_FAULT) { // schedule far-fault for transfer
+
+      pcie_read_latency_queue->ready_cycle =
+          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle +
+          m_config.page_fault_latency *
+              pcie_read_latency_queue->page_list.size();
+
+      if (sim_prof_enable) {
+        event_stats *mf_fault = new page_fault_stats(
+            m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle,
+            pcie_read_latency_queue->page_list,
+            pcie_read_latency_queue->page_list.size() * m_config.page_size);
+        fault_stats.push_back(mf_fault);
+      }
+    } else if (pcie_read_latency_queue->type ==
+               latency_type::DMA) { // schedule DMA request for transfer
+      pcie_read_latency_queue->ready_cycle =
+          get_ready_cycle_dma(pcie_read_latency_queue->mf->get_access_size());
+      if (sim_prof_enable) {
+        event_stats *ma_dma =
+            new memory_stats(dma, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
+                             pcie_read_latency_queue->ready_cycle,
+                             pcie_read_latency_queue->mf->get_addr(),
+                             pcie_read_latency_queue->mf->get_access_size(), 0);
+        sim_prof[m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle].push_back(ma_dma);
+      }
+    }
+
+    // remove the scheduled transfer from read stage queue
+    if (iter == pcie_read_stage_queue.end()) {
+      pcie_read_stage_queue.pop_front();
+    } else {
+      pcie_read_stage_queue.erase(iter);
+    }
+  }
+
+  std::map<mem_addr_t, std::list<mem_fetch *>> page_fault_this_turn;
+
+  // check the page_table_walk_delay_queue
+  while (!page_table_walk_queue.empty() &&
+         ((m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle) >=
+          page_table_walk_queue.front().ready_cycle)) {
+
+    mem_fetch *mf = page_table_walk_queue.front().mf;
+
+    // list<mem_addr_t> page_list = m_gpu->get_global_memory()->get_faulty_pages(
+        // mf->get_addr(), mf->get_access_size());
+
+    list<mem_addr_t> page_list = get_faulty_pages(
+        mf->get_addr(), mf->get_access_size());
+
+    simt_cluster_id = mf->get_sid() / m_config.num_core_per_cluster();
+    // if there is no page fault, directly return to the upward queue of cluster
+    if (page_list.empty()) {
+      mem_addr_t page_num = m_gpu->get_global_memory()->get_page_num(
+          mf->get_mem_access().get_addr());
+      check_write_stage_queue(page_num, false);
+
+      (m_gpu->getSIMTCluster(simt_cluster_id))->push_gmmu_cu_queue(mf);
+
+      m_new_stats->mf_page_hit[simt_cluster_id]++;
+    } else {
+      assert(page_list.size() == 1);
+
+      m_new_stats->mf_page_miss[simt_cluster_id]++;
+
+      // the page request is already there in MSHR either as a page fault or as
+      // part of scheduled prefetch request
+      if (req_info.find(*(page_list.begin())) != req_info.end()) {
+        m_new_stats->mf_page_fault_pending++;
+        req_info[*(page_list.begin())].push_back(mf);
+      } else {
+
+        // if the memory fetch is part of any requests in the prefetch command
+        // buffer then add it to the incoming replayable_nacks
+        std::list<prefetch_req>::iterator iter;
+
+        for (iter = prefetch_req_buffer.begin();
+             iter != prefetch_req_buffer.end(); iter++) {
+
+          if (iter->start_addr <= mf->get_addr() &&
+              mf->get_addr() < iter->start_addr + iter->size) {
+
+            m_new_stats->mf_page_fault_pending++;
+
+            iter->incoming_replayable_nacks[page_list.front()].push_back(mf);
+            break;
+          }
+        }
+
+        // if the memory fetch is not part of any request in the prefetch
+        // command buffer
+        if (iter == prefetch_req_buffer.end()) {
+
+          // if dma is enabled/it is a write access/read access counter hasn't
+          // reached thresold
+          if (!should_cause_page_migration(mf->get_mem_access().get_addr(),
+                                           mf->get_mem_access().get_type() ==
+                                               GLOBAL_ACC_W)) {
+
+            m_new_stats->num_dma++;
+            pcie_latency_t *p_t = new pcie_latency_t();
+
+            mf->set_dma();
+
+            p_t->mf = mf;
+            p_t->type = latency_type::DMA;
+
+            pcie_read_stage_queue.push_back(p_t);
+          } else {
+            if (dma_mode != dma_type::DISABLED &&
+                mf->get_mem_access().get_type() == GLOBAL_ACC_W) {
+              m_new_stats->dma_page_transfer_write++;
+            } else if (dma_mode != dma_type::DISABLED &&
+                       mf->get_mem_access().get_type() == GLOBAL_ACC_R) {
+              m_new_stats->dma_page_transfer_read++;
+            }
+
+            page_fault_this_turn[page_list.front()].push_back(mf);
+          }
+        }
+      }
+    }
+
+    page_table_walk_queue.pop_front();
+  }
+
+  // call hardware prefetcher based on the current page faults
+  do_hardware_prefetch(page_fault_this_turn);
+
+  // fetch from cluster's cu to gmmu queue and push it into the page table way
+  // delay queue
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
+
+    if (!(m_gpu->getSIMTCluster(i))->empty_cu_gmmu_queue()) {
+
+      mem_fetch *mf = (m_gpu->getSIMTCluster(i))->front_cu_gmmu_queue();
+
+      struct page_table_walk_latency_t pt_t;
+      pt_t.mf = mf;
+      pt_t.ready_cycle =
+          m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle + m_config.page_table_walk_latency;
+
+      page_table_walk_queue.push_back(pt_t);
+
+      (m_gpu->getSIMTCluster(i))->pop_cu_gmmu_queue();
+    }
+  }
+
+  // check if there is an active outstanding prefetch request
+  if (!prefetch_req_buffer.empty() && prefetch_req_buffer.front().active) {
+
+    prefetch_req &pre_q = prefetch_req_buffer.front();
+
+    // schedule for page transfers from the active prefetch request when there
+    // is no pending transfer for the same can be the very first time or a
+    // scheduled big chunk of pages (2MB) is finsihed just now
+    if (pre_q.pending_prefetch.empty()) {
+
+      // case when the last schedule finished, it is not the first time
+      if (pre_q.cur_addr > pre_q.start_addr) {
+
+        if (sim_prof_enable) {
+          update_sim_prof_prefetch_break_down(m_gpu->gpu_sim_cycle +
+                                              m_gpu->gpu_tot_sim_cycle);
+        }
+
+        m_new_stats->pf_fault_latency.back().second =
+            m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle -
+            m_new_stats->pf_fault_latency.back().second;
+
+        // all the memory fetches created by core on page fault were aggreagted
+        // earlier now they are replayed back together to the core
+        for (map<mem_addr_t, std::list<mem_fetch *>>::iterator iter =
+                 pre_q.outgoing_replayable_nacks.begin();
+             iter != pre_q.outgoing_replayable_nacks.end(); iter++) {
+
+          for (std::list<mem_fetch *>::iterator iter2 = iter->second.begin();
+               iter2 != iter->second.end(); iter2++) {
+
+            mem_fetch *mf = *iter2;
+
+            simt_cluster_id = mf->get_sid() / m_config.num_core_per_cluster();
+            // push them to the upward queue to replay them back to the
+            // corresponding core in bulk
+            (m_gpu->getSIMTCluster(simt_cluster_id))->push_gmmu_cu_queue(mf);
+          }
+        }
+        pre_q.outgoing_replayable_nacks.clear();
+      }
+
+      // all the memory fetches have been replayed and
+      // the prefetch request is completed entirely
+      // now signal the stream that the operation is finished so that it can
+      // schedule something else
+      if (pre_q.cur_addr == pre_q.start_addr + pre_q.size) {
+
+        pre_q.m_stream->record_next_done();
+
+        if (sim_prof_enable) {
+          update_sim_prof_prefetch(pre_q.start_addr, pre_q.size,
+                                   m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+        }
+
+        prefetch_req_buffer.pop_front();
+        return;
+      }
+
+      mem_addr_t start_addr = 0;
+
+      pcie_latency_t *p_t = new pcie_latency_t();
+
+      // break the loop if
+      //  Case 1: reach the end of this prefetch
+      //  Case 2: it reaches the 2MB line from starting of the allocation
+      //  Case 3: it encounters a valid page in between
+      do {
+        // get the page number for the current updated address
+        mem_addr_t page_num =
+            m_gpu->get_global_memory()->get_page_num(pre_q.cur_addr);
+
+        // update the current address by page size as we break a big chunk (2MB)
+        // in the granularity of the smallest unit of page
+        pre_q.cur_addr += m_config.page_size;
+
+        // check for Case 3, i.e., we encounter a valid page
+        if (m_gpu->get_global_memory()->is_valid(page_num)) {
+
+          m_new_stats->pf_page_hit++;
+
+          // check if this page is currently written back
+          check_write_stage_queue(page_num, false);
+
+          // break out of loop only when we have already scheduled some pages
+          // for transfer if not we will continue skipping valid pages if any
+          // until we find some invalid pages to transfer
+          if (!pre_q.pending_prefetch.empty()) {
+            break;
+          }
+        } else {
+
+          m_new_stats->pf_page_miss++;
+
+          // remember this page as pending under the prefetch request
+          pre_q.pending_prefetch.push_back(page_num);
+
+          if (start_addr == 0) {
+            start_addr = m_gpu->get_global_memory()->get_mem_addr(page_num);
+            p_t->start_addr = pre_q.cur_addr;
+          }
+
+          // just create a placeholder in MSHR for the memory fetches created by
+          // core on page fault later in the time so that they go to outgoing
+          // replayable nacks, rather than incoming
+          req_info[page_num];
+
+          // incoming nacks hold the list of page faults for the transfer which
+          // has not been scheduled yet so instead of pushing them to MSHR and
+          // then again getting back to the outgoing list directly switch
+          // between the incoming and outgoing list of replayable nacks
+          if (pre_q.incoming_replayable_nacks.find(page_num) !=
+              pre_q.incoming_replayable_nacks.end()) {
+            pre_q.outgoing_replayable_nacks[page_num].merge(
+                pre_q.incoming_replayable_nacks[page_num]);
+            pre_q.incoming_replayable_nacks.erase(page_num);
+          }
+
+          // schedule this page as it is not valid to the read stage queue
+          p_t->page_list.push_back(page_num);
+          m_new_stats->pf_page_fault_latency[page_num].push_back(
+              m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+        }
+
+      } while (
+          pre_q.cur_addr !=
+              (pre_q.start_addr +
+               pre_q.size) && // check for Case 1, i.e., we reached the end of
+                              // prefetch request
+          ((unsigned long long)(pre_q.cur_addr - pre_q.allocation_addr)) %
+              ((unsigned long long)
+                   MAX_PREFETCH_SIZE)); // Case 2: allowing maximum transfer
+                                        // size as huge page size of 2MB
+
+      if (!p_t->page_list.empty()) {
+        p_t->size = p_t->page_list.size() * m_config.page_size;
+        p_t->type = latency_type::PCIE_READ;
+        pcie_read_stage_queue.push_back(p_t);
+      }
+
+      m_new_stats->pf_fault_latency.push_back(
+          std::make_pair(pre_q.pending_prefetch.size() * m_config.page_size,
+                         m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle));
+
+      if (sim_prof_enable && !pre_q.pending_prefetch.empty()) {
+        event_stats *cp_pref_bd = new memory_stats(
+            prefetch_breakdown, m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, start_addr,
+            pre_q.pending_prefetch.size() * m_config.page_size,
+            pre_q.m_stream->get_uid());
+        sim_prof[m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle].push_back(cp_pref_bd);
+      }
+    }
+  }
+  
+  if (!skip_cycles && !all_warps.empty() && all_warps.size() == fail_warps.size()) {
+    std::set<int> temp_set;
+    for (std::list<shd_warp_t *>::iterator iter = fail_warps.begin(); iter != fail_warps.end(); iter++) {
+      temp_set.insert((*iter)->get_warp_id());
+    }
+
+    for (std::map<mem_addr_t, std::list<mem_fetch *>>::iterator iter=req_info.begin();
+          iter != req_info.end(); ++iter) {
+      for(std::list<mem_fetch *>::iterator iter2=iter->second.begin(); iter2!=iter->second.end(); ++iter2) {
+        if (temp_set.find((*iter2)->get_inst().warp_id()) != temp_set.end())
+          temp_set.erase(temp_set.find((*iter2)->get_inst().warp_id()));
+      }
+    }
+
+    if (temp_set.empty()) {
+      skip_cycles = true;
+    } else {
+      skip_cycles = false;
+    }
+    fflush(stdout);
+  }
+}
+
+void gmmu_t::do_hardware_prefetch(
+    std::map<mem_addr_t, std::list<mem_fetch *>> &page_fault_this_turn) {
+  // now decide on transfers as a group of page faults and prefetches
+  if (!page_fault_this_turn.empty()) {
+    unsigned long long num_pages_read_stage_queue = 0;
+
+    for (std::list<pcie_latency_t *>::iterator iter =
+             pcie_read_stage_queue.begin();
+         iter != pcie_read_stage_queue.end(); iter++) {
+      num_pages_read_stage_queue += (*iter)->page_list.size();
+    }
+
+    std::list<std::list<mem_addr_t>> all_transfer_all_page;
+    std::list<std::list<mem_addr_t>> all_transfer_faulty_pages;
+    std::map<mem_addr_t, std::list<mem_fetch *>> temp_req_info;
+
+    // create a tree structure large page -> basic blocks -> faulty pages
+    std::map<mem_addr_t, std::map<mem_addr_t, std::list<mem_addr_t>>>
+        block_tree;
+
+    if (prefetcher == hwardware_prefetcher::DISBALED ||
+        prefetcher == hwardware_prefetcher::RANDOM) {
+      for (std::map<mem_addr_t, std::list<mem_fetch *>>::iterator it =
+               page_fault_this_turn.begin();
+           it != page_fault_this_turn.end(); it++) {
+        std::list<mem_addr_t> temp_pages;
+        temp_pages.push_back(it->first);
+
+        mem_addr_t page_addr =
+            m_gpu->get_global_memory()->get_mem_addr(it->first);
+        struct lp_tree_node *root = get_lp_node(page_addr);
+        update_basic_block(root, page_addr, m_config.page_size, true);
+
+        all_transfer_all_page.push_back(temp_pages);
+        all_transfer_faulty_pages.push_back(temp_pages);
+
+        temp_req_info[it->first];
+
+        if (prefetcher == hwardware_prefetcher::RANDOM) {
+          struct lp_tree_node *root =
+              get_lp_node(m_gpu->get_global_memory()->get_mem_addr(it->first));
+
+          size_t random_size =
+              (rand() % (root->size / m_config.page_size)) * m_config.page_size;
+
+          if (random_size > root->size) {
+            random_size -= root->size;
+          }
+
+          mem_addr_t prefetch_addr = root->addr + random_size;
+
+          mem_addr_t prefetch_page_num =
+              m_gpu->get_global_memory()->get_page_num(prefetch_addr);
+
+          if (!m_gpu->get_global_memory()->is_valid(prefetch_page_num) &&
+              page_fault_this_turn.find(prefetch_addr) ==
+                  page_fault_this_turn.end() &&
+              temp_req_info.find(prefetch_page_num) == temp_req_info.end() &&
+              req_info.find(prefetch_page_num) == req_info.end()) {
+
+            mem_addr_t page_addr =
+                m_gpu->get_global_memory()->get_mem_addr(prefetch_page_num);
+            struct lp_tree_node *root = get_lp_node(page_addr);
+            update_basic_block(root, page_addr, m_config.page_size, true);
+
+            all_transfer_all_page.back().push_back(prefetch_page_num);
+
+            temp_req_info[prefetch_page_num];
+          }
+        }
+      }
+    } else {
+      std::map<mem_addr_t, std::set<mem_addr_t>> lp_pf_groups;
+
+      for (std::map<mem_addr_t, std::list<mem_fetch *>>::iterator it =
+               page_fault_this_turn.begin();
+           it != page_fault_this_turn.end(); it++) {
+        mem_addr_t page_addr =
+            m_gpu->get_global_memory()->get_mem_addr(it->first);
+
+        struct lp_tree_node *root = get_lp_node(page_addr);
+
+        lp_pf_groups[root->addr].insert(page_addr);
+      }
+
+      for (std::map<mem_addr_t, std::set<mem_addr_t>>::iterator lp_pf_iter =
+               lp_pf_groups.begin();
+           lp_pf_iter != lp_pf_groups.end(); lp_pf_iter++) {
+        std::set<mem_addr_t> schedulable_basic_blocks;
+
+        // list of all invalid pages and pages with fault from all basic blocks
+        // to satisfy current transfer size
+        std::list<mem_addr_t> cur_transfer_all_pages;
+        std::list<mem_addr_t> cur_transfer_faulty_pages;
+
+        for (std::set<mem_addr_t>::iterator pf_iter =
+                 lp_pf_iter->second.begin();
+             pf_iter != lp_pf_iter->second.end(); pf_iter++) {
+          mem_addr_t page_addr = *pf_iter;
+
+          struct lp_tree_node *root = get_lp_node(page_addr);
+
+          mem_addr_t bb_addr =
+              update_basic_block(root, page_addr, MIN_PREFETCH_SIZE, true);
+
+          schedulable_basic_blocks.insert(bb_addr);
+
+          cur_transfer_faulty_pages.push_back(
+              m_gpu->get_global_memory()->get_page_num(page_addr));
+        }
+
+        if (prefetcher == hwardware_prefetcher::TBN) {
+          struct lp_tree_node *root = get_lp_node(lp_pf_iter->first);
+          traverse_and_fill_lp_tree(root, schedulable_basic_blocks);
+        }
+
+        for (std::set<mem_addr_t>::iterator bb =
+                 schedulable_basic_blocks.begin();
+             bb != schedulable_basic_blocks.end(); bb++) {
+
+          block_access_list.push_back(
+              std::make_pair(m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, *bb));
+
+          // all the invalid pages in the current 64 K basic block of transfer
+          std::list<mem_addr_t> all_block_pages =
+              m_gpu->get_global_memory()->get_faulty_pages(*bb,
+                                                           MIN_PREFETCH_SIZE);
+
+          for (std::list<mem_addr_t>::iterator pg_iter =
+                   all_block_pages.begin();
+               pg_iter != all_block_pages.end(); pg_iter++) {
+            if (temp_req_info.find(*pg_iter) == temp_req_info.end()) {
+              // mark entry into mshr for all pages in the current basic block
+              temp_req_info[*pg_iter];
+              cur_transfer_all_pages.push_back(*pg_iter);
+            }
+          }
+        }
+
+        all_transfer_all_page.push_back(cur_transfer_all_pages);
+        all_transfer_faulty_pages.push_back(cur_transfer_faulty_pages);
+      }
+    }
+
+    for (std::map<mem_addr_t, std::list<mem_fetch *>>::iterator iter =
+             temp_req_info.begin();
+         iter != temp_req_info.end(); iter++) {
+      req_info[iter->first];
+      req_info[iter->first].merge(iter->second);
+    }
+
+    std::list<std::list<mem_addr_t>>::iterator all_pg_iter =
+        all_transfer_all_page.begin();
+    std::list<std::list<mem_addr_t>>::iterator all_pf_iter =
+        all_transfer_faulty_pages.begin();
+
+    for (; all_pg_iter != all_transfer_all_page.end();
+         all_pg_iter++, all_pf_iter++) {
+      // now we found all the basic blocks for the current transfer size
+      // we now decide on the splits based on page faults
+      std::list<mem_addr_t>::iterator pf_iter = all_pf_iter->begin();
+      std::list<mem_addr_t>::iterator pg_iter = all_pg_iter->begin();
+
+      std::list<mem_addr_t>::iterator prev_pg_iter;
+
+      while (pg_iter != all_pg_iter->end()) {
+
+        // if there is a gap between current and last page
+        // it can be if two basic blocks selected for current transfer size
+        // is separated by other basic blocks
+        // then we send this basic block (or remaining of so) for transfer
+        if (pg_iter != all_pg_iter->begin()) {
+          prev_pg_iter = pg_iter;
+          --prev_pg_iter;
+
+          if ((*pg_iter) != ((*prev_pg_iter) + 1)) {
+
+            // add the current split for transfer
+            pcie_latency_t *p_t = new pcie_latency_t();
+            p_t->start_addr =
+                m_gpu->get_global_memory()->get_mem_addr(all_pg_iter->front());
+            p_t->page_list =
+                std::list<mem_addr_t>(all_pg_iter->begin(), pg_iter);
+            p_t->size = p_t->page_list.size() * m_config.page_size;
+            p_t->type = latency_type::PCIE_READ;
+
+            pcie_read_stage_queue.push_back(p_t);
+
+            // remove the scheduled pages from all pages and move the pointer
+            pg_iter = all_pg_iter->erase(all_pg_iter->begin(), pg_iter);
+          }
+        }
+
+        // we found a page on which a page fault request is pending
+        // now we split upto this and create a memory transfer
+        if ((pf_iter != all_pf_iter->end()) && ((*pf_iter) == (*pg_iter))) {
+
+          if (m_config.enable_accurate_simulation) {
+            pcie_latency_t *f_t = new pcie_latency_t();
+            f_t->page_list.push_back(*pf_iter);
+            f_t->type = latency_type::PAGE_FAULT;
+            pcie_read_stage_queue.push_back(f_t);
+          }
+
+          // add the current split for transfer
+          pcie_latency_t *p_t = new pcie_latency_t();
+          p_t->start_addr =
+              m_gpu->get_global_memory()->get_mem_addr(all_pg_iter->front());
+          p_t->page_list =
+              std::list<mem_addr_t>(all_pg_iter->begin(), ++pg_iter);
+          p_t->size = p_t->page_list.size() * m_config.page_size;
+          p_t->type = latency_type::PCIE_READ;
+
+          pcie_read_stage_queue.push_back(p_t);
+
+          // remove the scheduled pages from all pages and move the pointer
+          pg_iter = all_pg_iter->erase(all_pg_iter->begin(), pg_iter);
+          pf_iter++;
+        } else {
+          pg_iter++;
+        }
+      }
+
+      // prefetch the remaining from the 64K basic block
+      if (!all_pg_iter->empty()) {
+        pcie_latency_t *p_t = new pcie_latency_t();
+        p_t->start_addr =
+            m_gpu->get_global_memory()->get_mem_addr(all_pg_iter->front());
+        p_t->page_list = *all_pg_iter;
+        p_t->size = p_t->page_list.size() * m_config.page_size;
+        p_t->type = latency_type::PCIE_READ;
+
+        pcie_read_stage_queue.push_back(p_t);
+      }
+    }
+
+    // adding statistics for prefetch
+    for (std::map<mem_addr_t, std::list<mem_fetch *>>::iterator iter2 =
+             page_fault_this_turn.begin();
+         iter2 != page_fault_this_turn.end(); iter2++) {
+      assert(req_info[iter2->first].size() == 0);
+
+      // add the pending prefecthes to the MSHR entry
+      req_info[iter2->first] = iter2->second;
+
+      m_new_stats->mf_page_fault_outstanding++;
+      m_new_stats->mf_page_fault_pending += req_info[iter2->first].size() - 1;
+
+      m_new_stats->mf_page_fault_latency[iter2->first].push_back(
+          m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+    }
+
+    if (!over_sub && m_gpu->get_global_memory()->should_evict_page(
+                         num_pages_read_stage_queue + temp_req_info.size(), 0,
+                         m_config.free_page_buffer_percentage)) {
+
+      if (m_config.enable_smart_runtime) {
+        update_memory_management_policy();
+      } else {
+        update_hardware_prefetcher_oversubscribed();
+      }
+
+      over_sub = true;
+    }
+  }
+}
+
+void gpgpu_sim::cycle() {
+  int clock_mask = next_clock_domain();
+
+  // the gmmu has the same clock as the core
+  if (clock_mask & GMMU) {
+    m_gmmu->cycle();
+  }
+
+  // skip cycles because all warps stall to wait for mem_fetch come back from gmmu
+  if (skip_cycles_enable && skip_cycles) {
+    if (clock_mask & CORE) {
+      skipped_cycles++;
+      gpu_sim_cycle++;
+    }
+    return;
+  }
+
+  if (clock_mask & CORE) {
+    // shader core loading (pop from ICNT into core) follows CORE clock
+    for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
+      m_cluster[i]->icnt_cycle();
+  }
+  unsigned partiton_replys_in_parallel_per_cycle = 0;
+  if (clock_mask & ICNT) {
+    // pop from memory controller to interconnect
+    for (unsigned i = 0; i < m_memory_config->m_n_mem_sub_partition; i++) {
+      mem_fetch *mf = m_memory_sub_partition[i]->top();
+      if (mf) {
+        unsigned response_size =
+            mf->get_is_write() ? mf->get_ctrl_size() : mf->size();
+        if (::icnt_has_buffer(m_shader_config->mem2device(i), response_size)) {
+          // if (!mf->get_is_write())
+          mf->set_return_timestamp(gpu_sim_cycle + gpu_tot_sim_cycle);
+          mf->set_status(IN_ICNT_TO_SHADER, gpu_sim_cycle + gpu_tot_sim_cycle);
+          ::icnt_push(m_shader_config->mem2device(i), mf->get_tpc(), mf,
+                      response_size);
+          m_memory_sub_partition[i]->pop();
+          partiton_replys_in_parallel_per_cycle++;
+        } else {
+          gpu_stall_icnt2sh++;
+        }
+      } else {
+        m_memory_sub_partition[i]->pop();
+      }
+    }
+  }
+  partiton_replys_in_parallel += partiton_replys_in_parallel_per_cycle;
+
+  if (clock_mask & DRAM) {
+    for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+      if (m_memory_config->simple_dram_model)
+        m_memory_partition_unit[i]->simple_dram_model_cycle();
+      else
+        m_memory_partition_unit[i]
+            ->dram_cycle();  // Issue the dram command (scheduler + delay model)
+      // Update performance counters for DRAM
+      m_memory_partition_unit[i]->set_dram_power_stats(
+          m_power_stats->pwr_mem_stat->n_cmd[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_activity[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_nop[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_act[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_req[CURRENT_STAT_IDX][i]);
+    }
+  }
+
+  // L2 operations follow L2 clock domain
+  unsigned partiton_reqs_in_parallel_per_cycle = 0;
+  if (clock_mask & L2) {
+    m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].clear();
+    for (unsigned i = 0; i < m_memory_config->m_n_mem_sub_partition; i++) {
+      // move memory request from interconnect into memory partition (if not
+      // backed up) Note:This needs to be called in DRAM clock domain if there
+      // is no L2 cache in the system In the worst case, we may need to push
+      // SECTOR_CHUNCK_SIZE requests, so ensure you have enough buffer for them
+      if (m_memory_sub_partition[i]->full(SECTOR_CHUNCK_SIZE)) {
+        gpu_stall_dramfull++;
+      } else {
+        mem_fetch *mf = (mem_fetch *)icnt_pop(m_shader_config->mem2device(i));
+        //if (mf) {
+        //  printf("MEM_FETCH DEBUG: gpgpu_sim::cycle :: mf info %p\n", mf);
+        //  mf->print(stdout);
+        //}
+        m_memory_sub_partition[i]->push(mf, gpu_sim_cycle + gpu_tot_sim_cycle);
+        if (mf) partiton_reqs_in_parallel_per_cycle++;
+      }
+      m_memory_sub_partition[i]->cache_cycle(gpu_sim_cycle + gpu_tot_sim_cycle);
+      if (m_config.g_power_simulation_enabled) {
+        m_memory_sub_partition[i]->accumulate_L2cache_stats(
+            m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX]);
+      }
+    }
+  }
+  partiton_reqs_in_parallel += partiton_reqs_in_parallel_per_cycle;
+  if (partiton_reqs_in_parallel_per_cycle > 0) {
+    partiton_reqs_in_parallel_util += partiton_reqs_in_parallel_per_cycle;
+    gpu_sim_cycle_parition_util++;
+  }
+
+  if (clock_mask & ICNT) {
+    icnt_transfer();
+  }
+
+  if (clock_mask & CORE) {
+    // clear warp info collected so far
+    all_warps.clear();
+    fail_warps.clear();
+    // L1 cache + shader core pipeline stages
+    m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].clear();
+    for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
+      if (m_cluster[i]->get_not_completed() || get_more_cta_left()) {
+        m_cluster[i]->core_cycle();
+        *active_sms += m_cluster[i]->get_n_active_sms();
+      }
+      // Update core icnt/cache stats for AccelWattch
+      if (m_config.g_power_simulation_enabled) {
+        m_cluster[i]->get_icnt_stats(
+            m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
+            m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
+        m_cluster[i]->get_cache_stats(
+            m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]);
+      }
+      m_cluster[i]->get_current_occupancy(
+          gpu_occupancy.aggregate_warp_slot_filled,
+          gpu_occupancy.aggregate_theoretical_warp_slots);
+    }
+    float temp = 0;
+    for (unsigned i = 0; i < m_shader_config->num_shader(); i++) {
+      temp += m_shader_stats->m_pipeline_duty_cycle[i];
+    }
+    temp = temp / m_shader_config->num_shader();
+    *average_pipeline_duty_cycle = ((*average_pipeline_duty_cycle) + temp);
+    // cout<<"Average pipeline duty cycle:
+    // "<<*average_pipeline_duty_cycle<<endl;
+
+    if (g_single_step &&
+        ((gpu_sim_cycle + gpu_tot_sim_cycle) >= g_single_step)) {
+      raise(SIGTRAP);  // Debug breakpoint
+    }
+    gpu_sim_cycle++;
+
+    if (g_interactive_debugger_enabled) gpgpu_debug();
+
+      // McPAT main cycle (interface with McPAT)
+#ifdef GPGPUSIM_POWER_MODEL
+    if (m_config.g_power_simulation_enabled) {
+      if (m_config.g_power_simulation_mode == 0) {
+        mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                    m_power_stats, m_config.gpu_stat_sample_freq,
+                    gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
+                    gpu_sim_insn, m_config.g_dvfs_enabled);
+      }
+    }
+#endif
+
+    issue_block2core();
+    decrement_kernel_latency();
+
+    // Depending on configuration, invalidate the caches once all of threads are
+    // completed.
+    int all_threads_complete = 1;
+    if (m_config.gpgpu_flush_l1_cache) {
+      for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
+        if (m_cluster[i]->get_not_completed() == 0)
+          m_cluster[i]->cache_invalidate();
+        else
+          all_threads_complete = 0;
+      }
+    }
+
+    if (m_config.gpgpu_flush_l2_cache) {
+      if (!m_config.gpgpu_flush_l1_cache) {
+        for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
+          if (m_cluster[i]->get_not_completed() != 0) {
+            all_threads_complete = 0;
             break;
           }
         }
@@ -2295,7 +5181,7 @@ const shader_core_config *gpgpu_sim::getShaderCoreConfig() {
 
 const memory_config *gpgpu_sim::getMemoryConfig() { return m_memory_config; }
 
-simt_core_cluster *gpgpu_sim::getSIMTCluster() { return *m_cluster; }
+//simt_core_cluster *gpgpu_sim::getSIMTCluster() { return *m_cluster; }
 
 void sst_gpgpu_sim::SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
   if (m_shader_config->n_simt_clusters != sst_numcores) {
@@ -2403,3 +5289,6 @@ void sst_gpgpu_sim::SST_cycle() {
   gpgpu_ctx->device_runtime->launch_one_device_kernel();
 #endif
 }
+simt_core_cluster *gpgpu_sim::getSIMTCluster(int index) { return *(m_cluster + index); }
+
+gmmu_t *gpgpu_sim::getGmmu() { return m_gmmu; }
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 68bdca72e..271e2da5a 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -37,7 +37,10 @@
 #include <fstream>
 #include <iostream>
 #include <list>
+#include <fstream>
+#include <functional> 
 #include "../abstract_hardware_model.h"
+#include "../cuda-sim/memory.h"
 #include "../option_parser.h"
 #include "../trace.h"
 #include "addrdec.h"
@@ -446,6 +449,7 @@ class gpgpu_sim_config : public power_config,
   unsigned get_core_freq() const { return core_freq; }
   unsigned num_shader() const { return m_shader_config.num_shader(); }
   unsigned num_cluster() const { return m_shader_config.n_simt_clusters; }
+  unsigned num_core_per_cluster() const { return m_shader_config.n_simt_cores_per_cluster; }
   unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; }
 
   /**
@@ -466,6 +470,7 @@ class gpgpu_sim_config : public power_config,
   }
 
   bool flush_l1() const { return gpgpu_flush_l1_cache; }
+  void convert_byte_string();
 
  private:
   void init_clock_domains(void);
@@ -507,6 +512,24 @@ class gpgpu_sim_config : public power_config,
   // statistics collection
   int gpu_stat_sample_freq;
   int gpu_runtime_stat_flag;
+  unsigned long long page_table_walk_latency;
+
+  int eviction_policy;
+  bool invalidate_clean;
+  float reserve_accessed_page_percent;
+  float free_page_buffer_percentage;
+
+  char *pcie_bandwidth_string;
+  float pcie_bandwidth;
+
+  float curve_a;
+  float curve_b;
+
+  int enable_dma;
+  int multiply_dma_penalty;
+  unsigned migrate_threshold;
+
+  bool enable_smart_runtime;
 
   // Device Limits
   size_t stack_size_limit;
@@ -521,6 +544,532 @@ class gpgpu_sim_config : public power_config,
 
   friend class gpgpu_sim;
   friend class sst_gpgpu_sim;
+
+  public:
+  int hardware_prefetch;
+  int hwprefetch_oversub;
+
+  friend class gmmu_t;
+};
+
+extern unsigned long long kernel_time;
+extern unsigned long long memory_copy_time_h2d;
+extern unsigned long long memory_copy_time_d2h;
+extern unsigned long long prefetch_time;
+extern unsigned long long devicesync_time;
+extern unsigned long long writeback_time;
+extern unsigned long long dma_time;
+
+enum stats_type {
+  prefetch = 0,
+  prefetch_breakdown,
+  memcpy_h2d,
+  memcpy_d2h,
+  memcpy_d2d,
+  kernel_launching,
+  page_fault,
+  device_sync,
+  write_back,
+  invalidate,
+  dma
+};
+
+class event_stats {
+public:
+  event_stats(enum stats_type t, unsigned long long s_time,
+              unsigned long long e_time)
+      : type(t), start_time(s_time), end_time(e_time) {}
+  event_stats(enum stats_type t, unsigned long long s_time)
+      : type(t), start_time(s_time), end_time(0) {}
+  enum stats_type type;
+  unsigned long long start_time;
+  unsigned long long end_time;
+
+  virtual void print(FILE *fout, float freq) = 0;
+  virtual void calculate() = 0;
+};
+
+class memory_stats : public event_stats {
+public:
+  memory_stats(enum stats_type t, unsigned long long s_time, mem_addr_t s_addr,
+               size_t sz, unsigned s_id)
+      : event_stats(t, s_time), start_addr(s_addr), size(sz), stream_id(s_id) {}
+  memory_stats(enum stats_type t, unsigned long long s_time,
+               unsigned long long e_time, mem_addr_t s_addr, size_t sz,
+               unsigned s_id)
+      : event_stats(t, s_time, e_time), start_addr(s_addr), size(sz),
+        stream_id(s_id) {}
+  mem_addr_t start_addr;
+  size_t size;
+  unsigned stream_id;
+
+  virtual void print(FILE *fout, float freq) {
+    fprintf(fout, "F: %8llu----T: %8llu \t St: %x Sz: %lu \t Sm: %u \t ",
+            start_time, end_time, start_addr, size, stream_id);
+    if (type == memcpy_h2d)
+      fprintf(fout, "T: memcpy_h2d");
+    else if (type == memcpy_d2h)
+      fprintf(fout, "T: memcpy_d2h");
+    else if (type == memcpy_d2d)
+      fprintf(fout, "T: memcpy_d2d");
+    else if (type == prefetch)
+      fprintf(fout, "T: prefetch");
+    else if (type == prefetch_breakdown)
+      fprintf(fout, "T: prefetch_breakdown");
+    else if (type == device_sync)
+      fprintf(fout, "T: device_sync");
+    else if (type == write_back)
+      fprintf(fout, "T: write_back");
+    else if (type == invalidate)
+      fprintf(fout, "T: invalidate");
+    else if (type == dma)
+      fprintf(fout, "T: dma");
+
+    fprintf(fout, "(%f)\n", ((float)(end_time - start_time)) / freq);
+  }
+  virtual void calculate() {
+    if (type == memcpy_h2d) {
+      memory_copy_time_h2d += end_time - start_time;
+    } else if (type == memcpy_d2h) {
+      memory_copy_time_d2h += end_time - start_time;
+    } else if (type == prefetch_breakdown) {
+      prefetch_time += end_time - start_time;
+    } else if (type == device_sync) {
+      devicesync_time += end_time - start_time;
+    } else if (type == write_back) {
+      writeback_time += end_time - start_time;
+    } else if (type == dma) {
+      dma_time += end_time - start_time;
+    }
+  }
+};
+
+class kernel_stats : public event_stats {
+public:
+  kernel_stats(unsigned long long s_time, unsigned s_id, unsigned k_id)
+      : event_stats(kernel_launching, s_time), stream_id(s_id), kernel_id(k_id) {}
+  unsigned stream_id;
+  unsigned kernel_id;
+
+  virtual void print(FILE *fout, float freq) {
+    fprintf(
+        fout,
+        "F: %8llu----T: %8llu \t \t \t Kl: %u \t Sm: %u \t T: kernel_launch",
+        start_time, end_time, kernel_id, stream_id);
+    fprintf(fout, "(%f)\n", ((float)(end_time - start_time)) / freq);
+  }
+
+  virtual void calculate() { kernel_time += end_time - start_time; }
+};
+
+class page_fault_stats : public event_stats {
+public:
+  page_fault_stats(unsigned long long s_time, const std::list<mem_addr_t> &pgs,
+                   unsigned sz)
+      : event_stats(page_fault, s_time), pages(pgs), transfering_pages(pgs),
+        size(sz) {}
+  std::list<mem_addr_t> pages;
+  std::list<mem_addr_t> transfering_pages;
+  size_t size;
+
+  virtual void print(FILE *fout, float freq) {
+    fprintf(fout, "F: %8llu----T: %8llu \t Sz: %lu \t T: page_fault",
+            start_time, end_time, size);
+    fprintf(fout, "(%f)\n", ((float)(end_time - start_time)) / freq);
+  }
+
+  virtual void calculate() {}
+};
+
+// In order to skip idle cycles due to page fault, we need collect warp info
+// in all cores.
+extern std::list<shd_warp_t *> all_warps;
+extern std::list<shd_warp_t *> fail_warps;
+extern bool skip_cycles;
+extern bool skip_cycles_enable;
+
+extern std::map<unsigned long long, std::list<event_stats *>> sim_prof;
+
+extern bool sim_prof_enable;
+
+
+void print_sim_prof(FILE *fout, float freq);
+
+void calculate_sim_prof(FILE *fout, gpgpu_sim *gpu);
+
+void update_sim_prof_kernel(unsigned kernel_id, unsigned long long end_time);
+
+void update_sim_prof_prefetch(mem_addr_t start_addr, size_t size,
+                              unsigned long long end_time);
+
+void update_sim_prof_prefetch_break_down(unsigned long long end_time);
+
+void print_UVM_stats(gpgpu_new_stats *new_stats, gpgpu_sim *gpu, FILE *fout);
+
+class access_info {
+public:
+  mem_addr_t page_no;
+  mem_addr_t mem_addr;
+  size_t size;
+  unsigned long long cycle;
+  bool is_read;
+  unsigned sm_id;
+  unsigned warp_id;
+  access_info(mem_addr_t p_n, mem_addr_t addr, size_t s, unsigned long long c,
+              bool rw, unsigned s_id, unsigned w_id)
+      : page_no(p_n), mem_addr(addr), size(s), cycle(c), is_read(rw),
+        sm_id(s_id), warp_id(w_id) {}
+};
+
+class gpgpu_new_stats {
+public:
+  gpgpu_new_stats(const gpgpu_sim_config &config);
+  ~gpgpu_new_stats();
+  void print(FILE *fout) const;
+  void print_pcie(FILE *fout) const;
+  void print_access_pattern_detail(FILE *fout) const;
+  void print_access_pattern(FILE *fout) const;
+  void print_time_and_access(FILE *fout) const;
+
+  // for each shader of all global memory access
+
+  // tlb hit
+  unsigned long long *tlb_hit;
+  // tlb miss
+  unsigned long long *tlb_miss;
+
+  // tlb validate
+  unsigned long long *tlb_val;
+  // tlb eviction
+  unsigned long long *tlb_evict;
+  // tlb invalidated by page eviction
+  unsigned long long *tlb_page_evict;
+
+  // in tlb miss, page hit
+  unsigned long long *mf_page_hit;
+  // in tlb miss, page miss
+  unsigned long long *mf_page_miss;
+
+  // in tlb miss, page miss, the first create fault
+  unsigned long long mf_page_fault_outstanding;
+  // in tlb miss, page miss, the following that appends to mshr
+  unsigned long long mf_page_fault_pending;
+
+  unsigned long long page_evict_dirty;
+
+  unsigned long long page_evict_not_dirty;
+
+  // prefetch page hit
+  unsigned long long pf_page_hit;
+  // prefetch page miss
+  unsigned long long pf_page_miss;
+  // prefetch fault page size, large page and latency
+  std::vector<std::pair<unsigned long, unsigned long long>> pf_fault_latency;
+
+  // for each page, how many time is it being accessed by each shader
+  std::map<mem_addr_t, unsigned> *page_access_times;
+
+  // for each timestamp, which page is being accessed
+  std::list<access_info> time_and_page_access;
+
+  // ready lanes utilization
+  std::list<std::pair<unsigned long long, float>> pcie_read_utilization;
+  // write lanes utilization
+  std::list<std::pair<unsigned long long, float>> pcie_write_utilization;
+
+  // page and its partern
+  std::map<mem_addr_t, std::vector<bool>> page_thrashing;
+  // tlb and its partern
+  std::map<mem_addr_t, std::vector<bool>> *tlb_thrashing;
+
+  // for each shader, the memory access latency
+  std::map<unsigned, std::pair<bool, unsigned long long>> *ma_latency;
+
+  // for mf when it is fault(not pending to prefetch), the latency
+  std::map<mem_addr_t, std::list<unsigned long long>> mf_page_fault_latency;
+
+  // for prefetch each small page latency
+  std::map<mem_addr_t, std::list<unsigned long long>> pf_page_fault_latency;
+
+  const gpgpu_sim_config &m_config;
+
+  unsigned long long num_dma;
+  unsigned long long dma_page_transfer_read;
+  unsigned long long dma_page_transfer_write;
+};
+
+// this class simulate the gmmu unit on chip
+
+class gmmu_t {
+public:
+  gmmu_t(class gpgpu_sim *gpu, const gpgpu_sim_config &config,
+         class gpgpu_new_stats *new_stats);
+  unsigned long long calculate_transfer_time(size_t data_size);
+  void calculate_devicesync_time(size_t data_size);
+  void cycle();
+  void register_tlbflush_callback(std::function<void(mem_addr_t)> cb_tlb);
+  void tlb_flush(mem_addr_t page_num);
+  void page_eviction_procedure();
+  bool is_block_evictable(mem_addr_t bb_addr, size_t size);
+
+  // add a new accessed page or refresh the position of the page in the LRU page
+  // list being called on detecting tlb hit or when memory fetch comes back from
+  // the upward (gmmu to cu) queue
+  void refresh_valid_pages(mem_addr_t page_addr);
+  void sort_valid_pages();
+
+  // check whether the page to be accessed is already in pci-e write stage queue
+  // being called on tlb hit or on tlb miss but no page fault
+  void check_write_stage_queue(mem_addr_t page_num, bool refresh);
+
+  void valid_pages_erase(mem_addr_t pagenum);
+  void valid_pages_clear();
+
+  void register_prefetch(mem_addr_t m_device_addr,
+                         mem_addr_t m_device_allocation_ptr, size_t m_cnt,
+                         struct CUstream_st *m_stream);
+  void activate_prefetch(mem_addr_t m_device_addr, size_t m_cnt,
+                         struct CUstream_st *m_stream);
+
+  struct lp_tree_node *build_lp_tree(mem_addr_t addr, size_t size);
+  void reset_large_page_info(struct lp_tree_node *node);
+  void reset_lp_tree_node(struct lp_tree_node *node);
+  struct lp_tree_node *get_lp_node(mem_addr_t addr);
+  void evict_whole_tree(struct lp_tree_node *root);
+  mem_addr_t update_basic_block(struct lp_tree_node *root, mem_addr_t addr,
+                                size_t size, bool prefetch);
+  mem_addr_t get_basic_block(struct lp_tree_node *root, mem_addr_t addr);
+
+  void fill_lp_tree(struct lp_tree_node *node,
+                    std::set<mem_addr_t> &scheduled_basic_blocks);
+  void remove_lp_tree(struct lp_tree_node *node,
+                      std::set<mem_addr_t> &scheduled_basic_blocks);
+  void traverse_and_fill_lp_tree(struct lp_tree_node *node,
+                                 std::set<mem_addr_t> &scheduled_basic_blocks);
+  void
+  traverse_and_remove_lp_tree(struct lp_tree_node *node,
+                              std::set<mem_addr_t> &scheduled_basic_blocks);
+
+  bool pcie_transfers_completed();
+
+  void initialize_large_page(mem_addr_t start_addr, size_t size);
+
+  unsigned long long get_ready_cycle(unsigned num_pages);
+  unsigned long long get_ready_cycle_dma(unsigned size);
+
+  float get_pcie_utilization(unsigned num_pages);
+
+  void do_hardware_prefetch(
+      std::map<mem_addr_t, std::list<mem_fetch *>> &page_fault_this_turn);
+
+  void reserve_pages_insert(mem_addr_t addr, unsigned mem_access_uid);
+  void reserve_pages_remove(mem_addr_t addr, unsigned mem_access_uid);
+  bool reserve_pages_check(mem_addr_t addr);
+
+  // std::unordered_map<mem_addr_t, page_table_entry_t> page_table;
+
+  // mem_addr_t get_page_number(mem_addr_t addr) {
+  //   return addr >> m_log2_page_size;
+  // }
+  // void page_table_insert(mem_addr_t page_num, mem_addr_t device_addr,
+  //                       mem_addr_t allocation_ptr, size_t size);
+  // void page_table_erase(mem_addr_t page_num);
+  // void page_table_clear();
+  std::list<mem_addr_t> get_faulty_pages(mem_addr_t addr, size_t length);
+
+  std::map<mem_addr_t, std::list<unsigned>> reserve_pages;
+
+  void update_hardware_prefetcher_oversubscribed();
+
+  // update paging, pinning, and eviction decision based on memory access
+  // pattern under oversubscription
+  void update_memory_management_policy();
+  void log_kernel_info(unsigned kernel_id, unsigned long long time,
+                       bool finish);
+
+  void reset_large_page_info();
+
+  mem_addr_t get_eviction_base_addr(mem_addr_t page_addr);
+  size_t get_eviction_granularity(mem_addr_t page_addr);
+
+  int get_bb_access_counter(struct lp_tree_node *node, mem_addr_t addr);
+  int get_bb_round_trip(struct lp_tree_node *node, mem_addr_t addr);
+  void inc_bb_access_counter(mem_addr_t addr);
+  void inc_bb_round_trip(struct lp_tree_node *root);
+  void traverse_and_reset_access_counter(struct lp_tree_node *root);
+  void reset_bb_access_counter();
+  void traverse_and_reset_round_trip(struct lp_tree_node *root);
+  void reset_bb_round_trip();
+  void update_access_type(mem_addr_t addr, int type);
+
+  bool should_cause_page_migration(mem_addr_t addr, bool is_write);
+
+private:
+  unsigned m_log2_page_size;
+  // data structure for page_table_entry
+  struct page_table_entry_t {
+    // mem_addr_t page_num;
+    // mem_addr_t device_addr;
+    // mem_addr_t allocation_ptr;
+    size_t size;
+    bool valid;
+    bool accessed;
+    bool dirty;
+    // unsigned long long last_access_cycle;
+    // unsigned long long last_access_time;
+  };
+
+  // data structure to wrap memory fetch and page table walk delay
+  struct page_table_walk_latency_t {
+    mem_fetch *mf;
+    unsigned long long ready_cycle;
+  };
+
+  // page table walk delay queue
+  std::list<page_table_walk_latency_t> page_table_walk_queue;
+
+  enum class latency_type {
+    PCIE_READ,
+    PCIE_WRITE_BACK,
+    INVALIDATE,
+    PAGE_FAULT,
+    DMA
+  };
+
+  // data structure to wrap a memory page and delay to transfer over PCI-E
+  struct pcie_latency_t {
+    mem_addr_t start_addr;
+    unsigned long long size;
+    std::list<mem_addr_t> page_list;
+    unsigned long long ready_cycle;
+
+    mem_fetch *mf;
+    latency_type type;
+  };
+
+  // staging queue to hold the PCI-E requests waiting for scheduling
+  std::list<pcie_latency_t *> pcie_read_stage_queue;
+  std::list<pcie_latency_t *> pcie_write_stage_queue;
+
+  // read queue for fetching the page from host side
+  // the request may be global memory's read (load)/ write (store)
+  pcie_latency_t *pcie_read_latency_queue;
+
+  // write back queue for page eviction requests over PCI-E
+  pcie_latency_t *pcie_write_latency_queue;
+
+  // loosely represent MSHRs to hold all memory fetches
+  // corresponding to a PCI-E read requests, i.e., a common page number
+  // to replay the memory fetch back upon completion
+  std::map<mem_addr_t, std::list<mem_fetch *>> req_info;
+
+  // need the gpu to do address traslation, validate page
+  class gpgpu_sim *m_gpu;
+
+  // config file
+  const gpgpu_sim_config &m_config;
+  const struct shader_core_config *m_shader_config;
+
+  // callback functions to invalidate the tlb in ldst unit
+  std::list<std::function<void(mem_addr_t)>> callback_tlb_flush;
+
+  // list of valid pages (valid = 1, accessed = 1/0, dirty = 1/0) ordered as LRU
+  std::list<eviction_t *> valid_pages;
+
+  // page eviction policy
+  enum class eviction_policy { LRU, TBN, SEQUENTIAL_LOCAL, RANDOM, LFU, LRU4K };
+
+  // types of hardware prefetcher
+  enum class hwardware_prefetcher { DISBALED, TBN, SEQUENTIAL_LOCAL, RANDOM };
+
+  // types of hardware prefetcher under over-subscription
+  enum class hwardware_prefetcher_oversub {
+    DISBALED,
+    TBN,
+    SEQUENTIAL_LOCAL,
+    RANDOM
+  };
+
+  // type of DMA
+  enum class dma_type { DISABLED, ADAPTIVE, ALWAYS, OVERSUB };
+
+  // type of memory access pattern per data structure
+  enum class ds_pattern {
+    UNDECIDED,
+    RANDOM,
+    LINEAR,
+    MIXED,
+    RANDOM_REUSE,
+    LINEAR_REUSE,
+    MIXED_REUSE
+  };
+
+  // list of scheduled basic blocks by their timestamps
+  std::list<std::pair<unsigned long long, mem_addr_t>> block_access_list;
+
+  // list of launch and finish cycle of kernels keyed by id
+  std::map<unsigned, std::pair<unsigned long long, unsigned long long>>
+      kernel_info;
+
+  eviction_policy evict_policy;
+  hwardware_prefetcher prefetcher;
+  hwardware_prefetcher_oversub oversub_prefetcher;
+
+  dma_type dma_mode;
+
+  struct prefetch_req {
+    // starting address (rolled up and down for page alignment) for the prefetch
+    mem_addr_t start_addr;
+
+    // current address from the start up to which PCI-e has already processed
+    mem_addr_t cur_addr;
+
+    // starting address of the current variable allocation
+    mem_addr_t allocation_addr;
+
+    // total size (rolled up and down for page alignment) for the prefetch
+    size_t size;
+
+    // stream associated to the prefetch
+    CUstream_st *m_stream;
+
+    // memory fetches, which are created upon page fault and are depending on
+    // current prefetch, aggreagted before the prefetch is actually scheduled
+    std::map<mem_addr_t, std::list<mem_fetch *>> incoming_replayable_nacks;
+
+    // memory fetches that are finished PCI-e transfer are aggregated to be
+    // replayed together upon completion of the prefetch
+    std::map<mem_addr_t, std::list<mem_fetch *>> outgoing_replayable_nacks;
+
+    // list of pages (max upto 2MB) from the current prefetch request which are
+    // being served by PCI-e
+    std::list<mem_addr_t> pending_prefetch;
+
+    // stream manager upon reaching to this entry of the queue sets it to active
+    bool active;
+  };
+
+  std::list<prefetch_req> prefetch_req_buffer;
+
+  std::list<event_stats *> fault_stats;
+  std::list<event_stats *> writeback_stats;
+
+  std::list<struct lp_tree_node *> large_page_info;
+  size_t total_allocation_size;
+
+  bool over_sub;
+
+  class gpgpu_new_stats *m_new_stats;
+};
+
+struct lp_tree_node {
+  mem_addr_t addr;
+  size_t size;
+  size_t valid_size;
+  struct lp_tree_node *left;
+  struct lp_tree_node *right;
+  uint32_t access_counter;
+  uint8_t RW;
 };
 
 struct occupancy_stats {
@@ -654,7 +1203,9 @@ class gpgpu_sim : public gpgpu_t {
    * Returning the cluster of of the shader core, used by the functional
    * simulation so far
    */
-  simt_core_cluster *getSIMTCluster();
+  simt_core_cluster *getSIMTCluster(int index);
+
+  gmmu_t *getGmmu();
 
   void hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd,
                       const ptx_instruction *pI);
@@ -687,6 +1238,7 @@ class gpgpu_sim : public gpgpu_t {
 
  protected:
   ///// data /////
+  class gmmu_t *m_gmmu;
   class simt_core_cluster **m_cluster;
   class memory_partition_unit **m_memory_partition_unit;
   class memory_sub_partition **m_memory_sub_partition;
@@ -709,6 +1261,7 @@ class gpgpu_sim : public gpgpu_t {
   double icnt_time;
   double dram_time;
   double l2_time;
+  double gmmu_time;
 
   // debug
   bool gpu_deadlock;
@@ -744,6 +1297,9 @@ class gpgpu_sim : public gpgpu_t {
                                       // stat printout
   virtual void createSIMTCluster() = 0;
 
+public:
+  class gpgpu_new_stats *m_new_stats;
+
  public:
   unsigned long long gpu_sim_insn;
   unsigned long long gpu_tot_sim_insn;
diff --git a/src/gpgpu-sim/mem_fetch.cc b/src/gpgpu-sim/mem_fetch.cc
index 809c92081..210110794 100644
--- a/src/gpgpu-sim/mem_fetch.cc
+++ b/src/gpgpu-sim/mem_fetch.cc
@@ -47,8 +47,11 @@ mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
   if (inst) {
     m_inst = *inst;
     assert(wid == m_inst.warp_id());
+    //printf("MEM_FETCH DEBUG: mem_fetch.cc :: mf %p is formed and m_inst info, mem_access.m_uid=%d\n", this, m_access.get_uid());
+    //m_inst.print_insn(stdout);
   }
   m_streamID = streamID;
+  m_split = false;
   m_data_size = access.get_size();
   m_ctrl_size = ctrl_size;
   m_sid = sid;
@@ -70,12 +73,18 @@ mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
   m_status_change = cycle;
   m_mem_config = config;
   icnt_flit_size = config->icnt_flit_size;
+
+  m_dma = false;
+
   original_mf = m_original_mf;
   original_wr_mf = m_original_wr_mf;
   if (m_original_mf) {
     m_raw_addr.chip = m_original_mf->get_tlx_addr().chip;
     m_raw_addr.sub_partition = m_original_mf->get_tlx_addr().sub_partition;
   }
+  // MEM_FETCH DEBUG
+  //printf("MEM_FETCH DEBUG: mem_fetch.cc :: mf %p is formed, mem_access.m_uid=%d\n", this, m_access.get_uid());
+  //print(stdout);
 }
 
 mem_fetch::~mem_fetch() { m_status = MEM_FETCH_DELETED; }
diff --git a/src/gpgpu-sim/mem_fetch.h b/src/gpgpu-sim/mem_fetch.h
index 770421822..a85d4ec08 100644
--- a/src/gpgpu-sim/mem_fetch.h
+++ b/src/gpgpu-sim/mem_fetch.h
@@ -101,6 +101,7 @@ class mem_fetch {
   enum mf_type get_type() const { return m_type; }
   bool isatomic() const;
 
+  mem_access_t get_mem_access() { return m_access; }
   void set_return_timestamp(unsigned t) { m_timestamp2 = t; }
   void set_icnt_receive_time(unsigned t) { m_icnt_receive_time = t; }
   unsigned get_timestamp() const { return m_timestamp; }
@@ -120,17 +121,28 @@ class mem_fetch {
   }
 
   address_type get_pc() const { return m_inst.empty() ? -1 : m_inst.pc; }
-  const warp_inst_t &get_inst() { return m_inst; }
+
+  //Changed to non-const type
+  warp_inst_t &get_inst() { return m_inst; }
   enum mem_fetch_status get_status() const { return m_status; }
 
   const memory_config *get_mem_config() { return m_mem_config; }
 
   unsigned get_num_flits(bool simt_to_mem);
 
+  bool is_dma() { return m_dma; }
+  void set_dma() { m_dma = true; }
+
   mem_fetch *get_original_mf() { return original_mf; }
   mem_fetch *get_original_wr_mf() { return original_wr_mf; }
 
+  void set_split() { m_split = true; }
+  bool is_split() { return m_split; }
+
  private:
+  // Is this mf being split?
+  bool m_split;
+  
   // request source information
   unsigned m_request_uid;
   unsigned m_sid;
@@ -172,6 +184,8 @@ class mem_fetch {
   const memory_config *m_mem_config;
   unsigned icnt_flit_size;
 
+  bool m_dma;
+
   mem_fetch
       *original_mf;  // this pointer is set up when a request is divided into
                      // sector requests at L2 cache (if the req size > L2 sector
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 1ea011e45..1ef795b3d 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -33,6 +33,7 @@
 #include "shader.h"
 #include <float.h>
 #include <limits.h>
+#include <list>
 #include <string.h>
 #include "../../libcuda/gpgpu_context.h"
 #include "../cuda-sim/cuda-sim.h"
@@ -448,9 +449,10 @@ void shader_core_ctx::create_exec_pipeline() {
     }
   }
 
-  m_ldst_unit = new ldst_unit(m_icnt, m_mem_fetch_allocator, this,
+  m_ldst_unit = new ldst_unit(m_gpu, m_icnt, m_mem_fetch_allocator, this,
                               &m_operand_collector, m_scoreboard, m_config,
-                              m_memory_config, m_stats, m_sid, m_tpc, m_gpu);
+                              m_memory_config, m_stats, m_new_stats, m_sid, 
+                              m_tpc);
   m_fu.push_back(m_ldst_unit);
   m_dispatch_port.push_back(ID_OC_MEM);
   m_issue_port.push_back(OC_EX_MEM);
@@ -503,6 +505,47 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
   m_occupied_cta_to_hwtid.clear();
 }
 
+shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
+                                 class simt_core_cluster *cluster,
+                                 unsigned shader_id, unsigned tpc_id,
+                                 const shader_core_config *config,
+                                 const memory_config *mem_config,
+                                 shader_core_stats *stats,
+                                 class gpgpu_new_stats *new_stats)
+    : core_t(gpu, NULL, config->warp_size, config->n_thread_per_shader),
+      m_barriers(this, config->max_warps_per_shader, config->max_cta_per_core,
+                 config->max_barriers_per_cta, config->warp_size),
+      m_active_warps(0),
+      m_dynamic_warp_id(0) {
+  m_cluster = cluster;
+  m_config = config;
+  m_memory_config = mem_config;
+  m_stats = stats;
+
+  m_new_stats = new_stats;
+
+  //unsigned warp_size = config->warp_size;
+  Issue_Prio = 0;
+
+  m_sid = shader_id;
+  m_tpc = tpc_id;
+
+  if (get_gpu()->get_config().g_power_simulation_enabled) {
+    scaling_coeffs = get_gpu()->get_scaling_coeffs();
+  }
+
+  m_last_inst_gpu_sim_cycle = 0;
+  m_last_inst_gpu_tot_sim_cycle = 0;
+
+  // Jin: for concurrent kernels on a SM
+  m_occupied_n_threads = 0;
+  m_occupied_shmem = 0;
+  m_occupied_regs = 0;
+  m_occupied_ctas = 0;
+  m_occupied_hwtid.reset();
+  m_occupied_cta_to_hwtid.clear();
+}
+
 void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
                              bool reset_not_completed) {
   if (reset_not_completed) {
@@ -948,7 +991,8 @@ void shader_core_ctx::fetch() {
         // reclaimed
         if (m_warp[warp_id]->hardware_done() &&
             !m_scoreboard->pendingWrites(warp_id) &&
-            !m_warp[warp_id]->done_exit()) {
+            !m_warp[warp_id]->done_exit() &&
+            m_gpu->getGmmu()->pcie_transfers_completed()) {
           bool did_exit = false;
           for (unsigned t = 0; t < m_config->warp_size; t++) {
             unsigned tid = warp_id * m_config->warp_size + t;
@@ -1044,6 +1088,13 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
   m_warp[warp_id]->ibuffer_free();
   assert(next_inst->valid());
   **pipe_reg = *next_inst;  // static instruction information
+
+  if (g_debug_execution >= 6 ) {
+    printf("MEM_FETCH DEBUG: shader_core_ctx::issue_warp - inst info\n");
+    (*pipe_reg)->print_insn(stdout);
+    printf("\n");
+  }
+
   (*pipe_reg)->issue(
       active_mask, warp_id, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
       m_warp[warp_id]->get_dynamic_warp_id(), sch_id,
@@ -1273,6 +1324,9 @@ void scheduler_unit::cycle() {
     if ((*iter) == NULL || (*iter)->done_exit()) {
       continue;
     }
+    if (!(*iter)->waiting()) {
+      all_warps.push_back(*iter);
+    }
     SCHED_DPRINTF("Testing (warp_id %u, dynamic_warp_id %u)\n",
                   (*iter)->get_warp_id(), (*iter)->get_dynamic_warp_id());
     unsigned warp_id = (*iter)->get_warp_id();
@@ -1507,9 +1561,11 @@ void scheduler_unit::cycle() {
 
             }  // end of else
           } else {
+            fail_warps.push_back((*iter));
             SCHED_DPRINTF(
                 "Warp (warp_id %u, dynamic_warp_id %u) fails scoreboard\n",
                 (*iter)->get_warp_id(), (*iter)->get_dynamic_warp_id());
+            //m_scoreboard->printContents();
           }
         }
       } else if (valid) {
@@ -1994,6 +2050,88 @@ bool ldst_unit::shared_cycle(warp_inst_t &inst, mem_stage_stall_type &rc_fail,
   return !stall;
 }
 
+mem_stage_stall_type ldst_unit::process_managed_cache_access(
+    cache_t *cache, new_addr_type address, std::list<cache_event> &events,
+    mem_fetch *mf, enum cache_request_status status) {
+  mem_stage_stall_type result = NO_RC_FAIL;
+  bool write_sent = was_write_sent(events);
+  bool read_sent = was_read_sent(events);
+  if (write_sent)
+    m_core->inc_store_req(mf->get_inst().warp_id());
+  if (status == HIT) {
+    assert(!read_sent);
+    m_core->dec_managed_access_req(mf->get_wid());
+    m_gmmu_cu_queue.pop_front();
+    if (mf->get_inst().is_load()) {
+      for (unsigned r = 0; r < 4; r++)
+        if (mf->get_inst().out[r] > 0)
+          m_pending_writes[mf->get_inst().warp_id()][mf->get_inst().out[r]]--;
+
+      bool pending_requests = false;
+      // changed from non-const to const
+      const warp_inst_t &pipe_reg = mf->get_inst();
+      unsigned warp_id = mf->get_wid();
+      for (unsigned r = 0; r < 4; r++) {
+        unsigned reg_id = pipe_reg.out[r];
+        if (reg_id > 0) {
+          if (m_pending_writes[warp_id].find(reg_id) !=
+              m_pending_writes[warp_id].end()) {
+            if (m_pending_writes[warp_id][reg_id] > 0) {
+              pending_requests = true;
+              break;
+            } else {
+              // this instruction is done already
+              m_pending_writes[warp_id].erase(reg_id);
+            }
+          }
+        }
+      }
+      if (!pending_requests) {
+        m_core->warp_inst_complete(pipe_reg);
+        m_scoreboard->releaseRegisters(&pipe_reg);
+      }
+    }
+
+    // if (mf->get_mem_access().get_type() == GLOBAL_ACC_R 
+        // && m_core->get_gpu()->get_global_memory()->is_page_managed(
+        //     mf->get_mem_access().get_addr(), mf->get_mem_access().get_size())
+    // ) {
+    //   if (!mf->is_split()) {
+    //     m_core->get_gpu()->getGmmu()->reserve_pages_remove(mf->get_mem_access().get_addr(),
+    //                                          mf->get_mem_access().get_uid());
+    //   }
+    // }
+
+    if (!write_sent) {
+      if (mf->get_mem_access().get_type() == GLOBAL_ACC_R ||
+          mf->get_mem_access().get_type() == GLOBAL_ACC_W) {
+        assert(m_new_stats->ma_latency[m_sid].find(
+                   mf->get_mem_access().get_uid()) !=
+               m_new_stats->ma_latency[m_sid].end());
+        m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()].first =
+            true;
+        m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()].second =
+            m_core->get_gpu()->gpu_tot_sim_cycle + m_core->get_gpu()->gpu_sim_cycle -
+            m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()]
+                .second;
+      }
+      delete mf;
+    }
+
+  } else if (status == RESERVATION_FAIL) {
+    result = COAL_STALL;
+    assert(!read_sent);
+    assert(!write_sent);
+  } else {
+    assert(status == MISS || status == HIT_RESERVED);
+    // inst.clear_active( access.get_warp_mask() ); // threads in mf writeback
+    // when mf returns
+    m_core->dec_managed_access_req(mf->get_wid());
+    m_gmmu_cu_queue.pop_front();
+  }
+  return result;
+}
+
 mem_stage_stall_type ldst_unit::process_cache_access(
     cache_t *cache, new_addr_type address, warp_inst_t &inst,
     std::list<cache_event> &events, mem_fetch *mf,
@@ -2011,7 +2149,7 @@ mem_stage_stall_type ldst_unit::process_cache_access(
   }
   if (status == HIT) {
     assert(!read_sent);
-    inst.accessq_pop_back();
+    inst.accessq_pop_front(); //Yechen: pop_back before;
     if (inst.is_load()) {
       for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
         if (inst.out[r] > 0) m_pending_writes[inst.warp_id()][inst.out[r]]--;
@@ -2024,7 +2162,33 @@ mem_stage_stall_type ldst_unit::process_cache_access(
         }
       }
     }
-    if (!write_sent) delete mf;
+
+    // if (mf->get_mem_access().get_type() == GLOBAL_ACC_R 
+        // && m_core->get_gpu()->get_global_memory()->is_page_managed(
+        //     mf->get_mem_access().get_addr(), mf->get_mem_access().get_size())
+    // ) {
+    //   if (!mf->is_split()) {
+    //     m_core->get_gpu()->getGmmu()->reserve_pages_remove(mf->get_mem_access().get_addr(),
+    //                                          mf->get_mem_access().get_uid());
+    //   }
+    // }
+
+    if (!write_sent) {
+      if (mf->get_mem_access().get_type() == GLOBAL_ACC_R ||
+          mf->get_mem_access().get_type() == GLOBAL_ACC_W) {
+        assert(m_new_stats->ma_latency[m_sid].find(
+                   mf->get_mem_access().get_uid()) !=
+               m_new_stats->ma_latency[m_sid].end());
+        m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()].first =
+            true;
+        m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()].second =
+            m_core->get_gpu()->gpu_tot_sim_cycle + m_core->get_gpu()->gpu_sim_cycle -
+            m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()]
+                .second;
+      }
+      delete mf;
+    }
+
   } else if (status == RESERVATION_FAIL) {
     result = BK_CONF;
     assert(!read_sent);
@@ -2034,12 +2198,36 @@ mem_stage_stall_type ldst_unit::process_cache_access(
     assert(status == MISS || status == HIT_RESERVED);
     // inst.clear_active( access.get_warp_mask() ); // threads in mf writeback
     // when mf returns
-    inst.accessq_pop_back();
+    // if (mf->get_mem_access().get_type() == GLOBAL_ACC_R 
+      // && m_core->get_gpu()->get_global_memory()->is_page_managed(
+      //       mf->get_mem_access().get_addr(), mf->get_mem_access().get_size())
+    // ) {
+    //   if (!mf->is_split()) {
+    //     m_core->get_gpu()->getGmmu()->reserve_pages_remove(mf->get_mem_access().get_addr(),
+    //                                          mf->get_mem_access().get_uid());
+    //   }
+    // }
+
+    inst.accessq_pop_front();
   }
   if (!inst.accessq_empty() && result == NO_RC_FAIL) result = COAL_STALL;
   return result;
 }
 
+mem_stage_stall_type
+ldst_unit::process_managed_memory_access_queue(cache_t *cache) {
+  if (!cache->data_port_free())
+    return DATA_PORT_STALL;
+
+  // const mem_access_t &access = inst.accessq_back();
+  mem_fetch *mf = m_gmmu_cu_queue.front();
+  std::list<cache_event> events;
+  enum cache_request_status status = cache->access(
+      mf->get_addr(), mf, m_core->get_gpu()->gpu_sim_cycle + m_core->get_gpu()->gpu_tot_sim_cycle, events);
+  return process_managed_cache_access(cache, mf->get_addr(), events, mf,
+                                      status);
+}
+
 mem_stage_stall_type ldst_unit::process_memory_access_queue(cache_t *cache,
                                                             warp_inst_t &inst) {
   mem_stage_stall_type result = NO_RC_FAIL;
@@ -2049,7 +2237,7 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue(cache_t *cache,
 
   // const mem_access_t &access = inst.accessq_back();
   mem_fetch *mf = m_mf_allocator->alloc(
-      inst, inst.accessq_back(),
+      inst, inst.accessq_front(),
       m_core->get_gpu()->gpu_sim_cycle + m_core->get_gpu()->gpu_tot_sim_cycle);
   std::list<cache_event> events;
   enum cache_request_status status = cache->access(
@@ -2071,7 +2259,7 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
       if (inst.accessq_empty()) return result;
 
       mem_fetch *mf =
-          m_mf_allocator->alloc(inst, inst.accessq_back(),
+          m_mf_allocator->alloc(inst, inst.accessq_front(),
                                 m_core->get_gpu()->gpu_sim_cycle +
                                     m_core->get_gpu()->gpu_tot_sim_cycle);
       unsigned bank_id = m_config->m_L1D_config.set_bank(mf->get_addr());
@@ -2090,8 +2278,13 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
           for (unsigned i = 0; i < inc_ack; ++i)
             m_core->inc_store_req(inst.warp_id());
         }
-
-        inst.accessq_pop_back();
+        mem_stage_access_type type;
+        mem_addr_t page_no =
+          m_core->get_gpu()->get_global_memory()->get_page_num(inst.accessq_front().get_addr());
+        unsigned front = inst.accessq_front().get_uid();
+        tlb_cycle(inst, result, type, page_no);
+        if (!inst.accessq_empty() && inst.accessq_front().get_uid() == front)
+          inst.accessq_pop_front();
       } else {
         result = BK_CONF;
         m_stats->gpgpu_n_l1cache_bkconflict++;
@@ -2105,7 +2298,7 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
     return result;
   } else {
     mem_fetch *mf =
-        m_mf_allocator->alloc(inst, inst.accessq_back(),
+        m_mf_allocator->alloc(inst, inst.accessq_front(),
                               m_core->get_gpu()->gpu_sim_cycle +
                                   m_core->get_gpu()->gpu_tot_sim_cycle);
     std::list<cache_event> events;
@@ -2257,73 +2450,379 @@ bool ldst_unit::texture_cycle(warp_inst_t &inst, mem_stage_stall_type &rc_fail,
   return inst.accessq_empty();  // done if empty.
 }
 
-bool ldst_unit::memory_cycle(warp_inst_t &inst,
+bool ldst_unit::is_in_tlb(mem_addr_t page_num) {
+  return std::find(tlb.begin(), tlb.end(), page_num) != tlb.end();
+}
+
+bool ldst_unit::remove_tlb_entry(mem_addr_t page_num) {
+  if (is_in_tlb(page_num)) {
+    tlb.remove(page_num);
+    return true;
+  }
+
+  return false;
+}
+
+void ldst_unit::refresh_tlb(mem_addr_t page_num) {
+  if (!is_in_tlb(page_num)) {
+    m_new_stats->tlb_val[m_sid]++;
+    m_new_stats->tlb_thrashing[m_sid][page_num].push_back(true);
+
+    if (tlb.size() == m_core_config->tlb_size) {
+      mem_addr_t oldest = tlb.front();
+
+      m_new_stats->tlb_evict[m_sid]++;
+      m_new_stats->tlb_thrashing[m_sid][oldest].push_back(false);
+
+      tlb.pop_front();
+    }
+  } else {
+    remove_tlb_entry(page_num);
+  }
+
+  tlb.push_back(page_num);
+}
+
+bool ldst_unit::tlb_cycle(warp_inst_t &inst,
+                          mem_stage_stall_type &stall_reason,
+                          mem_stage_access_type &access_type,
+                          mem_addr_t page_no) {
+  // process for far fetch only when it is a managed page
+  // if (!m_core->get_gpu()->get_global_memory()->is_page_managed(
+  // inst.accessq_front().get_addr(), inst.accessq_front().get_size())) {
+  //   return true;
+  // }
+
+  // far fetch is valid only for managed page in global memory
+  if (inst.accessq_front().get_type() != GLOBAL_ACC_R &&
+  inst.accessq_front().get_type() != GLOBAL_ACC_W) {
+    return true;
+  }
+
+  // m_core->get_gpu()->getGmmu()->update_access_type(
+  //   inst.accessq_front().get_addr(),
+  //   inst.accessq_front().get_type() == GLOBAL_ACC_W ? 2 : 1);
+  // m_core->get_gpu()->getGmmu()->inc_bb_access_counter(inst.accessq_front().get_addr());
+  // m_core->get_gpu()->getGmmu()->reserve_pages_insert(inst.accessq_front().get_addr(),
+  //                                       inst.accessq_front().get_uid());
+
+  // check if the page corresponding to memory access is there in TLB or not
+  if (is_in_tlb(page_no)) {
+    // on tlb hit, check whether the page is in pci-e write stage queue
+    // if so, then evict another page instead
+    m_core->get_gpu()->getGmmu()->check_write_stage_queue(
+        m_core->get_gpu()->get_global_memory()->get_page_num(
+            inst.accessq_front().get_addr()),
+        true);
+
+    // on tlb hit, refresh the LRU page list
+    m_core->get_gpu()->get_global_memory()->set_page_access(page_no);
+
+    // on write (store) set the dirty flag
+    if (inst.accessq_front().get_type() == GLOBAL_ACC_W) {
+      m_core->get_gpu()->get_global_memory()->set_page_dirty(page_no);
+    }
+  
+    refresh_tlb(page_no);
+
+    m_core->get_gpu()->getGmmu()->refresh_valid_pages(inst.accessq_front().get_addr());
+
+    return true;
+  } else {
+    mem_fetch *mf = m_mf_allocator->alloc(inst, inst.accessq_front(),
+                                m_core->get_gpu()->gpu_sim_cycle +
+                                  m_core->get_gpu()->gpu_tot_sim_cycle);
+
+    // send it over downward queues (CU to GMMU) to suffer for far fetch latency
+    m_cu_gmmu_queue.push_back(mf);
+
+    inst.accessq_pop_front();
+
+    m_core->inc_managed_access_req(mf->get_wid());
+
+    if (!inst.accessq_empty()) {
+      stall_reason = COAL_STALL;
+      access_type =
+          inst.accessq_front().get_type() == GLOBAL_ACC_W ? G_MEM_ST : G_MEM_LD;
+    }
+
+    // return false if access queue is not empty and we have already processed
+    // one memory access in the current load/store unit cycle
+    return inst.accessq_empty();
+  }
+}
+
+bool ldst_unit::access_cycle(warp_inst_t &inst,
                              mem_stage_stall_type &stall_reason,
                              mem_stage_access_type &access_type) {
-  if (inst.empty() || ((inst.space.get_type() != global_space) &&
-                       (inst.space.get_type() != local_space) &&
-                       (inst.space.get_type() != param_space_local)))
+  if (inst.empty() || inst.accessq_empty() || inst.active_count() == 0) {
     return true;
-  if (inst.active_count() == 0) return true;
-  if (inst.accessq_empty()) return true;
+  }
+
+  mem_addr_t page_no =
+      m_core->get_gpu()->get_global_memory()->get_page_num(inst.accessq_front().get_addr());
+
+  for (unsigned i = 0; i < inst.accessq_count(); i++) {
+    if ((inst.accessq_front().get_type() == GLOBAL_ACC_R ||
+        inst.accessq_front().get_type() == GLOBAL_ACC_W) &&
+        m_new_stats->ma_latency[m_sid].find(inst.accessq_front().get_uid()) ==
+            m_new_stats->ma_latency[m_sid].end()) {
+
+      if (inst.accessq_front().get_type() == GLOBAL_ACC_W && g_debug_execution >= 3) {
+        printf("MEM_FETCH DEBUG :: ldst_unit::access_cycle :: m_sid=%d, uid=%d\n", m_sid, inst.accessq_front().get_uid());
+        inst.print_m_accessq();
+      }
+      m_new_stats->ma_latency[m_sid][inst.accessq_front().get_uid()] =
+          std::make_pair(false, m_core->get_gpu()->gpu_sim_cycle + m_core->get_gpu()->gpu_tot_sim_cycle);
+      
+      m_new_stats->page_access_times[m_sid][page_no]++;
+
+      m_new_stats->time_and_page_access.push_back(access_info(
+          page_no, inst.accessq_front().get_addr(),
+          inst.accessq_front().get_size(), m_core->get_gpu()->gpu_tot_sim_cycle + m_core->get_gpu()->gpu_sim_cycle,
+          inst.accessq_front().get_type() == GLOBAL_ACC_R, m_sid,
+          inst.warp_id()));
+
+      // if (m_core->get_gpu()->get_global_memory()->is_page_managed(
+              // inst.accessq_front().get_addr(), inst.accessq_front().get_size())) {
+
+        if (is_in_tlb(page_no)) {
+          m_new_stats->tlb_hit[m_sid]++;
+        } else {
+          m_new_stats->tlb_miss[m_sid]++;
+        }
+      // }
+    }
+    inst.accessq_push_back(inst.accessq_front());
+    inst.accessq_pop_front();
+  }
+
+  // process for far fetch only when it is a managed page
+  // if (!m_core->get_gpu()->get_global_memory()->is_page_managed(
+  //         inst.accessq_front().get_addr(), inst.accessq_front().get_size())) {
+  //   return true;
+  // }
+
+  // // far fetch is valid only for managed page in global memory
+  // if (inst.accessq_front().get_type() != GLOBAL_ACC_R &&
+  //     inst.accessq_front().get_type() != GLOBAL_ACC_W) {
+  //   return true;
+  // }
+
+  return tlb_cycle(inst, stall_reason, access_type, page_no);
+  // m_core->get_gpu()->getGmmu()->update_access_type(
+  //     inst.accessq_front().get_addr(),
+  //     inst.accessq_front().get_type() == GLOBAL_ACC_W ? 2 : 1);
+  // m_core->get_gpu()->getGmmu()->inc_bb_access_counter(inst.accessq_front().get_addr());
+  // m_core->get_gpu()->getGmmu()->reserve_pages_insert(inst.accessq_front().get_addr(),
+  //                                        inst.accessq_front().get_uid());
+
+  // // check if the page corresponding to memory access is there in TLB or not
+  // if (is_in_tlb(page_no)) {
+  //   // on tlb hit, check whether the page is in pci-e write stage queue
+  //   // if so, then evict another page instead
+  //   m_core->get_gpu()->getGmmu()->check_write_stage_queue(
+  //       m_core->get_gpu()->get_global_memory()->get_page_num(
+  //           inst.accessq_front().get_addr()),
+  //       true);
+
+  //   // on tlb hit, refresh the LRU page list
+  //   m_core->get_gpu()->get_global_memory()->set_page_access(page_no);
+
+  //   // on write (store) set the dirty flag
+  //   if (inst.accessq_front().get_type() == GLOBAL_ACC_W) {
+  //     m_core->get_gpu()->get_global_memory()->set_page_dirty(page_no);
+  //   }
+
+  //   refresh_tlb(page_no);
+
+  //   m_core->get_gpu()->getGmmu()->refresh_valid_pages(inst.accessq_front().get_addr());
+
+  //   return true;
+  // } else {
+  //   mem_fetch *mf = m_mf_allocator->alloc(inst, inst.accessq_front(),
+  //                               m_core->get_gpu()->gpu_sim_cycle +
+  //                                 m_core->get_gpu()->gpu_tot_sim_cycle);
+
+  //   // send it over downward queues (CU to GMMU) to suffer for far fetch latency
+  //   m_cu_gmmu_queue.push_back(mf);
+
+  //   inst.accessq_pop_front();
+
+  //   m_core->inc_managed_access_req(mf->get_wid());
+
+  //   if (!inst.accessq_empty()) {
+  //     stall_reason = COAL_STALL;
+  //     access_type =
+  //         inst.accessq_front().get_type() == GLOBAL_ACC_W ? G_MEM_ST : G_MEM_LD;
+  //   }
+
+  //   // return false if access queue is not empty and we have already processed
+  //   // one memory access in the current load/store unit cycle
+  //   return inst.accessq_empty();
+  // }
+}
+
+bool ldst_unit::memory_cycle(warp_inst_t &inst,
+                             mem_stage_stall_type &stall_reason,
+                             mem_stage_access_type &access_type) {
+  //inst.print_m_accessq();
+  if (m_gmmu_cu_queue.empty()) {
+
+    if (inst.empty() || inst.accessq_empty() ||
+        ((inst.space.get_type() != global_space) &&
+         (inst.space.get_type() != local_space) &&
+         (inst.space.get_type() != param_space_local)))
+      return true;
+    if (inst.active_count() == 0)
+      return true;
+  }
+  // if (inst.accessq_empty()) return true;
 
   mem_stage_stall_type stall_cond = NO_RC_FAIL;
-  const mem_access_t &access = inst.accessq_back();
-
-  bool bypassL1D = false;
-  if (CACHE_GLOBAL == inst.cache_op || (m_L1D == NULL)) {
-    bypassL1D = true;
-  } else if (inst.space.is_global()) {  // global memory access
-    // skip L1 cache if the option is enabled
-    if (m_core->get_config()->gmem_skip_L1D && (CACHE_L1 != inst.cache_op))
+
+  if (!inst.accessq_empty()) {
+    const mem_access_t &access = inst.accessq_front();
+
+    bool bypassL1D = false;
+    if (CACHE_GLOBAL == inst.cache_op || (m_L1D == NULL)) {
       bypassL1D = true;
-  }
-  if (bypassL1D) {
-    // bypass L1 cache
-    unsigned control_size =
-        inst.is_store() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE;
-    unsigned size = access.get_size() + control_size;
-    // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
-    if (m_memory_config->SST_mode &&
-        (static_cast<sst_memory_interface *>(m_icnt)->full(
-            size, inst.is_store() || inst.isatomic(), access.get_type()))) {
-      // SST need mf type here
-      // Cast it to sst_memory_interface pointer first as this full() method
-      // is not a virtual method in parent class
-      stall_cond = ICNT_RC_FAIL;
-    } else if (!m_memory_config->SST_mode &&
-               (m_icnt->full(size, inst.is_store() || inst.isatomic()))) {
-      stall_cond = ICNT_RC_FAIL;
+    } else if (inst.space.is_global()) {  // global memory access
+      // skip L1 cache if the option is enabled
+      if (m_core->get_config()->gmem_skip_L1D && (CACHE_L1 != inst.cache_op))
+        bypassL1D = true;
+    }
+
+    if (bypassL1D) {
+      // bypass L1 cache
+      unsigned control_size =
+          inst.is_store() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE;
+      unsigned size = access.get_size() + control_size;
+      // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
+      if (m_icnt->full(size, inst.is_store() || inst.isatomic())) {
+        stall_cond = ICNT_RC_FAIL;
+      } else {
+        mem_fetch *mf =
+            m_mf_allocator->alloc(inst, access,
+                                  m_core->get_gpu()->gpu_sim_cycle +
+                                      m_core->get_gpu()->gpu_tot_sim_cycle);
+        m_icnt->push(mf);
+
+        // if (access.get_type() == GLOBAL_ACC_R 
+          //  && m_core->get_gpu()->get_global_memory()->is_page_managed(access.get_addr(),
+          //                                               access.get_size())
+        // ) {
+        //   if (!mf->is_split()){
+        //     m_core->get_gpu()->getGmmu()->reserve_pages_remove(access.get_addr(),
+        //                                          access.get_uid());
+        //   }
+        // }
+
+        inst.accessq_pop_front();
+        // inst.clear_active( access.get_warp_mask() );
+        if (g_debug_execution >= 6) {
+          printf("MEM_FETCH DEBUG: ldst_unit::memory_cycle - inst info, %d, %u, %s\n", 
+                  inst.op, inst.warp_id(), inst.empty()? "empty" : "not empty");
+          inst.print_insn(stdout);
+          printf("\n");
+          fflush(stdout);
+        }
+        if (inst.is_load()) {
+          for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
+            if (inst.out[r] > 0)
+              assert(m_pending_writes[inst.warp_id()][inst.out[r]] > 0);
+        } else if (inst.is_store()) {
+          m_core->inc_store_req(inst.warp_id());
+        }
+      }
     } else {
-      mem_fetch *mf =
-          m_mf_allocator->alloc(inst, access,
-                                m_core->get_gpu()->gpu_sim_cycle +
-                                    m_core->get_gpu()->gpu_tot_sim_cycle);
-      m_icnt->push(mf);
-      inst.accessq_pop_back();
-      // inst.clear_active( access.get_warp_mask() );
-      if (inst.is_load()) {
-        for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
-          if (inst.out[r] > 0)
-            assert(m_pending_writes[inst.warp_id()][inst.out[r]] > 0);
-      } else if (inst.is_store())
-        m_core->inc_store_req(inst.warp_id());
+      assert(CACHE_UNDEFINED != inst.cache_op);
+      stall_cond = process_memory_access_queue_l1cache(m_L1D, inst);
+    }
+    if (!inst.accessq_empty() && stall_cond == NO_RC_FAIL)
+      stall_cond = COAL_STALL;
+    if (stall_cond != NO_RC_FAIL) {
+      stall_reason = stall_cond;
+      bool iswrite = inst.is_store();
+      if (inst.space.is_local())
+        access_type = (iswrite) ? L_MEM_ST : L_MEM_LD;
+      else
+        access_type = (iswrite) ? G_MEM_ST : G_MEM_LD;
     }
+    return inst.accessq_empty();
   } else {
-    assert(CACHE_UNDEFINED != inst.cache_op);
-    stall_cond = process_memory_access_queue_l1cache(m_L1D, inst);
-  }
-  if (!inst.accessq_empty() && stall_cond == NO_RC_FAIL)
-    stall_cond = COAL_STALL;
-  if (stall_cond != NO_RC_FAIL) {
-    stall_reason = stall_cond;
-    bool iswrite = inst.is_store();
-    if (inst.space.is_local())
-      access_type = (iswrite) ? L_MEM_ST : L_MEM_LD;
-    else
-      access_type = (iswrite) ? G_MEM_ST : G_MEM_LD;
+    mem_fetch *mf = m_gmmu_cu_queue.front();
+
+    bool bypassL1D = false;
+    if (CACHE_GLOBAL == mf->get_inst().cache_op || (m_L1D == NULL)) {
+      bypassL1D = true;
+    } else if (mf->get_inst().space.is_global()) { // global memory access
+      // skip L1 cache if the option is enabled
+      if (m_core->get_config()->gmem_skip_L1D)
+        bypassL1D = true;
+    }
+
+    if (bypassL1D) {
+      // bypass L1 cache
+      unsigned control_size =
+          mf->get_inst().is_store() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE;
+      unsigned size = mf->get_mem_access().get_size() + control_size;
+      if (m_icnt->full(size, mf->get_inst().is_store() ||
+                                 mf->get_inst().isatomic())) {
+        stall_cond = ICNT_RC_FAIL;
+      } else {
+        m_icnt->push(mf);
+
+        // if (mf->get_mem_access().get_type() == GLOBAL_ACC_R 
+            // && m_core->get_gpu()->get_global_memory()->is_page_managed(
+            //     mf->get_mem_access().get_addr(),
+            //     mf->get_mem_access().get_size())
+        // ) {
+        //   if (!mf->is_split()) {
+        //     m_core->get_gpu()->getGmmu()->reserve_pages_remove(
+        //         mf->get_mem_access().get_addr(), mf->get_mem_access().get_uid());
+        //   }
+        // }
+
+        m_core->dec_managed_access_req(mf->get_wid());
+        m_gmmu_cu_queue.pop_front();
+        if (mf->get_inst().is_load()) {
+          for (unsigned r = 0; r < 4; r++)
+            if (mf->get_inst().out[r] > 0)
+              assert(m_pending_writes[mf->get_inst().warp_id()]
+                                     [mf->get_inst().out[r]] > 0);
+        } else if (mf->get_inst().is_store())
+          m_core->inc_store_req(mf->get_inst().warp_id());
+      }
+    } else {
+      assert(CACHE_UNDEFINED != mf->get_inst().cache_op);
+      stall_cond = process_managed_memory_access_queue(m_L1D);
+    }
+
+    if (stall_cond == NO_RC_FAIL) {
+      // the page is coming out of upward queue and so ready to be accessed,
+      // refresh the LRU page list
+      mem_addr_t page_num =
+          m_core->get_gpu()->get_global_memory()->get_page_num(mf->get_addr());
+      m_core->get_gpu()->get_global_memory()->set_page_access(page_num);
+
+      // on write (store) set the dirty flag
+      if (mf->get_mem_access().get_type() == GLOBAL_ACC_W) {
+        m_core->get_gpu()->get_global_memory()->set_page_dirty(page_num);
+      }
+
+      m_core->get_gpu()->getGmmu()->refresh_valid_pages(mf->get_addr());
+
+      refresh_tlb(page_num);
+    } else {
+      mem_stage_access_type type =
+          inst.accessq_front().get_type() == GLOBAL_ACC_W ? G_MEM_ST : G_MEM_LD;
+      m_stats->gpgpu_n_stall_shd_mem++;
+      m_stats->gpu_stall_shd_mem_breakdown[type][stall_cond]++;
+    }
+
+    return true;
   }
-  return inst.accessq_empty();
 }
 
 bool ldst_unit::response_buffer_full() const {
@@ -2334,9 +2833,18 @@ void ldst_unit::fill(mem_fetch *mf) {
   mf->set_status(
       IN_SHADER_LDST_RESPONSE_FIFO,
       m_core->get_gpu()->gpu_sim_cycle + m_core->get_gpu()->gpu_tot_sim_cycle);
+  if (g_debug_execution >= 6) {
+    printf("MEM_FETCH DEBUG: ldst_unit::fill - mf info %p\n", mf);
+    mf->print(stdout);
+  }
   m_response_fifo.push_back(mf);
 }
 
+void ldst_unit::fill_mem_access(mem_fetch *mf) {
+  mf->set_status(MEM_FETCH_INITIALIZED, m_core->get_gpu()->gpu_sim_cycle + m_core->get_gpu()->gpu_tot_sim_cycle);
+  m_gmmu_cu_queue.push_back(mf);
+}
+
 void ldst_unit::flush() {
   // Flush L1D cache
   m_L1D->flush();
@@ -2584,12 +3092,14 @@ void pipelined_simd_unit::issue(register_set &source_reg) {
     }
 */
 
-void ldst_unit::init(mem_fetch_interface *icnt,
+void ldst_unit::init(gpgpu_sim *gpu, mem_fetch_interface *icnt,
                      shader_core_mem_fetch_allocator *mf_allocator,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc) {
+                     class gpgpu_new_stats *new_stats, unsigned sid, 
+                     unsigned tpc) {
+  m_core_config = config;
   m_memory_config = mem_config;
   m_icnt = icnt;
   m_mf_allocator = mf_allocator;
@@ -2597,8 +3107,12 @@ void ldst_unit::init(mem_fetch_interface *icnt,
   m_operand_collector = operand_collector;
   m_scoreboard = scoreboard;
   m_stats = stats;
+
+  m_new_stats = new_stats;
+  
   m_sid = sid;
   m_tpc = tpc;
+
 #define STRSIZE 1024
   char L1T_name[STRSIZE];
   char L1C_name[STRSIZE];
@@ -2618,20 +3132,24 @@ void ldst_unit::init(mem_fetch_interface *icnt,
   m_next_global = NULL;
   m_last_inst_gpu_sim_cycle = 0;
   m_last_inst_gpu_tot_sim_cycle = 0;
+
+  gpu->getGmmu()->register_tlbflush_callback(
+      [this](mem_addr_t addr) { return invalidate_tlb(addr); });
 }
 
-ldst_unit::ldst_unit(mem_fetch_interface *icnt,
+ldst_unit::ldst_unit(gpgpu_sim *gpu, mem_fetch_interface *icnt,
                      shader_core_mem_fetch_allocator *mf_allocator,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc, gpgpu_sim *gpu)
+                     class gpgpu_new_stats *new_stats, unsigned sid, 
+                     unsigned tpc)
     : pipelined_simd_unit(NULL, config, config->smem_latency, core, 0),
       m_next_wb(config),
       m_gpu(gpu) {
   assert(config->smem_latency > 1);
-  init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
-       mem_config, stats, sid, tpc);
+  init(gpu, icnt, mf_allocator, core, operand_collector, scoreboard, config,
+       mem_config, stats, new_stats, sid, tpc);
   if (!m_config->m_L1D_config.disabled()) {
     char L1D_name[STRSIZE];
     snprintf(L1D_name, STRSIZE, "L1D_%03d", m_sid);
@@ -2649,17 +3167,25 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
   m_name = "MEM ";
 }
 
-ldst_unit::ldst_unit(mem_fetch_interface *icnt,
+ldst_unit::ldst_unit(gpgpu_sim *gpu, mem_fetch_interface *icnt,
                      shader_core_mem_fetch_allocator *mf_allocator,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc, l1_cache *new_l1d_cache)
+                     class gpgpu_new_stats *new_stats, unsigned sid, 
+                     unsigned tpc, l1_cache *new_l1d_cache)
     : pipelined_simd_unit(NULL, config, 3, core, 0),
       m_L1D(new_l1d_cache),
       m_next_wb(config) {
-  init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
-       mem_config, stats, sid, tpc);
+  init(gpu, icnt, mf_allocator, core, operand_collector, scoreboard, config,
+       mem_config, stats, new_stats, sid, tpc);
+}
+
+void ldst_unit::invalidate_tlb(mem_addr_t page_num) {
+  if (remove_tlb_entry(page_num)) {
+    m_new_stats->tlb_page_evict[m_sid]++;
+    m_new_stats->tlb_thrashing[m_sid][page_num].push_back(false);
+  }
 }
 
 void ldst_unit::issue(register_set &reg_set) {
@@ -2786,6 +3312,24 @@ void ldst_unit::writeback() {
         if (m_L1D && m_L1D->access_ready()) {
           mem_fetch *mf = m_L1D->next_access();
           m_next_wb = mf->get_inst();
+          // if (m_core->get_gpu()->get_global_memory()->is_page_managed(
+          //         mf->get_mem_access().get_addr(),
+          //         mf->get_mem_access().get_size())) {
+            // if (!mf->is_split()) {
+            //   m_core->get_gpu()->getGmmu()->reserve_pages_remove(
+            //       mf->get_mem_access().get_addr(), mf->get_mem_access().get_uid());
+            // }
+          // }
+          assert(m_new_stats->ma_latency[m_sid].find(
+                    mf->get_mem_access().get_uid()) !=
+                m_new_stats->ma_latency[m_sid].end());
+          m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()].first =
+              true;
+          m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()].second =
+              m_core->get_gpu()->gpu_tot_sim_cycle + m_core->get_gpu()->gpu_sim_cycle -
+              m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()]
+                  .second;
+        
           delete mf;
           serviced_client = next_client;
         }
@@ -2832,6 +3376,8 @@ inst->space.get_type() != shared_space) { unsigned warp_id = inst->warp_id();
 }
 */
 void ldst_unit::cycle() {
+  if (g_debug_execution >= 6)
+    print(stdout);
   writeback();
 
   for (unsigned stage = 0; (stage + 1) < m_pipeline_depth; stage++)
@@ -2863,6 +3409,37 @@ void ldst_unit::cycle() {
         // Perfect mem
         m_core->store_ack(mf);
         m_response_fifo.pop_front();
+
+        if (g_debug_execution >= 3) {
+          printf("MEM_FETCH DEBUG: ldst_unit::cycle :: mf info %p\n", mf);
+          mf->print(stdout);
+          printf("MEM_FETCH DEBUG: ldst_unit::cycle :: Need to find uid=%d\n", mf->get_mem_access().get_uid());
+          printf("m_sid(%d) : ", m_sid);
+          for (std::map<unsigned int,std::pair<bool, unsigned long long>>::iterator it=m_new_stats->ma_latency[m_sid].begin(); it!=m_new_stats->ma_latency[m_sid].end(); ++it) {
+            printf("%d ", it->first);
+          }
+          printf("\n");
+          fflush(stdout);
+        }
+        assert(m_new_stats->ma_latency[m_sid].find(
+                   mf->get_mem_access().get_uid()) !=
+               m_new_stats->ma_latency[m_sid].end());
+        m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()].first =
+            true;
+        m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()].second =
+            m_core->get_gpu()->gpu_tot_sim_cycle + m_core->get_gpu()->gpu_sim_cycle -
+            m_new_stats->ma_latency[m_sid][mf->get_mem_access().get_uid()]
+                .second;
+
+        // if (m_core->get_gpu()->get_global_memory()->is_page_managed(
+        //         mf->get_mem_access().get_addr(),
+        //         mf->get_mem_access().get_size())) {
+          // if (!mf->is_split()) {
+          //   m_core->get_gpu()->getGmmu()->reserve_pages_remove(
+          //       mf->get_mem_access().get_addr(), mf->get_mem_access().get_uid());
+          // }
+        // }
+
         delete mf;
       } else {
         assert(!mf->get_is_write());  // L1 cache is write evict, allocate line
@@ -2905,12 +3482,21 @@ void ldst_unit::cycle() {
   warp_inst_t &pipe_reg = *m_dispatch_reg;
   enum mem_stage_stall_type rc_fail = NO_RC_FAIL;
   mem_stage_access_type type;
+
   bool done = true;
-  done &= shared_cycle(pipe_reg, rc_fail, type);
-  done &= constant_cycle(pipe_reg, rc_fail, type);
-  done &= texture_cycle(pipe_reg, rc_fail, type);
-  done &= memory_cycle(pipe_reg, rc_fail, type);
-  m_mem_rc = rc_fail;
+
+  // process the instruction's memory access queue for TLB, Page Table, and
+  // PCI-E
+  done = access_cycle(pipe_reg, rc_fail, type);
+
+  // if we have already processed one memory access from instruction's access
+  // queue in the current cycle do not process further
+  if (done) {
+    done &= shared_cycle(pipe_reg, rc_fail, type);
+    done &= constant_cycle(pipe_reg, rc_fail, type);
+    done &= texture_cycle(pipe_reg, rc_fail, type);
+    done &= memory_cycle(pipe_reg, rc_fail, type);
+  }
 
   if (!done) {  // log stall types and return
     assert(rc_fail != NO_RC_FAIL);
@@ -4032,12 +4618,19 @@ void shader_core_ctx::accept_ldst_unit_response(mem_fetch *mf) {
   m_ldst_unit->fill(mf);
 }
 
+void shader_core_ctx::accept_access_response(mem_fetch *mf) {
+  m_ldst_unit->fill_mem_access(mf);
+}
+
 void shader_core_ctx::store_ack(class mem_fetch *mf) {
   assert(mf->get_type() == WRITE_ACK ||
          ((m_config->gpgpu_perfect_mem || m_memory_config->SST_mode) &&
           mf->get_is_write()));
-  unsigned warp_id = mf->get_wid();
-  m_warp[warp_id]->dec_store_req();
+  // Check if the mf is split or not
+  if (!mf->is_split()) {
+    unsigned warp_id = mf->get_wid();
+    m_warp[warp_id]->dec_store_req();
+  }
 }
 
 void shader_core_ctx::print_cache_stats(FILE *fp, unsigned &dl1_accesses,
@@ -4079,7 +4672,8 @@ bool shd_warp_t::functional_done() const {
 }
 
 bool shd_warp_t::hardware_done() const {
-  return functional_done() && stores_done() && !inst_in_pipeline();
+  return functional_done() && stores_done() && managed_access_done() && 
+         !inst_in_pipeline();
 }
 
 bool shd_warp_t::waiting() {
@@ -4454,7 +5048,8 @@ void exec_simt_core_cluster::create_shader_core_ctx() {
   for (unsigned i = 0; i < m_config->n_simt_cores_per_cluster; i++) {
     unsigned sid = m_config->cid_to_sid(i, m_cluster_id);
     m_core[i] = new exec_shader_core_ctx(m_gpu, this, sid, m_cluster_id,
-                                         m_config, m_mem_config, m_stats);
+                                         m_config, m_mem_config, m_stats, 
+                                         m_new_stats);
     m_core_sim_order.push_back(i);
   }
 }
@@ -4474,6 +5069,25 @@ simt_core_cluster::simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
   m_mem_config = mem_config;
 }
 
+simt_core_cluster::simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
+                                     const shader_core_config *config,
+                                     const memory_config *mem_config,
+                                     shader_core_stats *stats,
+                                     class memory_stats_t *mstats,
+                                     class gpgpu_new_stats *new_stats) {
+  m_config = config;
+  m_cta_issue_next_core = m_config->n_simt_cores_per_cluster -
+                          1;  // this causes first launch to use hw cta 0
+  m_cluster_id = cluster_id;
+  m_gpu = gpu;
+  m_stats = stats;
+  m_memory_stats = mstats;
+
+  m_new_stats = new_stats;
+
+  m_mem_config = mem_config;
+}
+
 void simt_core_cluster::core_cycle() {
   for (std::list<unsigned>::iterator it = m_core_sim_order.begin();
        it != m_core_sim_order.end(); ++it) {
@@ -4619,9 +5233,14 @@ void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) {
   unsigned destination = mf->get_sub_partition_id();
   mf->set_status(IN_ICNT_TO_MEM,
                  m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-  if (!mf->get_is_write() && !mf->isatomic())
+  if (!mf->get_is_write() && !mf->isatomic()) {
+    if (g_debug_execution >= 6) {
+      printf("MEM_FETCH DEBUG :: simt_core_cluster::icnt_inject_request_packet :: mf info %p\n", mf);
+      mf->print(stdout);
+    }
     ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf,
                 mf->get_ctrl_size());
+  }
   else
     ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf,
                 mf->size());
@@ -4710,6 +5329,24 @@ void sst_simt_core_cluster::icnt_inject_request_packet_to_SST(
 }
 
 void simt_core_cluster::icnt_cycle() {
+  // pop from upward queue (GMMU to CU) of cluster and push it to the one in
+  // core (SM/CU)
+  if (!m_gmmu_cu_queue.empty()) {
+    mem_fetch *mf = m_gmmu_cu_queue.front();
+    unsigned cid = m_config->sid_to_cid(mf->get_sid());
+    m_gmmu_cu_queue.pop_front();
+    m_core[cid]->accept_access_response(mf);
+  }
+
+  // pop it from the downward queue (CU to GMMU) of the core (SM/CU) and push it
+  // to the one in cluster (TPC)
+  for (unsigned i = 0; i < m_config->n_simt_cores_per_cluster; i++) {
+    if (!m_core[i]->empty_cu_gmmu_queue()) {
+      mem_fetch *mf = m_core[i]->front_cu_gmmu_queue();
+      m_cu_gmmu_queue.push_front(mf);
+      m_core[i]->pop_cu_gmmu_queue();
+    }
+  }
   if (!m_response_fifo.empty()) {
     mem_fetch *mf = m_response_fifo.front();
     unsigned cid = m_config->sid_to_cid(mf->get_sid());
@@ -4724,6 +5361,10 @@ void simt_core_cluster::icnt_cycle() {
       if (!m_core[cid]->ldst_unit_response_buffer_full()) {
         m_response_fifo.pop_front();
         m_memory_stats->memlatstat_read_done(mf);
+        if (g_debug_execution >= 6) {
+          printf("MEM_FETCH DEBUG: simt_core_cluster::icnt_cycle - mf info %p\n", mf);
+          mf->print(stdout);
+        }
         m_core[cid]->accept_ldst_unit_response(mf);
       }
     }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index b90ce2fdb..6d513585c 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -46,9 +46,10 @@
 #include <utility>
 #include <vector>
 
-// #include "../cuda-sim/ptx.tab.h"
+//#include "../cuda-sim/ptx.tab.h"
 
 #include "../abstract_hardware_model.h"
+#include "../cuda-sim/memory.h"
 #include "delayqueue.h"
 #include "dram.h"
 #include "gpu-cache.h"
@@ -105,11 +106,13 @@ class shd_warp_t {
   shd_warp_t(class shader_core_ctx *shader, unsigned warp_size)
       : m_shader(shader), m_warp_size(warp_size) {
     m_stores_outstanding = 0;
+    m_managed_access_outstanding = 0;
     m_inst_in_pipeline = 0;
     reset();
   }
   void reset() {
     assert(m_stores_outstanding == 0);
+    assert(m_managed_access_outstanding == 0);
     assert(m_inst_in_pipeline == 0);
     m_imiss_pending = false;
     m_warp_id = (unsigned)-1;
@@ -250,6 +253,13 @@ class shd_warp_t {
     m_stores_outstanding--;
   }
 
+  bool managed_access_done() const { return m_managed_access_outstanding == 0; }
+  void inc_managed_access_req() { m_managed_access_outstanding++; }
+  void dec_managed_access_req() {
+    assert(m_managed_access_outstanding > 0);
+    m_managed_access_outstanding--;
+  }
+
   unsigned num_inst_in_buffer() const {
     unsigned count = 0;
     for (unsigned i = 0; i < IBUFFER_SIZE; i++) {
@@ -314,6 +324,7 @@ class shd_warp_t {
 
   unsigned m_stores_outstanding;  // number of store requests sent but not yet
                                   // acknowledged
+  unsigned m_managed_access_outstanding;
   unsigned m_inst_in_pipeline;
 
   // Jin: cdp support
@@ -1343,12 +1354,12 @@ class cache_t;
 
 class ldst_unit : public pipelined_simd_unit {
  public:
-  ldst_unit(mem_fetch_interface *icnt,
+  ldst_unit(class gpgpu_sim *gpu, mem_fetch_interface *icnt,
             shader_core_mem_fetch_allocator *mf_allocator,
             shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
             Scoreboard *scoreboard, const shader_core_config *config,
             const memory_config *mem_config, class shader_core_stats *stats,
-            unsigned sid, unsigned tpc, gpgpu_sim *gpu);
+            class gpgpu_new_stats *new_stats, unsigned sid, unsigned tpc);
 
   // Add a structure to record the LDGSTS instructions,
   // similar to m_pending_writes, but since LDGSTS does not have a output
@@ -1366,6 +1377,11 @@ class ldst_unit : public pipelined_simd_unit {
   virtual void cycle();
 
   void fill(mem_fetch *mf);
+
+  // function to fill the gmmu to cu queue
+  // from the cluster to load/store unit
+  void fill_mem_access(mem_fetch *mf);
+
   void flush();
   void invalidate();
   void writeback();
@@ -1406,21 +1422,37 @@ class ldst_unit : public pipelined_simd_unit {
   void get_L1C_sub_stats(struct cache_sub_stats &css) const;
   void get_L1T_sub_stats(struct cache_sub_stats &css) const;
 
+  // methods to be called by the clusters
+  // to access the downward queues (CU to GMMU)
+  bool empty_cu_gmmu_queue() { return m_cu_gmmu_queue.empty(); }
+  mem_fetch *front_cu_gmmu_queue() { return m_cu_gmmu_queue.front(); }
+  void pop_cu_gmmu_queue() { m_cu_gmmu_queue.pop_front(); }
+
+  void invalidate_tlb(mem_addr_t addr);
+
  protected:
-  ldst_unit(mem_fetch_interface *icnt,
+  ldst_unit(class gpgpu_sim *gpu, mem_fetch_interface *icnt,
             shader_core_mem_fetch_allocator *mf_allocator,
             shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
             Scoreboard *scoreboard, const shader_core_config *config,
             const memory_config *mem_config, shader_core_stats *stats,
-            unsigned sid, unsigned tpc, l1_cache *new_l1d_cache);
-  void init(mem_fetch_interface *icnt,
+            class gpgpu_new_stats *new_stats, unsigned sid, unsigned tpc,
+            l1_cache *new_l1d_cache);
+  void init(class gpgpu_sim *gpu, mem_fetch_interface *icnt,
             shader_core_mem_fetch_allocator *mf_allocator,
             shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
             Scoreboard *scoreboard, const shader_core_config *config,
             const memory_config *mem_config, shader_core_stats *stats,
-            unsigned sid, unsigned tpc);
+            class gpgpu_new_stats *new_stats, unsigned sid, unsigned tpc);
 
  protected:
+ // deals with global read (load)/write (store) access
+  // checks tlb for hit/miss
+  bool tlb_cycle(warp_inst_t &inst, mem_stage_stall_type &stall_reason,
+                 mem_stage_access_type &access_type, mem_addr_t page_no);
+  bool access_cycle(warp_inst_t &inst, mem_stage_stall_type &rc_fail,
+                    mem_stage_access_type &fail_type);
+
   bool shared_cycle(warp_inst_t &inst, mem_stage_stall_type &rc_fail,
                     mem_stage_access_type &fail_type);
   bool constant_cycle(warp_inst_t &inst, mem_stage_stall_type &rc_fail,
@@ -1440,6 +1472,14 @@ class ldst_unit : public pipelined_simd_unit {
                                                            warp_inst_t &inst);
   gpgpu_sim *m_gpu;
 
+  virtual mem_stage_stall_type
+  process_managed_cache_access(cache_t *cache, new_addr_type address,
+                               std::list<cache_event> &events, mem_fetch *mf,
+                               enum cache_request_status status);
+  mem_stage_stall_type process_managed_memory_access_queue(cache_t *cache);
+
+  const shader_core_config *m_core_config;
+
   const memory_config *m_memory_config;
   class mem_fetch_interface *m_icnt;
   shader_core_mem_fetch_allocator *m_mf_allocator;
@@ -1466,11 +1506,23 @@ class ldst_unit : public pipelined_simd_unit {
   enum mem_stage_stall_type m_mem_rc;
 
   shader_core_stats *m_stats;
+  class gpgpu_new_stats *m_new_stats;
 
   // for debugging
   unsigned long long m_last_inst_gpu_sim_cycle;
   unsigned long long m_last_inst_gpu_tot_sim_cycle;
 
+  // two queues that interface with texture processor cluster
+  std::list<mem_fetch *> m_gmmu_cu_queue;
+  std::list<mem_fetch *> m_cu_gmmu_queue;
+
+  // set of virtual addresses present in TLB
+  std::list<mem_addr_t> tlb;
+
+  bool remove_tlb_entry(mem_addr_t page_num);
+  bool is_in_tlb(mem_addr_t page_num);
+  void refresh_tlb(mem_addr_t page_num);
+
   std::vector<std::deque<mem_fetch *>> l1_latency_queue;
   void L1_latency_queue_cycle();
 };
@@ -1707,6 +1759,9 @@ class shader_core_config : public core_config {
   // Jin: concurrent kernel on sm
   bool gpgpu_concurrent_kernel_sm;
 
+  int tlb_size;
+  friend class ldst_unit;
+
   bool perfect_inst_const_cache;
   unsigned inst_fetch_throughput;
   unsigned reg_file_port_throughput;
@@ -2048,7 +2103,6 @@ class shader_core_mem_fetch_allocator : public mem_fetch_allocator {
         inst.warp_id(), m_core_id, m_cluster_id, m_memory_config, cycle);
     return mf;
   }
-
  private:
   unsigned m_core_id;
   unsigned m_cluster_id;
@@ -2063,6 +2117,12 @@ class shader_core_ctx : public core_t {
                   const shader_core_config *config,
                   const memory_config *mem_config, shader_core_stats *stats);
 
+  shader_core_ctx(class gpgpu_sim *gpu, class simt_core_cluster *cluster,
+                  unsigned shader_id, unsigned tpc_id,
+                  const shader_core_config *config,
+                  const memory_config *mem_config, shader_core_stats *stats,
+                  class gpgpu_new_stats *new_stats);
+
   // used by simt_core_cluster:
   // modifiers
   void cycle();
@@ -2074,6 +2134,18 @@ class shader_core_ctx : public core_t {
   void cache_invalidate();
   void accept_fetch_response(mem_fetch *mf);
   void accept_ldst_unit_response(class mem_fetch *mf);
+
+  // method to fill the upward queue (GMMU to CU) in load/store unit
+  void accept_access_response(mem_fetch *mf);
+
+  // interface between core (CU/SM) and cluster
+  // to access the downward queues (CU to GMMU)
+  bool empty_cu_gmmu_queue() { return m_ldst_unit->empty_cu_gmmu_queue(); }
+  mem_fetch *front_cu_gmmu_queue() {
+    return m_ldst_unit->front_cu_gmmu_queue();
+  }
+  void pop_cu_gmmu_queue() { m_ldst_unit->pop_cu_gmmu_queue(); }
+
   void broadcast_barrier_reduction(unsigned cta_id, unsigned bar_id,
                                    warp_set_t warps);
   void set_kernel(kernel_info_t *k) {
@@ -2116,6 +2188,14 @@ class shader_core_ctx : public core_t {
   void mem_instruction_stats(const warp_inst_t &inst);
   void decrement_atomic_count(unsigned wid, unsigned n);
   void inc_store_req(unsigned warp_id) { m_warp[warp_id]->inc_store_req(); }
+
+  void inc_managed_access_req(unsigned warp_id) {
+    m_warp[warp_id]->inc_managed_access_req();
+  }
+  void dec_managed_access_req(unsigned warp_id) {
+    m_warp[warp_id]->dec_managed_access_req();
+  }
+
   void dec_inst_in_pipeline(unsigned warp_id) {
     m_warp[warp_id]->dec_inst_in_pipeline();
   }  // also used in writeback()
@@ -2504,6 +2584,7 @@ class shader_core_ctx : public core_t {
 
   // statistics
   shader_core_stats *m_stats;
+  class gpgpu_new_stats *m_new_stats;
 
   // CTA scheduling / hardware thread allocation
   unsigned m_n_active_cta;  // number of Cooperative Thread Arrays (blocks)
@@ -2582,9 +2663,10 @@ class exec_shader_core_ctx : public shader_core_ctx {
                        unsigned shader_id, unsigned tpc_id,
                        const shader_core_config *config,
                        const memory_config *mem_config,
-                       shader_core_stats *stats)
+                       shader_core_stats *stats, 
+                       class gpgpu_new_stats *new_stats)
       : shader_core_ctx(gpu, cluster, shader_id, tpc_id, config, mem_config,
-                        stats) {
+                        stats, new_stats) {
     create_front_pipeline();
     create_shd_warp();
     create_schedulers();
@@ -2610,6 +2692,12 @@ class exec_shader_core_ctx : public shader_core_ctx {
 
 class simt_core_cluster {
  public:
+  simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
+                    const shader_core_config *config,
+                    const memory_config *mem_config, shader_core_stats *stats,
+                    memory_stats_t *mstats,
+                    class gpgpu_new_stats *new_stats);
+
   simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
                     const shader_core_config *config,
                     const memory_config *mem_config, shader_core_stats *stats,
@@ -2634,6 +2722,16 @@ class simt_core_cluster {
     m_response_fifo.push_back(mf);
   }
 
+  // interface to be called by gmmu
+  // to access the downward queues (CU to GMMU) in the cluster by GMMU
+  bool empty_cu_gmmu_queue() { return m_cu_gmmu_queue.empty(); }
+  mem_fetch *front_cu_gmmu_queue() { return m_cu_gmmu_queue.front(); }
+  void pop_cu_gmmu_queue() { m_cu_gmmu_queue.pop_front(); }
+
+  // method to fill the upward queue (GMMU to CU) by GMMU upon completion of
+  // PCI-E transfer
+  void push_gmmu_cu_queue(mem_fetch *mf) { m_gmmu_cu_queue.push_back(mf); }
+
   void get_pdom_stack_top_info(unsigned sid, unsigned tid, unsigned *pc,
                                unsigned *rpc) const;
   unsigned max_cta(const kernel_info_t &kernel);
@@ -2667,13 +2765,30 @@ class simt_core_cluster {
   shader_core_ctx **m_core;
   const memory_config *m_mem_config;
 
+  class gpgpu_new_stats *m_new_stats;
+
   unsigned m_cta_issue_next_core;
   std::list<unsigned> m_core_sim_order;
   std::list<mem_fetch *> m_response_fifo;
+
+  // queues that pass memory accesses between core and GMMU
+  // as cluster interfaces between CU and GMMU
+  std::list<mem_fetch *> m_gmmu_cu_queue;
+  std::list<mem_fetch *> m_cu_gmmu_queue;
 };
 
 class exec_simt_core_cluster : public simt_core_cluster {
  public:
+  exec_simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
+                         const shader_core_config *config,
+                         const memory_config *mem_config,
+                         class shader_core_stats *stats,
+                         class memory_stats_t *mstats,
+                         class gpgpu_new_stats *new_stats)
+      : simt_core_cluster(gpu, cluster_id, config, mem_config, stats, mstats, new_stats) {
+    create_shader_core_ctx();
+  }
+
   exec_simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
                          const shader_core_config *config,
                          const memory_config *mem_config,
diff --git a/src/gpgpu-sim/workspace.code-workspace b/src/gpgpu-sim/workspace.code-workspace
new file mode 100644
index 000000000..b50ca6918
--- /dev/null
+++ b/src/gpgpu-sim/workspace.code-workspace
@@ -0,0 +1,11 @@
+{
+	"folders": [
+		{
+			"path": "../.."
+		},
+		{
+			"path": "../../../gpgpu-sim_UVMSmart"
+		}
+	],
+	"settings": {}
+}
\ No newline at end of file
diff --git a/src/gpgpusim_entrypoint.cc b/src/gpgpusim_entrypoint.cc
index be492295f..d2f3f910f 100644
--- a/src/gpgpusim_entrypoint.cc
+++ b/src/gpgpusim_entrypoint.cc
@@ -145,6 +145,7 @@ void *gpgpu_sim_thread_concurrent(void *ctx_ptr) {
         sim_cycles = true;
         ctx->the_gpgpusim->g_the_gpu->deadlock_check();
       } else {
+        ctx->the_gpgpusim->g_the_gpu->cycle();
         if (ctx->the_gpgpusim->g_the_gpu->cycle_insn_cta_max_hit()) {
           ctx->the_gpgpusim->g_stream_manager->stop_all_running_kernels();
           ctx->the_gpgpusim->g_sim_done = true;
@@ -295,6 +296,7 @@ void gpgpu_context::synchronize() {
 
 bool gpgpu_context::synchronize_check() {
   // printf("GPGPU-Sim: synchronize checking for inactive GPU simulation\n");
+  requested_synchronize = true;
   the_gpgpusim->g_stream_manager->print(stdout);
   fflush(stdout);
   //    sem_wait(&g_sim_signal_finish);
@@ -336,8 +338,9 @@ gpgpu_sim *gpgpu_context::gpgpu_ptx_sim_init_perf() {
   the_gpgpusim->g_the_gpu_config = new gpgpu_sim_config(this);
   the_gpgpusim->g_the_gpu_config->reg_options(
       opp);  // register GPU microrachitecture options
-
+  the_gpgpusim->g_the_gpu_config->convert_byte_string();
   option_parser_cmdline(opp, sg_argc, sg_argv);  // parse configuration options
+
   fprintf(stdout, "GPGPU-Sim: Configuration options:\n\n");
   option_parser_print(opp, stdout);
   // Set the Numeric locale to a standard locale where a decimal point is a
diff --git a/src/gpuwattch/Alpha21364.xml b/src/gpuwattch/Alpha21364.xml
new file mode 100644
index 000000000..c40c4f50b
--- /dev/null
+++ b/src/gpuwattch/Alpha21364.xml
@@ -0,0 +1,456 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="1"/>
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="1"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/>
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="90"/><!-- nm -->
+		<param name="target_core_clockrate" value="1200"/><!--MHz -->
+		<param name="temperature" value="380"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="0"/><!-- 0 no use; 1 use when approperiate -->
+		<param name="machine_bits" value="64"/>
+		<param name="virtual_address_width" value="64"/>
+		<param name="physical_address_width" value="52"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+		<stat name="total_cycles" value="100000"/>
+		<stat name="idle_cycles" value="0"/>
+		<stat name="busy_cycles"  value="100000"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="1200"/>
+			<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
+			<param name="opt_local" value="1"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="7"/>
+			<param name="x86" value="0"/>
+			<param name="micro_opcode_width" value="8"/>
+			<param name="machine_type" value="0"/>
+			<!-- inorder/OoO; 1 inorder; 0 OOO-->
+			<param name="number_hardware_threads" value="1"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="4"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="4"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="4"/>
+			<param name="peak_issue_width" value="6"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="4"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="2"/>
+			<param name="prediction_width" value="1"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="7,7"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="4"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="0"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="1"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="32"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="16"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="20"/>
+			<param name="fp_instruction_window_size" value="15"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="80"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<param name="archi_Regs_IRF_size" value="32"/>		
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="80"/>
+			<param name="phy_Regs_FRF_size" value="72"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="1"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="32"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="400000"/>
+			<stat name="int_instructions" value="200000"/>
+			<stat name="fp_instructions" value="100000"/>
+			<stat name="branch_instructions" value="100000"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="0"/>
+			<stat name="store_instructions" value="50000"/>
+			<stat name="committed_instructions" value="400000"/>
+			<stat name="committed_int_instructions" value="200000"/>
+			<stat name="committed_fp_instructions" value="100000"/>
+			<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="100000"/>
+		    <stat name="idle_cycles" value="0"/>
+		    <stat name="busy_cycles"  value="100000"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="400000"/>
+			<stat name="ROB_writes" value="400000"/>
+			<!-- RAT accesses -->
+			<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
+			<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
+			<stat name="fp_rename_reads" value="200000"/>
+			<stat name="fp_rename_writes" value="100000"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="400000"/>
+			<stat name="inst_window_writes" value="400000"/>
+			<stat name="inst_window_wakeup_accesses" value="800000"/>
+			<stat name="fp_inst_window_reads" value="200000"/>
+			<stat name="fp_inst_window_writes" value="200000"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="600000"/>
+			<stat name="float_regfile_reads" value="100000"/>
+			<stat name="int_regfile_writes" value="300000"/>
+			<stat name="float_regfile_writes" value="50000"/>
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="5"/>
+			<stat name="context_switches" value="260343"/>
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="300000"/>			
+			<stat name="fpu_accesses" value="100000"/>
+			<stat name="mul_accesses" value="200000"/>
+			<stat name="cdb_alu_accesses" value="300000"/>
+			<stat name="cdb_mul_accesses" value="200000"/>
+			<stat name="cdb_fpu_accesses" value="100000"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="1"/>			
+			<stat name="LSU_duty_cycle" value="1"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="1"/>
+			<stat name="ALU_duty_cycle" value="1"/>
+			<stat name="MUL_duty_cycle" value="0.3"/>
+			<stat name="FPU_duty_cycle" value="1"/>
+			<stat name="ALU_cdb_duty_cycle" value="1"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.3"/>
+			<stat name="FPU_cdb_duty_cycle" value="1"/>
+			<param name="number_of_BPT" value="2"/>
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="128"/>
+				<stat name="total_accesses" value="200000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="65536,16,2,1,1,2,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="200000"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="128"/><!--dual threads-->
+				<stat name="total_accesses" value="400000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="65536,16,2,1,1,3,16,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<param name="number_of_BTB" value="2"/>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="6144,4,2,1, 1,3"/> <!--48Kbits -->
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+				<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
+				<stat name="write_accesses" value="0"/>
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="512,4,0,1,1, 1"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="16, 16, 16, 16"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1200"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>	
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="1835008,16, 8, 16, 32, 32, 12, 1"/> 
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="1200"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="1.0"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+				<param name="clockrate" value="850"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="11824"/>
+				<stat name="write_accesses" value="11276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+				<stat name="duty_cycle" value="1.0"/>	
+		</component>
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="1200"/>
+			<param name="type" value="1"/>
+			<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
+				at each time only one node can send req -->
+			<param name="horizontal_nodes" value="1"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="1"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="8"/>
+			<param name="output_ports" value="7"/>
+			<!-- For bus the I/O ports should be 1 -->
+			<param name="virtual_channel_per_port" value="2"/>
+			<param name="input_buffer_entries_per_vc" value="128"/>
+			<param name="flit_bits" value="40"/>
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. 
+				chip_coverage <=1 -->
+			<param name="link_routing_over_percentage" value="1.0"/>
+			<!-- Links can route over other components or occupy whole area.
+				by default, 50% of the NoC global links routes over other 
+				components -->
+			<stat name="total_accesses" value="100000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="1"/>
+		</component>		
+<!--**********************************************************************-->
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="180"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="800"/><!--MHz-->
+			<param name="peak_transfer_rate" value="1600"/><!--MB/S-->
+			<param name="block_size" value="16"/><!--B-->
+			<param name="number_mcs" value="2"/>
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="2"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="6666"/>
+			<stat name="memory_reads" value="3333"/>
+			<stat name="memory_writes" value="3333"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
diff --git a/src/gpuwattch/Niagara1.xml b/src/gpuwattch/Niagara1.xml
new file mode 100644
index 000000000..ae748e246
--- /dev/null
+++ b/src/gpuwattch/Niagara1.xml
@@ -0,0 +1,442 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="8"/>
+		<param name="number_of_L1Directories" value="4"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="4"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/>
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="90"/><!-- nm -->
+		<param name="target_core_clockrate" value="1200"/><!--MHz -->
+		<param name="temperature" value="380"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="64"/>
+		<param name="virtual_address_width" value="64"/>
+		<param name="physical_address_width" value="52"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<stat name="total_cycles" value="100000"/>
+		<stat name="idle_cycles" value="0"/>
+		<stat name="busy_cycles"  value="100000"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="1200"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="4"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="1"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="1"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="6,6"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="1"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="1"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="0.125"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="16"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="16"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="16"/>
+			<param name="fp_instruction_window_size" value="16"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="80"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<param name="archi_Regs_IRF_size" value="32"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="80"/>
+			<param name="phy_Regs_FRF_size" value="80"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="8"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="1"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="32"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="800000"/>
+			<stat name="int_instructions" value="600000"/>
+			<stat name="fp_instructions" value="20000"/>
+			<stat name="branch_instructions" value="0"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="100000"/>
+			<stat name="store_instructions" value="100000"/>
+			<stat name="committed_instructions" value="800000"/>
+			<stat name="committed_int_instructions" value="600000"/>
+			<stat name="committed_fp_instructions" value="20000"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="100000"/>
+		    <stat name="idle_cycles" value="0"/>
+		    <stat name="busy_cycles"  value="100000"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="1600000"/>
+			<stat name="float_regfile_reads" value="40000"/>
+			<stat name="int_regfile_writes" value="800000"/>
+			<stat name="float_regfile_writes" value="20000"/>
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="5"/>
+			<stat name="context_switches" value="260343"/>
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="800000"/>			
+			<stat name="fpu_accesses" value="10000"/>
+			<stat name="mul_accesses" value="100000"/>
+			<stat name="cdb_alu_accesses" value="1000000"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="0.4"/>
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="800000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="200000"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="200000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1200"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1200"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="786432,64,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="1200"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="1200"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="8"/>
+			<param name="output_ports" value="5"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="136"/>
+			<param name="input_buffer_entries_per_vc" value="2"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="360000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>
+		
+<!--**********************************************************************-->
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="32"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="4"/>
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="1"/>
+			<param name="number_ranks" value="2"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="128"/>
+			<param name="addressbus_width" value="51"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="33333"/>
+			<stat name="memory_reads" value="16667"/>
+			<stat name="memory_writes" value="16667"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
diff --git a/src/gpuwattch/Niagara1_sharing.xml b/src/gpuwattch/Niagara1_sharing.xml
new file mode 100644
index 000000000..93531aebd
--- /dev/null
+++ b/src/gpuwattch/Niagara1_sharing.xml
@@ -0,0 +1,400 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="64"/>
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/>
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="22"/><!-- nm -->
+		<param name="target_core_clockrate" value="3500"/><!--MHz -->
+		<param name="temperature" value="360"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="64"/>
+		<param name="virtual_address_width" value="64"/>
+		<param name="physical_address_width" value="52"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<stat name="total_cycles" value="100000"/>
+		<stat name="idle_cycles" value="0"/>
+		<stat name="busy_cycles"  value="100000"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="3500"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="4"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="1"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="1"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="6,6"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="1"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="1"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="0.125"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="16"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="16"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="16"/>
+			<param name="fp_instruction_window_size" value="16"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="80"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<param name="archi_Regs_IRF_size" value="32"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="80"/>
+			<param name="phy_Regs_FRF_size" value="80"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="8"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="1"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="32"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="800000"/>
+			<stat name="int_instructions" value="600000"/>
+			<stat name="fp_instructions" value="20000"/>
+			<stat name="branch_instructions" value="0"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="100000"/>
+			<stat name="store_instructions" value="100000"/>
+			<stat name="committed_instructions" value="800000"/>
+			<stat name="committed_int_instructions" value="600000"/>
+			<stat name="committed_fp_instructions" value="20000"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="100000"/>
+		    <stat name="idle_cycles" value="0"/>
+		    <stat name="busy_cycles"  value="100000"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="1600000"/>
+			<stat name="float_regfile_reads" value="40000"/>
+			<stat name="int_regfile_writes" value="800000"/>
+			<stat name="float_regfile_writes" value="20000"/>
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="5"/>
+			<stat name="context_switches" value="260343"/>
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="800000"/>			
+			<stat name="fpu_accesses" value="10000"/>
+			<stat name="mul_accesses" value="100000"/>
+			<stat name="cdb_alu_accesses" value="1000000"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="0.4"/>
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="800000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="200000"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="200000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="3500"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="8"/>
+			<param name="vertical_nodes" value="8"/>
+			<param name="has_global_link" value="1"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="5"/>
+			<param name="output_ports" value="5"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="256"/>
+			<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="360000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.1"/>
+		</component>
+		
+<!--**********************************************************************-->
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="32"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="llc_line_length" value="64"/><!--B-->
+			<param name="number_mcs" value="4"/>
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="1"/>
+			<param name="number_ranks" value="2"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="128"/>
+			<param name="addressbus_width" value="51"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="33333"/>
+			<stat name="memory_reads" value="16667"/>
+			<stat name="memory_writes" value="16667"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+	</component>
+</component>
diff --git a/src/gpuwattch/Niagara1_sharing_DC.xml b/src/gpuwattch/Niagara1_sharing_DC.xml
new file mode 100644
index 000000000..574ec8157
--- /dev/null
+++ b/src/gpuwattch/Niagara1_sharing_DC.xml
@@ -0,0 +1,442 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="64"/>
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="8"/>
+		<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/>
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="22"/><!-- nm -->
+		<param name="target_core_clockrate" value="3500"/><!--MHz -->
+		<param name="temperature" value="360"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="64"/>
+		<param name="virtual_address_width" value="64"/>
+		<param name="physical_address_width" value="52"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<stat name="total_cycles" value="100000"/>
+		<stat name="idle_cycles" value="0"/>
+		<stat name="busy_cycles"  value="100000"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="3500"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="4"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="1"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="1"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="6,6"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="1"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="1"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="0.125"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="16"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="16"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="16"/>
+			<param name="fp_instruction_window_size" value="16"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="80"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<param name="archi_Regs_IRF_size" value="32"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="80"/>
+			<param name="phy_Regs_FRF_size" value="80"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="8"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="1"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="32"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="800000"/>
+			<stat name="int_instructions" value="600000"/>
+			<stat name="fp_instructions" value="20000"/>
+			<stat name="branch_instructions" value="0"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="100000"/>
+			<stat name="store_instructions" value="100000"/>
+			<stat name="committed_instructions" value="800000"/>
+			<stat name="committed_int_instructions" value="600000"/>
+			<stat name="committed_fp_instructions" value="20000"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="100000"/>
+		    <stat name="idle_cycles" value="0"/>
+		    <stat name="busy_cycles"  value="100000"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="1600000"/>
+			<stat name="float_regfile_reads" value="40000"/>
+			<stat name="int_regfile_writes" value="800000"/>
+			<stat name="float_regfile_writes" value="20000"/>
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="5"/>
+			<stat name="context_switches" value="260343"/>
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="800000"/>			
+			<stat name="fpu_accesses" value="10000"/>
+			<stat name="mul_accesses" value="100000"/>
+			<stat name="cdb_alu_accesses" value="1000000"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="0.4"/>
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="800000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="200000"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="200000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,9,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="3500"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="8"/>
+			<param name="vertical_nodes" value="8"/>
+			<param name="has_global_link" value="1"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="5"/>
+			<param name="output_ports" value="5"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="256"/>
+			<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="360000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.1"/>
+		</component>
+		
+<!--**********************************************************************-->
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="32"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="0"/>
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="1"/>
+			<param name="number_ranks" value="2"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="128"/>
+			<param name="addressbus_width" value="51"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="33333"/>
+			<stat name="memory_reads" value="16667"/>
+			<stat name="memory_writes" value="16667"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
diff --git a/src/gpuwattch/Niagara1_sharing_SBT.xml b/src/gpuwattch/Niagara1_sharing_SBT.xml
new file mode 100644
index 000000000..32eeca382
--- /dev/null
+++ b/src/gpuwattch/Niagara1_sharing_SBT.xml
@@ -0,0 +1,455 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="64"/>
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/>
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="22"/><!-- nm -->
+		<param name="target_core_clockrate" value="3500"/><!--MHz -->
+		<param name="temperature" value="360"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="64"/>
+		<param name="virtual_address_width" value="64"/>
+		<param name="physical_address_width" value="52"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<stat name="total_cycles" value="100000"/>
+		<stat name="idle_cycles" value="0"/>
+		<stat name="busy_cycles"  value="100000"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="3500"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="4"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="1"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="1"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="6,6"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="1"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="1"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="0.125"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="16"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="16"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="16"/>
+			<param name="fp_instruction_window_size" value="16"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="80"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<param name="archi_Regs_IRF_size" value="32"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="80"/>
+			<param name="phy_Regs_FRF_size" value="80"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="8"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="1"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="32"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="800000"/>
+			<stat name="int_instructions" value="600000"/>
+			<stat name="fp_instructions" value="20000"/>
+			<stat name="branch_instructions" value="0"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="100000"/>
+			<stat name="store_instructions" value="100000"/>
+			<stat name="committed_instructions" value="800000"/>
+			<stat name="committed_int_instructions" value="600000"/>
+			<stat name="committed_fp_instructions" value="20000"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="100000"/>
+		    <stat name="idle_cycles" value="0"/>
+		    <stat name="busy_cycles"  value="100000"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="1600000"/>
+			<stat name="float_regfile_reads" value="40000"/>
+			<stat name="int_regfile_writes" value="800000"/>
+			<stat name="float_regfile_writes" value="20000"/>
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="5"/>
+			<stat name="context_switches" value="260343"/>
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="800000"/>			
+			<stat name="fpu_accesses" value="10000"/>
+			<stat name="mul_accesses" value="100000"/>
+			<stat name="cdb_alu_accesses" value="1000000"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="0.4"/>
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="800000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="200000"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="200000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->	
+				<param name="Dir_config" value="8388608,9,0,1,100, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,8"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+			    <param name="merged_dir" value="1"/><!--if static bank tag is used as the directory -->
+				<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+				<stat name="coherent_read_accesses" value="400000"/>
+				<stat name="coherent_write_accesses" value="0"/>
+				<stat name="coherent_read_misses" value="400000"/>
+				<stat name="coherent_write_misses" value="0"/>
+			    <stat name="dir_duty_cycle" value="0.5"/>
+			
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>
+				<param name="Merged_dir" value="1"/><!--if static bank tag is used as the directory -->
+				<stat name="coherent_read_accesses" value="400000"/>
+				<stat name="coherent_write_accesses" value="0"/>
+				<stat name="coherent_read_misses" value="400000"/>
+				<stat name="coherent_write_misses" value="0"/>
+			    <stat name="dir_duty_cycle" value="0.5"/>	
+		</component>
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="3500"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="8"/>
+			<param name="vertical_nodes" value="8"/>
+			<param name="has_global_link" value="1"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="5"/>
+			<param name="output_ports" value="5"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="256"/>
+			<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="360000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.1"/>
+		</component>
+		
+<!--**********************************************************************-->
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="32"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="0"/>
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="1"/>
+			<param name="number_ranks" value="2"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="128"/>
+			<param name="addressbus_width" value="51"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="33333"/>
+			<stat name="memory_reads" value="16667"/>
+			<stat name="memory_writes" value="16667"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
diff --git a/src/gpuwattch/Niagara1_sharing_ST.xml b/src/gpuwattch/Niagara1_sharing_ST.xml
new file mode 100644
index 000000000..3f0573fe9
--- /dev/null
+++ b/src/gpuwattch/Niagara1_sharing_ST.xml
@@ -0,0 +1,443 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="64"/>
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="1"/>
+		<param name="number_of_L2s" value="64"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/>
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="22"/><!-- nm -->
+		<param name="target_core_clockrate" value="3500"/><!--MHz -->
+		<param name="temperature" value="360"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="64"/>
+		<param name="virtual_address_width" value="64"/>
+		<param name="physical_address_width" value="52"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<stat name="total_cycles" value="100000"/>
+		<stat name="idle_cycles" value="0"/>
+		<stat name="busy_cycles"  value="100000"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="3500"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="4"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="1"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="1"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="6,6"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="1"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="1"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="0.125"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="16"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="16"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="16"/>
+			<param name="fp_instruction_window_size" value="16"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="80"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<param name="archi_Regs_IRF_size" value="32"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="80"/>
+			<param name="phy_Regs_FRF_size" value="80"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="8"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="1"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="32"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="800000"/>
+			<stat name="int_instructions" value="600000"/>
+			<stat name="fp_instructions" value="20000"/>
+			<stat name="branch_instructions" value="0"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="100000"/>
+			<stat name="store_instructions" value="100000"/>
+			<stat name="committed_instructions" value="800000"/>
+			<stat name="committed_int_instructions" value="600000"/>
+			<stat name="committed_fp_instructions" value="20000"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="100000"/>
+		    <stat name="idle_cycles" value="0"/>
+		    <stat name="busy_cycles"  value="100000"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="1600000"/>
+			<stat name="float_regfile_reads" value="40000"/>
+			<stat name="int_regfile_writes" value="800000"/>
+			<stat name="float_regfile_writes" value="20000"/>
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="5"/>
+			<stat name="context_switches" value="260343"/>
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="800000"/>			
+			<stat name="fpu_accesses" value="10000"/>
+			<stat name="mul_accesses" value="100000"/>
+			<stat name="cdb_alu_accesses" value="1000000"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="0.4"/>
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="800000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="200000"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="200000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="8192,16,4,1,1,3,16,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache, 2 static-cache bank -->	
+				<param name="Dir_config" value="8388608,9,0,1,100, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3500"/>
+				<param name="ports" value="0,0,8"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="1048576,64,16,1, 4,23, 64, 1"/>
+			    <param name="Merged_dir" value="1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="Merged_dir" value="1"/>
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="3500"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="8"/>
+			<param name="vertical_nodes" value="8"/>
+			<param name="has_global_link" value="1"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="5"/>
+			<param name="output_ports" value="5"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="256"/>
+			<param name="input_buffer_entries_per_vc" value="4"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="360000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.1"/>
+		</component>
+		
+<!--**********************************************************************-->
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="32"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="0"/>
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="1"/>
+			<param name="number_ranks" value="2"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="128"/>
+			<param name="addressbus_width" value="51"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="33333"/>
+			<stat name="memory_reads" value="16667"/>
+			<stat name="memory_writes" value="16667"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+		</component>
+</component>
\ No newline at end of file
diff --git a/src/gpuwattch/Niagara2.xml b/src/gpuwattch/Niagara2.xml
new file mode 100644
index 000000000..c7e311ff8
--- /dev/null
+++ b/src/gpuwattch/Niagara2.xml
@@ -0,0 +1,438 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="8"/>
+		<param name="number_of_L1Directories" value="8"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="8"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/>
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="65"/><!-- nm -->
+		<param name="target_core_clockrate" value="1400"/><!--MHz -->
+		<param name="temperature" value="380"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="64"/>
+		<param name="virtual_address_width" value="64"/>
+		<param name="physical_address_width" value="52"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<stat name="total_cycles" value="100000"/>
+		<stat name="idle_cycles" value="0"/>
+		<stat name="busy_cycles"  value="100000"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="1400"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="4"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="1"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="1"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="2,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="2"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="0"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="1"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="32"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="16"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="16"/>
+			<param name="fp_instruction_window_size" value="16"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="80"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<param name="archi_Regs_IRF_size" value="32"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="80"/>
+			<param name="phy_Regs_FRF_size" value="80"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="8"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="64"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="64"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="1"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="32"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="1600000"/>
+			<stat name="int_instructions" value="1200000"/>
+			<stat name="fp_instructions" value="40000"/>
+			<stat name="branch_instructions" value="0"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="200000"/>
+			<stat name="store_instructions" value="200000"/>
+			<stat name="committed_instructions" value="1600000"/>
+			<stat name="committed_int_instructions" value="1200000"/>
+			<stat name="committed_fp_instructions" value="40000"/>
+			<stat name="pipeline_duty_cycle" value="0.5"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="100000"/>
+		    <stat name="idle_cycles" value="0"/>
+		    <stat name="busy_cycles"  value="100000"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="3200000"/>
+			<stat name="float_regfile_reads" value="80000"/>
+			<stat name="int_regfile_writes" value="1600000"/>
+			<stat name="float_regfile_writes" value="40000"/>
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="5"/>
+			<stat name="context_switches" value="260343"/>
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="1600000"/>			
+			<stat name="fpu_accesses" value="10000"/>
+			<stat name="mul_accesses" value="100000"/>
+			<stat name="cdb_alu_accesses" value="1200000"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.5"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="0.5"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0"/>
+			<stat name="FPU_duty_cycle" value="0.6"/>
+			<!--FPU also handles Mul/div -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.6"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="64"/>
+				<stat name="total_accesses" value="800000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,32,8,1,1,7,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="200000"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="128"/>
+				<stat name="total_accesses" value="200000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="8192,16,4,1, 1,3, 16,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1024,2,0,1,1,1, 8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>			    
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="524228,64,16,1, 8,23, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="400000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="1"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64, 1"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.35"/>				
+		</component>
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="1400"/>
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="9"/>
+			<param name="output_ports" value="8"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="136"/>
+			<param name="input_buffer_entries_per_vc" value="16"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="160000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+		    <stat name="duty_cycle" value="0.1"/>
+		</component>
+		
+<!--**********************************************************************-->
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="32"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="400"/><!--MHz-->
+			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+			<param name="block_size" value="64"/><!--(B) the block size of last level cache, which is the unit for one memory burst transfer -->
+			<param name="number_mcs" value="4"/>
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="1"/>
+			<param name="number_ranks" value="2"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="128"/>
+			<param name="addressbus_width" value="51"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="66666"/>
+			<stat name="memory_reads" value="33333"/>
+			<stat name="memory_writes" value="33333"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="2"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="1"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
diff --git a/src/gpuwattch/Penryn.xml b/src/gpuwattch/Penryn.xml
new file mode 100644
index 000000000..fe9715b77
--- /dev/null
+++ b/src/gpuwattch/Penryn.xml
@@ -0,0 +1,456 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="2"/>
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="Private_L2" value="0"/><!--1 Private, 0 shared/coherent -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/>
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="45"/><!-- nm -->
+		<param name="target_core_clockrate" value="3700"/><!--MHz -->
+		<param name="temperature" value="380"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
+		<param name="machine_bits" value="64"/>
+		<param name="virtual_address_width" value="64"/>
+		<param name="physical_address_width" value="52"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+		<stat name="total_cycles" value="100000"/>
+		<stat name="idle_cycles" value="0"/>
+		<stat name="busy_cycles"  value="100000"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="3700"/>
+			<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
+			<param name="opt_local" value="1"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="16"/>
+			<param name="x86" value="1"/>
+			<param name="micro_opcode_width" value="8"/>
+			<param name="machine_type" value="0"/>
+			<!-- inorder/OoO; 1 inorder; 0 OOO-->
+			<param name="number_hardware_threads" value="1"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="4"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="4"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="4"/>
+			<param name="peak_issue_width" value="6"/><!--As shown in Wiki figure which has max 5 ports, store data/address is modeled 
+														  as a single port.-->
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="4"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="2"/>
+			<param name="prediction_width" value="1"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="14,14"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="6"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="1"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="2"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="32"/><!--Inst. + micro-op -->
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="16"/>
+			<param name="instruction_window_scheme" value="1"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="32"/>
+			<param name="fp_instruction_window_size" value="32"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="96"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<param name="archi_Regs_IRF_size" value="16"/><!-- X86-64 has 16GPR -->			
+			<param name="archi_Regs_FRF_size" value="32"/><!-- MMX + XMM -->
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="256"/>
+			<param name="phy_Regs_FRF_size" value="256"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="96"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="48"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="64"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="400000"/>
+			<stat name="int_instructions" value="200000"/>
+			<stat name="fp_instructions" value="100000"/>
+			<stat name="branch_instructions" value="100000"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="0"/>
+			<stat name="store_instructions" value="50000"/>
+			<stat name="committed_instructions" value="400000"/>
+			<stat name="committed_int_instructions" value="200000"/>
+			<stat name="committed_fp_instructions" value="100000"/>
+			<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="100000"/>
+		    <stat name="idle_cycles" value="0"/>
+		    <stat name="busy_cycles"  value="100000"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="400000"/>
+			<stat name="ROB_writes" value="400000"/>
+			<!-- RAT accesses -->
+			<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
+			<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
+			<stat name="fp_rename_reads" value="200000"/>
+			<stat name="fp_rename_writes" value="100000"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="400000"/>
+			<stat name="inst_window_writes" value="400000"/>
+			<stat name="inst_window_wakeup_accesses" value="800000"/>
+			<stat name="fp_inst_window_reads" value="200000"/>
+			<stat name="fp_inst_window_writes" value="200000"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="600000"/>
+			<stat name="float_regfile_reads" value="100000"/>
+			<stat name="int_regfile_writes" value="300000"/>
+			<stat name="float_regfile_writes" value="50000"/>
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="5"/>
+			<stat name="context_switches" value="260343"/>
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="300000"/>			
+			<stat name="fpu_accesses" value="100000"/>
+			<stat name="mul_accesses" value="200000"/>
+			<stat name="cdb_alu_accesses" value="300000"/>
+			<stat name="cdb_mul_accesses" value="200000"/>
+			<stat name="cdb_fpu_accesses" value="100000"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="1"/>			
+			<stat name="LSU_duty_cycle" value="0.5"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.5"/>
+			<stat name="ALU_duty_cycle" value="1"/>
+			<stat name="MUL_duty_cycle" value="0.3"/>
+			<stat name="FPU_duty_cycle" value="0.3"/>
+			<stat name="ALU_cdb_duty_cycle" value="1"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.3"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.3"/>
+			<param name="number_of_BPT" value="2"/>
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="128"/>
+				<stat name="total_accesses" value="200000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="32768,32,8,1,4,4,32,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="200000"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="256"/><!--dual threads-->
+				<stat name="total_accesses" value="400000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="32768,32,8,1, 4,6, 32,1 "/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<param name="number_of_BTB" value="2"/>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="5120,4,2,1, 1,3"/> <!--should be 4096 + 1024 -->
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+				<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
+				<stat name="write_accesses" value="0"/>
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>	
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="6291456,64, 16, 8, 8, 23, 32, 1"/> 
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="3700"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="1.0"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+				<param name="clockrate" value="850"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="11824"/>
+				<stat name="write_accesses" value="11276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+				<stat name="duty_cycle" value="1.0"/>	
+		</component>
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="3400"/>
+			<param name="type" value="0"/>
+			<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
+				at each time only one node can send req -->
+			<param name="horizontal_nodes" value="1"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="1"/>
+			<param name="output_ports" value="1"/>
+			<!-- For bus the I/O ports should be 1 -->
+			<param name="flit_bits" value="256"/>
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. 
+				chip_coverage <=1 -->
+			<param name="link_routing_over_percentage" value="0.5"/>
+			<!-- Links can route over other components or occupy whole area.
+				by default, 50% of the NoC global links routes over other 
+				components -->
+			<stat name="total_accesses" value="100000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="1"/>
+		</component>		
+<!--**********************************************************************-->
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="32"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="0"/>
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="1"/>
+			<param name="number_ranks" value="2"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="128"/>
+			<param name="addressbus_width" value="51"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="33333"/>
+			<stat name="memory_reads" value="16667"/>
+			<stat name="memory_writes" value="16667"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
+
diff --git a/src/gpuwattch/README b/src/gpuwattch/README
new file mode 100644
index 000000000..4887b1037
--- /dev/null
+++ b/src/gpuwattch/README
@@ -0,0 +1,226 @@
+ __  __      ____   _  _____   ____       _         
+|  \/  | ___|  _ \ / \|_   _| | __ )  ___| |_  __ _ 
+| |\/| |/ __| |_) / _ \ | |   |  _ \ / _ \ __|/ _` |
+| |  | | (__|  __/ ___ \| |   | |_) |  __/ |_| (_| |
+|_|  |_|\___|_| /_/   \_\_|   |____/ \___|\__|\__,_|
+
+McPAT: Multicore Power, Area, and Timing
+Current version 0.8Beta 
+===============================
+
+McPAT is an architectural modeling tool for chip multiprocessors (CMP)
+The main focus of McPAT is accurate power and area
+modeling, and a target clock rate is used as a design constraint. 
+McPAT performs automatic extensive search to find optimal designs 
+that satisfy the target clock frequency.  
+
+For complete documentation of the McPAT, please refer McPAT 1.0
+technical report and the following paper,
+"McPAT: An Integrated Power, Area, and Timing Modeling
+ Framework for Multicore and Manycore Architectures", 
+that appears in MICRO 2009. Please cite the paper, if you use
+McPAT in your work. The bibtex entry is provided below for your convenience.
+
+ @inproceedings{mcpat:micro,
+ author = {Sheng Li and Jung Ho Ahn and Richard D. Strong and Jay B. Brockman and Dean M. Tullsen and Norman P. Jouppi},
+ title =  "{McPAT: An Integrated Power, Area, and Timing Modeling Framework for Multicore and Manycore Architectures}",
+ booktitle = {MICRO 42: Proceedings of the 42nd Annual IEEE/ACM International Symposium on Microarchitecture},
+ year = {2009},
+ pages = {469--480},
+ }
+
+Current McPAT is in its beta release. 
+List of features of beta release
+===============================
+The following are the list of features supported by the tool. 
+
+* Power, area, and timing models for CMPs with:
+      Inorder cores both single and multithreaded
+      OOO cores both single and multithreaded
+      Shared/coherent caches with directory hardware:
+      	including directory cache, shadowed tag directory
+      	and static bank mapped tag directory
+      Network-on-Chip
+      On-chip memory controllers
+    
+* Internal models are based on real modern processors:
+  Inorder models are based on Sun Niagara family
+  OOO models are based on Intel P6 for reservation 
+  station based OOO cores, and on Intel Netburst and 
+  Alpha 21264 for physical register file based OOO cores.     
+
+* Leakage power modeling considers both sub-threshold leakage 
+  and gate leakage power. The impact of operating temperature 
+  on both leakage power are considered. Longer channel devices 
+  that can reduce leakage significantly with modest performance 
+  penalty are also modeled.
+  
+* McPAT supports automatic extensive search to find optimal designs 
+  that satisfy the target clock frequency. The timing constraint 
+  include both throughput and latency.
+
+* Interconnect model with different delay, power, and area 
+  properties, as well as both the aggressive and conservative 
+  interconnect projections on wire technologies. 
+
+* All process specific values used by the McPAT are obtained
+  from ITRS and currently, the McPAT supports 90nm, 65nm, 45nm, 
+  32nm, and 22nm technology nodes. At 32nm and 22nm nodes, SOI 
+  and DG devices are used. After 45nm, Hi-K metal gates are used.
+
+How to use the tool?
+====================
+
+McPAT takes input parameters from an XML-based interface,
+then it computes area and peak power of the 
+Please note that the peak power is the absolute worst case power, 
+which could be even higher than TDP. 
+
+1. Steps to run McPAT:
+   -> define the target processor using inorder.xml or OOO.xml 
+   -> run the "mcpat" binary:
+      ./mcpat -infile <*.xml>  -print_level < level of detailed output>
+      ./mcpat -h (or mcpat --help) will show the quick help message.
+
+   Rather than being hardwired to certain simulators, McPAT 
+   uses an XML-based interface to enable easy integration
+   with various performance simulators. Our collaborator, 
+   Richard Strong, at University of California, San Diego, 
+   designed an experimental parser for the M5 simulator, aiming for 
+   streamlining the integration of McPAT and M5. Please check the M5 
+   repository/ for the latest version of the parser.
+   
+2. Optimize:
+   McPAT will try its best to satisfy the target clock rate. 
+   When it cannot find a valid solution, it gives out warnings, 
+   while still giving a solution that is closest to the timing 
+   constraints and calculate power based on it. The optimization 
+   will lead to larger power/area numbers for target higher clock
+   rate. McPAT also provides the option "-opt_for_clk" to turn on 
+   ("-opt_for_clk 1") and off this strict optimization for the 
+   timing constraint. When it is off, McPAT always optimize 
+   component for ED^2P without worrying about meeting the 
+   target clock frequency. By turning it off, the computation time 
+   can be reduced, which suites for situations where target clock rate
+   is conservative.
+  
+3. The output:
+   McPAT outputs results in a hierarchical manner. Increasing 
+   the "-print_level" will show detailed results inside each 
+   component. For each component, major parts are shown, and associated 
+   pipeline registers/control logic are added up in total area/power of each 
+   components. In general, McPAT does not model the area/overhead of the pad 
+   frame used in a processor die.
+   
+4. How to use the XML interface for McPAT 
+   4.1 Set up the parameters
+   		Parameters of target designs need to be set in the *.xml file for 
+   		entries taged as "param". McPAT have very detailed parameter settings. 
+   		please remove the structure parameter from the file if you want 
+   		to use the default values. Otherwise, the parameters in the xml file 
+   		will override the default values. 
+   
+   4.2 Pass the statistics
+   		There are two options to get the correct stats: a) the performance 
+   		simulator can capture all the stats in detail and pass them to McPAT;
+   		b). Performance simulator can only capture partial stats and pass 
+   		them to McPAT, while McPAT can reason about the complete stats using 
+        the partial information and the configuration. Therefore, there are 
+        some overlap for the stats. 
+   
+   4.3 Interface XML file structures (PLEASE READ!)
+   			The XML is hierarchical from processor level to micro-architecture 
+   		level. McPAT support both heterogeneous and homogeneous manycore processors. 
+   		
+   			1). For heterogeneous processor setup, each component (core, NoC, cache, 
+   		and etc) must have its own instantiations (core0, core1, ..., coreN). 
+   		Each instantiation will have different parameters as well as its stats.
+   		Thus, the XML file must have multiple "instantiation" of each type of 
+   		heterogeneous components and the corresponding hetero flags must be set 
+   		in the XML file. Then state in the XML should be the stats of "a" instantiation 
+   		(e.g. "a" cores). The reported runtime dynamic is of a single instantiation 
+   		(e.g. "a" cores). Since the stats for each (e.g. "a" cores) may be different,
+   		we will see a whole list of (e.g. "a" cores) with different dynamic power,
+   		and total power is just a sum of them.  
+   		
+   			2). For homogeneous processors, the same method for heterogeneous can 
+   		also be used by treating all homogeneous instantiations as heterogeneous. 
+   		However, a preferred approach is to use a single representative for all 
+   		the same components (e.g. core0 to represent all cores) and set the 
+   		processor to have homogeneous components (e.g. <param name="homogeneous_cores
+   		" value="1"/> ). Thus, the XML file only has one instantiation to represent 
+   		all others with the same architectural parameters. The corresponding homo 
+   		flags must be set in the XML file.  Then, the stats in the XML should be 
+   		the aggregated stats of the sum of all instantiations (e.g. aggregated stats 
+   		of all cores). In the final results, McPAT will only report a single 
+   		instantiation of each type of component, and the reported runtime dynamic power
+   		is the sum of all instantiations of the same type. This approach can run fast 
+   		and use much less memory.        
+
+5. Guide for integrating McPAT into performance simulators and bypassing the XML interface
+   		The detailed work flow of McPAT has two phases: the initialization phase and
+   the computation phase. Specifically, in order to start the initialization phase a 
+   user specifies static configurations, including parameters at all three levels, 
+   namely, architectural, circuit, and technology levels. During the initialization 
+   phase, McPAT will generate the internal chip representation using the configurations 
+   set by the user. 
+   		The computation phase of McPAT is called by McPAT or the performance simulator 
+   during simulation to generate runtime power numbers. Before calling McPAT to 
+   compute runtime power numbers, the performance simulator needs to pass the 
+   statistics, namely, the activity factors of each individual components to McPAT 
+   via the XML interface. 
+   		The initialization phase is very time-consuming, since it will repeat many 
+   times until valid configurations are found or the possible configurations are 
+   exhausted. To reduce the overhead, a user can let the simulator to call McPAT 
+   directly for computation phase and only call initialization phase once at the 
+   beginning of simulation. In this case, the XML interface file is bypassed, 
+   please refer to processor.cc to see how the two phases are called.
+   
+6. Sample input files:
+   This package provide sample XML files for validating target processors. Please find the 
+   enclosed Niagara1.xml (for the Sun Niagara1 processor), Niagara2.xml (for the Sun Niagara2 
+   processor), Alpha21364.xml (for the Alpha21364 processor), and Xeon.xml (for the Intel 
+   Xeon Tulsa processor). 
+   
+   Special instructions for using Xeon.xml:
+   McPAT uses ITRS device types including HP, LSTP, and LOP. Although most 
+   designs follow ITRS projections, there are designs with special technologies. 
+   For example, the 65nm Xeon Tulsa processor uses 1.25 V rather than 1.1V 
+   for the core voltage domain, which results in the changes in threshold voltage,
+   leakage current density, saturation current, and etc, besides the different 
+   supply voltage. We use MASTAR to match the special technology as used in Xeon 
+   core domain. Therefore, in order to generate accurate results of Xeon 
+   Tulsa cores, users need to do make TAR=mcpatXeonCore and use the generated 
+   special executable. The L3 cache and buses must be computed using standard 
+   ITRS technology.    
+    
+
+====================
+McPAT is in its beginning stage. We are still improving 
+the tool and refining the code. Please come back to its website 
+for newer versions. If you have any comments, 
+questions, or suggestions, please write to us.
+
+Version history and roadmap
+
+McPAT Alpha:      released Sep. 2009 Experimental release
+McPAT Beta (0.6): released Nov. 2009 New code base and technology base
+McPAT Beta (0.7): released May. 2010 Added various new models, 
+                  including long channel devices, buses model; together
+                  with bug fixes and extensive code optimization to reduce 
+                  memory usage.  
+McPAT Beta (0.8): released Aug. 2010 Added various new models, 
+                  including on-chip 10Gb ethernet units, PCIe, and flash controllers.
+Next major release:     
+McPAT 1.0:        including advance power-saving states
+
+Future releases may include the modeling of embedded low-power 
+processors as well as vector processors and GPGPUs.             
+                  
+
+Sheng Li             
+sheng.li@hp.com 
+
+
+
+
diff --git a/src/gpuwattch/XML_Parse.cc b/src/gpuwattch/XML_Parse.cc
new file mode 100644
index 000000000..d7ff4499a
--- /dev/null
+++ b/src/gpuwattch/XML_Parse.cc
@@ -0,0 +1,4586 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+
+#include "XML_Parse.h"
+#include "xmlParser.h"
+#include <stdio.h>
+#include <string>
+
+using namespace std;
+
+const char *perf_count_label[] = {
+    "TOT_INST,",    "FP_INT,",  "IC_H,",     "IC_M,",        "DC_RH,",
+    "DC_RM,",       "DC_WH,",   "DC_WM,",    "TC_H,",        "TC_M,",
+    "CC_H,",        "CC_M,",    "SHRD_ACC,", "REG_RD,",      "REG_WR,",
+    "NON_REG_OPs,", "SP_ACC,",  "SFU_ACC,",  "FPU_ACC,",     "MEM_RD,",
+    "MEM_WR,",      "MEM_PRE,", "L2_RH,",    "L2_RM,",       "L2_WH,",
+    "L2_WM,",       "NOC_A,",   "PIPE_A,",   "IDLE_CORE_N,", "CONST_DYNAMICN"};
+
+void ParseXML::parse(char *filepath) {
+  unsigned int i, j, k, m, n;
+  unsigned int NumofCom_4;
+  unsigned int itmp;
+  // Initialize all structures
+  ParseXML::initialize();
+  string strtmp;
+  char chtmp[60];
+  char chtmp1[60];
+  chtmp1[0] = '\0';
+  // this open and parse the XML file:
+  XMLNode xMainNode = XMLNode::openFileHelper(
+      filepath, "component"); // the 'component' in the first layer
+
+  XMLNode xNode2 = xMainNode.getChildNode(
+      "component"); // the 'component' in the second layer
+  // get all params in the second layer
+  itmp = xNode2.nChildNode("param");
+  for (i = 0; i < itmp; i++) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "GPU_Architecture") == 0) {
+      sys.GPU_Architecture =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "number_of_cores") == 0) {
+      sys.number_of_cores =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "architecture") == 0) {
+      sys.architecture =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "number_of_L1Directories") == 0) {
+      sys.number_of_L1Directories =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "number_of_L2Directories") == 0) {
+      sys.number_of_L2Directories =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "number_of_L2s") == 0) {
+      sys.number_of_L2s =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "Private_L2") == 0) {
+      sys.Private_L2 =
+          (bool)atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "number_of_L3s") == 0) {
+      sys.number_of_L3s =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "number_of_NoCs") == 0) {
+      sys.number_of_NoCs =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "number_of_dir_levels") == 0) {
+      sys.number_of_dir_levels =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "domain_size") == 0) {
+      sys.domain_size =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "first_level_dir") == 0) {
+      sys.first_level_dir =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "homogeneous_cores") == 0) {
+      sys.homogeneous_cores =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "core_tech_node") == 0) {
+      sys.core_tech_node =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "target_core_clockrate") == 0) {
+      sys.target_core_clockrate =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "target_chip_area") == 0) {
+      sys.target_chip_area =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "temperature") == 0) {
+      sys.temperature =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "number_cache_levels") == 0) {
+      sys.number_cache_levels =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "L1_property") == 0) {
+      sys.L1_property =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "L2_property") == 0) {
+      sys.L2_property =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "homogeneous_L2s") == 0) {
+      sys.homogeneous_L2s =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "homogeneous_L1Directories") == 0) {
+      sys.homogeneous_L1Directories =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "homogeneous_L2Directories") == 0) {
+      sys.homogeneous_L2Directories =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "L3_property") == 0) {
+      sys.L3_property =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "homogeneous_L3s") == 0) {
+      sys.homogeneous_L3s =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "homogeneous_ccs") == 0) {
+      sys.homogeneous_ccs =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "homogeneous_NoCs") == 0) {
+      sys.homogeneous_NoCs =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "Max_area_deviation") == 0) {
+      sys.Max_area_deviation =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "Max_power_deviation") == 0) {
+      sys.Max_power_deviation =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "device_type") == 0) {
+      sys.device_type =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "longer_channel_device") == 0) {
+      sys.longer_channel_device =
+          (bool)atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "opt_dynamic_power") == 0) {
+      sys.opt_dynamic_power =
+          (bool)atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "opt_lakage_power") == 0) {
+      sys.opt_lakage_power =
+          (bool)atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "opt_clockrate") == 0) {
+      sys.opt_clockrate =
+          (bool)atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "opt_area") == 0) {
+      sys.opt_area =
+          (bool)atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "Embedded") == 0) {
+      sys.Embedded =
+          (bool)atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "interconnect_projection_type") == 0) {
+      sys.interconnect_projection_type =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value")) == 0 ? 0
+                                                                           : 1;
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "machine_bits") == 0) {
+      sys.machine_bits =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "virtual_address_width") == 0) {
+      sys.virtual_address_width =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "physical_address_width") == 0) {
+      sys.physical_address_width =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "virtual_memory_page_size") == 0) {
+      sys.virtual_memory_page_size =
+          atoi(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "idle_core_power") == 0) {
+      sys.idle_core_power =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "TOT_INST") == 0) {
+      sys.scaling_coefficients[TOT_INST] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_INT") == 0) {
+      sys.scaling_coefficients[FP_INT] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "IC_H") ==
+        0) {
+      sys.scaling_coefficients[IC_H] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "IC_M") ==
+        0) {
+      sys.scaling_coefficients[IC_M] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "DC_RH") ==
+        0) {
+      sys.scaling_coefficients[DC_RH] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "DC_RM") ==
+        0) {
+      sys.scaling_coefficients[DC_RM] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "DC_WH") ==
+        0) {
+      sys.scaling_coefficients[DC_WH] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "DC_WM") ==
+        0) {
+      sys.scaling_coefficients[DC_WM] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "TC_H") ==
+        0) {
+      sys.scaling_coefficients[TC_H] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "TC_M") ==
+        0) {
+      sys.scaling_coefficients[TC_M] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "CC_H") ==
+        0) {
+      sys.scaling_coefficients[CC_H] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "CC_M") ==
+        0) {
+      sys.scaling_coefficients[CC_M] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "SHRD_ACC") == 0) {
+      sys.scaling_coefficients[SHRD_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "REG_RD") == 0) {
+      sys.scaling_coefficients[REG_RD] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "REG_WR") == 0) {
+      sys.scaling_coefficients[REG_WR] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "NON_REG_OPs") == 0) {
+      sys.scaling_coefficients[NON_REG_OPs] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "SP_ACC") == 0) {
+      sys.scaling_coefficients[SP_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "SFU_ACC") == 0) {
+      sys.scaling_coefficients[SFU_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FPU_ACC") == 0) {
+      sys.scaling_coefficients[FPU_ACC] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "MEM_RD") == 0) {
+      sys.scaling_coefficients[MEM_RD] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "MEM_WR") == 0) {
+      sys.scaling_coefficients[MEM_WR] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "MEM_PRE") == 0) {
+      sys.scaling_coefficients[MEM_PRE] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "L2_RH") ==
+        0) {
+      sys.scaling_coefficients[L2_RH] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "L2_RM") ==
+        0) {
+      sys.scaling_coefficients[L2_RM] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "L2_WH") ==
+        0) {
+      sys.scaling_coefficients[L2_WH] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "L2_WM") ==
+        0) {
+      sys.scaling_coefficients[L2_WM] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"), "NOC_A") ==
+        0) {
+      sys.scaling_coefficients[NOC_A] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "PIPE_A") == 0) {
+      sys.scaling_coefficients[PIPE_A] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "IDLE_CORE_N") == 0) {
+      sys.scaling_coefficients[IDLE_CORE_N] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "CONST_DYNAMICN") == 0) {
+      sys.scaling_coefficients[CONST_DYNAMICN] =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    /*
+                    if
+       (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),"scaling_coefficients")==0)
+                    {
+                            strtmp.assign(xNode2.getChildNode("param",i).getAttribute("value"));
+                            m=0;
+                            for(n=0; n<strtmp.length(); n++)
+                            {
+                                    if (strtmp[n]!=',')
+                                    {
+                                            sprintf(chtmp,"%c",strtmp[n]);
+                                            strcat(chtmp1,chtmp);
+                                    }
+                                    else{
+                                            sys.scaling_coefficients[m]=atof(chtmp1);
+                                            m++;
+                                            chtmp1[0]='\0';
+                                    }
+                            }
+                            sys.scaling_coefficients[m]=atof(chtmp1);
+                            m++;
+                            chtmp1[0]='\0';
+                            continue;
+                    }
+    */
+  }
+
+  //	if (sys.Private_L2 && sys.number_of_cores!=sys.number_of_L2s)
+  //	{
+  //		cout<<"Private L2: Number of L2s must equal to Number of
+  // Cores"<<endl; 		exit(0);
+  //	}
+
+  itmp = xNode2.nChildNode("stat");
+  for (i = 0; i < itmp; i++) {
+    if (strcmp(xNode2.getChildNode("stat", i).getAttribute("name"),
+               "total_cycles") == 0) {
+      sys.total_cycles =
+          atof(xNode2.getChildNode("stat", i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("stat", i).getAttribute("name"),
+               "num_idle_cores") == 0) {
+      sys.num_idle_cores =
+          atoi(xNode2.getChildNode("stat", i).getAttribute("value"));
+      continue;
+    }
+  }
+
+  // get the number of components within the second layer
+  unsigned int NumofCom_3 = xNode2.nChildNode("component");
+  XMLNode xNode3, xNode4; // define the third-layer(system.core0) and
+                          // fourth-layer(system.core0.predictor) xnodes
+
+  unsigned int OrderofComponents_3layer = 0;
+  if (NumofCom_3 > OrderofComponents_3layer) {
+    //___________________________get all
+    // system.core0-n________________________________________________
+    if (sys.homogeneous_cores == 1)
+      OrderofComponents_3layer = 0;
+    else
+      OrderofComponents_3layer = sys.number_of_cores - 1;
+    for (i = 0; i <= OrderofComponents_3layer; i++) {
+      xNode3 = xNode2.getChildNode("component", i);
+      if (xNode3.isEmpty() == 1) {
+        printf("The value of homogeneous_cores or number_of_cores is not "
+               "correct!");
+        exit(0);
+      } else {
+        if (strstr(xNode3.getAttribute("name"), "core") != NULL) {
+          { // For cpu0-cpui
+            // Get all params with system.core?
+            itmp = xNode3.nChildNode("param");
+            for (k = 0; k < itmp; k++) {
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "clock_rate") == 0) {
+                sys.core[i].clock_rate =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "opt_local") == 0) {
+                sys.core[i].opt_local = (bool)atoi(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "x86") == 0) {
+                sys.core[i].x86 = (bool)atoi(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "machine_bits") == 0) {
+                sys.core[i].machine_bits =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "virtual_address_width") == 0) {
+                sys.core[i].virtual_address_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "physical_address_width") == 0) {
+                sys.core[i].physical_address_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "instruction_length") == 0) {
+                sys.core[i].instruction_length =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "opcode_width") == 0) {
+                sys.core[i].opcode_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "micro_opcode_width") == 0) {
+                sys.core[i].micro_opcode_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "machine_type") == 0) {
+                sys.core[i].machine_type =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "internal_datapath_width") == 0) {
+                sys.core[i].internal_datapath_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "number_hardware_threads") == 0) {
+                sys.core[i].number_hardware_threads =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "fetch_width") == 0) {
+                sys.core[i].fetch_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "number_instruction_fetch_ports") == 0) {
+                sys.core[i].number_instruction_fetch_ports =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "decode_width") == 0) {
+                sys.core[i].decode_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "issue_width") == 0) {
+                sys.core[i].issue_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "peak_issue_width") == 0) {
+                sys.core[i].peak_issue_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "commit_width") == 0) {
+                sys.core[i].commit_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "fp_issue_width") == 0) {
+                sys.core[i].fp_issue_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "prediction_width") == 0) {
+                sys.core[i].prediction_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "pipelines_per_core") == 0) {
+                strtmp.assign(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                m = 0;
+                for (n = 0; n < strtmp.length(); n++) {
+                  if (strtmp[n] != ',') {
+                    sprintf(chtmp, "%c", strtmp[n]);
+                    strcat(chtmp1, chtmp);
+                  } else {
+                    sys.core[i].pipelines_per_core[m] = atoi(chtmp1);
+                    m++;
+                    chtmp1[0] = '\0';
+                  }
+                }
+                sys.core[i].pipelines_per_core[m] = atoi(chtmp1);
+                m++;
+                chtmp1[0] = '\0';
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "pipeline_depth") == 0) {
+                strtmp.assign(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                m = 0;
+                for (n = 0; n < strtmp.length(); n++) {
+                  if (strtmp[n] != ',') {
+                    sprintf(chtmp, "%c", strtmp[n]);
+                    strcat(chtmp1, chtmp);
+                  } else {
+                    sys.core[i].pipeline_depth[m] = atoi(chtmp1);
+                    m++;
+                    chtmp1[0] = '\0';
+                  }
+                }
+                sys.core[i].pipeline_depth[m] = atoi(chtmp1);
+                m++;
+                chtmp1[0] = '\0';
+                continue;
+              }
+
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "FPU") == 0) {
+                strcpy(sys.core[i].FPU,
+                       xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "divider_multiplier") == 0) {
+                strcpy(sys.core[i].divider_multiplier,
+                       xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "ALU_per_core") == 0) {
+                sys.core[i].ALU_per_core =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "FPU_per_core") == 0) {
+                sys.core[i].FPU_per_core =
+                    atof(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "MUL_per_core") == 0) {
+                sys.core[i].MUL_per_core =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "instruction_buffer_size") == 0) {
+                sys.core[i].instruction_buffer_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "decoded_stream_buffer_size") == 0) {
+                sys.core[i].decoded_stream_buffer_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "instruction_window_scheme") == 0) {
+                sys.core[i].instruction_window_scheme =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "instruction_window_size") == 0) {
+                sys.core[i].instruction_window_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "fp_instruction_window_size") == 0) {
+                sys.core[i].fp_instruction_window_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "ROB_size") == 0) {
+                sys.core[i].ROB_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "rf_banks") == 0) {
+                sys.core[i].rf_banks =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "simd_width") == 0) {
+                sys.core[i].simd_width =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "collector_units") == 0) {
+                sys.core[i].collector_units =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "core_clock_ratio") == 0) {
+                sys.core[i].core_clock_ratio =
+                    atof(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "warp_size") == 0) {
+                sys.core[i].warp_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "archi_Regs_IRF_size") == 0) {
+                sys.core[i].archi_Regs_IRF_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "archi_Regs_FRF_size") == 0) {
+                sys.core[i].archi_Regs_FRF_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "phy_Regs_IRF_size") == 0) {
+                sys.core[i].phy_Regs_IRF_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "phy_Regs_FRF_size") == 0) {
+                sys.core[i].phy_Regs_FRF_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "rename_scheme") == 0) {
+                sys.core[i].rename_scheme =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "register_windows_size") == 0) {
+                sys.core[i].register_windows_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "LSU_order") == 0) {
+                strcpy(sys.core[i].LSU_order,
+                       xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "store_buffer_size") == 0) {
+                sys.core[i].store_buffer_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "load_buffer_size") == 0) {
+                sys.core[i].load_buffer_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "memory_ports") == 0) {
+                sys.core[i].memory_ports =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "Dcache_dual_pump") == 0) {
+                strcpy(sys.core[i].Dcache_dual_pump,
+                       xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "RAS_size") == 0) {
+                sys.core[i].RAS_size =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+            }
+            // Get all stats with system.core?
+            itmp = xNode3.nChildNode("stat");
+            for (k = 0; k < itmp; k++) {
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "total_instructions") == 0) {
+                sys.core[i].total_instructions =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "int_instructions") == 0) {
+                sys.core[i].int_instructions =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "fp_instructions") == 0) {
+                sys.core[i].fp_instructions =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "branch_instructions") == 0) {
+                sys.core[i].branch_instructions =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "branch_mispredictions") == 0) {
+                sys.core[i].branch_mispredictions =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "committed_instructions") == 0) {
+                sys.core[i].committed_instructions =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "committed_int_instructions") == 0) {
+                sys.core[i].committed_int_instructions =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "committed_fp_instructions") == 0) {
+                sys.core[i].committed_fp_instructions =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "load_instructions") == 0) {
+                sys.core[i].load_instructions =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "store_instructions") == 0) {
+                sys.core[i].store_instructions =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "total_cycles") == 0) {
+                sys.core[i].total_cycles =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "idle_cycles") == 0) {
+                sys.core[i].idle_cycles =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "busy_cycles") == 0) {
+                sys.core[i].busy_cycles =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "instruction_buffer_reads") == 0) {
+                sys.core[i].instruction_buffer_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "instruction_buffer_write") == 0) {
+                sys.core[i].instruction_buffer_write =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "ROB_reads") == 0) {
+                sys.core[i].ROB_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "ROB_writes") == 0) {
+                sys.core[i].ROB_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "rename_reads") == 0) {
+                sys.core[i].rename_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "rename_writes") == 0) {
+                sys.core[i].rename_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "fp_rename_reads") == 0) {
+                sys.core[i].fp_rename_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "fp_rename_writes") == 0) {
+                sys.core[i].fp_rename_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "inst_window_reads") == 0) {
+                sys.core[i].inst_window_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "inst_window_writes") == 0) {
+                sys.core[i].inst_window_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "inst_window_wakeup_accesses") == 0) {
+                sys.core[i].inst_window_wakeup_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "inst_window_selections") == 0) {
+                sys.core[i].inst_window_selections =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "fp_inst_window_reads") == 0) {
+                sys.core[i].fp_inst_window_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "fp_inst_window_writes") == 0) {
+                sys.core[i].fp_inst_window_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "fp_inst_window_wakeup_accesses") == 0) {
+                sys.core[i].fp_inst_window_wakeup_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "archi_int_regfile_reads") == 0) {
+                sys.core[i].archi_int_regfile_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "archi_float_regfile_reads") == 0) {
+                sys.core[i].archi_float_regfile_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "phy_int_regfile_reads") == 0) {
+                sys.core[i].phy_int_regfile_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "phy_float_regfile_reads") == 0) {
+                sys.core[i].phy_float_regfile_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "phy_int_regfile_writes") == 0) {
+                sys.core[i].archi_int_regfile_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "phy_float_regfile_writes") == 0) {
+                sys.core[i].archi_float_regfile_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "archi_int_regfile_writes") == 0) {
+                sys.core[i].phy_int_regfile_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "archi_float_regfile_writes") == 0) {
+                sys.core[i].phy_float_regfile_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "int_regfile_reads") == 0) {
+                sys.core[i].int_regfile_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "float_regfile_reads") == 0) {
+                sys.core[i].float_regfile_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "int_regfile_writes") == 0) {
+                sys.core[i].int_regfile_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "float_regfile_writes") == 0) {
+                sys.core[i].float_regfile_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "non_rf_operands") == 0) {
+                sys.core[i].non_rf_operands =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "windowed_reg_accesses") == 0) {
+                sys.core[i].windowed_reg_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "windowed_reg_transports") == 0) {
+                sys.core[i].windowed_reg_transports =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "function_calls") == 0) {
+                sys.core[i].function_calls =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "context_switches") == 0) {
+                sys.core[i].context_switches =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "ialu_accesses") == 0) {
+                sys.core[i].ialu_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "fpu_accesses") == 0) {
+                sys.core[i].fpu_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "mul_accesses") == 0) {
+                sys.core[i].mul_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "cdb_alu_accesses") == 0) {
+                sys.core[i].cdb_alu_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "cdb_mul_accesses") == 0) {
+                sys.core[i].cdb_mul_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "cdb_fpu_accesses") == 0) {
+                sys.core[i].cdb_fpu_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "load_buffer_reads") == 0) {
+                sys.core[i].load_buffer_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "load_buffer_writes") == 0) {
+                sys.core[i].load_buffer_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "load_buffer_cams") == 0) {
+                sys.core[i].load_buffer_cams =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "store_buffer_reads") == 0) {
+                sys.core[i].store_buffer_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "store_buffer_writes") == 0) {
+                sys.core[i].store_buffer_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "store_buffer_cams") == 0) {
+                sys.core[i].store_buffer_cams =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "store_buffer_forwards") == 0) {
+                sys.core[i].store_buffer_forwards =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "main_memory_access") == 0) {
+                sys.core[i].main_memory_access =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "main_memory_read") == 0) {
+                sys.core[i].main_memory_read =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "main_memory_write") == 0) {
+                sys.core[i].main_memory_write =
+                    atoi(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "pipeline_duty_cycle") == 0) {
+                sys.core[i].pipeline_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "IFU_duty_cycle") == 0) {
+                sys.core[i].IFU_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "BR_duty_cycle") == 0) {
+                sys.core[i].BR_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "LSU_duty_cycle") == 0) {
+                sys.core[i].LSU_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "MemManU_I_duty_cycle") == 0) {
+                sys.core[i].MemManU_I_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "MemManU_D_duty_cycle") == 0) {
+                sys.core[i].MemManU_D_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "ALU_duty_cycle") == 0) {
+                sys.core[i].ALU_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "MUL_duty_cycle") == 0) {
+                sys.core[i].MUL_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "FPU_duty_cycle") == 0) {
+                sys.core[i].FPU_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "ALU_cdb_duty_cycle") == 0) {
+                sys.core[i].ALU_cdb_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "MUL_cdb_duty_cycle") == 0) {
+                sys.core[i].MUL_cdb_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "FPU_cdb_duty_cycle") == 0) {
+                sys.core[i].FPU_cdb_duty_cycle =
+                    atoi(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+            }
+          }
+
+          NumofCom_4 =
+              xNode3.nChildNode("component"); // get the number of components
+                                              // within the third layer
+          for (j = 0; j < NumofCom_4; j++) {
+            xNode4 = xNode3.getChildNode("component", j);
+            if (strcmp(xNode4.getAttribute("name"), "PBT") == 0) { // find PBT
+              itmp = xNode4.nChildNode("param");
+              for (k = 0; k < itmp; k++) { // get all items of param in
+                                           // system.core0.predictor--PBT
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "prediction_width") == 0) {
+                  sys.core[i].predictor.prediction_width = atoi(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "prediction_scheme") == 0) {
+                  strcpy(sys.core[i].predictor.prediction_scheme,
+                         xNode4.getChildNode("param", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "predictor_size") == 0) {
+                  sys.core[i].predictor.predictor_size = atoi(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "predictor_entries") == 0) {
+                  sys.core[i].predictor.predictor_entries = atoi(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "local_predictor_size") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].predictor.local_predictor_size[m] =
+                          atoi(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].predictor.local_predictor_size[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "local_predictor_entries") == 0) {
+                  sys.core[i].predictor.local_predictor_entries = atoi(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "global_predictor_entries") == 0) {
+                  sys.core[i].predictor.global_predictor_entries = atoi(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "global_predictor_bits") == 0) {
+                  sys.core[i].predictor.global_predictor_bits = atoi(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "chooser_predictor_entries") == 0) {
+                  sys.core[i].predictor.chooser_predictor_entries = atoi(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "chooser_predictor_bits") == 0) {
+                  sys.core[i].predictor.chooser_predictor_bits = atoi(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  continue;
+                }
+              }
+              itmp = xNode4.nChildNode("stat");
+              for (k = 0; k < itmp; k++) { // get all items of stat in
+                                           // system.core0.predictor--PBT
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "predictor_accesses") == 0)
+                  sys.core[i].predictor.predictor_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+              }
+            }
+            if (strcmp(xNode4.getAttribute("name"), "itlb") ==
+                0) { // find system.core0.itlb
+              itmp = xNode4.nChildNode("param");
+              for (k = 0; k < itmp;
+                   k++) { // get all items of param in system.core0.itlb--itlb
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "number_entries") == 0)
+                  sys.core[i].itlb.number_entries = atoi(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+              }
+              itmp = xNode4.nChildNode("stat");
+              for (k = 0; k < itmp; k++) { // get all items of stat in itlb
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_hits") == 0) {
+                  sys.core[i].itlb.total_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_accesses") == 0) {
+                  sys.core[i].itlb.total_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_misses") == 0) {
+                  sys.core[i].itlb.total_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "conflicts") == 0) {
+                  sys.core[i].itlb.conflicts = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+              }
+            }
+            if (strcmp(xNode4.getAttribute("name"), "icache") ==
+                0) { // find system.core0.icache
+              itmp = xNode4.nChildNode("param");
+              for (k = 0; k < itmp; k++) { // get all items of param in
+                                           // system.core0.icache--icache
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "icache_config") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].icache.icache_config[m] = atof(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].icache.icache_config[m] = atof(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "buffer_sizes") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].icache.buffer_sizes[m] = atoi(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].icache.buffer_sizes[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              itmp = xNode4.nChildNode("stat");
+              for (k = 0; k < itmp; k++) {
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_accesses") == 0) {
+                  sys.core[i].icache.total_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_accesses") == 0) {
+                  sys.core[i].icache.read_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_misses") == 0) {
+                  sys.core[i].icache.read_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "replacements") == 0) {
+                  sys.core[i].icache.replacements = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_hits") == 0) {
+                  sys.core[i].icache.read_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_hits") == 0) {
+                  sys.core[i].icache.total_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_misses") == 0) {
+                  sys.core[i].icache.total_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "miss_buffer_access") == 0) {
+                  sys.core[i].icache.miss_buffer_access = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "fill_buffer_accesses") == 0) {
+                  sys.core[i].icache.fill_buffer_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_accesses") == 0) {
+                  sys.core[i].icache.prefetch_buffer_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_writes") == 0) {
+                  sys.core[i].icache.prefetch_buffer_writes = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_reads") == 0) {
+                  sys.core[i].icache.prefetch_buffer_reads = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_hits") == 0) {
+                  sys.core[i].icache.prefetch_buffer_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "conflicts") == 0) {
+                  sys.core[i].icache.conflicts = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+              }
+            }
+            if (strcmp(xNode4.getAttribute("name"), "dtlb") ==
+                0) { // find system.core0.dtlb
+              itmp = xNode4.nChildNode("param");
+              for (k = 0; k < itmp;
+                   k++) { // get all items of param in system.core0.dtlb--dtlb
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "number_entries") == 0)
+                  sys.core[i].dtlb.number_entries = atoi(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+              }
+              itmp = xNode4.nChildNode("stat");
+              for (k = 0; k < itmp; k++) { // get all items of stat in dtlb
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_accesses") == 0) {
+                  sys.core[i].dtlb.total_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_accesses") == 0) {
+                  sys.core[i].dtlb.read_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_accesses") == 0) {
+                  sys.core[i].dtlb.write_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_hits") == 0) {
+                  sys.core[i].dtlb.read_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_hits") == 0) {
+                  sys.core[i].dtlb.write_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_misses") == 0) {
+                  sys.core[i].dtlb.read_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_misses") == 0) {
+                  sys.core[i].dtlb.write_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_hits") == 0) {
+                  sys.core[i].dtlb.total_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_misses") == 0) {
+                  sys.core[i].dtlb.total_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "conflicts") == 0) {
+                  sys.core[i].dtlb.conflicts = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+              }
+            }
+
+            // Added by Jingwen
+            if (strcmp(xNode4.getAttribute("name"), "ccache") ==
+                0) { // find system.core0.ccache
+              itmp = xNode4.nChildNode("param");
+              for (k = 0; k < itmp; k++) { // get all items of param in
+                                           // system.core0.ccache--ccache
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "ccache_config") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].ccache.dcache_config[m] = atof(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].ccache.dcache_config[m] = atof(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "buffer_sizes") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].ccache.buffer_sizes[m] = atoi(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].ccache.buffer_sizes[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              itmp = xNode4.nChildNode("stat");
+              for (k = 0; k < itmp; k++) { // get all items of stat in ccache
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_accesses") == 0) {
+                  sys.core[i].ccache.total_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_accesses") == 0) {
+                  sys.core[i].ccache.read_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_accesses") == 0) {
+                  sys.core[i].ccache.write_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_hits") == 0) {
+                  sys.core[i].ccache.total_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_misses") == 0) {
+                  sys.core[i].ccache.total_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_hits") == 0) {
+                  sys.core[i].ccache.read_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_hits") == 0) {
+                  sys.core[i].ccache.write_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_misses") == 0) {
+                  sys.core[i].ccache.read_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_misses") == 0) {
+                  sys.core[i].ccache.write_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "replacements") == 0) {
+                  sys.core[i].ccache.replacements = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_backs") == 0) {
+                  sys.core[i].ccache.write_backs = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "miss_buffer_access") == 0) {
+                  sys.core[i].ccache.miss_buffer_access = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "fill_buffer_accesses") == 0) {
+                  sys.core[i].ccache.fill_buffer_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_accesses") == 0) {
+                  sys.core[i].ccache.prefetch_buffer_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_writes") == 0) {
+                  sys.core[i].ccache.prefetch_buffer_writes = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_reads") == 0) {
+                  sys.core[i].ccache.prefetch_buffer_reads = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_hits") == 0) {
+                  sys.core[i].ccache.prefetch_buffer_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "wbb_writes") == 0) {
+                  sys.core[i].ccache.wbb_writes = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "wbb_reads") == 0) {
+                  sys.core[i].ccache.wbb_reads = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "conflicts") == 0) {
+                  sys.core[i].ccache.conflicts = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+              }
+            }
+
+            // tcache
+            if (strcmp(xNode4.getAttribute("name"), "tcache") ==
+                0) { // find system.core0.tcache
+              itmp = xNode4.nChildNode("param");
+              for (k = 0; k < itmp; k++) { // get all items of param in
+                                           // system.core0.tcache--tcache
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "tcache_config") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].tcache.dcache_config[m] = atof(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].tcache.dcache_config[m] = atof(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "buffer_sizes") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].tcache.buffer_sizes[m] = atoi(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].tcache.buffer_sizes[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              itmp = xNode4.nChildNode("stat");
+              for (k = 0; k < itmp; k++) { // get all items of stat in tcache
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_accesses") == 0) {
+                  sys.core[i].tcache.total_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_accesses") == 0) {
+                  sys.core[i].tcache.read_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_accesses") == 0) {
+                  sys.core[i].tcache.write_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_hits") == 0) {
+                  sys.core[i].tcache.total_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_misses") == 0) {
+                  sys.core[i].tcache.total_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_hits") == 0) {
+                  sys.core[i].tcache.read_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_hits") == 0) {
+                  sys.core[i].tcache.write_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_misses") == 0) {
+                  sys.core[i].tcache.read_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_misses") == 0) {
+                  sys.core[i].tcache.write_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "replacements") == 0) {
+                  sys.core[i].tcache.replacements = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_backs") == 0) {
+                  sys.core[i].tcache.write_backs = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "miss_buffer_access") == 0) {
+                  sys.core[i].tcache.miss_buffer_access = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "fill_buffer_accesses") == 0) {
+                  sys.core[i].tcache.fill_buffer_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_accesses") == 0) {
+                  sys.core[i].tcache.prefetch_buffer_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_writes") == 0) {
+                  sys.core[i].tcache.prefetch_buffer_writes = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_reads") == 0) {
+                  sys.core[i].tcache.prefetch_buffer_reads = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_hits") == 0) {
+                  sys.core[i].tcache.prefetch_buffer_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "wbb_writes") == 0) {
+                  sys.core[i].tcache.wbb_writes = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "wbb_reads") == 0) {
+                  sys.core[i].tcache.wbb_reads = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "conflicts") == 0) {
+                  sys.core[i].tcache.conflicts = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+              }
+            }
+
+            if (strcmp(xNode4.getAttribute("name"), "sharedmemory") ==
+                0) { // find system.core0.sharedmemory
+              itmp = xNode4.nChildNode("param");
+              for (k = 0; k < itmp;
+                   k++) { // get all items of param in
+                          // system.core0.sharedmemory--sharedmemory
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "sharedmemory_config") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].sharedmemory.dcache_config[m] = atof(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].sharedmemory.dcache_config[m] = atof(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "buffer_sizes") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].sharedmemory.buffer_sizes[m] = atoi(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].sharedmemory.buffer_sizes[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              itmp = xNode4.nChildNode("stat");
+              for (k = 0; k < itmp;
+                   k++) { // get all items of stat in sharedmemory
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_accesses") == 0) {
+                  sys.core[i].sharedmemory.total_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_accesses") == 0) {
+                  sys.core[i].sharedmemory.read_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_accesses") == 0) {
+                  sys.core[i].sharedmemory.write_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_hits") == 0) {
+                  sys.core[i].sharedmemory.total_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_misses") == 0) {
+                  sys.core[i].sharedmemory.total_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_hits") == 0) {
+                  sys.core[i].sharedmemory.read_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_hits") == 0) {
+                  sys.core[i].sharedmemory.write_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_misses") == 0) {
+                  sys.core[i].sharedmemory.read_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_misses") == 0) {
+                  sys.core[i].sharedmemory.write_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "replacements") == 0) {
+                  sys.core[i].sharedmemory.replacements = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_backs") == 0) {
+                  sys.core[i].sharedmemory.write_backs = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "miss_buffer_access") == 0) {
+                  sys.core[i].sharedmemory.miss_buffer_access = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "fill_buffer_accesses") == 0) {
+                  sys.core[i].sharedmemory.fill_buffer_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_accesses") == 0) {
+                  sys.core[i].sharedmemory.prefetch_buffer_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_writes") == 0) {
+                  sys.core[i].sharedmemory.prefetch_buffer_writes = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_reads") == 0) {
+                  sys.core[i].sharedmemory.prefetch_buffer_reads = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_hits") == 0) {
+                  sys.core[i].sharedmemory.prefetch_buffer_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "wbb_writes") == 0) {
+                  sys.core[i].sharedmemory.wbb_writes = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "wbb_reads") == 0) {
+                  sys.core[i].sharedmemory.wbb_reads = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "conflicts") == 0) {
+                  sys.core[i].sharedmemory.conflicts = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+              }
+            }
+
+            if (strcmp(xNode4.getAttribute("name"), "dcache") ==
+                0) { // find system.core0.dcache
+              itmp = xNode4.nChildNode("param");
+              for (k = 0; k < itmp; k++) { // get all items of param in
+                                           // system.core0.dcache--dcache
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "dcache_config") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].dcache.dcache_config[m] = atof(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].dcache.dcache_config[m] = atof(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "buffer_sizes") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].dcache.buffer_sizes[m] = atoi(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].dcache.buffer_sizes[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              itmp = xNode4.nChildNode("stat");
+              for (k = 0; k < itmp; k++) { // get all items of stat in dcache
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_accesses") == 0) {
+                  sys.core[i].dcache.total_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_accesses") == 0) {
+                  sys.core[i].dcache.read_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_accesses") == 0) {
+                  sys.core[i].dcache.write_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_hits") == 0) {
+                  sys.core[i].dcache.total_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_misses") == 0) {
+                  sys.core[i].dcache.total_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_hits") == 0) {
+                  sys.core[i].dcache.read_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_hits") == 0) {
+                  sys.core[i].dcache.write_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_misses") == 0) {
+                  sys.core[i].dcache.read_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_misses") == 0) {
+                  sys.core[i].dcache.write_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "replacements") == 0) {
+                  sys.core[i].dcache.replacements = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_backs") == 0) {
+                  sys.core[i].dcache.write_backs = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "miss_buffer_access") == 0) {
+                  sys.core[i].dcache.miss_buffer_access = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "fill_buffer_accesses") == 0) {
+                  sys.core[i].dcache.fill_buffer_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_accesses") == 0) {
+                  sys.core[i].dcache.prefetch_buffer_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_writes") == 0) {
+                  sys.core[i].dcache.prefetch_buffer_writes = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_reads") == 0) {
+                  sys.core[i].dcache.prefetch_buffer_reads = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "prefetch_buffer_hits") == 0) {
+                  sys.core[i].dcache.prefetch_buffer_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "wbb_writes") == 0) {
+                  sys.core[i].dcache.wbb_writes = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "wbb_reads") == 0) {
+                  sys.core[i].dcache.wbb_reads = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "conflicts") == 0) {
+                  sys.core[i].dcache.conflicts = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+              }
+            }
+
+            if (strcmp(xNode4.getAttribute("name"), "BTB") ==
+                0) { // find system.core0.BTB
+              itmp = xNode4.nChildNode("param");
+              for (k = 0; k < itmp;
+                   k++) { // get all items of param in system.core0.BTB--BTB
+                if (strcmp(xNode4.getChildNode("param", k).getAttribute("name"),
+                           "BTB_config") == 0) {
+                  strtmp.assign(
+                      xNode4.getChildNode("param", k).getAttribute("value"));
+                  m = 0;
+                  for (n = 0; n < strtmp.length(); n++) {
+                    if (strtmp[n] != ',') {
+                      sprintf(chtmp, "%c", strtmp[n]);
+                      strcat(chtmp1, chtmp);
+                    } else {
+                      sys.core[i].BTB.BTB_config[m] = atoi(chtmp1);
+                      m++;
+                      chtmp1[0] = '\0';
+                    }
+                  }
+                  sys.core[i].BTB.BTB_config[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              itmp = xNode4.nChildNode("stat");
+              for (k = 0; k < itmp; k++) { // get all items of stat in BTB
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_accesses") == 0) {
+                  sys.core[i].BTB.total_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_accesses") == 0) {
+                  sys.core[i].BTB.read_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_accesses") == 0) {
+                  sys.core[i].BTB.write_accesses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_hits") == 0) {
+                  sys.core[i].BTB.total_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "total_misses") == 0) {
+                  sys.core[i].BTB.total_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_hits") == 0) {
+                  sys.core[i].BTB.read_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_hits") == 0) {
+                  sys.core[i].BTB.write_hits = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "read_misses") == 0) {
+                  sys.core[i].BTB.read_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "write_misses") == 0) {
+                  sys.core[i].BTB.write_misses = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+                if (strcmp(xNode4.getChildNode("stat", k).getAttribute("name"),
+                           "replacements") == 0) {
+                  sys.core[i].BTB.replacements = atof(
+                      xNode4.getChildNode("stat", k).getAttribute("value"));
+                  continue;
+                }
+              }
+            }
+          }
+        } else {
+          printf("The value of homogeneous_cores or number_of_cores is not "
+                 "correct!");
+          exit(0);
+        }
+      }
+    }
+
+    //__________________________________________Get
+    // system.L1Directory0-n____________________________________________
+    int w, tmpOrderofComponents_3layer;
+    w = OrderofComponents_3layer + 1;
+    tmpOrderofComponents_3layer = OrderofComponents_3layer;
+    if (sys.homogeneous_L1Directories == 1)
+      OrderofComponents_3layer = OrderofComponents_3layer + 1;
+    else
+      OrderofComponents_3layer =
+          OrderofComponents_3layer + sys.number_of_L1Directories;
+
+    for (i = 0; i < (OrderofComponents_3layer - tmpOrderofComponents_3layer);
+         i++) {
+      xNode3 = xNode2.getChildNode("component", w);
+      if (xNode3.isEmpty() == 1) {
+        printf("The value of homogeneous_L1Directories or "
+               "number_of_L1Directories is not correct!");
+        exit(0);
+      } else {
+        if (strstr(xNode3.getAttribute("id"), "L1Directory") != NULL) {
+          itmp = xNode3.nChildNode("param");
+          for (k = 0; k < itmp;
+               k++) { // get all items of param in system.L1Directory
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "Dir_config") == 0) {
+              strtmp.assign(
+                  xNode3.getChildNode("param", k).getAttribute("value"));
+              m = 0;
+              for (n = 0; n < strtmp.length(); n++) {
+                if (strtmp[n] != ',') {
+                  sprintf(chtmp, "%c", strtmp[n]);
+                  strcat(chtmp1, chtmp);
+                } else {
+                  sys.L1Directory[i].Dir_config[m] = atof(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              sys.L1Directory[i].Dir_config[m] = atof(chtmp1);
+              m++;
+              chtmp1[0] = '\0';
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "buffer_sizes") == 0) {
+              strtmp.assign(
+                  xNode3.getChildNode("param", k).getAttribute("value"));
+              m = 0;
+              for (n = 0; n < strtmp.length(); n++) {
+                if (strtmp[n] != ',') {
+                  sprintf(chtmp, "%c", strtmp[n]);
+                  strcat(chtmp1, chtmp);
+                } else {
+                  sys.L1Directory[i].buffer_sizes[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              sys.L1Directory[i].buffer_sizes[m] = atoi(chtmp1);
+              m++;
+              chtmp1[0] = '\0';
+              continue;
+            }
+
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "clockrate") == 0) {
+              sys.L1Directory[i].clockrate =
+                  atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "ports") == 0) {
+              strtmp.assign(
+                  xNode3.getChildNode("param", k).getAttribute("value"));
+              m = 0;
+              for (n = 0; n < strtmp.length(); n++) {
+                if (strtmp[n] != ',') {
+                  sprintf(chtmp, "%c", strtmp[n]);
+                  strcat(chtmp1, chtmp);
+                } else {
+                  sys.L1Directory[i].ports[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              sys.L1Directory[i].ports[m] = atoi(chtmp1);
+              m++;
+              chtmp1[0] = '\0';
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "device_type") == 0) {
+              sys.L1Directory[i].device_type =
+                  atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "Directory_type") == 0) {
+              sys.L1Directory[i].Directory_type =
+                  atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "3D_stack") == 0) {
+              strcpy(sys.L1Directory[i].threeD_stack,
+                     xNode3.getChildNode("param", k).getAttribute("value"));
+              continue;
+            }
+          }
+          itmp = xNode3.nChildNode("stat");
+          for (k = 0; k < itmp;
+               k++) { // get all items of stat in system.L2directorydirectory
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "total_accesses") == 0) {
+              sys.L1Directory[i].total_accesses =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "read_accesses") == 0) {
+              sys.L1Directory[i].read_accesses =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "write_accesses") == 0) {
+              sys.L1Directory[i].write_accesses =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "read_misses") == 0) {
+              sys.L1Directory[i].read_misses =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "write_misses") == 0) {
+              sys.L1Directory[i].write_misses =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "conflicts") == 0) {
+              sys.L1Directory[i].conflicts =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "duty_cycle") == 0) {
+              sys.L1Directory[i].duty_cycle =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+          }
+          w = w + 1;
+        } else {
+          printf("The value of homogeneous_L1Directories or "
+                 "number_of_L1Directories is not correct!");
+          exit(0);
+        }
+      }
+    }
+
+    //__________________________________________Get
+    // system.L2Directory0-n____________________________________________
+    w = OrderofComponents_3layer + 1;
+    tmpOrderofComponents_3layer = OrderofComponents_3layer;
+    if (sys.homogeneous_L2Directories == 1)
+      OrderofComponents_3layer = OrderofComponents_3layer + 1;
+    else
+      OrderofComponents_3layer =
+          OrderofComponents_3layer + sys.number_of_L2Directories;
+
+    for (i = 0; i < (OrderofComponents_3layer - tmpOrderofComponents_3layer);
+         i++) {
+      xNode3 = xNode2.getChildNode("component", w);
+      if (xNode3.isEmpty() == 1) {
+        printf("The value of homogeneous_L2Directories or "
+               "number_of_L2Directories is not correct!");
+        exit(0);
+      } else {
+        if (strstr(xNode3.getAttribute("id"), "L2Directory") != NULL) {
+          itmp = xNode3.nChildNode("param");
+          for (k = 0; k < itmp;
+               k++) { // get all items of param in system.L2Directory
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "Dir_config") == 0) {
+              strtmp.assign(
+                  xNode3.getChildNode("param", k).getAttribute("value"));
+              m = 0;
+              for (n = 0; n < strtmp.length(); n++) {
+                if (strtmp[n] != ',') {
+                  sprintf(chtmp, "%c", strtmp[n]);
+                  strcat(chtmp1, chtmp);
+                } else {
+                  sys.L2Directory[i].Dir_config[m] = atof(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              sys.L2Directory[i].Dir_config[m] = atof(chtmp1);
+              m++;
+              chtmp1[0] = '\0';
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "buffer_sizes") == 0) {
+              strtmp.assign(
+                  xNode3.getChildNode("param", k).getAttribute("value"));
+              m = 0;
+              for (n = 0; n < strtmp.length(); n++) {
+                if (strtmp[n] != ',') {
+                  sprintf(chtmp, "%c", strtmp[n]);
+                  strcat(chtmp1, chtmp);
+                } else {
+                  sys.L2Directory[i].buffer_sizes[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              sys.L2Directory[i].buffer_sizes[m] = atoi(chtmp1);
+              m++;
+              chtmp1[0] = '\0';
+              continue;
+            }
+
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "clockrate") == 0) {
+              sys.L2Directory[i].clockrate =
+                  atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "Directory_type") == 0) {
+              sys.L2Directory[i].Directory_type =
+                  atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "ports") == 0) {
+              strtmp.assign(
+                  xNode3.getChildNode("param", k).getAttribute("value"));
+              m = 0;
+              for (n = 0; n < strtmp.length(); n++) {
+                if (strtmp[n] != ',') {
+                  sprintf(chtmp, "%c", strtmp[n]);
+                  strcat(chtmp1, chtmp);
+                } else {
+                  sys.L2Directory[i].ports[m] = atoi(chtmp1);
+                  m++;
+                  chtmp1[0] = '\0';
+                }
+              }
+              sys.L2Directory[i].ports[m] = atoi(chtmp1);
+              m++;
+              chtmp1[0] = '\0';
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "device_type") == 0) {
+              sys.L2Directory[i].device_type =
+                  atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                       "3D_stack") == 0) {
+              strcpy(sys.L2Directory[i].threeD_stack,
+                     xNode3.getChildNode("param", k).getAttribute("value"));
+              continue;
+            }
+          }
+          itmp = xNode3.nChildNode("stat");
+          for (k = 0; k < itmp;
+               k++) { // get all items of stat in system.L2directorydirectory
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "total_accesses") == 0) {
+              sys.L2Directory[i].total_accesses =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "read_accesses") == 0) {
+              sys.L2Directory[i].read_accesses =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "write_accesses") == 0) {
+              sys.L2Directory[i].write_accesses =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "read_misses") == 0) {
+              sys.L2Directory[i].read_misses =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "write_misses") == 0) {
+              sys.L2Directory[i].write_misses =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "conflicts") == 0) {
+              sys.L2Directory[i].conflicts =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+            if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                       "duty_cycle") == 0) {
+              sys.L2Directory[i].duty_cycle =
+                  atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              continue;
+            }
+          }
+          w = w + 1;
+        } else {
+          printf("The value of homogeneous_L2Directories or "
+                 "number_of_L2Directories is not correct!");
+          exit(0);
+        }
+      }
+    }
+
+    //__________________________________________Get
+    // system.L2[0..n]____________________________________________
+    w = OrderofComponents_3layer + 1;
+    tmpOrderofComponents_3layer = OrderofComponents_3layer;
+    if (sys.homogeneous_L2s == 1)
+      OrderofComponents_3layer = OrderofComponents_3layer + 1;
+    else
+      OrderofComponents_3layer = OrderofComponents_3layer + sys.number_of_L2s;
+
+    for (i = 0; i < (OrderofComponents_3layer - tmpOrderofComponents_3layer);
+         i++) {
+      xNode3 = xNode2.getChildNode("component", w);
+      if (xNode3.isEmpty() == 1) {
+        printf("The value of homogeneous_L2s or number_of_L2s is not correct!");
+        exit(0);
+      } else {
+        if (strstr(xNode3.getAttribute("name"), "L2") != NULL) {
+          { // For L20-L2i
+            // Get all params with system.L2?
+            itmp = xNode3.nChildNode("param");
+            for (k = 0; k < itmp; k++) {
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "L2_config") == 0) {
+                strtmp.assign(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                m = 0;
+                for (n = 0; n < strtmp.length(); n++) {
+                  if (strtmp[n] != ',') {
+                    sprintf(chtmp, "%c", strtmp[n]);
+                    strcat(chtmp1, chtmp);
+                  } else {
+                    sys.L2[i].L2_config[m] = atof(chtmp1);
+                    m++;
+                    chtmp1[0] = '\0';
+                  }
+                }
+                sys.L2[i].L2_config[m] = atof(chtmp1);
+                m++;
+                chtmp1[0] = '\0';
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "clockrate") == 0) {
+                sys.L2[i].clockrate =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "merged_dir") == 0) {
+                sys.L2[i].merged_dir = (bool)atoi(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "ports") == 0) {
+                strtmp.assign(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                m = 0;
+                for (n = 0; n < strtmp.length(); n++) {
+                  if (strtmp[n] != ',') {
+                    sprintf(chtmp, "%c", strtmp[n]);
+                    strcat(chtmp1, chtmp);
+                  } else {
+                    sys.L2[i].ports[m] = atoi(chtmp1);
+                    m++;
+                    chtmp1[0] = '\0';
+                  }
+                }
+                sys.L2[i].ports[m] = atoi(chtmp1);
+                m++;
+                chtmp1[0] = '\0';
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "device_type") == 0) {
+                sys.L2[i].device_type =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "threeD_stack") == 0) {
+                strcpy(sys.L2[i].threeD_stack,
+                       (xNode3.getChildNode("param", k).getAttribute("value")));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "buffer_sizes") == 0) {
+                strtmp.assign(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                m = 0;
+                for (n = 0; n < strtmp.length(); n++) {
+                  if (strtmp[n] != ',') {
+                    sprintf(chtmp, "%c", strtmp[n]);
+                    strcat(chtmp1, chtmp);
+                  } else {
+                    sys.L2[i].buffer_sizes[m] = atoi(chtmp1);
+                    m++;
+                    chtmp1[0] = '\0';
+                  }
+                }
+                sys.L2[i].buffer_sizes[m] = atoi(chtmp1);
+                m++;
+                chtmp1[0] = '\0';
+                continue;
+              }
+            }
+            // Get all stats with system.L2?
+            itmp = xNode3.nChildNode("stat");
+            for (k = 0; k < itmp; k++) {
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "total_accesses") == 0) {
+                sys.L2[i].total_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "read_accesses") == 0) {
+                sys.L2[i].read_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "write_accesses") == 0) {
+                sys.L2[i].write_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "total_hits") == 0) {
+                sys.L2[i].total_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "total_misses") == 0) {
+                sys.L2[i].total_misses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "read_hits") == 0) {
+                sys.L2[i].read_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "write_hits") == 0) {
+                sys.L2[i].write_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "read_misses") == 0) {
+                sys.L2[i].read_misses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "write_misses") == 0) {
+                sys.L2[i].write_misses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "replacements") == 0) {
+                sys.L2[i].replacements =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "write_backs") == 0) {
+                sys.L2[i].write_backs =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "miss_buffer_accesses") == 0) {
+                sys.L2[i].miss_buffer_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "fill_buffer_accesses") == 0) {
+                sys.L2[i].fill_buffer_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "prefetch_buffer_accesses") == 0) {
+                sys.L2[i].prefetch_buffer_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "prefetch_buffer_writes") == 0) {
+                sys.L2[i].prefetch_buffer_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "prefetch_buffer_reads") == 0) {
+                sys.L2[i].prefetch_buffer_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "prefetch_buffer_hits") == 0) {
+                sys.L2[i].prefetch_buffer_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "wbb_writes") == 0) {
+                sys.L2[i].wbb_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "wbb_reads") == 0) {
+                sys.L2[i].wbb_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "conflicts") == 0) {
+                sys.L2[i].conflicts =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "duty_cycle") == 0) {
+                sys.L2[i].duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_read_accesses") == 0) {
+                sys.L2[i].homenode_read_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_read_accesses") == 0) {
+                sys.L2[i].homenode_read_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_read_hits") == 0) {
+                sys.L2[i].homenode_read_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_write_hits") == 0) {
+                sys.L2[i].homenode_write_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_read_misses") == 0) {
+                sys.L2[i].homenode_read_misses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_write_misses") == 0) {
+                sys.L2[i].homenode_write_misses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "dir_duty_cycle") == 0) {
+                sys.L2[i].dir_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+            }
+          }
+          w = w + 1;
+        } else {
+          printf(
+              "The value of homogeneous_L2s or number_of_L2s is not correct!");
+          exit(0);
+        }
+      }
+    }
+    //__________________________________________Get
+    // system.L3[0..n]____________________________________________
+    w = OrderofComponents_3layer + 1;
+    tmpOrderofComponents_3layer = OrderofComponents_3layer;
+    if (sys.homogeneous_L3s == 1)
+      OrderofComponents_3layer = OrderofComponents_3layer + 1;
+    else
+      OrderofComponents_3layer = OrderofComponents_3layer + sys.number_of_L3s;
+
+    for (i = 0; i < (OrderofComponents_3layer - tmpOrderofComponents_3layer);
+         i++) {
+      xNode3 = xNode2.getChildNode("component", w);
+      if (xNode3.isEmpty() == 1) {
+        printf("The value of homogeneous_L3s or number_of_L3s is not correct!");
+        exit(0);
+      } else {
+        if (strstr(xNode3.getAttribute("name"), "L3") != NULL) {
+          { // For L30-L3i
+            // Get all params with system.L3?
+            itmp = xNode3.nChildNode("param");
+            for (k = 0; k < itmp; k++) {
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "L3_config") == 0) {
+                strtmp.assign(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                m = 0;
+                for (n = 0; n < strtmp.length(); n++) {
+                  if (strtmp[n] != ',') {
+                    sprintf(chtmp, "%c", strtmp[n]);
+                    strcat(chtmp1, chtmp);
+                  } else {
+                    sys.L3[i].L3_config[m] = atof(chtmp1);
+                    m++;
+                    chtmp1[0] = '\0';
+                  }
+                }
+                sys.L3[i].L3_config[m] = atof(chtmp1);
+                m++;
+                chtmp1[0] = '\0';
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "clockrate") == 0) {
+                sys.L3[i].clockrate =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "merged_dir") == 0) {
+                sys.L3[i].merged_dir = (bool)atoi(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "ports") == 0) {
+                strtmp.assign(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                m = 0;
+                for (n = 0; n < strtmp.length(); n++) {
+                  if (strtmp[n] != ',') {
+                    sprintf(chtmp, "%c", strtmp[n]);
+                    strcat(chtmp1, chtmp);
+                  } else {
+                    sys.L3[i].ports[m] = atoi(chtmp1);
+                    m++;
+                    chtmp1[0] = '\0';
+                  }
+                }
+                sys.L3[i].ports[m] = atoi(chtmp1);
+                m++;
+                chtmp1[0] = '\0';
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "device_type") == 0) {
+                sys.L3[i].device_type =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "threeD_stack") == 0) {
+                strcpy(sys.L3[i].threeD_stack,
+                       (xNode3.getChildNode("param", k).getAttribute("value")));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "buffer_sizes") == 0) {
+                strtmp.assign(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                m = 0;
+                for (n = 0; n < strtmp.length(); n++) {
+                  if (strtmp[n] != ',') {
+                    sprintf(chtmp, "%c", strtmp[n]);
+                    strcat(chtmp1, chtmp);
+                  } else {
+                    sys.L3[i].buffer_sizes[m] = atoi(chtmp1);
+                    m++;
+                    chtmp1[0] = '\0';
+                  }
+                }
+                sys.L3[i].buffer_sizes[m] = atoi(chtmp1);
+                m++;
+                chtmp1[0] = '\0';
+                continue;
+              }
+            }
+            // Get all stats with system.L3?
+            itmp = xNode3.nChildNode("stat");
+            for (k = 0; k < itmp; k++) {
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "total_accesses") == 0) {
+                sys.L3[i].total_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "read_accesses") == 0) {
+                sys.L3[i].read_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "write_accesses") == 0) {
+                sys.L3[i].write_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "total_hits") == 0) {
+                sys.L3[i].total_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "total_misses") == 0) {
+                sys.L3[i].total_misses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "read_hits") == 0) {
+                sys.L3[i].read_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "write_hits") == 0) {
+                sys.L3[i].write_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "read_misses") == 0) {
+                sys.L3[i].read_misses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "write_misses") == 0) {
+                sys.L3[i].write_misses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "replacements") == 0) {
+                sys.L3[i].replacements =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "write_backs") == 0) {
+                sys.L3[i].write_backs =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "miss_buffer_accesses") == 0) {
+                sys.L3[i].miss_buffer_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "fill_buffer_accesses") == 0) {
+                sys.L3[i].fill_buffer_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "prefetch_buffer_accesses") == 0) {
+                sys.L3[i].prefetch_buffer_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "prefetch_buffer_writes") == 0) {
+                sys.L3[i].prefetch_buffer_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "prefetch_buffer_reads") == 0) {
+                sys.L3[i].prefetch_buffer_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "prefetch_buffer_hits") == 0) {
+                sys.L3[i].prefetch_buffer_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "wbb_writes") == 0) {
+                sys.L3[i].wbb_writes =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "wbb_reads") == 0) {
+                sys.L3[i].wbb_reads =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "conflicts") == 0) {
+                sys.L3[i].conflicts =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "duty_cycle") == 0) {
+                sys.L3[i].duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_read_accesses") == 0) {
+                sys.L3[i].homenode_read_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_read_accesses") == 0) {
+                sys.L3[i].homenode_read_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_read_hits") == 0) {
+                sys.L3[i].homenode_read_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_write_hits") == 0) {
+                sys.L3[i].homenode_write_hits =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_read_misses") == 0) {
+                sys.L3[i].homenode_read_misses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "homenode_write_misses") == 0) {
+                sys.L3[i].homenode_write_misses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "dir_duty_cycle") == 0) {
+                sys.L3[i].dir_duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+                continue;
+              }
+            }
+          }
+          w = w + 1;
+        } else {
+          printf(
+              "The value of homogeneous_L3s or number_of_L3s is not correct!");
+          exit(0);
+        }
+      }
+    }
+    //__________________________________________Get
+    // system.NoC[0..n]____________________________________________
+    w = OrderofComponents_3layer + 1;
+    tmpOrderofComponents_3layer = OrderofComponents_3layer;
+    if (sys.homogeneous_NoCs == 1)
+      OrderofComponents_3layer = OrderofComponents_3layer + 1;
+    else
+      OrderofComponents_3layer = OrderofComponents_3layer + sys.number_of_NoCs;
+
+    for (i = 0; i < (OrderofComponents_3layer - tmpOrderofComponents_3layer);
+         i++) {
+      xNode3 = xNode2.getChildNode("component", w);
+      if (xNode3.isEmpty() == 1) {
+        printf(
+            "The value of homogeneous_NoCs or number_of_NoCs is not correct!");
+        exit(0);
+      } else {
+        if (strstr(xNode3.getAttribute("name"), "noc") != NULL) {
+          { // For NoC0-NoCi
+            // Get all params with system.NoC?
+            itmp = xNode3.nChildNode("param");
+            for (k = 0; k < itmp; k++) {
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "clockrate") == 0) {
+                sys.NoC[i].clockrate =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "type") == 0) {
+                sys.NoC[i].type = (bool)atoi(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "topology") == 0) {
+                strcpy(sys.NoC[i].topology,
+                       (xNode3.getChildNode("param", k).getAttribute("value")));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "horizontal_nodes") == 0) {
+                sys.NoC[i].horizontal_nodes =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "vertical_nodes") == 0) {
+                sys.NoC[i].vertical_nodes =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "has_global_link") == 0) {
+                sys.NoC[i].has_global_link = (bool)atoi(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "link_throughput") == 0) {
+                sys.NoC[i].link_throughput =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "link_latency") == 0) {
+                sys.NoC[i].link_latency =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "input_ports") == 0) {
+                sys.NoC[i].input_ports =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "output_ports") == 0) {
+                sys.NoC[i].output_ports =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "virtual_channel_per_port") == 0) {
+                sys.NoC[i].virtual_channel_per_port =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "flit_bits") == 0) {
+                sys.NoC[i].flit_bits =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "input_buffer_entries_per_vc") == 0) {
+                sys.NoC[i].input_buffer_entries_per_vc =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "dual_pump") == 0) {
+                sys.NoC[i].dual_pump =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "chip_coverage") == 0) {
+                sys.NoC[i].chip_coverage =
+                    atof(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "link_routing_over_percentage") == 0) {
+                sys.NoC[i].route_over_perc =
+                    atof(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "ports_of_input_buffer") == 0) {
+                strtmp.assign(
+                    xNode3.getChildNode("param", k).getAttribute("value"));
+                m = 0;
+                for (n = 0; n < strtmp.length(); n++) {
+                  if (strtmp[n] != ',') {
+                    sprintf(chtmp, "%c", strtmp[n]);
+                    strcat(chtmp1, chtmp);
+                  } else {
+                    sys.NoC[i].ports_of_input_buffer[m] = atoi(chtmp1);
+                    m++;
+                    chtmp1[0] = '\0';
+                  }
+                }
+                sys.NoC[i].ports_of_input_buffer[m] = atoi(chtmp1);
+                m++;
+                chtmp1[0] = '\0';
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "number_of_crossbars") == 0) {
+                sys.NoC[i].number_of_crossbars =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "crossbar_type") == 0) {
+                strcpy(sys.NoC[i].crossbar_type,
+                       (xNode3.getChildNode("param", k).getAttribute("value")));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "crosspoint_type") == 0) {
+                strcpy(sys.NoC[i].crosspoint_type,
+                       (xNode3.getChildNode("param", k).getAttribute("value")));
+                continue;
+              }
+              if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                         "arbiter_type") == 0) {
+                sys.NoC[i].arbiter_type =
+                    atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+                continue;
+              }
+            }
+            NumofCom_4 =
+                xNode3.nChildNode("component"); // get the number of components
+                                                // within the third layer
+            for (j = 0; j < NumofCom_4; j++) {
+              xNode4 = xNode3.getChildNode("component", j);
+              if (strcmp(xNode4.getAttribute("name"), "xbar0") ==
+                  0) { // find PBT
+                itmp = xNode4.nChildNode("param");
+                for (k = 0; k < itmp; k++) { // get all items of param in
+                                             // system.XoC0.xbar0--xbar0
+                  if (strcmp(
+                          xNode4.getChildNode("param", k).getAttribute("name"),
+                          "number_of_inputs_of_crossbars") == 0) {
+                    sys.NoC[i].xbar0.number_of_inputs_of_crossbars = atoi(
+                        xNode4.getChildNode("param", k).getAttribute("value"));
+                    continue;
+                  }
+                  if (strcmp(
+                          xNode4.getChildNode("param", k).getAttribute("name"),
+                          "number_of_outputs_of_crossbars") == 0) {
+                    sys.NoC[i].xbar0.number_of_outputs_of_crossbars = atoi(
+                        xNode4.getChildNode("param", k).getAttribute("value"));
+                    continue;
+                  }
+                  if (strcmp(
+                          xNode4.getChildNode("param", k).getAttribute("name"),
+                          "flit_bits") == 0) {
+                    sys.NoC[i].xbar0.flit_bits = atoi(
+                        xNode4.getChildNode("param", k).getAttribute("value"));
+                    continue;
+                  }
+                  if (strcmp(
+                          xNode4.getChildNode("param", k).getAttribute("name"),
+                          "input_buffer_entries_per_port") == 0) {
+                    sys.NoC[i].xbar0.input_buffer_entries_per_port = atoi(
+                        xNode4.getChildNode("param", k).getAttribute("value"));
+                    continue;
+                  }
+                  if (strcmp(
+                          xNode4.getChildNode("param", k).getAttribute("name"),
+                          "ports_of_input_buffer") == 0) {
+                    strtmp.assign(
+                        xNode4.getChildNode("param", k).getAttribute("value"));
+                    m = 0;
+                    for (n = 0; n < strtmp.length(); n++) {
+                      if (strtmp[n] != ',') {
+                        sprintf(chtmp, "%c", strtmp[n]);
+                        strcat(chtmp1, chtmp);
+                      } else {
+                        sys.NoC[i].xbar0.ports_of_input_buffer[m] =
+                            atoi(chtmp1);
+                        m++;
+                        chtmp1[0] = '\0';
+                      }
+                    }
+                    sys.NoC[i].xbar0.ports_of_input_buffer[m] = atoi(chtmp1);
+                    m++;
+                    chtmp1[0] = '\0';
+                  }
+                }
+                itmp = xNode4.nChildNode("stat");
+                for (k = 0; k < itmp; k++) { // get all items of stat in
+                                             // system.core0.predictor--PBT
+                  if (strcmp(
+                          xNode4.getChildNode("stat", k).getAttribute("name"),
+                          "predictor_accesses") == 0)
+                    sys.core[i].predictor.predictor_accesses = atof(
+                        xNode4.getChildNode("stat", k).getAttribute("value"));
+                }
+              }
+            }
+            // Get all stats with system.NoC?
+            itmp = xNode3.nChildNode("stat");
+            for (k = 0; k < itmp; k++) {
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "total_accesses") == 0)
+                sys.NoC[i].total_accesses =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+              if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                         "duty_cycle") == 0)
+                sys.NoC[i].duty_cycle =
+                    atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+            }
+          }
+          w = w + 1;
+        }
+      }
+    }
+    //__________________________________________Get
+    // system.mem____________________________________________
+    if (OrderofComponents_3layer > 0)
+      OrderofComponents_3layer = OrderofComponents_3layer + 1;
+    xNode3 = xNode2.getChildNode("component", OrderofComponents_3layer);
+    if (xNode3.isEmpty() == 1) {
+      printf("some value(s) of "
+             "number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs "
+             "is/are not correct!");
+      exit(0);
+    }
+    if (strstr(xNode3.getAttribute("id"), "system.mem") != NULL) {
+
+      itmp = xNode3.nChildNode("param");
+      for (k = 0; k < itmp; k++) { // get all items of param in system.mem
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "mem_tech_node") == 0) {
+          sys.mem.mem_tech_node =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "device_clock") == 0) {
+          sys.mem.device_clock =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "peak_transfer_rate") == 0) {
+          sys.mem.peak_transfer_rate =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "capacity_per_channel") == 0) {
+          sys.mem.capacity_per_channel =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "number_ranks") == 0) {
+          sys.mem.number_ranks =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "num_banks_of_DRAM_chip") == 0) {
+          sys.mem.num_banks_of_DRAM_chip =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "Block_width_of_DRAM_chip") == 0) {
+          sys.mem.Block_width_of_DRAM_chip =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "output_width_of_DRAM_chip") == 0) {
+          sys.mem.output_width_of_DRAM_chip =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "page_size_of_DRAM_chip") == 0) {
+          sys.mem.page_size_of_DRAM_chip =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "burstlength_of_DRAM_chip") == 0) {
+          sys.mem.burstlength_of_DRAM_chip =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "internal_prefetch_of_DRAM_chip") == 0) {
+          sys.mem.internal_prefetch_of_DRAM_chip =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+      }
+      itmp = xNode3.nChildNode("stat");
+      for (k = 0; k < itmp; k++) { // get all items of stat in system.mem
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "memory_accesses") == 0) {
+          sys.mem.memory_accesses =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "memory_reads") == 0) {
+          sys.mem.memory_reads =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "memory_writes") == 0) {
+          sys.mem.memory_writes =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "dram_pre") == 0) {
+          sys.mem.dram_pre =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+      }
+    } else {
+      printf("some value(s) of "
+             "number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs "
+             "is/are not correct!");
+      exit(0);
+    }
+    //__________________________________________Get
+    // system.mc____________________________________________
+    if (OrderofComponents_3layer > 0)
+      OrderofComponents_3layer = OrderofComponents_3layer + 1;
+    xNode3 = xNode2.getChildNode("component", OrderofComponents_3layer);
+    if (xNode3.isEmpty() == 1) {
+      printf("some value(s) of "
+             "number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs "
+             "is/are not correct!");
+      exit(0);
+    }
+    if (strstr(xNode3.getAttribute("id"), "system.mc") != NULL) {
+      itmp = xNode3.nChildNode("param");
+      for (k = 0; k < itmp; k++) { // get all items of param in system.mem
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "mc_clock") == 0) {
+          sys.mc.mc_clock =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "block_size") == 0) {
+          sys.mc.llc_line_length =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "number_mcs") == 0) {
+          sys.mc.number_mcs =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "memory_channels_per_mc") == 0) {
+          sys.mc.memory_channels_per_mc =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "req_window_size_per_channel") == 0) {
+          sys.mc.req_window_size_per_channel =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "IO_buffer_size_per_channel") == 0) {
+          sys.mc.IO_buffer_size_per_channel =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "databus_width") == 0) {
+          sys.mc.databus_width =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "addressbus_width") == 0) {
+          sys.mc.addressbus_width =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "PRT_entries") == 0) {
+          sys.mc.PRT_entries =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "peak_transfer_rate") == 0) {
+          sys.mc.peak_transfer_rate =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "number_ranks") == 0) {
+          sys.mc.number_ranks =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "LVDS") == 0) {
+          sys.mc.LVDS =
+              (bool)atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "type") == 0) {
+          sys.mc.type =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "withPHY") == 0) {
+          sys.mc.withPHY =
+              (bool)atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "dram_cmd_coeff") == 0) {
+          sys.mc.dram_cmd_coeff =
+              atof(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "dram_act_coeff") == 0) {
+          sys.mc.dram_act_coeff =
+              atof(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "dram_nop_coeff") == 0) {
+          sys.mc.dram_nop_coeff =
+              atof(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "dram_activity_coeff") == 0) {
+          sys.mc.dram_activity_coeff =
+              atof(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "dram_pre_coeff") == 0) {
+          sys.mc.dram_pre_coeff =
+              atof(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "dram_rd_coeff") == 0) {
+          sys.mc.dram_rd_coeff =
+              atof(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "dram_wr_coeff") == 0) {
+          sys.mc.dram_wr_coeff =
+              atof(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "dram_req_coeff") == 0) {
+          sys.mc.dram_req_coeff =
+              atof(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "dram_const_coeff") == 0) {
+          sys.mc.dram_const_coeff =
+              atof(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+      }
+      itmp = xNode3.nChildNode("stat");
+      for (k = 0; k < itmp;
+           k++) { // get all items of stat in system.mendirectory
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "memory_accesses") == 0) {
+          sys.mc.memory_accesses =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "memory_reads") == 0) {
+          sys.mc.memory_reads =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "memory_writes") == 0) {
+          sys.mc.memory_writes =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "dram_pre") == 0) {
+          sys.mc.dram_pre =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+      }
+    } else {
+      printf("some value(s) of "
+             "number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs "
+             "is/are not correct!");
+      exit(0);
+    }
+    //__________________________________________Get
+    // system.niu____________________________________________
+    if (OrderofComponents_3layer > 0)
+      OrderofComponents_3layer = OrderofComponents_3layer + 1;
+    xNode3 = xNode2.getChildNode("component", OrderofComponents_3layer);
+    if (xNode3.isEmpty() == 1) {
+      printf("some value(s) of "
+             "number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs "
+             "is/are not correct!");
+      exit(0);
+    }
+    if (strstr(xNode3.getAttribute("id"), "system.niu") != NULL) {
+      itmp = xNode3.nChildNode("param");
+      for (k = 0; k < itmp; k++) { // get all items of param in system.mem
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "clockrate") == 0) {
+          sys.niu.clockrate =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "number_units") == 0) {
+          sys.niu.number_units =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "type") == 0) {
+          sys.niu.type =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+      }
+      itmp = xNode3.nChildNode("stat");
+      for (k = 0; k < itmp;
+           k++) { // get all items of stat in system.mendirectory
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "duty_cycle") == 0) {
+          sys.niu.duty_cycle =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "total_load_perc") == 0) {
+          sys.niu.total_load_perc =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+      }
+    } else {
+      printf("some value(s) of "
+             "number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs "
+             "is/are not correct!");
+      exit(0);
+    }
+
+    //__________________________________________Get
+    // system.pcie____________________________________________
+    if (OrderofComponents_3layer > 0)
+      OrderofComponents_3layer = OrderofComponents_3layer + 1;
+    xNode3 = xNode2.getChildNode("component", OrderofComponents_3layer);
+    if (xNode3.isEmpty() == 1) {
+      printf("some value(s) of "
+             "number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs "
+             "is/are not correct!");
+      exit(0);
+    }
+    if (strstr(xNode3.getAttribute("id"), "system.pcie") != NULL) {
+      itmp = xNode3.nChildNode("param");
+      for (k = 0; k < itmp; k++) { // get all items of param in system.mem
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "clockrate") == 0) {
+          sys.pcie.clockrate =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "number_units") == 0) {
+          sys.pcie.number_units =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "num_channels") == 0) {
+          sys.pcie.num_channels =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "type") == 0) {
+          sys.pcie.type =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "withPHY") == 0) {
+          sys.pcie.withPHY =
+              (bool)atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+      }
+      itmp = xNode3.nChildNode("stat");
+      for (k = 0; k < itmp;
+           k++) { // get all items of stat in system.mendirectory
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "duty_cycle") == 0) {
+          sys.pcie.duty_cycle =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "total_load_perc") == 0) {
+          sys.pcie.total_load_perc =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+      }
+    } else {
+      printf("some value(s) of "
+             "number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs "
+             "is/are not correct!");
+      exit(0);
+    }
+    //__________________________________________Get
+    // system.flashcontroller____________________________________________
+    if (OrderofComponents_3layer > 0)
+      OrderofComponents_3layer = OrderofComponents_3layer + 1;
+    xNode3 = xNode2.getChildNode("component", OrderofComponents_3layer);
+    if (xNode3.isEmpty() == 1) {
+      printf("some value(s) of "
+             "number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs "
+             "is/are not correct!");
+      exit(0);
+    }
+    if (strstr(xNode3.getAttribute("id"), "system.flashc") != NULL) {
+      itmp = xNode3.nChildNode("param");
+      for (k = 0; k < itmp; k++) { // get all items of param in system.mem
+        //				if
+        //(strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"flashc_clock")==0)
+        //{sys.flashc.mc_clock=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+        //				if
+        //(strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"block_size")==0)
+        //{sys.flashc.llc_line_length=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "number_flashcs") == 0) {
+          sys.flashc.number_mcs =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        //				if
+        //(strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"memory_channels_per_flashc")==0)
+        //{sys.flashc.memory_channels_per_mc=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+        //				if
+        //(strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"req_window_size_per_channel")==0)
+        //{sys.flashc.req_window_size_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+        //				if
+        //(strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"IO_buffer_size_per_channel")==0)
+        //{sys.flashc.IO_buffer_size_per_channel=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+        //				if
+        //(strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"databus_width")==0)
+        //{sys.flashc.databus_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+        //				if
+        //(strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"addressbus_width")==0)
+        //{sys.flashc.addressbus_width=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "peak_transfer_rate") == 0) {
+          sys.flashc.peak_transfer_rate =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        //				if
+        //(strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"number_ranks")==0)
+        //{sys.flashc.number_ranks=atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+        //				if
+        //(strcmp(xNode3.getChildNode("param",k).getAttribute("name"),"LVDS")==0)
+        //{sys.flashc.LVDS=(bool)atoi(xNode3.getChildNode("param",k).getAttribute("value"));continue;}
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "type") == 0) {
+          sys.flashc.type =
+              atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("param", k).getAttribute("name"),
+                   "withPHY") == 0) {
+          sys.flashc.withPHY =
+              (bool)atoi(xNode3.getChildNode("param", k).getAttribute("value"));
+          continue;
+        }
+      }
+      itmp = xNode3.nChildNode("stat");
+      for (k = 0; k < itmp;
+           k++) { // get all items of stat in system.mendirectory
+        //				if
+        //(strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_accesses")==0)
+        //{sys.flashc.memory_accesses=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+        //				if
+        //(strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_reads")==0)
+        //{sys.flashc.memory_reads=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+        //				if
+        //(strcmp(xNode3.getChildNode("stat",k).getAttribute("name"),"memory_writes")==0)
+        //{sys.flashc.memory_writes=atof(xNode3.getChildNode("stat",k).getAttribute("value"));continue;}
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "duty_cycle") == 0) {
+          sys.flashc.duty_cycle =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+        if (strcmp(xNode3.getChildNode("stat", k).getAttribute("name"),
+                   "total_load_perc") == 0) {
+          sys.flashc.total_load_perc =
+              atof(xNode3.getChildNode("stat", k).getAttribute("value"));
+          continue;
+        }
+      }
+    } else {
+      printf("some value(s) of "
+             "number_of_cores/number_of_L2s/number_of_L3s/number_of_NoCs "
+             "is/are not correct!");
+      exit(0);
+    }
+  }
+}
+void ParseXML::initialize() // Initialize all
+{
+  // All number_of_* at the level of 'system' 03/21/2009
+  sys.number_of_cores = 1;
+  sys.architecture = 1; // 1 - fermi
+  sys.number_of_L1Directories = 1;
+  sys.number_of_L2Directories = 1;
+  sys.number_of_L2s = 1;
+  sys.Private_L2 = false;
+  sys.number_of_L3s = 1;
+  sys.number_of_NoCs = 1;
+  // All params at the level of 'system'
+  // strcpy(sys.homogeneous_cores,"default");
+  sys.core_tech_node = 1;
+  sys.target_core_clockrate = 1;
+  sys.target_chip_area = 1;
+  sys.temperature = 1;
+  sys.number_cache_levels = 1;
+  sys.homogeneous_cores = 1;
+  sys.homogeneous_L1Directories = 1;
+  sys.homogeneous_L2Directories = 1;
+  sys.homogeneous_L2s = 1;
+  sys.homogeneous_L3s = 1;
+  sys.homogeneous_NoCs = 1;
+  sys.homogeneous_ccs = 1;
+
+  sys.Max_area_deviation = 1;
+  sys.Max_power_deviation = 1;
+  sys.device_type = 1;
+  sys.longer_channel_device = true;
+  sys.Embedded = false;
+  sys.opt_dynamic_power = false;
+  sys.opt_lakage_power = false;
+  sys.opt_clockrate = true;
+  sys.opt_area = false;
+  sys.interconnect_projection_type = 1;
+  sys.idle_core_power = 0;
+  int i, j;
+  for (i = 0; i <= 63; i++) {
+    sys.scaling_coefficients[i] = 1;
+    sys.core[i].clock_rate = 1;
+    sys.core[i].opt_local = true;
+    sys.core[i].x86 = false;
+    sys.core[i].machine_bits = 1;
+    sys.core[i].virtual_address_width = 1;
+    sys.core[i].physical_address_width = 1;
+    sys.core[i].opcode_width = 1;
+    sys.core[i].micro_opcode_width = 1;
+    // strcpy(sys.core[i].machine_type,"default");
+    sys.core[i].internal_datapath_width = 1;
+    sys.core[i].number_hardware_threads = 1;
+    sys.core[i].fetch_width = 1;
+    sys.core[i].number_instruction_fetch_ports = 1;
+    sys.core[i].decode_width = 1;
+    sys.core[i].issue_width = 1;
+    sys.core[i].peak_issue_width = 1;
+    sys.core[i].commit_width = 1;
+    for (j = 0; j < 20; j++)
+      sys.core[i].pipelines_per_core[j] = 1;
+    for (j = 0; j < 20; j++)
+      sys.core[i].pipeline_depth[j] = 1;
+    strcpy(sys.core[i].FPU, "default");
+    strcpy(sys.core[i].divider_multiplier, "default");
+    sys.core[i].ALU_per_core = 1;
+    sys.core[i].FPU_per_core = 1.0;
+    sys.core[i].MUL_per_core = 1;
+    sys.core[i].instruction_buffer_size = 1;
+    sys.core[i].decoded_stream_buffer_size = 1;
+    // strcpy(sys.core[i].instruction_window_scheme,"default");
+    sys.core[i].instruction_window_size = 1;
+    sys.core[i].ROB_size = 1;
+    sys.core[i].archi_Regs_IRF_size = 1;
+    sys.core[i].archi_Regs_FRF_size = 1;
+    sys.core[i].phy_Regs_IRF_size = 1;
+    sys.core[i].phy_Regs_FRF_size = 1;
+    // strcpy(sys.core[i].rename_scheme,"default");
+    sys.core[i].register_windows_size = 1;
+    strcpy(sys.core[i].LSU_order, "default");
+    sys.core[i].store_buffer_size = 1;
+    sys.core[i].load_buffer_size = 1;
+    sys.core[i].memory_ports = 1;
+    strcpy(sys.core[i].Dcache_dual_pump, "default");
+    sys.core[i].RAS_size = 1;
+    // all stats at the level of system.core(0-n)
+    sys.core[i].total_instructions = 1;
+    sys.core[i].int_instructions = 1;
+    sys.core[i].fp_instructions = 1;
+    sys.core[i].branch_instructions = 1;
+    sys.core[i].branch_mispredictions = 1;
+    sys.core[i].committed_instructions = 1;
+    sys.core[i].load_instructions = 1;
+    sys.core[i].store_instructions = 1;
+    sys.core[i].total_cycles = 1;
+    sys.core[i].idle_cycles = 1;
+    sys.core[i].busy_cycles = 1;
+    sys.core[i].instruction_buffer_reads = 1;
+    sys.core[i].instruction_buffer_write = 1;
+    sys.core[i].ROB_reads = 1;
+    sys.core[i].ROB_writes = 1;
+    sys.core[i].rename_accesses = 1;
+    sys.core[i].inst_window_reads = 1;
+    sys.core[i].inst_window_writes = 1;
+    sys.core[i].inst_window_wakeup_accesses = 1;
+    sys.core[i].inst_window_selections = 1;
+    sys.core[i].archi_int_regfile_reads = 1;
+    sys.core[i].archi_float_regfile_reads = 1;
+    sys.core[i].phy_int_regfile_reads = 1;
+    sys.core[i].phy_float_regfile_reads = 1;
+    sys.core[i].windowed_reg_accesses = 1;
+    sys.core[i].windowed_reg_transports = 1;
+    sys.core[i].function_calls = 1;
+    sys.core[i].ialu_accesses = 1;
+    sys.core[i].fpu_accesses = 1;
+    sys.core[i].mul_accesses = 1;
+    sys.core[i].cdb_alu_accesses = 1;
+    sys.core[i].cdb_mul_accesses = 1;
+    sys.core[i].cdb_fpu_accesses = 1;
+    sys.core[i].load_buffer_reads = 1;
+    sys.core[i].load_buffer_writes = 1;
+    sys.core[i].load_buffer_cams = 1;
+    sys.core[i].store_buffer_reads = 1;
+    sys.core[i].store_buffer_writes = 1;
+    sys.core[i].store_buffer_cams = 1;
+    sys.core[i].store_buffer_forwards = 1;
+    sys.core[i].main_memory_access = 1;
+    sys.core[i].main_memory_read = 1;
+    sys.core[i].main_memory_write = 1;
+    sys.core[i].IFU_duty_cycle = 1;
+    sys.core[i].BR_duty_cycle = 1;
+    sys.core[i].LSU_duty_cycle = 1;
+    sys.core[i].MemManU_I_duty_cycle = 1;
+    sys.core[i].MemManU_D_duty_cycle = 1;
+    sys.core[i].ALU_duty_cycle = 1;
+    sys.core[i].MUL_duty_cycle = 1;
+    sys.core[i].FPU_duty_cycle = 1;
+    sys.core[i].ALU_cdb_duty_cycle = 1;
+    sys.core[i].MUL_cdb_duty_cycle = 1;
+    sys.core[i].FPU_cdb_duty_cycle = 1;
+    // system.core?.predictor
+    sys.core[i].predictor.prediction_width = 1;
+    strcpy(sys.core[i].predictor.prediction_scheme, "default");
+    sys.core[i].predictor.predictor_size = 1;
+    sys.core[i].predictor.predictor_entries = 1;
+    sys.core[i].predictor.local_predictor_entries = 1;
+    for (j = 0; j < 20; j++)
+      sys.core[i].predictor.local_predictor_size[j] = 1;
+    sys.core[i].predictor.global_predictor_entries = 1;
+    sys.core[i].predictor.global_predictor_bits = 1;
+    sys.core[i].predictor.chooser_predictor_entries = 1;
+    sys.core[i].predictor.chooser_predictor_bits = 1;
+    sys.core[i].predictor.predictor_accesses = 1;
+    // system.core?.itlb
+    sys.core[i].itlb.number_entries = 1;
+    sys.core[i].itlb.total_hits = 1;
+    sys.core[i].itlb.total_accesses = 1;
+    sys.core[i].itlb.total_misses = 1;
+    // system.core?.icache
+    for (j = 0; j < 20; j++)
+      sys.core[i].icache.icache_config[j] = 1;
+    // strcpy(sys.core[i].icache.buffer_sizes,"default");
+    sys.core[i].icache.total_accesses = 1;
+    sys.core[i].icache.read_accesses = 1;
+    sys.core[i].icache.read_misses = 1;
+    sys.core[i].icache.replacements = 1;
+    sys.core[i].icache.read_hits = 1;
+    sys.core[i].icache.total_hits = 1;
+    sys.core[i].icache.total_misses = 1;
+    sys.core[i].icache.miss_buffer_access = 1;
+    sys.core[i].icache.fill_buffer_accesses = 1;
+    sys.core[i].icache.prefetch_buffer_accesses = 1;
+    sys.core[i].icache.prefetch_buffer_writes = 1;
+    sys.core[i].icache.prefetch_buffer_reads = 1;
+    sys.core[i].icache.prefetch_buffer_hits = 1;
+    // system.core?.dtlb
+    sys.core[i].dtlb.number_entries = 1;
+    sys.core[i].dtlb.total_accesses = 1;
+    sys.core[i].dtlb.read_accesses = 1;
+    sys.core[i].dtlb.write_accesses = 1;
+    sys.core[i].dtlb.write_hits = 1;
+    sys.core[i].dtlb.read_hits = 1;
+    sys.core[i].dtlb.read_misses = 1;
+    sys.core[i].dtlb.write_misses = 1;
+    sys.core[i].dtlb.total_hits = 1;
+    sys.core[i].dtlb.total_misses = 1;
+    // system.core?.dcache
+    for (j = 0; j < 20; j++)
+      sys.core[i].dcache.dcache_config[j] = 1;
+    // strcpy(sys.core[i].dcache.buffer_sizes,"default");
+    sys.core[i].dcache.total_accesses = 1;
+    sys.core[i].dcache.read_accesses = 1;
+    sys.core[i].dcache.write_accesses = 1;
+    sys.core[i].dcache.total_hits = 1;
+    sys.core[i].dcache.total_misses = 1;
+    sys.core[i].dcache.read_hits = 1;
+    sys.core[i].dcache.write_hits = 1;
+    sys.core[i].dcache.read_misses = 1;
+    sys.core[i].dcache.write_misses = 1;
+    sys.core[i].dcache.replacements = 1;
+    sys.core[i].dcache.write_backs = 1;
+    sys.core[i].dcache.miss_buffer_access = 1;
+    sys.core[i].dcache.fill_buffer_accesses = 1;
+    sys.core[i].dcache.prefetch_buffer_accesses = 1;
+    sys.core[i].dcache.prefetch_buffer_writes = 1;
+    sys.core[i].dcache.prefetch_buffer_reads = 1;
+    sys.core[i].dcache.prefetch_buffer_hits = 1;
+    sys.core[i].dcache.wbb_writes = 1;
+    sys.core[i].dcache.wbb_reads = 1;
+    // system.core?.BTB
+    for (j = 0; j < 20; j++)
+      sys.core[i].BTB.BTB_config[j] = 1;
+    sys.core[i].BTB.total_accesses = 1;
+    sys.core[i].BTB.read_accesses = 1;
+    sys.core[i].BTB.write_accesses = 1;
+    sys.core[i].BTB.total_hits = 1;
+    sys.core[i].BTB.total_misses = 1;
+    sys.core[i].BTB.read_hits = 1;
+    sys.core[i].BTB.write_hits = 1;
+    sys.core[i].BTB.read_misses = 1;
+    sys.core[i].BTB.write_misses = 1;
+    sys.core[i].BTB.replacements = 1;
+  }
+
+  // system_L1directory
+  for (i = 0; i <= 63; i++) {
+    for (j = 0; j < 20; j++)
+      sys.L1Directory[i].Dir_config[j] = 1;
+    for (j = 0; j < 20; j++)
+      sys.L1Directory[i].buffer_sizes[j] = 1;
+    sys.L1Directory[i].clockrate = 1;
+    sys.L1Directory[i].ports[20] = 1;
+    sys.L1Directory[i].device_type = 1;
+    strcpy(sys.L1Directory[i].threeD_stack, "default");
+    sys.L1Directory[i].total_accesses = 1;
+    sys.L1Directory[i].read_accesses = 1;
+    sys.L1Directory[i].write_accesses = 1;
+    sys.L1Directory[i].duty_cycle = 1;
+  }
+  // system_L2directory
+  for (i = 0; i <= 63; i++) {
+    for (j = 0; j < 20; j++)
+      sys.L2Directory[i].Dir_config[j] = 1;
+    for (j = 0; j < 20; j++)
+      sys.L2Directory[i].buffer_sizes[j] = 1;
+    sys.L2Directory[i].clockrate = 1;
+    sys.L2Directory[i].ports[20] = 1;
+    sys.L2Directory[i].device_type = 1;
+    strcpy(sys.L2Directory[i].threeD_stack, "default");
+    sys.L2Directory[i].total_accesses = 1;
+    sys.L2Directory[i].read_accesses = 1;
+    sys.L2Directory[i].write_accesses = 1;
+    sys.L2Directory[i].duty_cycle = 1;
+  }
+  for (i = 0; i <= 63; i++) {
+    // system_L2
+    for (j = 0; j < 20; j++)
+      sys.L2[i].L2_config[j] = 1;
+    sys.L2[i].clockrate = 1;
+    for (j = 0; j < 20; j++)
+      sys.L2[i].ports[j] = 1;
+    sys.L2[i].device_type = 1;
+    strcpy(sys.L2[i].threeD_stack, "default");
+    for (j = 0; j < 20; j++)
+      sys.L2[i].buffer_sizes[j] = 1;
+    sys.L2[i].total_accesses = 1;
+    sys.L2[i].read_accesses = 1;
+    sys.L2[i].write_accesses = 1;
+    sys.L2[i].total_hits = 1;
+    sys.L2[i].total_misses = 1;
+    sys.L2[i].read_hits = 1;
+    sys.L2[i].write_hits = 1;
+    sys.L2[i].read_misses = 1;
+    sys.L2[i].write_misses = 1;
+    sys.L2[i].replacements = 1;
+    sys.L2[i].write_backs = 1;
+    sys.L2[i].miss_buffer_accesses = 1;
+    sys.L2[i].fill_buffer_accesses = 1;
+    sys.L2[i].prefetch_buffer_accesses = 1;
+    sys.L2[i].prefetch_buffer_writes = 1;
+    sys.L2[i].prefetch_buffer_reads = 1;
+    sys.L2[i].prefetch_buffer_hits = 1;
+    sys.L2[i].wbb_writes = 1;
+    sys.L2[i].wbb_reads = 1;
+    sys.L2[i].duty_cycle = 1;
+    sys.L2[i].merged_dir = false;
+    sys.L2[i].homenode_read_accesses = 1;
+    sys.L2[i].homenode_write_accesses = 1;
+    sys.L2[i].homenode_read_hits = 1;
+    sys.L2[i].homenode_write_hits = 1;
+    sys.L2[i].homenode_read_misses = 1;
+    sys.L2[i].homenode_write_misses = 1;
+    sys.L2[i].dir_duty_cycle = 1;
+  }
+  for (i = 0; i <= 63; i++) {
+    // system_L3
+    for (j = 0; j < 20; j++)
+      sys.L3[i].L3_config[j] = 1;
+    sys.L3[i].clockrate = 1;
+    for (j = 0; j < 20; j++)
+      sys.L3[i].ports[j] = 1;
+    sys.L3[i].device_type = 1;
+    strcpy(sys.L3[i].threeD_stack, "default");
+    for (j = 0; j < 20; j++)
+      sys.L3[i].buffer_sizes[j] = 1;
+    sys.L3[i].total_accesses = 1;
+    sys.L3[i].read_accesses = 1;
+    sys.L3[i].write_accesses = 1;
+    sys.L3[i].total_hits = 1;
+    sys.L3[i].total_misses = 1;
+    sys.L3[i].read_hits = 1;
+    sys.L3[i].write_hits = 1;
+    sys.L3[i].read_misses = 1;
+    sys.L3[i].write_misses = 1;
+    sys.L3[i].replacements = 1;
+    sys.L3[i].write_backs = 1;
+    sys.L3[i].miss_buffer_accesses = 1;
+    sys.L3[i].fill_buffer_accesses = 1;
+    sys.L3[i].prefetch_buffer_accesses = 1;
+    sys.L3[i].prefetch_buffer_writes = 1;
+    sys.L3[i].prefetch_buffer_reads = 1;
+    sys.L3[i].prefetch_buffer_hits = 1;
+    sys.L3[i].wbb_writes = 1;
+    sys.L3[i].wbb_reads = 1;
+    sys.L3[i].duty_cycle = 1;
+    sys.L3[i].merged_dir = false;
+    sys.L3[i].homenode_read_accesses = 1;
+    sys.L3[i].homenode_write_accesses = 1;
+    sys.L3[i].homenode_read_hits = 1;
+    sys.L3[i].homenode_write_hits = 1;
+    sys.L3[i].homenode_read_misses = 1;
+    sys.L3[i].homenode_write_misses = 1;
+    sys.L3[i].dir_duty_cycle = 1;
+  }
+  // system_NoC
+  for (i = 0; i <= 63; i++) {
+    sys.NoC[i].clockrate = 1;
+    sys.NoC[i].type = true;
+    sys.NoC[i].chip_coverage = 1;
+    sys.NoC[i].has_global_link = true;
+    strcpy(sys.NoC[i].topology, "default");
+    sys.NoC[i].horizontal_nodes = 1;
+    sys.NoC[i].vertical_nodes = 1;
+    sys.NoC[i].input_ports = 1;
+    sys.NoC[i].output_ports = 1;
+    sys.NoC[i].virtual_channel_per_port = 1;
+    sys.NoC[i].flit_bits = 1;
+    sys.NoC[i].input_buffer_entries_per_vc = 1;
+    sys.NoC[i].total_accesses = 1;
+    sys.NoC[i].duty_cycle = 1;
+    sys.NoC[i].route_over_perc = 0.5;
+    for (j = 0; j < 20; j++)
+      sys.NoC[i].ports_of_input_buffer[j] = 1;
+    sys.NoC[i].number_of_crossbars = 1;
+    strcpy(sys.NoC[i].crossbar_type, "default");
+    strcpy(sys.NoC[i].crosspoint_type, "default");
+    // system.NoC?.xbar0;
+    sys.NoC[i].xbar0.number_of_inputs_of_crossbars = 1;
+    sys.NoC[i].xbar0.number_of_outputs_of_crossbars = 1;
+    sys.NoC[i].xbar0.flit_bits = 1;
+    sys.NoC[i].xbar0.input_buffer_entries_per_port = 1;
+    sys.NoC[i].xbar0.ports_of_input_buffer[20] = 1;
+    sys.NoC[i].xbar0.crossbar_accesses = 1;
+  }
+  // system_mem
+  sys.mem.mem_tech_node = 1;
+  sys.mem.device_clock = 1;
+  sys.mem.capacity_per_channel = 1;
+  sys.mem.number_ranks = 1;
+  sys.mem.peak_transfer_rate = 1;
+  sys.mem.num_banks_of_DRAM_chip = 1;
+  sys.mem.Block_width_of_DRAM_chip = 1;
+  sys.mem.output_width_of_DRAM_chip = 1;
+  sys.mem.page_size_of_DRAM_chip = 1;
+  sys.mem.burstlength_of_DRAM_chip = 1;
+  sys.mem.internal_prefetch_of_DRAM_chip = 1;
+  sys.mem.memory_accesses = 1;
+  sys.mem.memory_reads = 1;
+  sys.mem.memory_writes = 1;
+
+  // system_mc
+  sys.mc.mc_clock = 1;
+  sys.mc.number_mcs = 1;
+  sys.mc.peak_transfer_rate = 1;
+  sys.mc.memory_channels_per_mc = 1;
+  sys.mc.number_ranks = 1;
+  sys.mc.req_window_size_per_channel = 1;
+  sys.mc.IO_buffer_size_per_channel = 1;
+  sys.mc.databus_width = 1;
+  sys.mc.addressbus_width = 1;
+  sys.mc.memory_accesses = 1;
+  sys.mc.memory_reads = 1;
+  sys.mc.memory_writes = 1;
+  sys.mc.LVDS = true;
+  sys.mc.type = 1;
+
+  // system_niu
+  sys.niu.clockrate = 1;
+  sys.niu.number_units = 1;
+  sys.niu.type = 1;
+  sys.niu.duty_cycle = 1;
+  sys.niu.total_load_perc = 1;
+  // system_pcie
+  sys.pcie.clockrate = 1;
+  sys.pcie.number_units = 1;
+  sys.pcie.num_channels = 1;
+  sys.pcie.type = 1;
+  sys.pcie.withPHY = false;
+  sys.pcie.duty_cycle = 1;
+  sys.pcie.total_load_perc = 1;
+  // system_flash_controller
+  sys.flashc.mc_clock = 1;
+  sys.flashc.number_mcs = 1;
+  sys.flashc.peak_transfer_rate = 1;
+  sys.flashc.memory_channels_per_mc = 1;
+  sys.flashc.number_ranks = 1;
+  sys.flashc.req_window_size_per_channel = 1;
+  sys.flashc.IO_buffer_size_per_channel = 1;
+  sys.flashc.databus_width = 1;
+  sys.flashc.addressbus_width = 1;
+  sys.flashc.memory_accesses = 1;
+  sys.flashc.memory_reads = 1;
+  sys.flashc.memory_writes = 1;
+  sys.flashc.LVDS = true;
+  sys.flashc.withPHY = false;
+  sys.flashc.type = 1;
+  sys.flashc.duty_cycle = 1;
+  sys.flashc.total_load_perc = 1;
+}
diff --git a/src/gpuwattch/XML_Parse.h b/src/gpuwattch/XML_Parse.h
new file mode 100644
index 000000000..918835410
--- /dev/null
+++ b/src/gpuwattch/XML_Parse.h
@@ -0,0 +1,691 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+
+#ifndef XML_PARSE_H_
+#define XML_PARSE_H_
+
+//#ifdef WIN32
+//#define _CRT_SECURE_NO_DEPRECATE
+//#endif
+
+#include "xmlParser.h"
+#include <iostream>
+#include <stdio.h>
+#include <string.h>
+using namespace std;
+
+/*
+void myfree(char *t); // {free(t);}
+ToXMLStringTool tx,tx2;
+*/
+// all subnodes at the level of system.core(0-n)
+// cache_policy is added into cache property arrays;//0 no write or write-though
+// with non-write allocate;1 write-back with write-allocate
+//
+// tgrogers - This was a static array declared in the header...
+//           Not too sure why the authors did this, maybe they didn't understand
+//           the context of the "static" keyword outside a class declaration. As
+//           it was written, each object file had it's own copy of the string
+//           list - which is okay, since the string is constant but wastes space
+//           and causes the compiler to complain about unused vars in files
+//           where this header is included but they don't use the variable.  Now
+//           this is extern'd here and the storage/definition is in the
+//           XML_Parse.cc file
+extern const char *perf_count_label[];
+
+enum perf_count_t {
+  TOT_INST = 0,
+  FP_INT,
+  IC_H,
+  IC_M,
+  DC_RH,
+  DC_RM,
+  DC_WH,
+  DC_WM,
+  TC_H,
+  TC_M,
+  CC_H,
+  CC_M,
+  SHRD_ACC,
+  REG_RD,
+  REG_WR,
+  NON_REG_OPs,
+  SP_ACC,
+  SFU_ACC,
+  FPU_ACC,
+  MEM_RD,
+  MEM_WR,
+  MEM_PRE,
+  L2_RH,
+  L2_RM,
+  L2_WH,
+  L2_WM,
+  NOC_A,
+  PIPE_A,
+  IDLE_CORE_N,
+  CONST_DYNAMICN,
+  NUM_PERFORMANCE_COUNTERS
+};
+
+typedef struct {
+  int prediction_width;
+  char prediction_scheme[20];
+  int predictor_size;
+  int predictor_entries;
+  int local_predictor_size[20];
+  int local_predictor_entries;
+  int global_predictor_entries;
+  int global_predictor_bits;
+  int chooser_predictor_entries;
+  int chooser_predictor_bits;
+  double predictor_accesses;
+} predictor_systemcore;
+typedef struct {
+  int number_entries;
+  int cache_policy; // 0 no write or write-though with non-write allocate;1
+                    // write-back with write-allocate
+  double total_hits;
+  double total_accesses;
+  double total_misses;
+  double conflicts;
+} itlb_systemcore;
+typedef struct {
+  // params
+  double icache_config[20];
+  int buffer_sizes[20];
+  int cache_policy; // 0 no write or write-though with non-write allocate;1
+                    // write-back with write-allocate
+  // stats
+  double total_accesses;
+  double read_accesses;
+  double read_misses;
+  double replacements;
+  double read_hits;
+  double total_hits;
+  double total_misses;
+  double miss_buffer_access;
+  double fill_buffer_accesses;
+  double prefetch_buffer_accesses;
+  double prefetch_buffer_writes;
+  double prefetch_buffer_reads;
+  double prefetch_buffer_hits;
+  double conflicts;
+} icache_systemcore;
+typedef struct {
+  // params
+  int number_entries;
+  int cache_policy; // 0 no write or write-though with non-write allocate;1
+                    // write-back with write-allocate
+  // stats
+  double total_accesses;
+  double read_accesses;
+  double write_accesses;
+  double write_hits;
+  double read_hits;
+  double read_misses;
+  double write_misses;
+  double total_hits;
+  double total_misses;
+  double conflicts;
+} dtlb_systemcore;
+typedef struct {
+  // params
+  double dcache_config[20];
+  int buffer_sizes[20];
+  int cache_policy; // 0 no write or write-though with non-write allocate;1
+                    // write-back with write-allocate
+  // stats
+  double total_accesses;
+  double read_accesses;
+  double write_accesses;
+  double total_hits;
+  double total_misses;
+  double read_hits;
+  double write_hits;
+  double read_misses;
+  double write_misses;
+  double replacements;
+  double write_backs;
+  double miss_buffer_access;
+  double fill_buffer_accesses;
+  double prefetch_buffer_accesses;
+  double prefetch_buffer_writes;
+  double prefetch_buffer_reads;
+  double prefetch_buffer_hits;
+  double wbb_writes;
+  double wbb_reads;
+  double conflicts;
+} dcache_systemcore;
+typedef struct {
+  // params
+  int BTB_config[20];
+  // stats
+  double total_accesses;
+  double read_accesses;
+  double write_accesses;
+  double total_hits;
+  double total_misses;
+  double read_hits;
+  double write_hits;
+  double read_misses;
+  double write_misses;
+  double replacements;
+} BTB_systemcore;
+typedef struct {
+  // all params at the level of system.core(0-n)
+  int clock_rate;
+  bool opt_local;
+  bool x86;
+  int machine_bits;
+  int virtual_address_width;
+  int physical_address_width;
+  int opcode_width;
+  int micro_opcode_width;
+  int instruction_length;
+  int machine_type;
+  int internal_datapath_width;
+  int number_hardware_threads;
+  int fetch_width;
+  int number_instruction_fetch_ports;
+  int decode_width;
+  int issue_width;
+  int peak_issue_width;
+  int commit_width;
+  int pipelines_per_core[20];
+  int pipeline_depth[20];
+  char FPU[20];
+  char divider_multiplier[20];
+  int ALU_per_core;
+  double FPU_per_core;
+  int MUL_per_core;
+  int instruction_buffer_size;
+  int decoded_stream_buffer_size;
+  int instruction_window_scheme;
+  int instruction_window_size;
+  int fp_instruction_window_size;
+  int ROB_size;
+  int archi_Regs_IRF_size;
+  int archi_Regs_FRF_size;
+  int phy_Regs_IRF_size;
+  int phy_Regs_FRF_size;
+  int rename_scheme;
+  int register_windows_size;
+  char LSU_order[20];
+  int store_buffer_size;
+  int load_buffer_size;
+  int memory_ports;
+  char Dcache_dual_pump[20];
+  int RAS_size;
+  int fp_issue_width;
+  int prediction_width;
+  int number_of_BTB;
+  int number_of_BPT;
+  bool gpgpu_clock_gated_lanes;
+
+  // all stats at the level of system.core(0-n)
+  double total_instructions;
+  double int_instructions;
+  double fp_instructions;
+  double branch_instructions;
+  double branch_mispredictions;
+  double committed_instructions;
+  double committed_int_instructions;
+  double committed_fp_instructions;
+  double load_instructions;
+  double store_instructions;
+  double total_cycles;
+  double idle_cycles;
+  double busy_cycles;
+  double instruction_buffer_reads;
+  double instruction_buffer_write;
+  double ROB_reads;
+  double ROB_writes;
+  double rename_accesses;
+  double fp_rename_accesses;
+  double rename_reads;
+  double rename_writes;
+  double fp_rename_reads;
+  double fp_rename_writes;
+  double inst_window_reads;
+  double inst_window_writes;
+  double inst_window_wakeup_accesses;
+  double inst_window_selections;
+  double fp_inst_window_reads;
+  double fp_inst_window_writes;
+  double fp_inst_window_wakeup_accesses;
+  double fp_inst_window_selections;
+  double archi_int_regfile_reads;
+  double archi_float_regfile_reads;
+  double phy_int_regfile_reads;
+  double phy_float_regfile_reads;
+  double phy_int_regfile_writes;
+  double phy_float_regfile_writes;
+  double archi_int_regfile_writes;
+  double archi_float_regfile_writes;
+  double int_regfile_reads;
+  double float_regfile_reads;
+  double int_regfile_writes;
+  double float_regfile_writes;
+  double non_rf_operands;
+  double windowed_reg_accesses;
+  double windowed_reg_transports;
+  double function_calls;
+  double context_switches;
+  double ialu_accesses;
+  double fpu_accesses;
+  double mul_accesses;
+  double sp_average_active_lanes;
+  double sfu_average_active_lanes;
+  double cdb_alu_accesses;
+  double cdb_mul_accesses;
+  double cdb_fpu_accesses;
+  double load_buffer_reads;
+  double load_buffer_writes;
+  double load_buffer_cams;
+  double store_buffer_reads;
+  double store_buffer_writes;
+  double store_buffer_cams;
+  double store_buffer_forwards;
+  double main_memory_access;
+  double main_memory_read;
+  double main_memory_write;
+  double pipeline_duty_cycle;
+
+  double IFU_duty_cycle;
+  double BR_duty_cycle;
+  double LSU_duty_cycle;
+  double MemManU_I_duty_cycle;
+  double MemManU_D_duty_cycle;
+  double ALU_duty_cycle;
+  double MUL_duty_cycle;
+  double FPU_duty_cycle;
+  double ALU_cdb_duty_cycle;
+  double MUL_cdb_duty_cycle;
+  double FPU_cdb_duty_cycle;
+
+  double num_idle_cores;
+
+  int rf_banks;            // (4)
+  int simd_width;          // (8)
+  int collector_units;     // (4)
+  double core_clock_ratio; // (2.0)
+  int warp_size;           // (32)
+
+  // all subnodes at the level of system.core(0-n)
+  predictor_systemcore predictor;
+  itlb_systemcore itlb;
+  icache_systemcore icache;
+  dtlb_systemcore dtlb;
+  dcache_systemcore dcache;
+  dcache_systemcore ccache;
+  dcache_systemcore tcache;
+  dcache_systemcore sharedmemory; // added by Jingwen
+  BTB_systemcore BTB;
+
+} system_core;
+typedef struct {
+  // params
+  int Directory_type;
+  double Dir_config[20];
+  int buffer_sizes[20];
+  int clockrate;
+  int ports[20];
+  int device_type;
+  int cache_policy; // 0 no write or write-though with non-write allocate;1
+                    // write-back with write-allocate
+  char threeD_stack[20];
+  // stats
+  double total_accesses;
+  double read_accesses;
+  double write_accesses;
+  double read_misses;
+  double write_misses;
+  double conflicts;
+  double duty_cycle;
+} system_L1Directory;
+typedef struct {
+  // params
+  int Directory_type;
+  double Dir_config[20];
+  int buffer_sizes[20];
+  int clockrate;
+  int ports[20];
+  int device_type;
+  int cache_policy; // 0 no write or write-though with non-write allocate;1
+                    // write-back with write-allocate
+  char threeD_stack[20];
+  // stats
+  double total_accesses;
+  double read_accesses;
+  double write_accesses;
+  double read_misses;
+  double write_misses;
+  double conflicts;
+  double duty_cycle;
+} system_L2Directory;
+typedef struct {
+  // params
+  double L2_config[20];
+  int clockrate;
+  int ports[20];
+  int device_type;
+  int cache_policy; // 0 no write or write-though with non-write allocate;1
+                    // write-back with write-allocate
+  char threeD_stack[20];
+  int buffer_sizes[20];
+  // stats
+  double total_accesses;
+  double read_accesses;
+  double write_accesses;
+  double total_hits;
+  double total_misses;
+  double read_hits;
+  double write_hits;
+  double read_misses;
+  double write_misses;
+  double replacements;
+  double write_backs;
+  double miss_buffer_accesses;
+  double fill_buffer_accesses;
+  double prefetch_buffer_accesses;
+  double prefetch_buffer_writes;
+  double prefetch_buffer_reads;
+  double prefetch_buffer_hits;
+  double wbb_writes;
+  double wbb_reads;
+  double conflicts;
+  double duty_cycle;
+
+  bool merged_dir;
+  double homenode_read_accesses;
+  double homenode_write_accesses;
+  double homenode_read_hits;
+  double homenode_write_hits;
+  double homenode_read_misses;
+  double homenode_write_misses;
+  double dir_duty_cycle;
+} system_L2;
+typedef struct {
+  // params
+  double L3_config[20];
+  int clockrate;
+  int ports[20];
+  int device_type;
+  int cache_policy; // 0 no write or write-though with non-write allocate;1
+                    // write-back with write-allocate
+  char threeD_stack[20];
+  int buffer_sizes[20];
+  // stats
+  double total_accesses;
+  double read_accesses;
+  double write_accesses;
+  double total_hits;
+  double total_misses;
+  double read_hits;
+  double write_hits;
+  double read_misses;
+  double write_misses;
+  double replacements;
+  double write_backs;
+  double miss_buffer_accesses;
+  double fill_buffer_accesses;
+  double prefetch_buffer_accesses;
+  double prefetch_buffer_writes;
+  double prefetch_buffer_reads;
+  double prefetch_buffer_hits;
+  double wbb_writes;
+  double wbb_reads;
+  double conflicts;
+  double duty_cycle;
+
+  bool merged_dir;
+  double homenode_read_accesses;
+  double homenode_write_accesses;
+  double homenode_read_hits;
+  double homenode_write_hits;
+  double homenode_read_misses;
+  double homenode_write_misses;
+  double dir_duty_cycle;
+} system_L3;
+typedef struct {
+  // params
+  int number_of_inputs_of_crossbars;
+  int number_of_outputs_of_crossbars;
+  int flit_bits;
+  int input_buffer_entries_per_port;
+  int ports_of_input_buffer[20];
+  // stats
+  double crossbar_accesses;
+} xbar0_systemNoC;
+typedef struct {
+  // params
+  int clockrate;
+  bool type;
+  bool has_global_link;
+  char topology[20];
+  int horizontal_nodes;
+  int vertical_nodes;
+  int link_throughput;
+  int link_latency;
+  int input_ports;
+  int output_ports;
+  int virtual_channel_per_port;
+  int flit_bits;
+  int input_buffer_entries_per_vc;
+  int ports_of_input_buffer[20];
+  int dual_pump;
+  int number_of_crossbars;
+  char crossbar_type[20];
+  char crosspoint_type[20];
+  xbar0_systemNoC xbar0;
+  int arbiter_type;
+  double chip_coverage;
+  // stats
+  double total_accesses;
+  double duty_cycle;
+  double route_over_perc;
+} system_NoC;
+typedef struct {
+  // params
+  int mem_tech_node;
+  int device_clock;
+  int peak_transfer_rate;
+  int internal_prefetch_of_DRAM_chip;
+  int capacity_per_channel;
+  int number_ranks;
+  int num_banks_of_DRAM_chip;
+  int Block_width_of_DRAM_chip;
+  int output_width_of_DRAM_chip;
+  int page_size_of_DRAM_chip;
+  int burstlength_of_DRAM_chip;
+
+  // stats
+  double memory_accesses;
+  double memory_reads;
+  double memory_writes;
+  double dram_pre;
+} system_mem;
+typedef struct {
+  // params
+  // Common Param for mc and fc
+  double peak_transfer_rate;
+  int number_mcs;
+  bool withPHY;
+  int type;
+
+  // FCParam
+  // stats
+  double duty_cycle;
+  double total_load_perc;
+
+  // McParam
+  int mc_clock;
+  int llc_line_length;
+  int memory_channels_per_mc;
+  int number_ranks;
+  int req_window_size_per_channel;
+  int IO_buffer_size_per_channel;
+  int databus_width;
+  int addressbus_width;
+  int PRT_entries;
+  bool LVDS;
+
+  // emprical DRAM coeff
+  double dram_cmd_coeff;
+  double dram_act_coeff;
+  double dram_nop_coeff;
+  double dram_activity_coeff;
+  double dram_pre_coeff;
+  double dram_rd_coeff;
+  double dram_wr_coeff;
+  double dram_req_coeff;
+  double dram_const_coeff;
+
+  // stats
+  double memory_accesses;
+  double memory_reads;
+  double memory_writes;
+
+  // dram stats
+  double dram_cmd;
+  double dram_activity;
+  double dram_nop;
+  double dram_act;
+  double dram_pre;
+  double dram_rd;
+  double dram_wr;
+  double dram_req;
+
+} system_mc;
+
+typedef struct {
+  // params
+  int clockrate;
+  int number_units;
+  int type;
+  // stats
+  double duty_cycle;
+  double total_load_perc;
+} system_niu;
+
+typedef struct {
+  // params
+  int clockrate;
+  int number_units;
+  int num_channels;
+  int type;
+  bool withPHY;
+  // stats
+  double duty_cycle;
+  double total_load_perc;
+} system_pcie;
+
+typedef struct {
+  // All number_of_* at the level of 'system' Ying 03/21/2009
+  int GPU_Architecture;
+  int number_of_cores;
+  int architecture;
+  int number_of_L1Directories;
+  int number_of_L2Directories;
+  int number_of_L2s;
+  bool Private_L2;
+  int number_of_L3s;
+  int number_of_NoCs;
+  int number_of_dir_levels;
+  int domain_size;
+  int first_level_dir;
+  // All params at the level of 'system'
+  int homogeneous_cores;
+  int homogeneous_L1Directories;
+  int homogeneous_L2Directories;
+  double core_tech_node;
+  int target_core_clockrate;
+  int target_chip_area;
+  int temperature;
+  int number_cache_levels;
+  int L1_property;
+  int L2_property;
+  int homogeneous_L2s;
+  int L3_property;
+  int homogeneous_L3s;
+  int homogeneous_NoCs;
+  int homogeneous_ccs;
+  int Max_area_deviation;
+  int Max_power_deviation;
+  int device_type;
+  bool longer_channel_device;
+  bool Embedded;
+  bool opt_dynamic_power;
+  bool opt_lakage_power;
+  bool opt_clockrate;
+  bool opt_area;
+  int interconnect_projection_type;
+  int machine_bits;
+  int virtual_address_width;
+  int physical_address_width;
+  int virtual_memory_page_size;
+  double idle_core_power;
+  double num_idle_cores;
+  int arch;
+  double total_cycles;
+  // system.core(0-n):3rd level
+  double scaling_coefficients[64];
+  system_core core[64];
+  system_L1Directory L1Directory[64];
+  system_L2Directory L2Directory[64];
+  system_L2 L2[64];
+  system_L2 l2;
+  system_L3 L3[64];
+  system_NoC NoC[64];
+  system_mem mem;
+  system_mc mc;
+  system_mc flashc;
+  system_niu niu;
+  system_pcie pcie;
+} root_system;
+
+class ParseXML {
+public:
+  void parse(char *filepath);
+  void initialize();
+
+public:
+  root_system sys;
+};
+
+#endif /* XML_PARSE_H_ */
diff --git a/src/gpuwattch/Xeon.xml b/src/gpuwattch/Xeon.xml
new file mode 100644
index 000000000..534210485
--- /dev/null
+++ b/src/gpuwattch/Xeon.xml
@@ -0,0 +1,455 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="2"/>
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="Private_L2" value="1"/><!--1 Private, 0 shared/coherent -->
+		<param name="number_of_L3s" value="1"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/>
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="65"/><!-- nm -->
+		<param name="target_core_clockrate" value="3400"/><!--MHz -->
+		<param name="temperature" value="380"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="3"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when approperiate -->
+		<param name="machine_bits" value="64"/>
+		<param name="virtual_address_width" value="64"/>
+		<param name="physical_address_width" value="52"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+		<stat name="total_cycles" value="100000"/>
+		<stat name="idle_cycles" value="0"/>
+		<stat name="busy_cycles"  value="100000"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="3400"/>
+			<!-- for cores with unknow timing, set to 0 to force off the opt flag -->
+			<param name="opt_local" value="0"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="16"/>
+			<param name="x86" value="1"/>
+			<param name="micro_opcode_width" value="8"/>
+			<param name="machine_type" value="0"/>
+			<!-- inorder/OoO; 1 inorder; 0 OOO-->
+			<param name="number_hardware_threads" value="2"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="4"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="4"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="4"/>
+			<param name="peak_issue_width" value="6"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="4"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="2"/>
+			<param name="prediction_width" value="1"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="31,31"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="6"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="1"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="2"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="32"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="16"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="64"/>
+			<param name="fp_instruction_window_size" value="64"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="128"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<param name="archi_Regs_IRF_size" value="16"/><!-- X86-64 has 16GPR -->			
+			<param name="archi_Regs_FRF_size" value="32"/><!-- MMX + XMM -->
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="256"/>
+			<param name="phy_Regs_FRF_size" value="256"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="96"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="48"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="64"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="400000"/>
+			<stat name="int_instructions" value="200000"/>
+			<stat name="fp_instructions" value="100000"/>
+			<stat name="branch_instructions" value="100000"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="0"/>
+			<stat name="store_instructions" value="50000"/>
+			<stat name="committed_instructions" value="400000"/>
+			<stat name="committed_int_instructions" value="200000"/>
+			<stat name="committed_fp_instructions" value="100000"/>
+			<stat name="pipeline_duty_cycle" value="1"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="100000"/>
+		    <stat name="idle_cycles" value="0"/>
+		    <stat name="busy_cycles"  value="100000"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="400000"/>
+			<stat name="ROB_writes" value="400000"/>
+			<!-- RAT accesses -->
+			<stat name="rename_reads" value="800000"/> <!--lookup in renaming logic -->
+			<stat name="rename_writes" value="400000"/><!--update dest regs. renaming logic -->
+			<stat name="fp_rename_reads" value="200000"/>
+			<stat name="fp_rename_writes" value="100000"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="400000"/>
+			<stat name="inst_window_writes" value="400000"/>
+			<stat name="inst_window_wakeup_accesses" value="800000"/>
+			<stat name="fp_inst_window_reads" value="200000"/>
+			<stat name="fp_inst_window_writes" value="200000"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="400000"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="600000"/>
+			<stat name="float_regfile_reads" value="100000"/>
+			<stat name="int_regfile_writes" value="300000"/>
+			<stat name="float_regfile_writes" value="50000"/>
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="5"/>
+			<stat name="context_switches" value="260343"/>
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="300000"/>			
+			<stat name="fpu_accesses" value="100000"/>
+			<stat name="mul_accesses" value="200000"/>
+			<stat name="cdb_alu_accesses" value="300000"/>
+			<stat name="cdb_mul_accesses" value="200000"/>
+			<stat name="cdb_fpu_accesses" value="100000"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="1"/>			
+			<stat name="LSU_duty_cycle" value="0.5"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.5"/>
+			<stat name="ALU_duty_cycle" value="1"/>
+			<stat name="MUL_duty_cycle" value="0.3"/>
+			<stat name="FPU_duty_cycle" value="0.3"/>
+			<stat name="ALU_cdb_duty_cycle" value="1"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.3"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.3"/>
+			<param name="number_of_BPT" value="2"/>
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="128"/>
+				<stat name="total_accesses" value="200000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="131072,32,8,1,8,3,32,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy,  -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="200000"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="128"/><!--dual threads-->
+				<stat name="total_accesses" value="400000"/>
+				<stat name="total_misses" value="4"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,16,4,1, 3,3, 16,1 "/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<param name="number_of_BTB" value="2"/>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="5120,4,2,1, 1,3"/> <!--should be 4096 + 1024 -->
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+				<stat name="read_accesses" value="400000"/> <!--See IFU code for guideline -->
+				<stat name="write_accesses" value="0"/>
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="4096,2,0,1,100,100, 8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="3400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>	
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="1048576,32, 8, 8, 8, 23, 32, 1"/> 
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="3400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="1.0"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="16777216,64,16, 16, 16, 100,1"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+				<param name="clockrate" value="850"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="11824"/>
+				<stat name="write_accesses" value="11276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+				<stat name="duty_cycle" value="1.0"/>	
+		</component>
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="3400"/>
+			<param name="type" value="0"/>
+			<!--0:bus, 1:NoC , for bus no matter how many nodes sharing the bus
+				at each time only one node can send req -->
+			<param name="horizontal_nodes" value="1"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="1"/>
+			<param name="output_ports" value="1"/>
+			<!-- For bus the I/O ports should be 1 -->
+			<param name="flit_bits" value="256"/>
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. 
+				chip_coverage <=1 -->
+			<param name="link_routing_over_percentage" value="0.5"/>
+			<!-- Links can route over other components or occupy whole area.
+				by default, 50% of the NoC global links routes over other 
+				components -->
+			<stat name="total_accesses" value="100000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="1"/>
+		</component>		
+<!--**********************************************************************-->
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="32"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="6400"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="0"/>
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="1"/>
+			<param name="number_ranks" value="2"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="128"/>
+			<param name="addressbus_width" value="51"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="33333"/>
+			<stat name="memory_reads" value="16667"/>
+			<stat name="memory_writes" value="16667"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
+
diff --git a/src/gpuwattch/arch_const.h b/src/gpuwattch/arch_const.h
new file mode 100644
index 000000000..0c82248c1
--- /dev/null
+++ b/src/gpuwattch/arch_const.h
@@ -0,0 +1,262 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef ARCH_CONST_H_
+#define ARCH_CONST_H_
+
+typedef struct {
+  unsigned int capacity;
+  unsigned int assoc; // fully
+  unsigned int blocksize;
+} array_inputs;
+
+// Do Not change, unless you want to bypass the XML interface and do not care
+// about the default values. Global parameters
+const int number_of_cores = 8;
+const int number_of_L2s = 1;
+const int number_of_L3s = 1;
+const int number_of_NoCs = 1;
+
+const double archi_F_sz_nm = 90.0;
+const unsigned int dev_type = 0;
+const double CLOCKRATE = 1.2 * 1e9;
+const double AF = 0.5;
+// const bool 			inorder			=	true;
+const bool embedded = false; // NEW
+
+const bool homogeneous_cores = true;
+const bool temperature = 360;
+const int number_cache_levels = 3;
+const int L1_property = 0; // private 0; coherent 1, shared 2.
+const int L2_property = 2;
+const bool homogeneous_L2s = true;
+const bool L3_property = 2;
+const bool homogeneous_L3s = true;
+const double Max_area_deviation = 50;
+const double Max_dynamic_deviation = 50; // New
+const int opt_dynamic_power = 1;
+const int opt_lakage_power = 0;
+const int opt_area = 0;
+const int interconnect_projection_type = 0;
+
+//******************************Core Parameters
+#if (inorder)
+const int opcode_length = 8;       // Niagara
+const int reg_length = 5;          // Niagara
+const int instruction_length = 32; // Niagara
+const int data_width = 64;
+#else
+const int opcode_length = 8;       // 16;//Niagara
+const int reg_length = 7;          // Niagara
+const int instruction_length = 32; // Niagara
+const int data_width = 64;
+#endif
+
+// Caches
+// itlb
+const int itlbsize = 512;
+const int itlbassoc = 0; // fully
+const int itlbblocksize = 8;
+// icache
+const int icachesize = 32768;
+const int icacheassoc = 4;
+const int icacheblocksize = 32;
+// dtlb
+const int dtlbsize = 512;
+const int dtlbassoc = 0; // fully
+const int dtlbblocksize = 8;
+// dcache
+const int dcachesize = 32768;
+const int dcacheassoc = 4;
+const int dcacheblocksize = 32;
+const int dcache_write_buffers = 8;
+
+// cache controllers
+// IB,
+const int numIBEntries = 64;
+const int IBsize = 64; // 2*4*instruction_length/8*2;
+const int IBassoc = 0; // In Niagara it is still fully associ
+const int IBblocksize = 4;
+
+// IFB and MIL should have the same parameters CAM
+const int IFBsize = 128; //
+const int IFBassoc = 0;  // In Niagara it is still fully associ
+const int IFBblocksize = 4;
+
+const int icache_write_buffers = 8;
+
+// register file RAM
+const int regfilesize = 5760;
+const int regfileassoc = 1;
+const int regfileblocksize = 18;
+// regwin  RAM
+const int regwinsize = 256;
+const int regwinassoc = 1;
+const int regwinblocksize = 8;
+
+// store buffer, lsq
+const int lsqsize = 512;
+const int lsqassoc = 0;
+const int lsqblocksize = 8;
+
+// data fill queue RAM
+const int dfqsize = 1024;
+const int dfqassoc = 1;
+const int dfqblocksize = 16;
+
+// outside the cores
+// L2 cache bank
+const int l2cachesize = 262144;
+const int l2cacheassoc = 16;
+const int l2cacheblocksize = 64;
+
+// L2 directory
+const int l2dirsize = 1024;
+const int l2dirassoc = 0;
+const int l2dirblocksize = 2;
+
+// crossbar
+// PCX
+const int PCX_NUMBER_INPUT_PORTS_CROSSBAR = 8;
+const int PCX_NUMBER_OUTPUT_PORTS_CROSSBAR = 9;
+const int PCX_NUMBER_SIGNALS_PER_PORT_CROSSBAR = 144;
+// PCX buffer RAM
+const int pcx_buffersize = 1024;
+const int pcx_bufferassoc = 1;
+const int pcx_bufferblocksize = 32;
+const int pcx_numbuffer = 5;
+// pcx arbiter
+const int pcx_arbsize = 128;
+const int pcx_arbassoc = 1;
+const int pcx_arbblocksize = 2;
+const int pcx_numarb = 5;
+
+// CPX
+const int CPX_NUMBER_INPUT_PORTS_CROSSBAR = 5;
+const int CPX_NUMBER_OUTPUT_PORTS_CROSSBAR = 8;
+const int CPX_NUMBER_SIGNALS_PER_PORT_CROSSBAR = 150;
+// CPX buffer RAM
+const int cpx_buffersize = 1024;
+const int cpx_bufferassoc = 1;
+const int cpx_bufferblocksize = 32;
+const int cpx_numbuffer = 8;
+// cpx arbiter
+const int cpx_arbsize = 128;
+const int cpx_arbassoc = 1;
+const int cpx_arbblocksize = 2;
+const int cpx_numarb = 8;
+
+const int numPhysFloatRegs = 256;
+const int numPhysIntRegs = 32;
+const int numROBEntries = 192;
+const int umRobs = 1;
+
+const int BTBEntries = 4096;
+const int BTBTagSize = 16;
+const int LFSTSize = 1024;
+const int LQEntries = 32;
+const int RASSize = 16;
+const int SQEntries = 32;
+const int SSITSize = 1024;
+const int activity = 0;
+const int backComSize = 5;
+const int cachePorts = 200;
+const int choiceCtrBits = 2;
+const int choicePredictorSize = 8192;
+
+const int commitWidth = 8;
+const int decodeWidth = 8;
+const int dispatchWidth = 8;
+const int fetchWidth = 8;
+const int issueWidth = 1;
+const int renameWidth = 8;
+// what is this forwardComSize=5??
+
+const int globalCtrBits = 2;
+const int globalHistoryBits = 13;
+const int globalPredictorSize = 8192;
+
+const int localCtrBits = 2;
+const int localHistoryBits = 11;
+const int localHistoryTableSize = 2048;
+const int localPredictorSize = 2048;
+
+const double Woutdrvnandn = 30 * 0.09;   //(24.0 * LSCALE)
+const double Woutdrvnandp = 12.5 * 0.09; //(10.0 * LSCALE)
+const double Woutdrvnorn = 7.5 * 0.09;   //(6.0 * LSCALE)
+const double Woutdrvnorp = 50 * 0.09;    //	(40.0 * LSCALE)
+const double Woutdrivern = 60 * 0.09;    //(48.0 * LSCALE)
+const double Woutdriverp = 100 * 0.09;   //(80.0 * LSCALE)
+
+/*
+smtCommitPolicy=RoundRobin
+smtFetchPolicy=SingleThread
+smtIQPolicy=Partitioned
+smtIQThreshold=100
+smtLSQPolicy=Partitioned
+smtLSQThreshold=100
+smtNumFetchingThreads=1
+smtROBPolicy=Partitioned
+smtROBThreshold=100
+squashWidth=8
+*/
+
+/*
+prefetch_access=false
+prefetch_cache_check_push=true
+prefetch_data_accesses_only=false
+prefetch_degree=1
+prefetch_latency=10000
+prefetch_miss=false
+prefetch_past_page=false
+prefetch_policy=none
+prefetch_serial_squash=false
+prefetch_use_cpu_id=true
+prefetcher_size=100
+prioritizeRequests=false
+repl=Null
+
+
+split=false
+split_size=0
+subblock_size=0
+tgts_per_mshr=20
+trace_addr=0
+two_queue=false
+
+cpu_side=system.cpu0.dcache_port
+mem_side=system.tol2bus.port[2]
+*/
+
+//[system.cpu0.dtb]
+// type=AlphaDT
+
+#endif /* ARCH_CONST_H_ */
diff --git a/src/gpuwattch/array.cc b/src/gpuwattch/array.cc
new file mode 100644
index 000000000..75ed23744
--- /dev/null
+++ b/src/gpuwattch/array.cc
@@ -0,0 +1,308 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#define GLOBALVAR
+#include "array.h"
+#include "cacti/area.h"
+#include "decoder.h"
+#include "globalvar.h"
+#include "parameter.h"
+#include <assert.h>
+#include <iostream>
+#include <math.h>
+
+using namespace std;
+
+ArrayST::ArrayST(const InputParameter *configure_interface, string _name,
+                 enum Device_ty device_ty_, bool opt_local_,
+                 enum Core_type core_ty_, bool _is_default)
+    : l_ip(*configure_interface), name(_name), device_ty(device_ty_),
+      opt_local(opt_local_), core_ty(core_ty_), is_default(_is_default) {
+
+  if (l_ip.cache_sz < 64)
+    l_ip.cache_sz = 64;
+  l_ip.error_checking(); // not only do the error checking but also fill some
+                         // missing parameters
+  optimize_array();
+}
+
+void ArrayST::compute_base_power() {
+  // l_ip.out_w               =l_ip.line_sz*8;
+  local_result = cacti_interface(&l_ip);
+}
+
+void ArrayST::optimize_array() {
+  list<uca_org_t> candidate_solutions(0);
+  list<uca_org_t>::iterator candidate_iter, min_dynamic_energy_iter;
+
+  uca_org_t *temp_res = 0;
+  local_result.valid = false;
+
+  double throughput = l_ip.throughput, latency = l_ip.latency;
+  double area_efficiency_threshold = 20.0;
+  bool throughput_overflow = true, latency_overflow = true;
+  compute_base_power();
+
+  if ((local_result.cycle_time - throughput) <= 1e-10)
+    throughput_overflow = false;
+  if ((local_result.access_time - latency) <= 1e-10)
+    latency_overflow = false;
+
+  if (opt_for_clk && opt_local) {
+    if (throughput_overflow || latency_overflow) {
+      l_ip.ed = 0;
+
+      l_ip.delay_wt = 100; // Fixed number, make sure timing can be satisfied.
+      l_ip.cycle_time_wt = 1000;
+
+      l_ip.area_wt = 10; // Fixed number, This is used to exhaustive search for
+                         // individual components.
+      l_ip.dynamic_power_wt = 10; // Fixed number, This is used to exhaustive
+                                  // search for individual components.
+      l_ip.leakage_power_wt = 10;
+
+      l_ip.delay_dev =
+          1000000; // Fixed number, make sure timing can be satisfied.
+      l_ip.cycle_time_dev = 100;
+
+      l_ip.area_dev = 1000000; // Fixed number, This is used to exhaustive
+                               // search for individual components.
+      l_ip.dynamic_power_dev =
+          1000000; // Fixed number, This is used to exhaustive search for
+                   // individual components.
+      l_ip.leakage_power_dev = 1000000;
+
+      throughput_overflow =
+          true; // Reset overflow flag before start optimization iterations
+      latency_overflow = true;
+
+      temp_res = &local_result; // Clean up the result for optimized for ED^2P
+      temp_res->cleanup();
+    }
+
+    while ((throughput_overflow || latency_overflow) &&
+           l_ip.cycle_time_dev > 10) // && l_ip.delay_dev > 10
+    {
+      compute_base_power();
+
+      l_ip.cycle_time_dev -=
+          10; // This is the time_dev to be used for next iteration
+
+      //		from best area to worst area -->worst timing to best
+      // timing
+      if ((((local_result.cycle_time - throughput) <= 1e-10) &&
+           (local_result.access_time - latency) <= 1e-10) ||
+          (local_result.data_array2->area_efficiency <
+               area_efficiency_threshold &&
+           l_ip.assoc == 0)) { // if no satisfiable solution is found,the most
+                               // aggressive one is left
+        candidate_solutions.push_back(local_result);
+        // output_data_csv(candidate_solutions.back());
+        if (((local_result.cycle_time - throughput) <= 1e-10) &&
+            ((local_result.access_time - latency) <= 1e-10))
+        // ensure stop opt not because of cam
+        {
+          throughput_overflow = false;
+          latency_overflow = false;
+        }
+
+      } else {
+        // TODO: whether checking the partial satisfied results too, or just
+        // change the mark???
+        if ((local_result.cycle_time - throughput) <= 1e-10)
+          throughput_overflow = false;
+        if ((local_result.access_time - latency) <= 1e-10)
+          latency_overflow = false;
+
+        if (l_ip.cycle_time_dev > 10) { // if not >10 local_result is the last
+                                        // result, it cannot be cleaned up
+          temp_res = &local_result; // Only solutions not saved in the list need
+                                    // to be cleaned up
+          temp_res->cleanup();
+        }
+      }
+      //			l_ip.cycle_time_dev-=10;
+      //			l_ip.delay_dev-=10;
+    }
+
+    if (l_ip.assoc > 0) {
+      // For array structures except CAM and FA, Give warning but still provide
+      // a result with best timing found
+      if (throughput_overflow == true)
+        cout << "Warning: " << name
+             << " array structure cannot satisfy throughput constraint."
+             << endl;
+      if (latency_overflow == true)
+        cout << "Warning: " << name
+             << " array structure cannot satisfy latency constraint." << endl;
+    }
+
+    //	else
+    //	{
+    //		/*According to "Content-Addressable Memory (CAM) Circuits and
+    //				Architectures": A Tutorial and Survey
+    //				by Kostas Pagiamtzis et al.
+    //				CAM structures can be heavily pipelined and use
+    // look-ahead techniques, 				therefore timing can be
+    // relaxed. But McPAT does not model the
+    // advanced 				techniques. If continue optimizing, the area
+    // efficiency will be too low
+    //		*/
+    //		//For CAM and FA, stop opt if area efficiency is too low
+    //		if (throughput_overflow==true)
+    //			cout<< "Warning: " <<" McPAT stopped optimization on
+    // throughput for
+    //"<< name
+    //				<<" array structure because its area efficiency
+    // is below
+    //"<<area_efficiency_threshold<<"% " << endl; 		if
+    //(latency_overflow==true) 			cout<< "Warning: " <<" McPAT
+    //stopped optimization on latency for "<< name
+    //				<<" array structure because its area efficiency
+    // is below
+    //"<<area_efficiency_threshold<<"% " << endl;
+    //	}
+
+    // double min_dynamic_energy, min_dynamic_power, min_leakage_power,
+    // min_cycle_time;
+    double min_dynamic_energy = BIGNUM;
+    if (candidate_solutions.empty() == false) {
+      local_result.valid = true;
+      for (candidate_iter = candidate_solutions.begin();
+           candidate_iter != candidate_solutions.end(); ++candidate_iter)
+
+      {
+        if (min_dynamic_energy > (candidate_iter)->power.readOp.dynamic) {
+          min_dynamic_energy = (candidate_iter)->power.readOp.dynamic;
+          min_dynamic_energy_iter = candidate_iter;
+          local_result = *(min_dynamic_energy_iter);
+          // TODO: since results are reordered results and l_ip may miss match.
+          // Therefore, the final output spread sheets may show the miss match.
+
+        } else {
+          candidate_iter->cleanup();
+        }
+      }
+    }
+    candidate_solutions.clear();
+  }
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(device_ty, core_ty);
+
+  double macro_layout_overhead = g_tp.macro_layout_overhead;
+  double chip_PR_overhead = g_tp.chip_layout_overhead;
+  double total_overhead = macro_layout_overhead * chip_PR_overhead;
+  local_result.area *= total_overhead;
+
+  // maintain constant power density
+  double pppm_t[4] = {total_overhead, 1, 1, total_overhead};
+
+  double sckRation = g_tp.sckt_co_eff;
+  local_result.power.readOp.dynamic *= sckRation;
+  local_result.power.writeOp.dynamic *= sckRation;
+  local_result.power.searchOp.dynamic *= sckRation;
+  local_result.power.readOp.leakage *= l_ip.nbanks;
+  local_result.power.readOp.longer_channel_leakage =
+      local_result.power.readOp.leakage * long_channel_device_reduction;
+  local_result.power = local_result.power * pppm_t;
+
+  local_result.data_array2->power.readOp.dynamic *= sckRation;
+  local_result.data_array2->power.writeOp.dynamic *= sckRation;
+  local_result.data_array2->power.searchOp.dynamic *= sckRation;
+  local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
+  local_result.data_array2->power.readOp.longer_channel_leakage =
+      local_result.data_array2->power.readOp.leakage *
+      long_channel_device_reduction;
+  local_result.data_array2->power = local_result.data_array2->power * pppm_t;
+
+  if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) {
+    local_result.tag_array2->power.readOp.dynamic *= sckRation;
+    local_result.tag_array2->power.writeOp.dynamic *= sckRation;
+    local_result.tag_array2->power.searchOp.dynamic *= sckRation;
+    local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
+    local_result.tag_array2->power.readOp.longer_channel_leakage =
+        local_result.tag_array2->power.readOp.leakage *
+        long_channel_device_reduction;
+    local_result.tag_array2->power = local_result.tag_array2->power * pppm_t;
+  }
+}
+
+void ArrayST::leakage_feedback(double temperature) {
+  // Update the temperature. l_ip is already set and error-checked in the
+  // creator function.
+  l_ip.temp = (unsigned int)round(temperature / 10.0) * 10;
+
+  // This corresponds to cacti_interface() in the initialization process.
+  // Leakage power is updated here.
+  reconfigure(&l_ip, &local_result);
+
+  // Scale the power values. This is part of ArrayST::optimize_array().
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(device_ty, core_ty);
+
+  double macro_layout_overhead = g_tp.macro_layout_overhead;
+  double chip_PR_overhead = g_tp.chip_layout_overhead;
+  double total_overhead = macro_layout_overhead * chip_PR_overhead;
+
+  double pppm_t[4] = {total_overhead, 1, 1, total_overhead};
+
+  double sckRation = g_tp.sckt_co_eff;
+  local_result.power.readOp.dynamic *= sckRation;
+  local_result.power.writeOp.dynamic *= sckRation;
+  local_result.power.searchOp.dynamic *= sckRation;
+  local_result.power.readOp.leakage *= l_ip.nbanks;
+  local_result.power.readOp.longer_channel_leakage =
+      local_result.power.readOp.leakage * long_channel_device_reduction;
+  local_result.power = local_result.power * pppm_t;
+
+  local_result.data_array2->power.readOp.dynamic *= sckRation;
+  local_result.data_array2->power.writeOp.dynamic *= sckRation;
+  local_result.data_array2->power.searchOp.dynamic *= sckRation;
+  local_result.data_array2->power.readOp.leakage *= l_ip.nbanks;
+  local_result.data_array2->power.readOp.longer_channel_leakage =
+      local_result.data_array2->power.readOp.leakage *
+      long_channel_device_reduction;
+  local_result.data_array2->power = local_result.data_array2->power * pppm_t;
+
+  if (!(l_ip.pure_cam || l_ip.pure_ram || l_ip.fully_assoc) && l_ip.is_cache) {
+    local_result.tag_array2->power.readOp.dynamic *= sckRation;
+    local_result.tag_array2->power.writeOp.dynamic *= sckRation;
+    local_result.tag_array2->power.searchOp.dynamic *= sckRation;
+    local_result.tag_array2->power.readOp.leakage *= l_ip.nbanks;
+    local_result.tag_array2->power.readOp.longer_channel_leakage =
+        local_result.tag_array2->power.readOp.leakage *
+        long_channel_device_reduction;
+    local_result.tag_array2->power = local_result.tag_array2->power * pppm_t;
+  }
+}
+
+ArrayST::~ArrayST() { local_result.cleanup(); }
diff --git a/src/gpuwattch/array.h b/src/gpuwattch/array.h
new file mode 100644
index 000000000..8b52427f1
--- /dev/null
+++ b/src/gpuwattch/array.h
@@ -0,0 +1,117 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef ARRAY_H_
+#define ARRAY_H_
+
+#include "basic_components.h"
+#include "cacti/cacti_interface.h"
+#include "cacti/component.h"
+#include "cacti/const.h"
+#include "cacti/parameter.h"
+#include <iostream>
+#include <string>
+
+using namespace std;
+
+class ArrayST : public Component {
+public:
+  ArrayST(){};
+  ArrayST(const InputParameter *configure_interface, string _name,
+          enum Device_ty device_ty_, bool opt_local_ = true,
+          enum Core_type core_ty_ = Inorder, bool _is_default = true);
+
+  InputParameter l_ip;
+  string name;
+  enum Device_ty device_ty;
+  bool opt_local;
+  enum Core_type core_ty;
+  bool is_default;
+  uca_org_t local_result;
+
+  statsDef tdp_stats;
+  statsDef rtp_stats;
+  statsDef stats_t;
+  powerDef power_t;
+
+  virtual void optimize_array();
+  virtual void compute_base_power();
+  virtual ~ArrayST();
+
+  void leakage_feedback(double temperature);
+};
+
+class InstCache : public Component {
+public:
+  ArrayST *caches;
+  ArrayST *missb;
+  ArrayST *ifb;
+  ArrayST *prefetchb;
+  powerDef power_t; // temp value holder for both (max) power and runtime power
+  InstCache() {
+    caches = 0;
+    missb = 0;
+    ifb = 0;
+    prefetchb = 0;
+  };
+  ~InstCache() {
+    if (caches) { // caches->local_result.cleanup();
+      delete caches;
+      caches = 0;
+    }
+    if (missb) { // missb->local_result.cleanup();
+      delete missb;
+      missb = 0;
+    }
+    if (ifb) { // ifb->local_result.cleanup();
+      delete ifb;
+      ifb = 0;
+    }
+    if (prefetchb) { // prefetchb->local_result.cleanup();
+      delete prefetchb;
+      prefetchb = 0;
+    }
+  };
+};
+
+class DataCache : public InstCache {
+public:
+  ArrayST *wbb;
+  DataCache() { wbb = 0; };
+  ~DataCache() {
+    if (wbb) { // wbb->local_result.cleanup();
+      delete wbb;
+      wbb = 0;
+    }
+  };
+};
+
+#endif /* TLB_H_ */
diff --git a/src/gpuwattch/basic_components.cc b/src/gpuwattch/basic_components.cc
new file mode 100644
index 000000000..391303d05
--- /dev/null
+++ b/src/gpuwattch/basic_components.cc
@@ -0,0 +1,117 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "basic_components.h"
+#include <assert.h>
+#include <cmath>
+#include <iostream>
+
+double longer_channel_device_reduction(enum Device_ty device_ty,
+                                       enum Core_type core_ty) {
+
+  double longer_channel_device_percentage_core;
+  double longer_channel_device_percentage_uncore;
+  double longer_channel_device_percentage_llc;
+
+  double long_channel_device_reduction;
+
+  longer_channel_device_percentage_llc = 1.0;
+  longer_channel_device_percentage_uncore = 0.82;
+  if (core_ty == OOO) {
+    longer_channel_device_percentage_core =
+        0.56; // 0.54 Xeon Tulsa //0.58 Nehelam
+    // longer_channel_device_percentage_uncore = 0.76;//0.85 Nehelam
+
+  } else {
+    longer_channel_device_percentage_core = 0.8; // 0.8;//Niagara
+    // longer_channel_device_percentage_uncore = 0.9;//Niagara
+  }
+
+  if (device_ty == Core_device) {
+    long_channel_device_reduction =
+        (1 - longer_channel_device_percentage_core) +
+        longer_channel_device_percentage_core *
+            g_tp.peri_global.long_channel_leakage_reduction;
+  } else if (device_ty == Uncore_device) {
+    long_channel_device_reduction =
+        (1 - longer_channel_device_percentage_uncore) +
+        longer_channel_device_percentage_uncore *
+            g_tp.peri_global.long_channel_leakage_reduction;
+  } else if (device_ty == LLC_device) {
+    long_channel_device_reduction =
+        (1 - longer_channel_device_percentage_llc) +
+        longer_channel_device_percentage_llc *
+            g_tp.peri_global.long_channel_leakage_reduction;
+  } else {
+    cout << "unknown device category" << endl;
+    exit(0);
+  }
+
+  return long_channel_device_reduction;
+}
+
+statsComponents operator+(const statsComponents &x, const statsComponents &y) {
+  statsComponents z;
+
+  z.access = x.access + y.access;
+  z.hit = x.hit + y.hit;
+  z.miss = x.miss + y.miss;
+
+  return z;
+}
+
+statsComponents operator*(const statsComponents &x, double const *const y) {
+  statsComponents z;
+
+  z.access = x.access * y[0];
+  z.hit = x.hit * y[1];
+  z.miss = x.miss * y[2];
+
+  return z;
+}
+
+statsDef operator+(const statsDef &x, const statsDef &y) {
+  statsDef z;
+
+  z.readAc = x.readAc + y.readAc;
+  z.writeAc = x.writeAc + y.writeAc;
+  z.searchAc = x.searchAc + y.searchAc;
+  return z;
+}
+
+statsDef operator*(const statsDef &x, double const *const y) {
+  statsDef z;
+
+  z.readAc = x.readAc * y;
+  z.writeAc = x.writeAc * y;
+  z.searchAc = x.searchAc * y;
+  return z;
+}
diff --git a/src/gpuwattch/basic_components.h b/src/gpuwattch/basic_components.h
new file mode 100644
index 000000000..77b5444a4
--- /dev/null
+++ b/src/gpuwattch/basic_components.h
@@ -0,0 +1,315 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef BASIC_COMPONENTS_H_
+#define BASIC_COMPONENTS_H_
+
+#include "XML_Parse.h"
+#include "cacti/parameter.h"
+#include <vector>
+
+const double cdb_overhead = 1.1;
+
+enum FU_type { FPU, ALU, MUL };
+
+enum Core_type { OOO, Inorder };
+
+enum Renaming_type { RAMbased, CAMbased };
+
+enum Scheduler_type { PhysicalRegFile, ReservationStation };
+
+enum cache_level { L2, L3, L1Directory, L2Directory };
+
+enum MemoryCtrl_type {
+  MC,    // memory controller
+  FLASHC // flash controller
+};
+
+enum Dram_type { GDDR5, GDDR3 };
+
+enum Dir_type {
+  ST,  // shadowed tag
+  DC,  // directory cache
+  SBT, // static bank tag
+  NonDir
+
+};
+
+enum Cache_policy { Write_through, Write_back };
+
+enum Device_ty { Core_device, Uncore_device, LLC_device };
+
+class statsComponents {
+public:
+  double access;
+  double hit;
+  double miss;
+
+  statsComponents() : access(0), hit(0), miss(0) {}
+  statsComponents(const statsComponents &obj) { *this = obj; }
+  statsComponents &operator=(const statsComponents &rhs) {
+    access = rhs.access;
+    hit = rhs.hit;
+    miss = rhs.miss;
+    return *this;
+  }
+  void reset() {
+    access = 0;
+    hit = 0;
+    miss = 0;
+  }
+
+  friend statsComponents operator+(const statsComponents &x,
+                                   const statsComponents &y);
+  friend statsComponents operator*(const statsComponents &x,
+                                   double const *const y);
+};
+
+class statsDef {
+public:
+  statsComponents readAc;
+  statsComponents writeAc;
+  statsComponents searchAc;
+
+  statsDef() : readAc(), writeAc(), searchAc() {}
+  void reset() {
+    readAc.reset();
+    writeAc.reset();
+    searchAc.reset();
+  }
+
+  friend statsDef operator+(const statsDef &x, const statsDef &y);
+  friend statsDef operator*(const statsDef &x, double const *const y);
+};
+
+double longer_channel_device_reduction(enum Device_ty device_ty = Core_device,
+                                       enum Core_type core_ty = Inorder);
+
+class CoreDynParam {
+public:
+  CoreDynParam(){};
+  CoreDynParam(ParseXML *XML_interface, int ithCore_);
+  //    :XML(XML_interface),
+  //     ithCore(ithCore_)
+  //     core_ty(inorder),
+  //     rm_ty(CAMbased),
+  //     scheu_ty(PhysicalRegFile),
+  //     clockRate(1e9),//1GHz
+  //     arch_ireg_width(32),
+  //     arch_freg_width(32),
+  //     phy_ireg_width(128),
+  //     phy_freg_width(128),
+  //     perThreadState(8),
+  //     globalCheckpoint(32),
+  //     instructionLength(32){};
+  // ParseXML * XML;
+  bool opt_local;
+  bool x86;
+  bool Embedded;
+  enum Core_type core_ty;
+  enum Renaming_type rm_ty;
+  enum Scheduler_type scheu_ty;
+  double clockRate, executionTime;
+  int arch_ireg_width, arch_freg_width, phy_ireg_width, phy_freg_width;
+  int num_IRF_entry, num_FRF_entry, num_ifreelist_entries,
+      num_ffreelist_entries;
+  int fetchW, decodeW, issueW, peak_issueW, commitW, peak_commitW, predictionW,
+      fp_issueW, fp_decodeW;
+  int perThreadState, globalCheckpoint, instruction_length, pc_width,
+      opcode_length, micro_opcode_length;
+  int num_hthreads, pipeline_stages, fp_pipeline_stages, num_pipelines,
+      num_fp_pipelines;
+  int num_alus, num_muls;
+  double num_fpus;
+  int int_data_width, fp_data_width, v_address_width, p_address_width;
+  double pipeline_duty_cycle, total_cycles, busy_cycles, idle_cycles;
+  bool regWindowing, multithreaded;
+  double pppm_lkg_multhread[4];
+  double IFU_duty_cycle, BR_duty_cycle, LSU_duty_cycle, MemManU_I_duty_cycle,
+      MemManU_D_duty_cycle, ALU_duty_cycle, MUL_duty_cycle, FPU_duty_cycle,
+      ALU_cdb_duty_cycle, MUL_cdb_duty_cycle, FPU_cdb_duty_cycle;
+  ~CoreDynParam(){};
+};
+
+class CacheDynParam {
+public:
+  CacheDynParam(){};
+  CacheDynParam(ParseXML *XML_interface, int ithCache_);
+  string name;
+  enum Dir_type dir_ty;
+  double clockRate, executionTime;
+  double capacity, blockW, assoc, nbanks;
+  double throughput, latency;
+  double duty_cycle, dir_duty_cycle;
+  // double duty_cycle;
+  int missb_size, fu_size, prefetchb_size, wbb_size;
+  ~CacheDynParam(){};
+};
+
+class DRAMParam {
+public:
+  DRAMParam(){};
+  DRAMParam(ParseXML *XML_interface, int ithCache_);
+  string name;
+  double clockRate;
+  double executionTime;
+  double cmd_coeff;
+  double activity_coeff;
+  double nop_coeff;
+  double act_coeff;
+  double pre_coeff;
+  double rd_coeff;
+  double wr_coeff;
+  double req_coeff;
+  double const_coeff;
+
+  int detailed_dram_model; // 1 - to use newly added DRAM model (GDDR5 only), 0
+                           // - use empirical model
+  // the following are the current specified by DATA SHEET
+  // unit: mA
+  int idd0;
+  int idd1;
+  int idd2p;
+  int idd2n;
+  int idd3p;
+  int idd3n;
+  int idd4r;
+  int idd4w;
+  int idd5;
+  int idd6;
+  int idd7;
+
+  // the following are the vdd specified by DATA SHEET; NOT the actual VDD
+  double datasheet_vdd;
+  double actual_vdd;
+
+  // the following are the timing parameters specified by DATA SHEET
+  // unit: ns
+  int t_ccd;
+  int t_rrd;
+  int t_rcd;
+  int t_ras;
+  int t_rp;
+  int t_rc;
+  int t_cl;
+  int t_cdlr;
+  int t_wr;
+
+  // the following are the DRAM clocks
+  // unit: MHz
+  int datasheet_operating_clock; // this is specified by DATA SHEET. This is NOT
+                                 // the actual DRAM clock
+  int actual_operating_clock;
+
+  // the following are each DRAM bank's IO info
+  int bank_width;                  // in bits
+  int dqs_signal_width;            // in bits
+  int extra_dq_write_signal_width; // in bits
+  int per_dq_read_power;           // in mW
+  int per_dq_write_power;          // in mW
+
+  ~DRAMParam(){};
+};
+
+class MCParam {
+public:
+  MCParam(){};
+  MCParam(ParseXML *XML_interface, int ithCache_);
+  string name;
+  double clockRate, num_mcs, peakDataTransferRate, num_channels;
+  //  double mcTEPowerperGhz;
+  //	double mcPHYperGbit;
+  //	double area;
+  int llcBlockSize, dataBusWidth, addressBusWidth;
+  int opcodeW;
+  int memAccesses;
+  int memRank;
+  int type;
+  double frontend_duty_cycle, duty_cycle, perc_load;
+  double executionTime, reads, writes;
+  bool LVDS, withPHY;
+
+  ~MCParam(){};
+};
+
+class NoCParam {
+public:
+  NoCParam(){};
+  NoCParam(ParseXML *XML_interface, int ithCache_);
+  string name;
+  double clockRate;
+  int flit_size;
+  int input_ports, output_ports, min_ports, global_linked_ports;
+  int virtual_channel_per_port, input_buffer_entries_per_vc;
+  int horizontal_nodes, vertical_nodes, total_nodes;
+  double executionTime, total_access, link_throughput, link_latency, duty_cycle,
+      chip_coverage, route_over_perc;
+  bool has_global_link, type;
+
+  ~NoCParam(){};
+};
+
+class ProcParam {
+public:
+  ProcParam(){};
+  ProcParam(ParseXML *XML_interface, int ithCache_);
+  string name;
+  int numCore, numL2, numL3, numNOC, numL1Dir, numL2Dir, numMC, numMCChannel;
+  bool homoCore, homoL2, homoL3, homoNOC, homoL1Dir, homoL2Dir;
+
+  ~ProcParam(){};
+};
+
+class NIUParam {
+public:
+  NIUParam(){};
+  NIUParam(ParseXML *XML_interface, int ithCache_);
+  string name;
+  double clockRate;
+  int num_units;
+  int type;
+  double duty_cycle, perc_load;
+  ~NIUParam(){};
+};
+
+class PCIeParam {
+public:
+  PCIeParam(){};
+  PCIeParam(ParseXML *XML_interface, int ithCache_);
+  string name;
+  double clockRate;
+  int num_channels, num_units;
+  bool withPHY;
+  int type;
+  double duty_cycle, perc_load;
+  ~PCIeParam(){};
+};
+#endif /* BASIC_COMPONENTS_H_ */
diff --git a/src/gpuwattch/cacti/README b/src/gpuwattch/cacti/README
new file mode 100644
index 000000000..de429d2bb
--- /dev/null
+++ b/src/gpuwattch/cacti/README
@@ -0,0 +1,94 @@
+-----------------------------------------------------------
+          ____    _    ____ _____ ___    __    ____  
+         / ___|  / \  / ___|_   _|_ _|  / /_  | ___| 
+        | |     / _ \| |     | |  | |  | '_ \ |___ \ 
+        | |___ / ___ \ |___  | |  | |  | (_) | ___) |
+         \____/_/   \_\____| |_| |___|  \___(_)____/ 
+
+
+             A Tool to Model Caches/Memories
+-----------------------------------------------------------
+
+CACTI is an analytical tool that takes a set of cache/memory para-
+meters as input and calculates its access time, power, cycle 
+time, and area.
+CACTI was originally developed by Dr. Jouppi and Dr. Wilton
+in 1993 and since then it has undergone five major 
+revisions.
+
+List of features (version 1-6.5):
+===============================
+The following is the list of features supported by the tool. 
+
+* Power, delay, area, and cycle time model for 
+                  direct mapped caches
+                  set-associative caches
+                  fully associative caches
+                  Embedded DRAM memories
+                  Commodity DRAM memories
+                  
+* Support for modeling multi-ported uniform cache access (UCA)
+  and multi-banked, multi-ported non-uniform cache access (NUCA).
+
+* Leakage power calculation that also considers the operating
+  temperature of the cache.
+  
+* Router power model.
+
+* Interconnect model with different delay, power, and area 
+  properties including low-swing wire model.
+
+* An interface to perform trade-off analysis involving power, delay,
+  area, and bandwidth.
+
+* All process specific values used by the tool are obtained
+  from ITRS and currently, the tool supports 90nm, 65nm, 45nm, 
+  and 32nm technology nodes.
+
+Version 6.5 has a new c++ code base and includes numerous bug fixes.
+CACTI 5.3 and 6.0 activate an entire row of mats to read/write a single
+block of data. This technique improves reliability at the cost of  
+power. CACTI 6.5 activates minimum number of mats just enough to retrieve 
+a block to minimize power.
+
+How to use the tool?
+====================
+Prior versions of CACTI take input parameters such as cache
+size and technology node as a set of command line arguments. 
+To avoid a long list of command line arguments, 
+CACTI 6.5 lets users specify their cache model in a more 
+detailed manner by using a config file (cache.cfg).
+
+-> define the cache model using cache.cfg
+-> run the "cacti" binary <./cacti -infile cache.cfg>
+
+CACTI6.5 also provides a command line interface similar to earlier versions
+of CACTI. The command line interface can be used as
+
+./cacti  cache_size line_size associativity rw_ports excl_read_ports excl_write_ports 
+  single_ended_read_ports search_ports banks tech_node output_width specific_tag tag_width
+  access_mode cache main_mem obj_func_delay obj_func_dynamic_power obj_func_leakage_power
+  obj_func_cycle_time obj_func_area dev_func_delay dev_func_dynamic_power dev_func_leakage_power
+  dev_func_area dev_func_cycle_time ed_ed2_none temp wt data_arr_ram_cell_tech_flavor_in
+  data_arr_peri_global_tech_flavor_in tag_arr_ram_cell_tech_flavor_in tag_arr_peri_global_tech_flavor_in
+  interconnect_projection_type_in wire_inside_mat_type_in wire_outside_mat_type_in
+  REPEATERS_IN_HTREE_SEGMENTS_in VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in 
+  BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in PAGE_SIZE_BITS_in BURST_LENGTH_in
+  INTERNAL_PREFETCH_WIDTH_in force_wiretype wiretype force_config ndwl ndbl nspd ndcm 
+  ndsam1 ndsam2 ecc
+
+For complete documentation of the tool, please refer CACTI-5.3 and 6.0
+technical reports and the following paper,
+"Optimizing NUCA Organizations and Wiring Alternatives for 
+Large Caches With CACTI 6.0", that appears in MICRO 2007.
+
+We are still improving the tool and refining the code. If you
+have any comments, questions, or suggestions please write to
+us.
+
+Naveen Muralimanohar             Jung Ho Ahn        Sheng Li
+naveen.muralimanohar@hp.com      gajh@snu.ac.kr     sheng.li@hp.com
+
+
+
+
diff --git a/src/gpuwattch/cacti/Ucache.cc b/src/gpuwattch/cacti/Ucache.cc
new file mode 100644
index 000000000..e855238c2
--- /dev/null
+++ b/src/gpuwattch/cacti/Ucache.cc
@@ -0,0 +1,916 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#include <time.h>
+#include <math.h>
+
+
+#include "area.h"
+#include "bank.h"
+#include "basic_circuit.h"
+#include "component.h"
+#include "const.h"
+#include "decoder.h"
+#include "parameter.h"
+#include "Ucache.h"
+#include "subarray.h"
+#include "uca.h"
+
+#include <pthread.h>
+#include <iostream>
+#include <algorithm>
+#include <list>
+
+using namespace std;
+
+const uint32_t nthreads = NTHREADS;
+
+
+void min_values_t::update_min_values(const min_values_t * val)
+{
+  min_delay   = (min_delay > val->min_delay) ? val->min_delay : min_delay;
+  min_dyn     = (min_dyn > val->min_dyn) ? val->min_dyn : min_dyn;
+  min_leakage = (min_leakage > val->min_leakage) ? val->min_leakage : min_leakage;
+  min_area    = (min_area > val->min_area) ? val->min_area : min_area;
+  min_cyc     = (min_cyc > val->min_cyc) ? val->min_cyc : min_cyc;
+}
+
+
+
+void min_values_t::update_min_values(const uca_org_t & res)
+{
+  min_delay   = (min_delay > res.access_time) ? res.access_time : min_delay;
+  min_dyn     = (min_dyn > res.power.readOp.dynamic) ? res.power.readOp.dynamic : min_dyn;
+  min_leakage = (min_leakage > res.power.readOp.leakage) ? res.power.readOp.leakage : min_leakage;
+  min_area    = (min_area > res.area) ? res.area : min_area;
+  min_cyc     = (min_cyc > res.cycle_time) ? res.cycle_time : min_cyc;
+}
+
+void min_values_t::update_min_values(const nuca_org_t * res)
+{
+  min_delay   = (min_delay > res->nuca_pda.delay) ? res->nuca_pda.delay : min_delay;
+  min_dyn     = (min_dyn > res->nuca_pda.power.readOp.dynamic) ? res->nuca_pda.power.readOp.dynamic : min_dyn;
+  min_leakage = (min_leakage > res->nuca_pda.power.readOp.leakage) ? res->nuca_pda.power.readOp.leakage : min_leakage;
+  min_area    = (min_area > res->nuca_pda.area.get_area()) ? res->nuca_pda.area.get_area() : min_area;
+  min_cyc     = (min_cyc > res->nuca_pda.cycle_time) ? res->nuca_pda.cycle_time : min_cyc;
+}
+
+void min_values_t::update_min_values(const mem_array * res)
+{
+  min_delay   = (min_delay > res->access_time) ? res->access_time : min_delay;
+  min_dyn     = (min_dyn > res->power.readOp.dynamic) ? res->power.readOp.dynamic : min_dyn;
+  min_leakage = (min_leakage > res->power.readOp.leakage) ? res->power.readOp.leakage : min_leakage;
+  min_area    = (min_area > res->area) ? res->area : min_area;
+  min_cyc     = (min_cyc > res->cycle_time) ? res->cycle_time : min_cyc;
+}
+
+
+
+void * calc_time_mt_wrapper(void * void_obj)
+{
+  calc_time_mt_wrapper_struct * calc_obj = (calc_time_mt_wrapper_struct *) void_obj;
+  uint32_t tid                   = calc_obj->tid;
+  list<mem_array *> & data_arr   = calc_obj->data_arr;
+  list<mem_array *> & tag_arr    = calc_obj->tag_arr;
+  bool is_tag                    = calc_obj->is_tag;
+  bool pure_ram                  = calc_obj->pure_ram;
+  bool pure_cam					 = calc_obj->pure_cam;
+  bool is_main_mem               = calc_obj->is_main_mem;
+  double Nspd_min                = calc_obj->Nspd_min;
+  min_values_t * data_res        = calc_obj->data_res;
+  min_values_t * tag_res         = calc_obj->tag_res;
+
+  data_arr.clear();
+  data_arr.push_back(new mem_array);
+  tag_arr.clear();
+  tag_arr.push_back(new mem_array);
+
+  uint32_t Ndwl_niter = _log2(MAXDATAN) + 1;
+  uint32_t Ndbl_niter = _log2(MAXDATAN) + 1;
+  uint32_t Ndcm_niter = _log2(MAX_COL_MUX) + 1;
+  uint32_t niter      = Ndwl_niter * Ndbl_niter * Ndcm_niter;
+
+
+  bool is_valid_partition;
+  int wt_min, wt_max;
+
+  if (g_ip->force_wiretype) {
+    if (g_ip->wt == 0) {
+      wt_min = Low_swing;
+      wt_max = Low_swing;
+    }
+    else {
+      wt_min = Global;
+      wt_max = Low_swing-1;
+    }
+  }
+  else {
+    wt_min = Global;
+    wt_max = Low_swing;
+  }
+
+  for (double Nspd = Nspd_min; Nspd <= MAXDATASPD; Nspd *= 2)
+  {
+    for (int wr = wt_min; wr <= wt_max; wr++)
+    {
+      for (uint32_t iter = tid; iter < niter; iter += nthreads)
+      {
+        // reconstruct Ndwl, Ndbl, Ndcm
+        unsigned int Ndwl = 1 << (iter / (Ndbl_niter * Ndcm_niter));
+        unsigned int Ndbl = 1 << ((iter / (Ndcm_niter))%Ndbl_niter);
+        unsigned int Ndcm = 1 << (iter % Ndcm_niter);
+        for(unsigned int Ndsam_lev_1 = 1; Ndsam_lev_1 <= MAX_COL_MUX; Ndsam_lev_1 *= 2)
+        {
+          for(unsigned int Ndsam_lev_2 = 1; Ndsam_lev_2 <= MAX_COL_MUX; Ndsam_lev_2 *= 2)
+          {
+            //for debuging
+            if (g_ip->force_cache_config && is_tag == false)
+            {
+              wr   = g_ip->wt;
+              Ndwl = g_ip->ndwl;
+              Ndbl = g_ip->ndbl;
+              Ndcm = g_ip->ndcm;
+              if(g_ip->nspd != 0) {
+            	  Nspd = g_ip->nspd;
+              }
+              if(g_ip->ndsam1 != 0) {
+            	  Ndsam_lev_1 = g_ip->ndsam1;
+            	  Ndsam_lev_2 = g_ip->ndsam2;
+              }
+            }
+
+            if (is_tag == true)
+            {
+              is_valid_partition = calculate_time(is_tag, pure_ram, pure_cam, Nspd, Ndwl,
+                  Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2,
+                  tag_arr.back(), 0, NULL, NULL,
+                  is_main_mem);
+            }
+            // If it's a fully-associative cache, the data array partition parameters are identical to that of
+            // the tag array, so compute data array partition properties also here.
+            if (is_tag == false || g_ip->fully_assoc)
+            {
+              is_valid_partition = calculate_time(is_tag/*false*/, pure_ram, pure_cam, Nspd, Ndwl,
+                  Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2,
+                  data_arr.back(), 0, NULL, NULL,
+                  is_main_mem);
+            }
+
+            if (is_valid_partition)
+            {
+              if (is_tag == true)
+              {
+                tag_arr.back()->wt = (enum Wire_type) wr;
+                tag_res->update_min_values(tag_arr.back());
+                tag_arr.push_back(new mem_array);
+              }
+              if (is_tag == false || g_ip->fully_assoc)
+              {
+                data_arr.back()->wt = (enum Wire_type) wr;
+                data_res->update_min_values(data_arr.back());
+                data_arr.push_back(new mem_array);
+              }
+            }
+
+            if (g_ip->force_cache_config && is_tag == false)
+            {
+            	wr   = wt_max;
+            	iter = niter;
+            	if(g_ip->nspd != 0) {
+            		Nspd = MAXDATASPD;
+            	}
+            	if (g_ip->ndsam1 != 0) {
+            		Ndsam_lev_1 = MAX_COL_MUX+1;
+            		Ndsam_lev_2 = MAX_COL_MUX+1;
+            	}
+            }
+          }
+        }
+      }
+    }
+  }
+
+  delete data_arr.back();
+  delete tag_arr.back();
+  data_arr.pop_back();
+  tag_arr.pop_back();
+
+  pthread_exit(NULL);
+}
+
+
+
+bool calculate_time(
+    bool is_tag,
+    int pure_ram,
+    bool pure_cam,
+    double Nspd,
+    unsigned int Ndwl,
+    unsigned int Ndbl,
+    unsigned int Ndcm,
+    unsigned int Ndsam_lev_1,
+    unsigned int Ndsam_lev_2,
+    mem_array *ptr_array,
+    int flag_results_populate,
+    results_mem_array *ptr_results,
+    uca_org_t *ptr_fin_res,
+    bool is_main_mem)
+{
+  DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem);
+
+  if (dyn_p.is_valid == false)
+  {
+    return false;
+  }
+
+  UCA * uca = new UCA(dyn_p);
+
+
+  if (flag_results_populate)
+  { //For the final solution, populate the ptr_results data structure  -- TODO: copy only necessary variables
+  }
+  else
+  {
+	  int num_act_mats_hor_dir = uca->bank.dp.num_act_mats_hor_dir;
+	  int num_mats = uca->bank.dp.num_mats;
+	  bool is_fa = uca->bank.dp.fully_assoc;
+	  bool pure_cam = uca->bank.dp.pure_cam;
+	ptr_array->Ndwl = Ndwl;
+    ptr_array->Ndbl = Ndbl;
+    ptr_array->Nspd = Nspd;
+    ptr_array->deg_bl_muxing = dyn_p.deg_bl_muxing;
+    ptr_array->Ndsam_lev_1 = Ndsam_lev_1;
+    ptr_array->Ndsam_lev_2 = Ndsam_lev_2;
+    ptr_array->access_time = uca->access_time;
+    ptr_array->cycle_time = uca->cycle_time;
+    ptr_array->multisubbank_interleave_cycle_time = uca->multisubbank_interleave_cycle_time;
+    ptr_array->area_ram_cells = uca->area_all_dataramcells;
+    ptr_array->area   = uca->area.get_area();
+    ptr_array->height = uca->area.h;
+    ptr_array->width  = uca->area.w;
+    ptr_array->mat_height = uca->bank.mat.area.h;
+    ptr_array->mat_length = uca->bank.mat.area.w;
+    ptr_array->subarray_height = uca->bank.mat.subarray.area.h;
+    ptr_array->subarray_length = uca->bank.mat.subarray.area.w;
+    ptr_array->power  = uca->power;
+    ptr_array->delay_senseamp_mux_decoder =
+      MAX(uca->delay_array_to_sa_mux_lev_1_decoder,
+          uca->delay_array_to_sa_mux_lev_2_decoder);
+    ptr_array->delay_before_subarray_output_driver         = uca->delay_before_subarray_output_driver;
+    ptr_array->delay_from_subarray_output_driver_to_output = uca->delay_from_subarray_out_drv_to_out;
+
+    ptr_array->delay_route_to_bank          = uca->htree_in_add->delay;
+    ptr_array->delay_input_htree            = uca->bank.htree_in_add->delay;
+    ptr_array->delay_row_predecode_driver_and_block = uca->bank.mat.r_predec->delay;
+    ptr_array->delay_row_decoder            = uca->bank.mat.row_dec->delay;
+    ptr_array->delay_bitlines               = uca->bank.mat.delay_bitline;
+    ptr_array->delay_matchlines               = uca->bank.mat.delay_matchchline;
+    ptr_array->delay_sense_amp              = uca->bank.mat.delay_sa;
+    ptr_array->delay_subarray_output_driver = uca->bank.mat.delay_subarray_out_drv_htree;
+    ptr_array->delay_dout_htree             = uca->bank.htree_out_data->delay;
+    ptr_array->delay_comparator             = uca->bank.mat.delay_comparator;
+
+    ptr_array->all_banks_height = uca->area.h;
+    ptr_array->all_banks_width  = uca->area.w;
+    ptr_array->area_efficiency = uca->area_all_dataramcells * 100 / (uca->area.get_area());
+
+    ptr_array->power_routing_to_bank = uca->power_routing_to_bank;
+    ptr_array->power_addr_input_htree = uca->bank.htree_in_add->power;
+    ptr_array->power_data_input_htree = uca->bank.htree_in_data->power;
+//    cout<<"power_data_input_htree"<<uca->bank.htree_in_data->power.readOp.leakage<<endl;
+    ptr_array->power_data_output_htree = uca->bank.htree_out_data->power;
+//    cout<<"power_data_output_htree"<<uca->bank.htree_out_data->power.readOp.leakage<<endl;
+    ptr_array->power_row_predecoder_drivers = uca->bank.mat.r_predec->driver_power;
+    ptr_array->power_row_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_row_predecoder_blocks = uca->bank.mat.r_predec->block_power;
+    ptr_array->power_row_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_row_decoders = uca->bank.mat.power_row_decoders;
+    ptr_array->power_row_decoders.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_row_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_bit_mux_predecoder_drivers = uca->bank.mat.b_mux_predec->driver_power;
+    ptr_array->power_bit_mux_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_bit_mux_predecoder_blocks  = uca->bank.mat.b_mux_predec->block_power;
+    ptr_array->power_bit_mux_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_bit_mux_decoders = uca->bank.mat.power_bit_mux_decoders;
+    ptr_array->power_bit_mux_decoders.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bit_mux_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers = uca->bank.mat.sa_mux_lev_1_predec->driver_power;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_drivers .searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks = uca->bank.mat.sa_mux_lev_1_predec->block_power;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_1_decoders = uca->bank.mat.power_sa_mux_lev_1_decoders;
+    ptr_array->power_senseamp_mux_lev_1_decoders.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_decoders.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_1_decoders.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers = uca->bank.mat.sa_mux_lev_2_predec->driver_power;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks = uca->bank.mat.sa_mux_lev_2_predec->block_power;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_predecoder_blocks.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_senseamp_mux_lev_2_decoders = uca->bank.mat.power_sa_mux_lev_2_decoders;
+    ptr_array->power_senseamp_mux_lev_2_decoders .readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_decoders .writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_senseamp_mux_lev_2_decoders .searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_bitlines = uca->bank.mat.power_bitline;
+    ptr_array->power_bitlines.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bitlines.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_bitlines.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_sense_amps = uca->bank.mat.power_sa;
+    ptr_array->power_sense_amps.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_sense_amps.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_sense_amps.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_prechg_eq_drivers = uca->bank.mat.power_bl_precharge_eq_drv;
+    ptr_array->power_prechg_eq_drivers.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_prechg_eq_drivers.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_prechg_eq_drivers.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_output_drivers_at_subarray = uca->bank.mat.power_subarray_out_drv;
+    ptr_array->power_output_drivers_at_subarray.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_output_drivers_at_subarray.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_output_drivers_at_subarray.searchOp.dynamic *= num_act_mats_hor_dir;
+
+    ptr_array->power_comparators = uca->bank.mat.power_comparator;
+    ptr_array->power_comparators.readOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_comparators.writeOp.dynamic *= num_act_mats_hor_dir;
+    ptr_array->power_comparators.searchOp.dynamic *= num_act_mats_hor_dir;
+
+//    cout <<  "  num of mats: " << dyn_p.num_mats << endl;
+    if (is_fa || pure_cam)
+    {
+    ptr_array->power_htree_in_search = uca->bank.htree_in_search->power;
+//    cout<<"power_htree_in_search"<<uca->bank.htree_in_search->power.readOp.leakage<<endl;
+    ptr_array->power_htree_out_search = uca->bank.htree_out_search->power;
+//    cout<<"power_htree_out_search"<<uca->bank.htree_out_search->power.readOp.leakage<<endl;
+    ptr_array->power_searchline = uca->bank.mat.power_searchline;
+//    cout<<"power_searchlineh"<<uca->bank.mat.power_searchline.readOp.leakage<<endl;
+    ptr_array->power_searchline.searchOp.dynamic *= num_mats;
+    ptr_array->power_searchline_precharge = uca->bank.mat.power_searchline_precharge;
+    ptr_array->power_searchline_precharge.searchOp.dynamic *= num_mats;
+    ptr_array->power_matchlines = uca->bank.mat.power_matchline;
+    ptr_array->power_matchlines.searchOp.dynamic *= num_mats;
+    ptr_array->power_matchline_precharge = uca->bank.mat.power_matchline_precharge;
+    ptr_array->power_matchline_precharge.searchOp.dynamic *= num_mats;
+    ptr_array->power_matchline_to_wordline_drv = uca->bank.mat.power_ml_to_ram_wl_drv;
+//    cout<<"power_matchline.searchOp.leakage"<<uca->bank.mat.power_matchline.searchOp.leakage<<endl;
+    }
+
+    ptr_array->activate_energy = uca->activate_energy;
+    ptr_array->read_energy = uca->read_energy;
+    ptr_array->write_energy = uca->write_energy;
+    ptr_array->precharge_energy = uca->precharge_energy;
+    ptr_array->refresh_power = uca->refresh_power;
+    ptr_array->leak_power_subbank_closed_page = uca->leak_power_subbank_closed_page;
+    ptr_array->leak_power_subbank_open_page = uca->leak_power_subbank_open_page;
+    ptr_array->leak_power_request_and_reply_networks = uca->leak_power_request_and_reply_networks;
+
+    ptr_array->precharge_delay = uca->precharge_delay;
+
+
+//      cout<<"power_matchline.searchOp.leakage"<<uca->bank.mat.<<endl;
+//
+//    if (!(is_fa || pure_cam))
+//    {
+//     cout <<  "  num of cols: " << dyn_p.num_c_subarray << endl;
+//    }
+//    else if (is_fa)
+//    {
+//  	  cout <<  "  num of cols: " << dyn_p.tag_num_c_subarray+ dyn_p.data_num_c_subarray<< endl;
+//    } else
+//  	  cout <<  "  num of cols: " << dyn_p.tag_num_c_subarray<< endl;
+//      cout <<  uca->bank.mat.subarray.get_total_cell_area()<<endl;
+  }
+
+
+  delete uca;
+  return true;
+}
+
+
+
+bool check_uca_org(uca_org_t & u, min_values_t *minval)
+{
+  if (((u.access_time - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev) {
+    return false;
+  }
+  if (((u.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
+      g_ip->dynamic_power_dev) {
+    return false;
+  }
+  if (((u.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
+      g_ip->leakage_power_dev) {
+    return false;
+  }
+  if (((u.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
+      g_ip->cycle_time_dev) {
+    return false;
+  }
+  if (((u.area - minval->min_area)/minval->min_area)*100 >
+      g_ip->area_dev) {
+    return false;
+  }
+  return true;
+}
+
+bool check_mem_org(mem_array & u, const min_values_t *minval)
+{
+  if (((u.access_time - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev) {
+    return false;
+  }
+  if (((u.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
+      g_ip->dynamic_power_dev) {
+    return false;
+  }
+  if (((u.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
+      g_ip->leakage_power_dev) {
+    return false;
+  }
+  if (((u.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
+      g_ip->cycle_time_dev) {
+    return false;
+  }
+  if (((u.area - minval->min_area)/minval->min_area)*100 >
+      g_ip->area_dev) {
+    return false;
+  }
+  return true;
+}
+
+
+
+
+void find_optimal_uca(uca_org_t *res, min_values_t * minval, list<uca_org_t> & ulist)
+{
+  double cost = 0;
+  double min_cost = BIGNUM;
+  float d, a, dp, lp, c;
+
+  dp = g_ip->dynamic_power_wt;
+  lp = g_ip->leakage_power_wt;
+  a  = g_ip->area_wt;
+  d  = g_ip->delay_wt;
+  c  = g_ip->cycle_time_wt;
+
+  if (ulist.empty() == true)
+  {
+    cout << "find_optial_uca1" << endl;
+    cout << "ERROR: no valid cache organizations found" << endl;
+    exit(0);
+  }
+
+  for (list<uca_org_t>::iterator niter = ulist.begin(); niter != ulist.end(); niter++)
+  {
+    if (g_ip->ed == 1)
+    {
+      cost = ((niter)->access_time/minval->min_delay) * ((niter)->power.readOp.dynamic/minval->min_dyn);
+      if (min_cost > cost)
+      {
+        min_cost = cost;
+        *res = (*(niter));
+      }
+    }
+    else if (g_ip->ed == 2)
+    {
+      cost = ((niter)->access_time/minval->min_delay)*
+             ((niter)->access_time/minval->min_delay)*
+             ((niter)->power.readOp.dynamic/minval->min_dyn);
+      if (min_cost > cost)
+      {
+        min_cost = cost;
+        *res = (*(niter));
+      }
+    }
+    else
+    {
+      /*
+       * check whether the current organization
+       * meets the input deviation constraints
+       */
+      bool v = check_uca_org(*niter, minval);
+      //if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
+
+      if (v)
+      {
+        cost = (d  * ((niter)->access_time/minval->min_delay) +
+                c  * ((niter)->cycle_time/minval->min_cyc) +
+                dp * ((niter)->power.readOp.dynamic/minval->min_dyn) +
+                lp * ((niter)->power.readOp.leakage/minval->min_leakage) +
+                a  * ((niter)->area/minval->min_area));
+        //fprintf(stderr, "cost = %g\n", cost);
+
+        if (min_cost > cost) {
+          min_cost = cost;
+          *res = (*(niter));
+          niter = ulist.erase(niter);
+          if (niter!=ulist.begin())
+        	  niter--;
+        }
+      }
+      else {
+        niter = ulist.erase(niter);
+        if (niter!=ulist.begin())
+        	niter--;
+      }
+    }
+  }
+
+  if (min_cost == BIGNUM)
+  {
+    cout << "find_optimal_uca2" << endl;
+    cout << "ERROR: no cache organizations met optimization criteria" << endl;
+    exit(0);
+  }
+}
+
+
+
+void filter_tag_arr(const min_values_t * min, list<mem_array *> & list)
+{
+  double cost = BIGNUM;
+  double cur_cost;
+  double wt_delay = g_ip->delay_wt, wt_dyn = g_ip->dynamic_power_wt, wt_leakage = g_ip->leakage_power_wt, wt_cyc = g_ip->cycle_time_wt, wt_area = g_ip->area_wt;
+  mem_array * res = NULL;
+
+  if (list.empty() == true)
+  {
+    cout << "filter_tag_arr1" << endl;
+    cout << "ERROR: no valid tag organizations found" << endl;
+    exit(1);
+  }
+
+
+  while (list.empty() != true)
+  {
+    bool v = check_mem_org(*list.back(), min);
+    if (v)
+    {
+      cur_cost = wt_delay   * (list.back()->access_time/min->min_delay) +
+        wt_dyn     * (list.back()->power.readOp.dynamic/min->min_dyn) +
+        wt_leakage * (list.back()->power.readOp.leakage/min->min_leakage) +
+        wt_area    * (list.back()->area/min->min_area) +
+        wt_cyc     * (list.back()->cycle_time/min->min_cyc);
+    }
+    else
+    {
+      cur_cost = BIGNUM;
+    }
+    if (cur_cost < cost)
+    {
+      if (res != NULL)
+      {
+        delete res;
+      }
+      cost = cur_cost;
+      res  = list.back();
+    }
+    else
+    {
+      delete list.back();
+    }
+    list.pop_back();
+  }
+  if(!res)
+  {
+    cout << "filter_tag_arr2" << endl;
+    cout << "ERROR: no valid tag organizations found" << endl;
+    exit(0);
+  }
+
+  list.push_back(res);
+}
+
+
+
+void filter_data_arr(list<mem_array *> & curr_list)
+{
+  if (curr_list.empty() == true)
+  {
+    cout << "filter_data_arr1" << endl;
+    cout << "ERROR: no valid data array organizations found" << endl;
+    exit(1);
+  }
+
+  list<mem_array *>::iterator iter;
+
+  for (iter = curr_list.begin(); iter != curr_list.end(); ++iter)
+  {
+    mem_array * m = *iter;
+
+    if (m == NULL) exit(1);
+
+    if(((m->access_time - m->arr_min->min_delay)/m->arr_min->min_delay > 0.5) &&
+       ((m->power.readOp.dynamic - m->arr_min->min_dyn)/m->arr_min->min_dyn > 0.5))
+    {
+      delete m;
+      iter = curr_list.erase(iter);
+      iter --;
+    }
+  }
+}
+
+
+
+/*
+ * Performs exhaustive search across different sub-array sizes,
+ * wire types and aspect ratios to find an optimal UCA organization
+ * 1. First different valid tag array organizations are calculated
+ *    and stored in tag_arr array
+ * 2. The exhaustive search is repeated to find valid data array
+ *    organizations and stored in data_arr array
+ * 3. Cache area, delay, power, and cycle time for different
+ *    cache organizations are calculated based on the
+ *    above results
+ * 4. Cache model with least cost is picked from sol_list
+ */
+void solve(uca_org_t *fin_res)
+{
+  int    pure_ram = g_ip->pure_ram;
+  bool   pure_cam = g_ip->pure_cam;
+
+  init_tech_params(g_ip->F_sz_um, false);
+
+
+  list<mem_array *> tag_arr (0);
+  list<mem_array *> data_arr(0);
+  list<mem_array *>::iterator miter;
+  list<uca_org_t> sol_list(1, uca_org_t());
+
+  fin_res->tag_array.access_time = 0;
+  fin_res->tag_array.Ndwl = 0;
+  fin_res->tag_array.Ndbl = 0;
+  fin_res->tag_array.Nspd = 0;
+  fin_res->tag_array.deg_bl_muxing = 0;
+  fin_res->tag_array.Ndsam_lev_1 = 0;
+  fin_res->tag_array.Ndsam_lev_2 = 0;
+
+
+  // distribute calculate_time() execution to multiple threads
+  calc_time_mt_wrapper_struct * calc_array = new calc_time_mt_wrapper_struct[nthreads];
+  pthread_t threads[nthreads];
+
+  for (uint32_t t = 0; t < nthreads; t++)
+  {
+    calc_array[t].tid         = t;
+    calc_array[t].pure_ram    = pure_ram;
+    calc_array[t].pure_cam    = pure_cam;
+    calc_array[t].data_res    = new min_values_t();
+    calc_array[t].tag_res     = new min_values_t();
+  }
+
+  bool     is_tag;
+
+  // If it's a cache, first calculate the area, delay and power for all tag array partitions.
+  if (!(pure_ram||pure_cam||g_ip->fully_assoc))
+  { //cache
+    is_tag              = true;
+    init_tech_params(g_ip->F_sz_um, is_tag);
+
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      calc_array[t].is_tag      = is_tag;
+      calc_array[t].is_main_mem = false;
+      calc_array[t].Nspd_min    = 0.125;
+      pthread_create(&threads[t], NULL, calc_time_mt_wrapper, (void *)(&(calc_array[t])));
+    }
+
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      pthread_join(threads[t], NULL);
+    }
+
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      calc_array[t].data_arr.sort(mem_array::lt);
+      data_arr.merge(calc_array[t].data_arr, mem_array::lt);
+      calc_array[t].tag_arr.sort(mem_array::lt);
+      tag_arr.merge(calc_array[t].tag_arr, mem_array::lt);
+    }
+  }
+
+
+  // calculate the area, delay and power for all data array partitions (for cache or plain RAM).
+//  if (!g_ip->fully_assoc)
+// {//in the new cacti, cam, fully_associative cache are processed as single array in the data portion
+    is_tag              = false;
+    init_tech_params(g_ip->F_sz_um, is_tag);
+
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      calc_array[t].is_tag      = is_tag;
+      calc_array[t].is_main_mem = g_ip->is_main_mem;
+      if (!(pure_cam||g_ip->fully_assoc))
+      {
+    	  calc_array[t].Nspd_min    = (double)(g_ip->out_w)/(double)(g_ip->block_sz*8);
+      }
+      else
+      {
+    	  calc_array[t].Nspd_min    = 1;
+      }
+
+      pthread_create(&threads[t], NULL, calc_time_mt_wrapper, (void *)(&(calc_array[t])));
+    }
+
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      pthread_join(threads[t], NULL);
+    }
+
+    data_arr.clear();
+    for (uint32_t t = 0; t < nthreads; t++)
+    {
+      calc_array[t].data_arr.sort(mem_array::lt);
+      data_arr.merge(calc_array[t].data_arr, mem_array::lt);
+    }
+//  }
+
+
+  min_values_t * d_min = new min_values_t();
+  min_values_t * t_min = new min_values_t();
+  min_values_t * cache_min = new min_values_t();
+
+  for (uint32_t t = 0; t < nthreads; t++)
+  {
+    d_min->update_min_values(calc_array[t].data_res);
+    t_min->update_min_values(calc_array[t].tag_res);
+  }
+
+  for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
+  {
+    (*miter)->arr_min = d_min;
+  }
+
+
+  //cout << data_arr.size() << "\t" << tag_arr.size() <<" before\n";
+  filter_data_arr(data_arr);
+  if(!(pure_ram||pure_cam||g_ip->fully_assoc))
+  {
+    filter_tag_arr(t_min, tag_arr);
+  }
+  //cout << data_arr.size() << "\t" << tag_arr.size() <<" after\n";
+
+
+  if (pure_ram||pure_cam||g_ip->fully_assoc)
+  {
+    for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
+    {
+      uca_org_t & curr_org  = sol_list.back();
+      curr_org.tag_array2  = NULL;
+      curr_org.data_array2 = (*miter);
+
+      curr_org.find_delay();
+      curr_org.find_energy();
+      curr_org.find_area();
+      curr_org.find_cyc();
+
+      //update min values for the entire cache
+      cache_min->update_min_values(curr_org);
+
+      sol_list.push_back(uca_org_t());
+    }
+  }
+  else
+  {
+    while (tag_arr.empty() != true)
+    {
+      mem_array * arr_temp = (tag_arr.back());
+      //delete tag_arr.back();
+      tag_arr.pop_back();
+
+      for (miter = data_arr.begin(); miter != data_arr.end(); miter++)
+      {
+        uca_org_t & curr_org  = sol_list.back();
+        curr_org.tag_array2  = arr_temp;
+        curr_org.data_array2 = (*miter);
+
+        curr_org.find_delay();
+        curr_org.find_energy();
+        curr_org.find_area();
+        curr_org.find_cyc();
+
+        //update min values for the entire cache
+        cache_min->update_min_values(curr_org);
+
+        sol_list.push_back(uca_org_t());
+      }
+    }
+  }
+
+  sol_list.pop_back();
+
+  find_optimal_uca(fin_res, cache_min, sol_list);
+
+  sol_list.clear();
+
+  for (miter = data_arr.begin(); miter != data_arr.end(); ++miter)
+  {
+    if (*miter != fin_res->data_array2)
+    {
+      delete *miter;
+    }
+  }
+  data_arr.clear();
+
+  for (uint32_t t = 0; t < nthreads; t++)
+  {
+    delete calc_array[t].data_res;
+    delete calc_array[t].tag_res;
+  }
+
+  delete [] calc_array;
+  delete cache_min;
+  delete d_min;
+  delete t_min;
+}
+
+void update(uca_org_t *fin_res)
+{
+  if(fin_res->tag_array2)
+  {
+    init_tech_params(g_ip->F_sz_um,true);
+    DynamicParameter tag_arr_dyn_p(true, g_ip->pure_ram, g_ip->pure_cam, fin_res->tag_array2->Nspd, fin_res->tag_array2->Ndwl, fin_res->tag_array2->Ndbl, fin_res->tag_array2->Ndcm, fin_res->tag_array2->Ndsam_lev_1, fin_res->tag_array2->Ndsam_lev_2, g_ip->is_main_mem);
+    if(tag_arr_dyn_p.is_valid)
+    {
+      UCA * tag_arr = new UCA(tag_arr_dyn_p);
+      fin_res->tag_array2->power = tag_arr->power;
+    }
+    else
+    {
+      cout << "ERROR: Cannot retrieve array structure for leakage feedback" << endl;
+      exit(1);
+    }
+  }
+  init_tech_params(g_ip->F_sz_um,false);
+  DynamicParameter data_arr_dyn_p(false, g_ip->pure_ram, g_ip->pure_cam, fin_res->data_array2->Nspd, fin_res->data_array2->Ndwl, fin_res->data_array2->Ndbl, fin_res->data_array2->Ndcm, fin_res->data_array2->Ndsam_lev_1, fin_res->data_array2->Ndsam_lev_2, g_ip->is_main_mem);
+  if(data_arr_dyn_p.is_valid)
+  {
+    UCA * data_arr = new UCA(data_arr_dyn_p);
+    fin_res->data_array2->power = data_arr->power;
+  }
+  else
+  {
+    cout << "ERROR: Cannot retrieve array structure for leakage feedback" << endl;
+    exit(1);
+  }
+
+  fin_res->find_energy();
+}
+
diff --git a/src/gpuwattch/cacti/Ucache.h b/src/gpuwattch/cacti/Ucache.h
new file mode 100644
index 000000000..cbd578fbf
--- /dev/null
+++ b/src/gpuwattch/cacti/Ucache.h
@@ -0,0 +1,115 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __UCACHE_H__
+#define __UCACHE_H__
+
+#include <list>
+#include "area.h"
+#include "router.h"
+#include "nuca.h"
+
+
+class min_values_t
+{
+  public:
+    double min_delay;
+    double min_dyn;
+    double min_leakage;
+    double min_area;
+    double min_cyc;
+
+    min_values_t() : min_delay(BIGNUM), min_dyn(BIGNUM), min_leakage(BIGNUM), min_area(BIGNUM), min_cyc(BIGNUM) { }
+
+    void update_min_values(const min_values_t * val);
+    void update_min_values(const uca_org_t & res);
+    void update_min_values(const nuca_org_t * res);
+    void update_min_values(const mem_array * res);
+};
+
+
+
+struct solution
+{
+  int    tag_array_index;
+  int    data_array_index;
+  list<mem_array *>::iterator tag_array_iter;
+  list<mem_array *>::iterator data_array_iter;
+  double access_time;
+  double cycle_time;
+  double area;
+  double efficiency;
+  powerDef total_power;
+};
+
+
+
+bool calculate_time(
+    bool is_tag,
+    int pure_ram,
+    bool pure_cam,
+    double Nspd,
+    unsigned int Ndwl,
+    unsigned int Ndbl,
+    unsigned int Ndcm,
+    unsigned int Ndsam_lev_1,
+    unsigned int Ndsam_lev_2,
+    mem_array *ptr_array,
+    int flag_results_populate,
+    results_mem_array *ptr_results,
+    uca_org_t *ptr_fin_res,
+    bool is_main_mem);
+void update(uca_org_t *fin_res);
+
+void solve(uca_org_t *fin_res);
+void init_tech_params(double tech, bool is_tag);
+
+
+struct calc_time_mt_wrapper_struct
+{
+  uint32_t tid;
+  bool     is_tag;
+  bool     pure_ram;
+  bool     pure_cam;
+  bool     is_main_mem;
+  double   Nspd_min;
+
+  min_values_t * data_res;
+  min_values_t * tag_res;
+
+  list<mem_array *> data_arr;
+  list<mem_array *> tag_arr;
+};
+
+void *calc_time_mt_wrapper(void * void_obj);
+
+#endif
diff --git a/src/gpuwattch/cacti/arbiter.cc b/src/gpuwattch/cacti/arbiter.cc
new file mode 100644
index 000000000..f4f520495
--- /dev/null
+++ b/src/gpuwattch/cacti/arbiter.cc
@@ -0,0 +1,130 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "arbiter.h"
+
+MCPAT_Arbiter::MCPAT_Arbiter(
+    double n_req,
+    double flit_size_,
+    double output_len,
+    TechnologyParameter::DeviceType *dt
+    ):R(n_req), flit_size(flit_size_),
+    o_len (output_len), deviceType(dt)
+{
+  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
+  Vdd = dt->Vdd;
+  double technology = g_ip->F_sz_um;
+  NTn1 = 13.5*technology/2;
+  PTn1 = 76*technology/2;
+  NTn2 = 13.5*technology/2;
+  PTn2 = 76*technology/2;
+  NTi = 12.5*technology/2;
+  PTi = 25*technology/2;
+  NTtr = 10*technology/2; /*Transmission gate's nmos tr. length*/
+  PTtr = 20*technology/2; /* pmos tr. length*/
+}
+
+MCPAT_Arbiter::~MCPAT_Arbiter(){}
+
+double
+MCPAT_Arbiter::arb_req() {
+  double temp = ((R-1)*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)) + 2*gate_C(NTn2, 0) +
+      gate_C(PTn2, 0) + gate_C(NTi, 0) + gate_C(PTi, 0) +
+      drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def));
+  return temp;
+}
+
+double
+MCPAT_Arbiter::arb_pri() {
+  double temp = 2*(2*gate_C(NTn1, 0)+gate_C(PTn1, 0)); /* switching capacitance
+                                                 of flip-flop is ignored */
+  return temp;
+}
+
+
+double
+MCPAT_Arbiter::arb_grant() {
+  double temp = drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) + crossbar_ctrline();
+  return temp;
+}
+
+double
+MCPAT_Arbiter::arb_int() {
+  double temp  =  (drain_C_(NTn1, 0, 1, 1, g_tp.cell_h_def)*2 + drain_C_(PTn1, 1, 1, 1, g_tp.cell_h_def) +
+      2*gate_C(NTn2, 0) + gate_C(PTn2, 0));
+  return temp;
+}
+
+void
+MCPAT_Arbiter::compute_power() {
+  power.readOp.dynamic =  (R*arb_req()*Vdd*Vdd/2 + R*arb_pri()*Vdd*Vdd/2 +
+      arb_grant()*Vdd*Vdd + arb_int()*0.5*Vdd*Vdd);
+  double nor1_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor);
+  double nor2_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor);
+  double not_leak = cmos_Isub_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv);
+  double nor1_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn1*2, min_w_pmos * PTn1*2, 2, nor);
+  double nor2_leak_gate = cmos_Ig_leakage(g_tp.min_w_nmos_*NTn2*R, min_w_pmos * PTn2*R, 2, nor);
+  double not_leak_gate  = cmos_Ig_leakage(g_tp.min_w_nmos_*NTi, min_w_pmos * PTi, 1, inv);
+  power.readOp.leakage = (nor1_leak + nor2_leak + not_leak)*Vdd; //FIXME include priority table leakage
+  power.readOp.gate_leakage = nor1_leak_gate*Vdd + nor2_leak_gate*Vdd + not_leak_gate*Vdd;
+}
+
+double //wire cap with triple spacing
+MCPAT_Arbiter::Cw3(double length) {
+  Wire wc(g_ip->wt, length, 1, 3, 3);
+  double temp = (wc.wire_cap(length,true));
+  return temp;
+}
+
+double
+MCPAT_Arbiter::crossbar_ctrline() {
+  double temp = (Cw3(o_len * 1e-6 /* m */) +
+      drain_C_(NTi, 0, 1, 1, g_tp.cell_h_def) + drain_C_(PTi, 1, 1, 1, g_tp.cell_h_def) +
+      gate_C(NTi, 0) + gate_C(PTi, 0));
+  return temp;
+}
+
+double
+MCPAT_Arbiter::transmission_buf_ctrcap() {
+  double temp = gate_C(NTtr, 0)+gate_C(PTtr, 0);
+  return temp;
+}
+
+
+void MCPAT_Arbiter::print_arbiter()
+{
+  cout << "\nMCPAT_Arbiter Stats ("   << R << " input arbiter" << ")\n\n";
+  cout << "Flit size        : " << flit_size << " bits" << endl;
+  cout << "Dynamic Power    : " << power.readOp.dynamic*1e9 << " (nJ)" << endl;
+  cout << "Leakage Power    : " << power.readOp.leakage*1e3 << " (mW)" << endl;
+}
+
+
diff --git a/src/gpuwattch/cacti/arbiter.h b/src/gpuwattch/cacti/arbiter.h
new file mode 100644
index 000000000..0884a043b
--- /dev/null
+++ b/src/gpuwattch/cacti/arbiter.h
@@ -0,0 +1,77 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef __ARBITER__
+#define __ARBITER__
+
+#include <assert.h>
+#include <iostream>
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "parameter.h"
+#include "mat.h"
+#include "wire.h"
+
+class MCPAT_Arbiter : public Component
+{
+  public:
+    MCPAT_Arbiter(
+      double Req,
+      double flit_sz,
+      double output_len,
+      TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+    ~MCPAT_Arbiter();
+
+    void print_arbiter();
+    double arb_req();
+    double arb_pri();
+    double arb_grant();
+    double arb_int();
+    void compute_power();
+    double Cw3(double len);
+    double crossbar_ctrline();
+    double transmission_buf_ctrcap();
+
+
+
+  private:
+    double NTn1, PTn1, NTn2, PTn2, R, PTi, NTi;
+    double flit_size;
+    double NTtr, PTtr;
+    double o_len;
+    TechnologyParameter::DeviceType *deviceType;
+    double TriS1, TriS2;
+    double min_w_pmos, Vdd;
+
+};
+
+#endif
diff --git a/src/gpuwattch/cacti/area.cc b/src/gpuwattch/cacti/area.cc
new file mode 100644
index 000000000..0d8d4b722
--- /dev/null
+++ b/src/gpuwattch/cacti/area.cc
@@ -0,0 +1,46 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include "area.h"
+#include "component.h"
+#include "decoder.h"
+#include "parameter.h"
+#include "basic_circuit.h"
+#include <iostream>
+#include <math.h>
+#include <assert.h>
+
+using namespace std;
+
+
+
diff --git a/src/gpuwattch/cacti/area.h b/src/gpuwattch/cacti/area.h
new file mode 100644
index 000000000..92272f068
--- /dev/null
+++ b/src/gpuwattch/cacti/area.h
@@ -0,0 +1,71 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __AREA_H__
+#define __AREA_H__
+
+#include "cacti_interface.h"
+#include "basic_circuit.h"
+
+using namespace std;
+
+class Area
+{
+ public:
+  double w;
+  double h;
+
+  Area():w(0), h(0), area(0) { }
+  double get_w() const { return w; }
+  double get_h() const { return h; }
+  double get_area() const
+  {
+    if (w == 0 && h == 0)
+    {
+      return area;
+    }
+    else
+    {
+      return w*h;
+    }
+  }
+  void set_w(double w_) { w = w_; }
+  void set_h(double h_) { h = h_; }
+  void set_area(double a_) { area = a_; }
+
+ private:
+  double area;
+};
+
+#endif
+
diff --git a/src/gpuwattch/cacti/bank.cc b/src/gpuwattch/cacti/bank.cc
new file mode 100755
index 000000000..2e5808006
--- /dev/null
+++ b/src/gpuwattch/cacti/bank.cc
@@ -0,0 +1,198 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include "bank.h"
+#include <iostream>
+
+
+Bank::Bank(const DynamicParameter & dyn_p):
+  dp(dyn_p), mat(dp),
+  num_addr_b_mat(dyn_p.number_addr_bits_mat),
+  num_mats_hor_dir(dyn_p.num_mats_h_dir), num_mats_ver_dir(dyn_p.num_mats_v_dir)
+{
+  int RWP;
+  int ERP;
+  int EWP;
+  int SCHP;
+
+  if (dp.use_inp_params)
+  {
+    RWP  = dp.num_rw_ports;
+    ERP  = dp.num_rd_ports;
+    EWP  = dp.num_wr_ports;
+    SCHP = dp.num_search_ports;
+  }
+  else
+  {
+    RWP  = g_ip->num_rw_ports;
+    ERP  = g_ip->num_rd_ports;
+    EWP  = g_ip->num_wr_ports;
+    SCHP = g_ip->num_search_ports;
+  }
+
+  int total_addrbits = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP);
+  int datainbits     = dp.num_di_b_bank_per_port * (RWP + EWP);
+  int dataoutbits    = dp.num_do_b_bank_per_port * (RWP + ERP);
+  int searchinbits;
+  int searchoutbits;
+
+  if (dp.fully_assoc || dp.pure_cam)
+  {
+	  datainbits   = dp.num_di_b_bank_per_port * (RWP + EWP);
+	  dataoutbits  = dp.num_do_b_bank_per_port * (RWP + ERP);
+	  searchinbits    = dp.num_si_b_bank_per_port * SCHP;
+	  searchoutbits   = dp.num_so_b_bank_per_port * SCHP;
+  }
+
+  if (!(dp.fully_assoc || dp.pure_cam))
+    {
+    if (g_ip->fast_access && dp.is_tag == false)
+    {
+        dataoutbits *= g_ip->data_assoc;
+    }
+
+  htree_in_add   = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+      total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree);
+  htree_in_data  = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+      total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree);
+  htree_out_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+      total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
+
+//  htree_out_data = new Htree2 (g_ip->wt,(double) 100, (double)100,
+//		  total_addrbits, datainbits, 0,dataoutbits,0, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
+
+  area.w = htree_in_data->area.w;
+  area.h = htree_in_data->area.h;
+  }
+  else
+  {
+	  htree_in_add   = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+			  total_addrbits, datainbits, searchinbits,dataoutbits,searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Add_htree);
+	  htree_in_data  = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+			  total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree);
+	  htree_out_data = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+			  total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree);
+	  htree_in_search  = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+			  total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits, num_mats_ver_dir*2, num_mats_hor_dir*2, Data_in_htree,true, true);
+	  htree_out_search = new Htree2 (g_ip->wt,(double) mat.area.w, (double)mat.area.h,
+			  total_addrbits, datainbits,searchinbits, dataoutbits, searchoutbits,num_mats_ver_dir*2, num_mats_hor_dir*2, Data_out_htree,true);
+
+      area.w = htree_in_data->area.w;
+      area.h = htree_in_data->area.h;
+  }
+
+  num_addr_b_row_dec = _log2(mat.subarray.num_rows);
+  num_addr_b_routed_to_mat_for_act = num_addr_b_row_dec;
+  num_addr_b_routed_to_mat_for_rd_or_wr = num_addr_b_mat - num_addr_b_row_dec;
+}
+
+
+
+Bank::~Bank()
+{
+  delete htree_in_add;
+  delete htree_out_data;
+  delete htree_in_data;
+  if (dp.fully_assoc || dp.pure_cam)
+  {
+	  delete htree_in_search;
+	  delete htree_out_search;
+  }
+}
+
+
+
+double Bank::compute_delays(double inrisetime)
+{
+  return mat.compute_delays(inrisetime);
+}
+
+
+
+void Bank::compute_power_energy()
+{
+  mat.compute_power_energy();
+
+  if (!(dp.fully_assoc || dp.pure_cam))
+  {
+	  power.readOp.dynamic += mat.power.readOp.dynamic * dp.num_act_mats_hor_dir;
+	  power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats;
+	  power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats;
+
+	  power.readOp.dynamic += htree_in_add->power.readOp.dynamic;
+	  power.readOp.dynamic += htree_out_data->power.readOp.dynamic;
+
+	  power.readOp.leakage += htree_in_add->power.readOp.leakage;
+	  power.readOp.leakage += htree_in_data->power.readOp.leakage;
+	  power.readOp.leakage += htree_out_data->power.readOp.leakage;
+	  power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage;
+	  power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage;
+	  power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage;
+  }
+  else
+  {
+
+	  power.readOp.dynamic += mat.power.readOp.dynamic ;//for fa and cam num_act_mats_hor_dir is 1 for plain r/w
+	  power.readOp.leakage += mat.power.readOp.leakage * dp.num_mats;
+	  power.readOp.gate_leakage += mat.power.readOp.gate_leakage * dp.num_mats;
+
+	  power.searchOp.dynamic += mat.power.searchOp.dynamic * dp.num_mats;
+	  power.searchOp.dynamic += mat.power_bl_precharge_eq_drv.searchOp.dynamic +
+	  	                        mat.power_sa.searchOp.dynamic +
+	  	                        mat.power_bitline.searchOp.dynamic +
+	  	                        mat.power_subarray_out_drv.searchOp.dynamic+
+	  	                        mat.ml_to_ram_wl_drv->power.readOp.dynamic;
+
+	  power.readOp.dynamic += htree_in_add->power.readOp.dynamic;
+	  power.readOp.dynamic += htree_out_data->power.readOp.dynamic;
+
+	  power.searchOp.dynamic += htree_in_search->power.searchOp.dynamic;
+	  power.searchOp.dynamic += htree_out_search->power.searchOp.dynamic;
+
+	  power.readOp.leakage += htree_in_add->power.readOp.leakage;
+	  power.readOp.leakage += htree_in_data->power.readOp.leakage;
+	  power.readOp.leakage += htree_out_data->power.readOp.leakage;
+	  power.readOp.leakage += htree_in_search->power.readOp.leakage;
+	  power.readOp.leakage += htree_out_search->power.readOp.leakage;
+
+
+	  power.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage;
+	  power.readOp.gate_leakage += htree_in_data->power.readOp.gate_leakage;
+	  power.readOp.gate_leakage += htree_out_data->power.readOp.gate_leakage;
+	  power.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage;
+	  power.readOp.gate_leakage += htree_out_search->power.readOp.gate_leakage;
+
+  }
+
+}
+
diff --git a/src/gpuwattch/cacti/bank.h b/src/gpuwattch/cacti/bank.h
new file mode 100755
index 000000000..47300da95
--- /dev/null
+++ b/src/gpuwattch/cacti/bank.h
@@ -0,0 +1,70 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __BANK_H__
+#define __BANK_H__
+
+#include "component.h"
+#include "decoder.h"
+#include "mat.h"
+#include "htree2.h"
+
+
+class Bank : public Component
+{
+  public:
+    Bank(const DynamicParameter & dyn_p);
+    ~Bank();
+    double compute_delays(double inrisetime);  // return outrisetime
+    void   compute_power_energy();
+
+    const DynamicParameter & dp;
+    Mat   mat;
+    Htree2 *htree_in_add;
+    Htree2 *htree_in_data;
+    Htree2 *htree_out_data;
+    Htree2 *htree_in_search;
+    Htree2 *htree_out_search;
+
+    int  num_addr_b_mat;
+    int  num_mats_hor_dir;
+    int  num_mats_ver_dir;
+
+    int  num_addr_b_row_dec;
+    int  num_addr_b_routed_to_mat_for_act;
+    int  num_addr_b_routed_to_mat_for_rd_or_wr;
+};
+
+
+
+#endif
diff --git a/src/gpuwattch/cacti/basic_circuit.cc b/src/gpuwattch/cacti/basic_circuit.cc
new file mode 100644
index 000000000..a8ea501b3
--- /dev/null
+++ b/src/gpuwattch/cacti/basic_circuit.cc
@@ -0,0 +1,789 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+
+#include "basic_circuit.h"
+#include "parameter.h"
+#include <iostream>
+#include <assert.h>
+#include <cmath>
+
+uint32_t _log2(uint64_t num)
+{
+  uint32_t log2 = 0;
+
+  if (num == 0)
+  {
+    std::cerr << "log0?" << std::endl;
+    exit(1);
+  }
+
+  while (num > 1)
+  {
+    num = (num >> 1);
+    log2++;
+  }
+
+  return log2;
+}
+
+
+bool is_pow2(int64_t val)
+{
+  if (val <= 0)
+  {
+    return false;
+  }
+  else if (val == 1)
+  {
+    return true;
+  }
+  else
+  {
+    return (_log2(val) != _log2(val-1));
+  }
+}
+
+
+int powers (int base, int n)
+{
+  int i, p;
+
+  p = 1;
+  for (i = 1; i <= n; ++i)
+    p *= base;
+  return p;
+}
+
+/*----------------------------------------------------------------------*/
+
+double logtwo (double x)
+{
+  assert(x > 0);
+  return ((double) (log (x) / log (2.0)));
+}
+
+/*----------------------------------------------------------------------*/
+
+
+double gate_C(
+    double width,
+    double wirelength,
+    bool   _is_dram,
+    bool   _is_cell,
+    bool   _is_wl_tr)
+{
+  const TechnologyParameter::DeviceType * dt;
+
+  if (_is_dram && _is_cell)
+  {
+    dt = &g_tp.dram_acc;   //DRAM cell access transistor
+  }
+  else if (_is_dram && _is_wl_tr)
+  {
+    dt = &g_tp.dram_wl;    //DRAM wordline transistor
+  }
+  else if (!_is_dram && _is_cell)
+  {
+    dt = &g_tp.sram_cell;  // SRAM cell access transistor
+  }
+  else
+  {
+    dt = &g_tp.peri_global;
+  }
+
+  return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire;
+}
+
+
+// returns gate capacitance in Farads
+// actually this function is the same as gate_C() now
+double gate_C_pass(
+    double width,       // gate width in um (length is Lphy_periph_global)
+    double wirelength,  // poly wire length going to gate in lambda
+    bool   _is_dram,
+    bool   _is_cell,
+    bool   _is_wl_tr)
+{
+  // v5.0
+  const TechnologyParameter::DeviceType * dt;
+
+  if ((_is_dram) && (_is_cell))
+  {
+    dt = &g_tp.dram_acc;   //DRAM cell access transistor
+  }
+  else if ((_is_dram) && (_is_wl_tr))
+  {
+    dt = &g_tp.dram_wl;    //DRAM wordline transistor
+  }
+  else if ((!_is_dram) && _is_cell)
+  {
+    dt = &g_tp.sram_cell;  // SRAM cell access transistor
+  }
+  else
+  {
+    dt = &g_tp.peri_global;
+  }
+
+  return (dt->C_g_ideal + dt->C_overlap + 3*dt->C_fringe)*width + dt->l_phy*Cpolywire;
+}
+
+
+
+double drain_C_(
+    double width,
+    int nchannel,
+    int stack,
+    int next_arg_thresh_folding_width_or_height_cell,
+    double fold_dimension,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  double w_folded_tr=0;
+  const  TechnologyParameter::DeviceType * dt;
+
+  if ((_is_dram) && (_is_cell))
+  {
+    dt = &g_tp.dram_acc;   // DRAM cell access transistor
+  }
+  else if ((_is_dram) && (_is_wl_tr))
+  {
+    dt = &g_tp.dram_wl;    // DRAM wordline transistor
+  }
+  else if ((!_is_dram) && _is_cell)
+  {
+    dt = &g_tp.sram_cell;  // SRAM cell access transistor
+  }
+  else
+  {
+    dt = &g_tp.peri_global;
+  }
+
+  double c_junc_area = dt->C_junc;
+  double c_junc_sidewall = dt->C_junc_sidewall;
+  double c_fringe    = 2*dt->C_fringe;
+  double c_overlap   = 2*dt->C_overlap;
+  double drain_C_metal_connecting_folded_tr = 0;
+
+  // determine the width of the transistor after folding (if it is getting folded)
+  if (next_arg_thresh_folding_width_or_height_cell == 0)
+  { // interpret fold_dimension as the the folding width threshold
+    // i.e. the value of transistor width above which the transistor gets folded
+    w_folded_tr = fold_dimension;
+  }
+  else
+  { // interpret fold_dimension as the height of the cell that this transistor is part of.
+    double h_tr_region  = fold_dimension - 2 * g_tp.HPOWERRAIL;
+    // TODO : w_folded_tr must come from Component::compute_gate_area()
+    double ratio_p_to_n = 2.0 / (2.0 + 1.0);
+    if (nchannel)
+    {
+      w_folded_tr = (1 - ratio_p_to_n) * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS);
+    }
+    else
+    {
+      w_folded_tr = ratio_p_to_n * (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS);
+    }
+  }
+  int num_folded_tr = (int) (ceil(width / w_folded_tr));
+
+  if (num_folded_tr < 2)
+  {
+    w_folded_tr = width;
+  }
+
+  double total_drain_w = (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) +  // only for drain
+                         (stack - 1) * g_tp.spacing_poly_to_poly;
+  double drain_h_for_sidewall = w_folded_tr;
+  double total_drain_height_for_cap_wrt_gate = w_folded_tr + 2 * w_folded_tr * (stack - 1);
+  if (num_folded_tr > 1)
+  {
+    total_drain_w += (num_folded_tr - 2) * (g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact) +
+                     (num_folded_tr - 1) * ((stack - 1) * g_tp.spacing_poly_to_poly);
+
+    if (num_folded_tr%2 == 0)
+    {
+      drain_h_for_sidewall = 0;
+    }
+    total_drain_height_for_cap_wrt_gate *= num_folded_tr;
+    drain_C_metal_connecting_folded_tr   = g_tp.wire_local.C_per_um * total_drain_w;
+  }
+
+  double drain_C_area     = c_junc_area * total_drain_w * w_folded_tr;
+  double drain_C_sidewall = c_junc_sidewall * (drain_h_for_sidewall + 2 * total_drain_w);
+  double drain_C_wrt_gate = (c_fringe + c_overlap) * total_drain_height_for_cap_wrt_gate;
+
+  return (drain_C_area + drain_C_sidewall + drain_C_wrt_gate + drain_C_metal_connecting_folded_tr);
+}
+
+
+double tr_R_on(
+    double width,
+    int nchannel,
+    int stack,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  const TechnologyParameter::DeviceType * dt;
+
+  if ((_is_dram) && (_is_cell))
+  {
+    dt = &g_tp.dram_acc;   //DRAM cell access transistor
+  }
+  else if ((_is_dram) && (_is_wl_tr))
+  {
+    dt = &g_tp.dram_wl;    //DRAM wordline transistor
+  }
+  else if ((!_is_dram) && _is_cell)
+  {
+    dt = &g_tp.sram_cell;  // SRAM cell access transistor
+  }
+  else
+  {
+    dt = &g_tp.peri_global;
+  }
+
+  double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on;
+  return (stack * restrans / width);
+}
+
+
+/* This routine operates in reverse: given a resistance, it finds
+ * the transistor width that would have this R.  It is used in the
+ * data wordline to estimate the wordline driver size. */
+
+// returns width in um
+double R_to_w(
+    double res,
+    int   nchannel,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  const TechnologyParameter::DeviceType * dt;
+
+  if ((_is_dram) && (_is_cell))
+  {
+    dt = &g_tp.dram_acc;   //DRAM cell access transistor
+  }
+  else if ((_is_dram) && (_is_wl_tr))
+  {
+    dt = &g_tp.dram_wl;    //DRAM wordline transistor
+  }
+  else if ((!_is_dram) && (_is_cell))
+  {
+    dt = &g_tp.sram_cell;  // SRAM cell access transistor
+  }
+  else
+  {
+    dt = &g_tp.peri_global;
+  }
+
+  double restrans = (nchannel) ? dt->R_nch_on : dt->R_pch_on;
+  return (restrans / res);
+}
+
+
+double pmos_to_nmos_sz_ratio(
+    bool _is_dram,
+    bool _is_wl_tr)
+{
+  double p_to_n_sizing_ratio;
+  if ((_is_dram) && (_is_wl_tr))
+  { //DRAM wordline transistor
+    p_to_n_sizing_ratio = g_tp.dram_wl.n_to_p_eff_curr_drv_ratio;
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    p_to_n_sizing_ratio = g_tp.peri_global.n_to_p_eff_curr_drv_ratio;
+  }
+  return p_to_n_sizing_ratio;
+}
+
+
+// "Timing Models for MOS Circuits" by Mark Horowitz, 1984
+double horowitz(
+    double inputramptime, // input rise time
+    double tf,            // time constant of gate
+    double vs1,           // threshold voltage
+    double vs2,           // threshold voltage
+    int    rise)          // whether input rises or fall
+{
+  if (inputramptime == 0 && vs1 == vs2)
+  {
+    return tf * (vs1 < 1 ? -log(vs1) : log(vs1));
+  }
+  double a, b, td;
+
+  a = inputramptime / tf;
+  if (rise == RISE)
+  {
+    b = 0.5;
+    td = tf * sqrt(log(vs1)*log(vs1) + 2*a*b*(1.0 - vs1)) + tf*(log(vs1) - log(vs2));
+  }
+  else
+  {
+    b = 0.4;
+    td = tf * sqrt(log(1.0 - vs1)*log(1.0 - vs1) + 2*a*b*(vs1)) + tf*(log(1.0 - vs1) - log(1.0 - vs2));
+  }
+  return (td);
+}
+
+double cmos_Ileak(
+    double nWidth,
+    double pWidth,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  TechnologyParameter::DeviceType * dt;
+
+  if ((!_is_dram)&&(_is_cell))
+  { //SRAM cell access transistor
+    dt = &(g_tp.sram_cell);
+  }
+  else if ((_is_dram)&&(_is_wl_tr))
+  { //DRAM wordline transistor
+    dt = &(g_tp.dram_wl);
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    dt = &(g_tp.peri_global);
+  }
+  return nWidth*dt->I_off_n + pWidth*dt->I_off_p;
+}
+
+
+double simplified_nmos_leakage(
+    double nwidth,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  TechnologyParameter::DeviceType * dt;
+
+  if ((!_is_dram)&&(_is_cell))
+  { //SRAM cell access transistor
+    dt = &(g_tp.sram_cell);
+  }
+  else if ((_is_dram)&&(_is_wl_tr))
+  { //DRAM wordline transistor
+    dt = &(g_tp.dram_wl);
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    dt = &(g_tp.peri_global);
+  }
+  return nwidth * dt->I_off_n;
+}
+
+int factorial(int n, int m)
+{
+	int fa = m, i;
+	for (i=m+1; i<=n; i++)
+		fa *=i;
+	return fa;
+}
+
+int combination(int n, int m)
+{
+  int ret;
+  ret = factorial(n, m+1) / factorial(n - m);
+  return ret;
+}
+
+double simplified_pmos_leakage(
+    double pwidth,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  TechnologyParameter::DeviceType * dt;
+
+  if ((!_is_dram)&&(_is_cell))
+  { //SRAM cell access transistor
+    dt = &(g_tp.sram_cell);
+  }
+  else if ((_is_dram)&&(_is_wl_tr))
+  { //DRAM wordline transistor
+    dt = &(g_tp.dram_wl);
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    dt = &(g_tp.peri_global);
+  }
+  return pwidth * dt->I_off_p;
+}
+
+double cmos_Ig_n(
+    double nWidth,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  TechnologyParameter::DeviceType * dt;
+
+  if ((!_is_dram)&&(_is_cell))
+  { //SRAM cell access transistor
+    dt = &(g_tp.sram_cell);
+  }
+  else if ((_is_dram)&&(_is_wl_tr))
+  { //DRAM wordline transistor
+    dt = &(g_tp.dram_wl);
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    dt = &(g_tp.peri_global);
+  }
+  return nWidth*dt->I_g_on_n;
+}
+
+double cmos_Ig_p(
+    double pWidth,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr)
+{
+  TechnologyParameter::DeviceType * dt;
+
+  if ((!_is_dram)&&(_is_cell))
+  { //SRAM cell access transistor
+    dt = &(g_tp.sram_cell);
+  }
+  else if ((_is_dram)&&(_is_wl_tr))
+  { //DRAM wordline transistor
+    dt = &(g_tp.dram_wl);
+  }
+  else
+  { //DRAM or SRAM all other transistors
+    dt = &(g_tp.peri_global);
+  }
+  return pWidth*dt->I_g_on_p;
+}
+
+double cmos_Isub_leakage(
+    double nWidth,
+    double pWidth,
+    int    fanin,
+    enum Gate_type g_type,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr,
+    enum Half_net_topology topo)
+{
+	assert (fanin>=1);
+	double nmos_leak = simplified_nmos_leakage(nWidth, _is_dram, _is_cell, _is_wl_tr);
+	double pmos_leak = simplified_pmos_leakage(pWidth, _is_dram, _is_cell, _is_wl_tr);
+    double Isub=0;
+    int    num_states;
+    int    num_off_tx;
+
+    num_states = int(pow(2.0, fanin));
+
+    switch (g_type)
+    {
+    case nmos:
+    	if (fanin==1)
+    	{
+    		Isub = nmos_leak/num_states;
+    	}
+    	else
+    	{
+    		if (topo==parallel)
+    		{
+    			Isub=nmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states
+    		}
+    		else
+    		{
+    			for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power
+    			{
+    				//Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
+    				Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
+    			}
+    			Isub /=num_states;
+    		}
+
+    	}
+    	break;
+    case pmos:
+    	if (fanin==1)
+    	{
+    		Isub = pmos_leak/num_states;
+    	}
+    	else
+    	{
+    		if (topo==parallel)
+    		{
+    			Isub=pmos_leak*fanin/num_states; //only when all tx are off, leakage power is non-zero. The possibility of this state is 1/num_states
+    		}
+    		else
+    		{
+    			for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) //when num_off_tx ==0 there is no leakage power
+    			{
+    				//Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
+    				Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
+    			}
+    			Isub /=num_states;
+    		}
+
+    	}
+    	break;
+    case inv:
+    	Isub = (nmos_leak + pmos_leak)/2;
+    	break;
+    case nand:
+    	Isub += fanin*pmos_leak;//the pullup network
+    	for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pulldown network
+    	{
+    		//Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
+            Isub += nmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
+    	}
+    	Isub /=num_states;
+    	break;
+    case nor:
+    	for (num_off_tx=1; num_off_tx<=fanin; num_off_tx++) // the pullup network
+    	{
+    		//Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*(factorial(fanin)/(factorial(fanin, num_off_tx)*factorial(num_off_tx)));
+    		Isub += pmos_leak*pow(UNI_LEAK_STACK_FACTOR,(num_off_tx-1))*combination(fanin, num_off_tx);
+    	}
+    	Isub += fanin*nmos_leak;//the pulldown network
+    	Isub /=num_states;
+    	break;
+    case tri:
+    	Isub += (nmos_leak + pmos_leak)/2;//enabled
+    	Isub += nmos_leak*UNI_LEAK_STACK_FACTOR; //disabled upper bound of leakage power
+    	Isub /=2;
+    	break;
+    case tg:
+    	Isub = (nmos_leak + pmos_leak)/2;
+    	break;
+    default:
+    	assert(0);
+    	break;
+	  }
+
+    return Isub;
+}
+
+
+double cmos_Ig_leakage(
+    double nWidth,
+    double pWidth,
+    int    fanin,
+    enum Gate_type g_type,
+    bool _is_dram,
+    bool _is_cell,
+    bool _is_wl_tr,
+    enum Half_net_topology topo)
+{
+	assert (fanin>=1);
+		double nmos_leak = cmos_Ig_n(nWidth, _is_dram, _is_cell, _is_wl_tr);
+		double pmos_leak = cmos_Ig_p(pWidth, _is_dram, _is_cell, _is_wl_tr);
+	    double Ig_on=0;
+	    int    num_states;
+	    int    num_on_tx;
+
+	    num_states = int(pow(2.0, fanin));
+
+	    switch (g_type)
+	    {
+	    case nmos:
+	    	if (fanin==1)
+	    	{
+	    		Ig_on = nmos_leak/num_states;
+	    	}
+	    	else
+	    	{
+	    		if (topo==parallel)
+	    		{
+	    	    	for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)
+	    	        {
+	    	    		Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx;
+	    	    	}
+	    		}
+	    		else
+	    		{
+	    			Ig_on += nmos_leak * fanin;//pull down network when all TXs are on.
+	    		    //num_on_tx is the number of on tx
+	    			for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
+	    			{
+	    				Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
+	    			}
+	    			Ig_on /=num_states;
+	    		}
+	    	}
+	    	break;
+	    case pmos:
+	    	if (fanin==1)
+	    	{
+	    		Ig_on = pmos_leak/num_states;
+	    	}
+	    	else
+	    	{
+	    		if (topo==parallel)
+    		    {
+    	    	  for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)
+    	          {
+    	    		  Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx;
+    	    	  }
+    		    }
+    		    else
+    		    {
+    			  Ig_on += pmos_leak * fanin;//pull down network when all TXs are on.
+    		      //num_on_tx is the number of on tx
+    			  for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
+    			  {
+    				  Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
+    			  }
+	    		  Ig_on /=num_states;
+	    	    }
+	    	}
+	    	break;
+
+	    case inv:
+	    	Ig_on = (nmos_leak + pmos_leak)/2;
+	    	break;
+	    case nand:
+	    	//pull up network
+	    	for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n]
+	        {
+	    		Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx;
+	    	}
+
+	    	//pull down network
+	    	Ig_on += nmos_leak * fanin;//pull down network when all TXs are on.
+	    	//num_on_tx is the number of on tx
+	    	for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)//when num_on_tx=[1,n-1]
+	    	{
+	    		Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;//TODO: this is a approximation now, a precise computation will be very complicated.
+	    	}
+	    	Ig_on /=num_states;
+	    	break;
+	    case nor:
+	    	// num_on_tx is the number of on tx in pull up network
+	    	Ig_on += pmos_leak * fanin;//pull up network when all TXs are on.
+	    	for (num_on_tx=1; num_on_tx<fanin; num_on_tx++)
+	    	{
+	    		Ig_on += pmos_leak*combination(fanin, num_on_tx)*num_on_tx/2;
+
+	    	}
+	    	//pull down network
+	    	for (num_on_tx=1; num_on_tx<=fanin; num_on_tx++)//when num_on_tx=[1,n]
+	        {
+	    		Ig_on += nmos_leak*combination(fanin, num_on_tx)*num_on_tx;
+	    	}
+	    	Ig_on /=num_states;
+	    	break;
+	    case tri:
+	    	Ig_on += (2*nmos_leak + 2*pmos_leak)/2;//enabled
+	    	Ig_on += (nmos_leak + pmos_leak)/2; //disabled upper bound of leakage power
+	    	Ig_on /=2;
+	    	break;
+	    case tg:
+	    	Ig_on = (nmos_leak + pmos_leak)/2;
+	    	break;
+	    default:
+	    	assert(0);
+	    	break;
+		  }
+
+	    return Ig_on;
+}
+
+double shortcircuit_simple(
+    double vt,
+    double velocity_index,
+    double c_in,
+    double c_out,
+    double w_nmos,
+    double w_pmos,
+    double i_on_n,
+    double i_on_p,
+    double i_on_n_in,
+    double i_on_p_in,
+    double vdd)
+{
+
+	double p_short_circuit, p_short_circuit_discharge, p_short_circuit_charge, p_short_circuit_discharge_low, p_short_circuit_charge_low;//this is actually energy
+	double fo_n, fo_p, fanout, beta_ratio, vt_to_vdd_ratio;
+
+	fo_n	= i_on_n/i_on_n_in;
+	fo_p	= i_on_p/i_on_p_in;
+	fanout	= c_out/c_in;
+	beta_ratio = i_on_p/i_on_n;
+	vt_to_vdd_ratio = vt/vdd;
+
+	//p_short_circuit_discharge_low 	= 10/3*(pow(0.5-vt_to_vdd_ratio,3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
+	p_short_circuit_discharge_low 	= 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_p*fo_p/fanout/beta_ratio;
+	p_short_circuit_charge_low 		= 10/3*(pow(((vdd-vt)-vt_to_vdd_ratio),3.0)/pow(velocity_index,2.0)/pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio))*c_in*vdd*vdd*fo_n*fo_n/fanout*beta_ratio;
+//	double t1, t2, t3, t4, t5;
+//	t1=pow(((vdd-vt)-vt_to_vdd_ratio),3);
+//	t2=pow(velocity_index,2.0);
+//	t3=pow(2.0,3*vt_to_vdd_ratio*vt_to_vdd_ratio);
+//	t4=t1/t2/t3;
+//	cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
+
+
+//	t1=pow(((vdd-vt)-vt_to_vdd_ratio),1.5);
+//	t2=pow(2, 3*vt_to_vdd_ratio+2*velocity_index);
+//	t3=t1/t2;
+//	cout <<t1<<"t1\n"<<t2<<"t2\n"<<t3<<"t3\n"<<t4<<"t4\n"<<fanout<<endl;
+//	p_short_circuit_discharge = 1.0/(1.0/p_short_circuit_discharge_low + 1.0/p_short_circuit_discharge_high);
+//	p_short_circuit_charge = 1/(1/p_short_circuit_charge_low + 1/p_short_circuit_charge_high); //harmmoic mean cannot be applied simple formulas.
+
+	p_short_circuit_discharge = p_short_circuit_discharge_low;
+	p_short_circuit_charge = p_short_circuit_charge_low;
+	p_short_circuit = (p_short_circuit_discharge + p_short_circuit_charge)/2;
+
+  return (p_short_circuit);
+}
+
+double shortcircuit(
+    double vt,
+    double velocity_index,
+    double c_in,
+    double c_out,
+    double w_nmos,
+    double w_pmos,
+    double i_on_n,
+    double i_on_p,
+    double i_on_n_in,
+    double i_on_p_in,
+    double vdd)
+{
+
+	double p_short_circuit=0; 
+  return (p_short_circuit);
+}
diff --git a/src/gpuwattch/cacti/basic_circuit.h b/src/gpuwattch/cacti/basic_circuit.h
new file mode 100644
index 000000000..bafd3d30d
--- /dev/null
+++ b/src/gpuwattch/cacti/basic_circuit.h
@@ -0,0 +1,248 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __BASIC_CIRCUIT_H__
+#define __BASIC_CIRCUIT_H__
+
+#include "const.h"
+#include "cacti_interface.h"
+
+using namespace std;
+
+#define UNI_LEAK_STACK_FACTOR 0.43
+
+int powers (int base, int n);
+bool is_pow2(int64_t val);
+uint32_t _log2(uint64_t num);
+int factorial(int n, int m = 1);
+int combination(int n, int m);
+
+//#define DBG
+#ifdef DBG
+    #define PRINTDW(a);\
+    a;
+#else
+    #define PRINTDW(a);\
+
+#endif
+
+
+enum Wire_placement {
+    outside_mat,
+    inside_mat,
+    local_wires
+};
+
+
+
+enum Htree_type {
+    Add_htree,
+    Data_in_htree,
+    Data_out_htree,
+    Search_in_htree,
+    Search_out_htree,
+};
+
+enum Gate_type {
+    nmos,
+    pmos,
+	inv,
+    nand,
+    nor,
+    tri,
+    tg
+};
+
+enum Half_net_topology {
+    parallel,
+    series
+};
+
+double logtwo (double x);
+
+double gate_C(
+    double width,
+    double wirelength,
+    bool _is_dram = false,
+    bool _is_sram = false,
+    bool _is_wl_tr = false);
+
+double gate_C_pass(
+    double width,
+    double wirelength,
+    bool   _is_dram = false,
+    bool   _is_sram = false,
+    bool   _is_wl_tr = false);
+
+double drain_C_(
+    double width,
+    int nchannel,
+    int stack,
+    int next_arg_thresh_folding_width_or_height_cell,
+    double fold_dimension,
+    bool _is_dram = false,
+    bool _is_sram = false,
+    bool _is_wl_tr = false);
+
+double tr_R_on(
+    double width,
+    int nchannel,
+    int stack,
+    bool _is_dram = false,
+    bool _is_sram = false,
+    bool _is_wl_tr = false);
+
+double R_to_w(
+    double res,
+    int nchannel,
+    bool _is_dram = false,
+    bool _is_sram = false,
+    bool _is_wl_tr = false);
+
+double horowitz (
+    double inputramptime,
+    double tf,
+    double vs1,
+    double vs2,
+    int rise);
+
+double pmos_to_nmos_sz_ratio(
+    bool _is_dram = false,
+    bool _is_wl_tr = false);
+
+double simplified_nmos_leakage(
+    double nwidth,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr = false);
+
+double simplified_pmos_leakage(
+    double pwidth,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr = false);
+
+
+double cmos_Ileak(
+    double nWidth,
+    double pWidth,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr = false);
+
+double cmos_Ig_n(
+    double nWidth,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr= false);
+
+double cmos_Ig_p(
+    double pWidth,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr= false);
+
+
+double cmos_Isub_leakage(
+    double nWidth,
+    double pWidth,
+    int    fanin,
+    enum Gate_type g_type,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr = false,
+    enum Half_net_topology topo = series);
+
+double cmos_Ig_leakage(
+    double nWidth,
+    double pWidth,
+    int    fanin,
+    enum Gate_type g_type,
+    bool _is_dram = false,
+    bool _is_cell = false,
+    bool _is_wl_tr = false,
+    enum Half_net_topology topo = series);
+
+double shortcircuit(
+    double vt,
+    double velocity_index,
+    double c_in,
+    double c_out,
+    double w_nmos,
+    double w_pmos,
+    double i_on_n,
+    double i_on_p,
+    double i_on_n_in,
+    double i_on_p_in,
+    double vdd);
+
+double shortcircuit_simple(
+    double vt,
+    double velocity_index,
+    double c_in,
+    double c_out,
+    double w_nmos,
+    double w_pmos,
+    double i_on_n,
+    double i_on_p,
+    double i_on_n_in,
+    double i_on_p_in,
+    double vdd);
+//set power point product mask; strictly speaking this is not real point product
+inline void set_pppm(
+	double * pppv,
+	double a=1,
+    double b=1,
+    double c=1,
+    double d=1
+    ){
+		pppv[0]= a;
+		pppv[1]= b;
+		pppv[2]= c;
+		pppv[3]= d;
+
+}
+
+inline void set_sppm(
+	double * sppv,
+	double a=1,
+    double b=1,
+    double c=1,
+    double d=1
+    ){
+		sppv[0]= a;
+		sppv[1]= b;
+		sppv[2]= c;
+}
+
+#endif
diff --git a/src/gpuwattch/cacti/batch_tests b/src/gpuwattch/cacti/batch_tests
new file mode 100755
index 000000000..45a03898e
--- /dev/null
+++ b/src/gpuwattch/cacti/batch_tests
@@ -0,0 +1,41 @@
+rm -rf ./out.csv
+./cacti 8192     64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 16384    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 32768    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 65536    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 131072   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 262144   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 524288   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 1048576  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 0 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 8192     64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 16384    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 32768    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 65536    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 131072   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 262144   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 524288   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 1048576  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 3 0 0 0 1 1 1 1 0 0 0 1 1
+./cacti 8192     64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 16384    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 32768    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 65536    64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 131072   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 262144   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 524288   64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 1048576  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 0 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 2097152  64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 4194304  64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 8388608  64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
+./cacti 16777216 64 1 1 0 0 0 1 65 512 0 0 0 1 0 1 0 0 0 0 10 1000 1000 1000 1000 360 4 1 1 1 1 1 1 1 0 0 0 1 1
diff --git a/src/gpuwattch/cacti/cache.cfg b/src/gpuwattch/cacti/cache.cfg
new file mode 100755
index 000000000..03de34a13
--- /dev/null
+++ b/src/gpuwattch/cacti/cache.cfg
@@ -0,0 +1,175 @@
+# Cache size
+//-size (bytes) 2048
+//-size (bytes) 4096
+//-size (bytes) 32768
+//-size (bytes) 262144
+//-size (bytes) 1048576
+//-size (bytes) 2097152
+//-size (bytes) 4194304
+//-size (bytes) 8388608
+//-size (bytes) 16777216
+//-size (bytes) 33554432
+//-size (bytes) 134217728
+//-size (bytes) 67108864
+-size (bytes) 1073741824
+
+# Line size
+//-block size (bytes) 8
+-block size (bytes) 64
+
+# To model Fully Associative cache, set associativity to zero
+//-associativity 0
+//-associativity 2
+//-associativity 4
+-associativity 8
+//-associativity 16
+
+-read-write port 1
+-exclusive read port 0
+-exclusive write port 0
+-single ended read ports 0
+
+# Multiple banks connected using a bus
+-UCA bank count 1
+-technology (u) 0.022
+//-technology (u) 0.040
+//-technology (u) 0.032
+//-technology (u) 0.090
+
+# following three parameters are meaningful only for main memories
+
+-page size (bits) 8192 
+-burst length 8
+-internal prefetch width 8
+
+# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
+-Data array cell type - "itrs-hp"
+//-Data array cell type - "itrs-lstp"
+//-Data array cell type - "itrs-lop"
+
+# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
+-Data array peripheral type - "itrs-hp"
+//-Data array peripheral type - "itrs-lstp"
+//-Data array peripheral type - "itrs-lop"
+
+# following parameter can have one of five values -- (itrs-hp, itrs-lstp, itrs-lop, lp-dram, comm-dram)
+-Tag array cell type - "itrs-hp"
+//-Tag array cell type - "itrs-lstp"
+//-Tag array cell type - "itrs-lop"
+
+# following parameter can have one of three values -- (itrs-hp, itrs-lstp, itrs-lop)
+-Tag array peripheral type - "itrs-hp"
+//-Tag array peripheral type - "itrs-lstp"
+//-Tag array peripheral type - "itrs-lop
+
+# Bus width include data bits and address bits required by the decoder
+//-output/input bus width 16
+-output/input bus width 512
+
+// 300-400 in steps of 10
+-operating temperature (K) 360
+
+# Type of memory - cache (with a tag array) or ram (scratch ram similar to a register file) 
+# or main memory (no tag array and every access will happen at a page granularity Ref: CACTI 5.3 report)
+-cache type "cache"
+//-cache type "ram"
+//-cache type "main memory"
+
+# to model special structure like branch target buffers, directory, etc. 
+# change the tag size parameter
+# if you want cacti to calculate the tagbits, set the tag size to "default"
+-tag size (b) "default"
+//-tag size (b) 22
+
+# fast - data and tag access happen in parallel
+# sequential - data array is accessed after accessing the tag array
+# normal - data array lookup and tag access happen in parallel
+#          final data block is broadcasted in data array h-tree 
+#          after getting the signal from the tag array
+//-access mode (normal, sequential, fast) - "fast"
+-access mode (normal, sequential, fast) - "normal"
+//-access mode (normal, sequential, fast) - "sequential"
+
+
+# DESIGN OBJECTIVE for UCA (or banks in NUCA)
+-design objective (weight delay, dynamic power, leakage power, cycle time, area) 0:0:0:100:0
+
+# Percentage deviation from the minimum value 
+# Ex: A deviation value of 10:1000:1000:1000:1000 will try to find an organization
+# that compromises at most 10% delay. 
+# NOTE: Try reasonable values for % deviation. Inconsistent deviation
+# percentage values will not produce any valid organizations. For example,
+# 0:0:100:100:100 will try to identify an organization that has both
+# least delay and dynamic power. Since such an organization is not possible, CACTI will
+# throw an error. Refer CACTI-6 Technical report for more details
+-deviate (delay, dynamic power, leakage power, cycle time, area) 20:100000:100000:100000:100000
+
+# Objective for NUCA
+-NUCAdesign objective (weight delay, dynamic power, leakage power, cycle time, area) 100:100:0:0:100
+-NUCAdeviate (delay, dynamic power, leakage power, cycle time, area) 10:10000:10000:10000:10000
+
+# Set optimize tag to ED or ED^2 to obtain a cache configuration optimized for
+# energy-delay or energy-delay sq. product
+# Note: Optimize tag will disable weight or deviate values mentioned above
+# Set it to NONE to let weight and deviate values determine the 
+# appropriate cache configuration
+//-Optimize ED or ED^2 (ED, ED^2, NONE): "ED"
+-Optimize ED or ED^2 (ED, ED^2, NONE): "ED^2"
+//-Optimize ED or ED^2 (ED, ED^2, NONE): "NONE"
+
+-Cache model (NUCA, UCA)  - "UCA"
+//-Cache model (NUCA, UCA)  - "NUCA"
+
+# In order for CACTI to find the optimal NUCA bank value the following
+# variable should be assigned 0.
+-NUCA bank count 0
+
+# NOTE: for nuca network frequency is set to a default value of 
+# 5GHz in time.c. CACTI automatically
+# calculates the maximum possible frequency and downgrades this value if necessary
+
+# By default CACTI considers both full-swing and low-swing 
+# wires to find an optimal configuration. However, it is possible to 
+# restrict the search space by changing the signalling from "default" to 
+# "fullswing" or "lowswing" type.
+//-Wire signalling (fullswing, lowswing, default) - "Global_10"
+-Wire signalling (fullswing, lowswing, default) - "default"
+//-Wire signalling (fullswing, lowswing, default) - "lowswing"
+
+//-Wire inside mat - "global"
+-Wire inside mat - "semi-global"
+//-Wire outside mat - "global"
+-Wire outside mat - "semi-global"
+
+//-Interconnect projection - "conservative"
+-Interconnect projection - "aggressive"
+
+# Contention in network (which is a function of core count and cache level) is one of
+# the critical factor used for deciding the optimal bank count value
+# core count can be 4, 8, or 16
+//-Core count 4
+-Core count 8
+//-Core count 16
+-Cache level (L2/L3) - "L3"
+
+-Add ECC - "true"
+
+//-Print level (DETAILED, CONCISE) - "CONCISE"
+-Print level (DETAILED, CONCISE) - "DETAILED"
+
+# for debugging
+//-Print input parameters - "true"
+-Print input parameters - "false"
+# force CACTI to model the cache with the 
+# following Ndbl, Ndwl, Nspd, Ndsam,
+# and Ndcm values
+//-Force cache config - "true"
+-Force cache config - "false"
+-Ndwl 1
+-Ndbl 1
+-Nspd 0
+-Ndcm 1
+-Ndsam1 0
+-Ndsam2 0
+
+
diff --git a/src/gpuwattch/cacti/cacti.i b/src/gpuwattch/cacti/cacti.i
new file mode 100644
index 000000000..796413872
--- /dev/null
+++ b/src/gpuwattch/cacti/cacti.i
@@ -0,0 +1,8 @@
+%module cacti
+%{
+/* Includes the header in the wrapper code */
+#include "cacti_interface.h"
+%}
+
+/* Parse the header file to generate wrappers */
+%include "cacti_interface.h"
\ No newline at end of file
diff --git a/src/gpuwattch/cacti/cacti.mk b/src/gpuwattch/cacti/cacti.mk
new file mode 100644
index 000000000..7f3c57338
--- /dev/null
+++ b/src/gpuwattch/cacti/cacti.mk
@@ -0,0 +1,65 @@
+
+OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/gpuwattch/cacti
+TARGET = cacti
+SHELL = /bin/sh
+.PHONY: all depend clean
+.SUFFIXES: .cc .o
+
+ifndef NTHREADS
+  NTHREADS = 8
+endif
+
+
+LIBS = 
+INCS = -lm
+
+ifeq ($(TAG),dbg)
+  DBG = -Wall 
+  OPT = -ggdb -g -O0 -DNTHREADS=1  -gstabs+
+else
+  DBG = 
+  OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS)
+endif
+
+#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) 
+CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) 
+
+ifeq ($(shell getconf LONG_BIT),64) 
+	CXX = g++ -m64
+	CC  = gcc -m64
+else 
+	CXX = g++ -m32
+	CC  = gcc -m32
+endif 
+
+
+SRCS  = area.cc bank.cc mat.cc main.cc Ucache.cc io.cc technology.cc basic_circuit.cc parameter.cc \
+		decoder.cc component.cc uca.cc subarray.cc wire.cc htree2.cc \
+		cacti_interface.cc router.cc nuca.cc crossbar.cc arbiter.cc 
+
+OBJS = $(patsubst %.cc,$(OUTPUT_DIR)/%.o,$(SRCS))
+PYTHONLIB_SRCS = $(patsubst main.cc, ,$(SRCS)) $(OUTPUT_DIR)/cacti_wrap.cc
+PYTHONLIB_OBJS = $(patsubst %.cc,%.o,$(PYTHONLIB_SRCS)) 
+INCLUDES       = -I /usr/include/python2.4 -I /usr/lib/python2.4/config
+
+all: $(OUTPUT_DIR)/$(TARGET)
+
+$(OUTPUT_DIR)/$(TARGET) : $(OBJS)
+	$(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
+
+#obj_$(TAG)/%.o : %.cc
+#	$(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
+
+$(OUTPUT_DIR)/Makefile.makedepend: depend
+
+depend:
+	touch $(OUTPUT_DIR)/Makefile.makedepend
+	makedepend -f$(OUTPUT_DIR)/Makefile.makedepend -p$(OUTPUT_DIR)/ $(SRCS) 2> /dev/null
+
+$(OUTPUT_DIR)/%.o : %.cc
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+clean:
+	-rm -f *.o _cacti.so cacti.py $(TARGET)
+
+include $(OUTPUT_DIR)/Makefile.makedepend
diff --git a/src/gpuwattch/cacti/cacti_interface.cc b/src/gpuwattch/cacti/cacti_interface.cc
new file mode 100644
index 000000000..99d734d2f
--- /dev/null
+++ b/src/gpuwattch/cacti/cacti_interface.cc
@@ -0,0 +1,174 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include <time.h>
+#include <math.h>
+
+
+#include "area.h"
+#include "basic_circuit.h"
+#include "component.h"
+#include "const.h"
+#include "parameter.h"
+#include "cacti_interface.h"
+#include "Ucache.h"
+
+#include <pthread.h>
+#include <iostream>
+#include <algorithm>
+
+using namespace std;
+
+
+bool mem_array::lt(const mem_array * m1, const mem_array * m2)
+{
+  if (m1->Nspd < m2->Nspd) return true;
+  else if (m1->Nspd > m2->Nspd) return false;
+  else if (m1->Ndwl < m2->Ndwl) return true;
+  else if (m1->Ndwl > m2->Ndwl) return false;
+  else if (m1->Ndbl < m2->Ndbl) return true;
+  else if (m1->Ndbl > m2->Ndbl) return false;
+  else if (m1->deg_bl_muxing < m2->deg_bl_muxing) return true;
+  else if (m1->deg_bl_muxing > m2->deg_bl_muxing) return false;
+  else if (m1->Ndsam_lev_1 < m2->Ndsam_lev_1) return true;
+  else if (m1->Ndsam_lev_1 > m2->Ndsam_lev_1) return false;
+  else if (m1->Ndsam_lev_2 < m2->Ndsam_lev_2) return true;
+  else return false;
+}
+
+
+
+void uca_org_t::find_delay()
+{
+  mem_array * data_arr = data_array2;
+  mem_array * tag_arr  = tag_array2;
+
+  // check whether it is a regular cache or scratch ram
+  if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)
+  {
+    access_time = data_arr->access_time;
+  }
+  // Both tag and data lookup happen in parallel
+  // and the entire set is sent over the data array h-tree without
+  // waiting for the way-select signal --TODO add the corresponding
+  // power overhead Nav
+  else if (g_ip->fast_access == true)
+  {
+    access_time = MAX(tag_arr->access_time, data_arr->access_time);
+  }
+  // Tag is accessed first. On a hit, way-select signal along with the
+  // address is sent to read/write the appropriate block in the data
+  // array
+  else if (g_ip->is_seq_acc == true)
+  {
+    access_time = tag_arr->access_time + data_arr->access_time;
+  }
+  // Normal access: tag array access and data array access happen in parallel.
+  // But, the data array will wait for the way-select and transfer only the
+  // appropriate block over the h-tree.
+  else
+  {
+    access_time = MAX(tag_arr->access_time + data_arr->delay_senseamp_mux_decoder,
+                      data_arr->delay_before_subarray_output_driver) +
+                  data_arr->delay_from_subarray_output_driver_to_output;
+  }
+}
+
+
+
+void uca_org_t::find_energy()
+{
+  if (!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache)
+    power = data_array2->power + tag_array2->power;
+  else
+    power = data_array2->power;
+}
+
+
+
+void uca_org_t::find_area()
+{
+  if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)//(g_ip->is_cache == false)
+  {
+    cache_ht  = data_array2->height;
+    cache_len = data_array2->width;
+  }
+  else
+  {
+    cache_ht  = MAX(tag_array2->height, data_array2->height);
+    cache_len = tag_array2->width + data_array2->width;
+  }
+  area = cache_ht * cache_len;
+}
+
+void uca_org_t::adjust_area()
+{
+  double area_adjust;
+  if (g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)
+  {
+    if (data_array2->area_efficiency/100.0<0.2)
+    {
+    	//area_adjust = sqrt(area/(area*(data_array2->area_efficiency/100.0)/0.2));
+    	area_adjust = sqrt(0.2/(data_array2->area_efficiency/100.0));
+    	cache_ht  = cache_ht/area_adjust;
+    	cache_len = cache_len/area_adjust;
+    }
+  }
+  area = cache_ht * cache_len;
+}
+
+void uca_org_t::find_cyc()
+{
+  if ((g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc))//(g_ip->is_cache == false)
+  {
+    cycle_time = data_array2->cycle_time;
+  }
+  else
+  {
+    cycle_time = MAX(tag_array2->cycle_time,
+                    data_array2->cycle_time);
+  }
+}
+
+uca_org_t :: uca_org_t()
+:tag_array2(0),
+ data_array2(0)
+{
+
+}
+
+void uca_org_t :: cleanup()
+{
+	  if (data_array2!=0)
+		  delete data_array2;
+	  if (tag_array2!=0)
+		  delete tag_array2;
+}
diff --git a/src/gpuwattch/cacti/cacti_interface.h b/src/gpuwattch/cacti/cacti_interface.h
new file mode 100644
index 000000000..ff639dadb
--- /dev/null
+++ b/src/gpuwattch/cacti/cacti_interface.h
@@ -0,0 +1,633 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __CACTI_INTERFACE_H__
+#define __CACTI_INTERFACE_H__
+
+#include <map>
+#include <string>
+#include <vector>
+#include <list>
+#include <iostream>
+#include "const.h"
+
+using namespace std;
+
+
+class min_values_t;
+class mem_array;
+class uca_org_t;
+
+
+class powerComponents
+{
+  public:
+    double dynamic;
+    double leakage;
+    double gate_leakage;
+    double short_circuit;
+    double longer_channel_leakage;
+
+    powerComponents() : dynamic(0), leakage(0), gate_leakage(0), short_circuit(0), longer_channel_leakage(0)  { }
+    powerComponents(const powerComponents & obj) { *this = obj; }
+    powerComponents & operator=(const powerComponents & rhs)
+    {
+      dynamic = rhs.dynamic;
+      leakage = rhs.leakage;
+      gate_leakage  = rhs.gate_leakage;
+      short_circuit = rhs.short_circuit;
+      longer_channel_leakage = rhs.longer_channel_leakage;
+      return *this;
+    }
+    void reset() { dynamic = 0; leakage = 0; gate_leakage = 0; short_circuit = 0;longer_channel_leakage = 0;}
+
+    friend powerComponents operator+(const powerComponents & x, const powerComponents & y);
+    friend powerComponents operator*(const powerComponents & x, double const * const y);
+};
+
+
+
+class powerDef
+{
+  public:
+    powerComponents readOp;
+    powerComponents writeOp;
+    powerComponents searchOp;//Sheng: for CAM and FA
+
+    powerDef() : readOp(), writeOp(), searchOp() { }
+    void reset() { readOp.reset(); writeOp.reset(); searchOp.reset();}
+
+    friend powerDef operator+(const powerDef & x, const powerDef & y);
+    friend powerDef operator*(const powerDef & x, double const * const y);
+};
+
+enum Wire_type
+{
+    Global /* gloabl wires with repeaters */,
+    Global_5 /* 5% delay penalty */,
+    Global_10 /* 10% delay penalty */,
+    Global_20 /* 20% delay penalty */,
+    Global_30 /* 30% delay penalty */,
+    Low_swing /* differential low power wires with high area overhead */,
+    Semi_global /* mid-level wires with repeaters*/,
+    Transmission /* tranmission lines with high area overhead */,
+    Optical /* optical wires */,
+    Invalid_wtype
+};
+
+
+
+class InputParameter
+{
+  public:
+	InputParameter();
+    void parse_cfg(const string & infile);
+
+    bool error_checking();  // return false if the input parameters are problematic
+    void display_ip();
+
+    unsigned int cache_sz;  // in bytes
+    unsigned int line_sz;
+    unsigned int assoc;
+    unsigned int nbanks;
+    unsigned int out_w;// == nr_bits_out
+    bool     specific_tag;
+    unsigned int tag_w;
+    unsigned int access_mode;
+    unsigned int obj_func_dyn_energy;
+    unsigned int obj_func_dyn_power;
+    unsigned int obj_func_leak_power;
+    unsigned int obj_func_cycle_t;
+
+    double   F_sz_nm;          // feature size in nm
+    double   F_sz_um;          // feature size in um
+    unsigned int num_rw_ports;
+    unsigned int num_rd_ports;
+    unsigned int num_wr_ports;
+    unsigned int num_se_rd_ports;  // number of single ended read ports
+    unsigned int num_search_ports;  // Sheng: number of search ports for CAM
+    bool     is_main_mem;
+    bool     is_cache;
+    bool     pure_ram;
+    bool     pure_cam;
+    bool     rpters_in_htree;  // if there are repeaters in htree segment
+    unsigned int ver_htree_wires_over_array;
+    unsigned int broadcast_addr_din_over_ver_htrees;
+    unsigned int temp;
+
+    unsigned int ram_cell_tech_type;
+    unsigned int peri_global_tech_type;
+    unsigned int data_arr_ram_cell_tech_type;
+    unsigned int data_arr_peri_global_tech_type;
+    unsigned int tag_arr_ram_cell_tech_type;
+    unsigned int tag_arr_peri_global_tech_type;
+
+    unsigned int burst_len;
+    unsigned int int_prefetch_w;
+    unsigned int page_sz_bits;
+
+    unsigned int ic_proj_type;      // interconnect_projection_type
+    unsigned int wire_is_mat_type;  // wire_inside_mat_type
+    unsigned int wire_os_mat_type; // wire_outside_mat_type
+    enum Wire_type wt;
+    int force_wiretype;
+    bool print_input_args;
+    unsigned int nuca_cache_sz; // TODO
+    int ndbl, ndwl, nspd, ndsam1, ndsam2, ndcm;
+    bool force_cache_config;
+
+    int cache_level;
+    int cores;
+    int nuca_bank_count;
+    int force_nuca_bank;
+
+    int delay_wt, dynamic_power_wt, leakage_power_wt,
+        cycle_time_wt, area_wt;
+    int delay_wt_nuca, dynamic_power_wt_nuca, leakage_power_wt_nuca,
+        cycle_time_wt_nuca, area_wt_nuca;
+
+    int delay_dev, dynamic_power_dev, leakage_power_dev,
+        cycle_time_dev, area_dev;
+    int delay_dev_nuca, dynamic_power_dev_nuca, leakage_power_dev_nuca,
+        cycle_time_dev_nuca, area_dev_nuca;
+    int ed; //ED or ED2 optimization
+    int nuca;
+
+    bool     fast_access;
+    unsigned int block_sz;  // bytes
+    unsigned int tag_assoc;
+    unsigned int data_assoc;
+    bool     is_seq_acc;
+    bool     fully_assoc;
+    unsigned int nsets;  // == number_of_sets
+    int print_detail;
+
+
+    bool     add_ecc_b_;
+  //parameters for design constraint
+  double throughput;
+  double latency;
+  bool pipelinable;
+  int pipeline_stages;
+  int per_stage_vector;
+  bool with_clock_grid;
+};
+
+
+typedef struct{
+  int Ndwl;
+  int Ndbl;
+  double Nspd;
+  int deg_bl_muxing;
+  int Ndsam_lev_1;
+  int Ndsam_lev_2;
+  int number_activated_mats_horizontal_direction;
+  int number_subbanks;
+  int page_size_in_bits;
+  double delay_route_to_bank;
+  double delay_crossbar;
+  double delay_addr_din_horizontal_htree;
+  double delay_addr_din_vertical_htree;
+  double delay_row_predecode_driver_and_block;
+  double delay_row_decoder;
+  double delay_bitlines;
+  double delay_sense_amp;
+  double delay_subarray_output_driver;
+  double delay_bit_mux_predecode_driver_and_block;
+  double delay_bit_mux_decoder;
+  double delay_senseamp_mux_lev_1_predecode_driver_and_block;
+  double delay_senseamp_mux_lev_1_decoder;
+  double delay_senseamp_mux_lev_2_predecode_driver_and_block;
+  double delay_senseamp_mux_lev_2_decoder;
+  double delay_input_htree;
+  double delay_output_htree;
+  double delay_dout_vertical_htree;
+  double delay_dout_horizontal_htree;
+  double delay_comparator;
+  double access_time;
+  double cycle_time;
+  double multisubbank_interleave_cycle_time;
+  double delay_request_network;
+  double delay_inside_mat;
+  double delay_reply_network;
+  double trcd;
+  double cas_latency;
+  double precharge_delay;
+  powerDef power_routing_to_bank;
+  powerDef power_addr_input_htree;
+  powerDef power_data_input_htree;
+  powerDef power_data_output_htree;
+  powerDef power_addr_horizontal_htree;
+  powerDef power_datain_horizontal_htree;
+  powerDef power_dataout_horizontal_htree;
+  powerDef power_addr_vertical_htree;
+  powerDef power_datain_vertical_htree;
+  powerDef power_row_predecoder_drivers;
+  powerDef power_row_predecoder_blocks;
+  powerDef power_row_decoders;
+  powerDef power_bit_mux_predecoder_drivers;
+  powerDef power_bit_mux_predecoder_blocks;
+  powerDef power_bit_mux_decoders;
+  powerDef power_senseamp_mux_lev_1_predecoder_drivers;
+  powerDef power_senseamp_mux_lev_1_predecoder_blocks;
+  powerDef power_senseamp_mux_lev_1_decoders;
+  powerDef power_senseamp_mux_lev_2_predecoder_drivers;
+  powerDef power_senseamp_mux_lev_2_predecoder_blocks;
+  powerDef power_senseamp_mux_lev_2_decoders;
+  powerDef power_bitlines;
+  powerDef power_sense_amps;
+  powerDef power_prechg_eq_drivers;
+  powerDef power_output_drivers_at_subarray;
+  powerDef power_dataout_vertical_htree;
+  powerDef power_comparators;
+  powerDef power_crossbar;
+  powerDef total_power;
+  double area;
+  double all_banks_height;
+  double all_banks_width;
+  double bank_height;
+  double bank_width;
+  double subarray_memory_cell_area_height;
+  double subarray_memory_cell_area_width;
+  double mat_height;
+  double mat_width;
+  double routing_area_height_within_bank;
+  double routing_area_width_within_bank;
+  double area_efficiency;
+//  double perc_power_dyn_routing_to_bank;
+//  double perc_power_dyn_addr_horizontal_htree;
+//  double perc_power_dyn_datain_horizontal_htree;
+//  double perc_power_dyn_dataout_horizontal_htree;
+//  double perc_power_dyn_addr_vertical_htree;
+//  double perc_power_dyn_datain_vertical_htree;
+//  double perc_power_dyn_row_predecoder_drivers;
+//  double perc_power_dyn_row_predecoder_blocks;
+//  double perc_power_dyn_row_decoders;
+//  double perc_power_dyn_bit_mux_predecoder_drivers;
+//  double perc_power_dyn_bit_mux_predecoder_blocks;
+//  double perc_power_dyn_bit_mux_decoders;
+//  double perc_power_dyn_senseamp_mux_lev_1_predecoder_drivers;
+//  double perc_power_dyn_senseamp_mux_lev_1_predecoder_blocks;
+//  double perc_power_dyn_senseamp_mux_lev_1_decoders;
+//  double perc_power_dyn_senseamp_mux_lev_2_predecoder_drivers;
+//  double perc_power_dyn_senseamp_mux_lev_2_predecoder_blocks;
+//  double perc_power_dyn_senseamp_mux_lev_2_decoders;
+//  double perc_power_dyn_bitlines;
+//  double perc_power_dyn_sense_amps;
+//  double perc_power_dyn_prechg_eq_drivers;
+//  double perc_power_dyn_subarray_output_drivers;
+//  double perc_power_dyn_dataout_vertical_htree;
+//  double perc_power_dyn_comparators;
+//  double perc_power_dyn_crossbar;
+//  double perc_power_dyn_spent_outside_mats;
+//  double perc_power_leak_routing_to_bank;
+//  double perc_power_leak_addr_horizontal_htree;
+//  double perc_power_leak_datain_horizontal_htree;
+//  double perc_power_leak_dataout_horizontal_htree;
+//  double perc_power_leak_addr_vertical_htree;
+//  double perc_power_leak_datain_vertical_htree;
+//  double perc_power_leak_row_predecoder_drivers;
+//  double perc_power_leak_row_predecoder_blocks;
+//  double perc_power_leak_row_decoders;
+//  double perc_power_leak_bit_mux_predecoder_drivers;
+//  double perc_power_leak_bit_mux_predecoder_blocks;
+//  double perc_power_leak_bit_mux_decoders;
+//  double perc_power_leak_senseamp_mux_lev_1_predecoder_drivers;
+//  double perc_power_leak_senseamp_mux_lev_1_predecoder_blocks;
+//  double perc_power_leak_senseamp_mux_lev_1_decoders;
+//  double perc_power_leak_senseamp_mux_lev_2_predecoder_drivers;
+//  double perc_power_leak_senseamp_mux_lev_2_predecoder_blocks;
+//  double perc_power_leak_senseamp_mux_lev_2_decoders;
+//  double perc_power_leak_bitlines;
+//  double perc_power_leak_sense_amps;
+//  double perc_power_leak_prechg_eq_drivers;
+//  double perc_power_leak_subarray_output_drivers;
+//  double perc_power_leak_dataout_vertical_htree;
+//  double perc_power_leak_comparators;
+//  double perc_power_leak_crossbar;
+//  double perc_leak_mats;
+//  double perc_active_mats;
+  double refresh_power;
+  double dram_refresh_period;
+  double dram_array_availability;
+  double dyn_read_energy_from_closed_page;
+  double dyn_read_energy_from_open_page;
+  double leak_power_subbank_closed_page;
+  double leak_power_subbank_open_page;
+  double leak_power_request_and_reply_networks;
+  double activate_energy;
+  double read_energy;
+  double write_energy;
+  double precharge_energy;
+} results_mem_array;
+
+
+class uca_org_t
+{
+  public:
+    mem_array * tag_array2;
+    mem_array * data_array2;
+    double access_time;
+    double cycle_time;
+    double area;
+    double area_efficiency;
+    powerDef power;
+    double leak_power_with_sleep_transistors_in_mats;
+    double cache_ht;
+    double cache_len;
+    char file_n[100];
+    double vdd_periph_global;
+    bool valid;
+    results_mem_array tag_array;
+    results_mem_array data_array;
+
+    uca_org_t();
+    void find_delay();
+    void find_energy();
+    void find_area();
+    void find_cyc();
+    void adjust_area();//for McPAT only to adjust routing overhead
+    void cleanup();
+    ~uca_org_t(){};
+};
+
+void reconfigure(InputParameter *local_interface, uca_org_t *fin_res);
+
+uca_org_t cacti_interface(const string & infile_name);
+//McPAT's plain interface, please keep !!!
+uca_org_t cacti_interface(InputParameter * const local_interface);
+//McPAT's plain interface, please keep !!!
+uca_org_t init_interface(InputParameter * const local_interface);
+//McPAT's plain interface, please keep !!!
+uca_org_t cacti_interface(
+	    int cache_size,
+	    int line_size,
+	    int associativity,
+	    int rw_ports,
+	    int excl_read_ports,
+	    int excl_write_ports,
+	    int single_ended_read_ports,
+	    int search_ports,
+	    int banks,
+	    double tech_node,
+	    int output_width,
+	    int specific_tag,
+	    int tag_width,
+	    int access_mode,
+	    int cache,
+	    int main_mem,
+	    int obj_func_delay,
+	    int obj_func_dynamic_power,
+	    int obj_func_leakage_power,
+	    int obj_func_cycle_time,
+	    int obj_func_area,
+	    int dev_func_delay,
+	    int dev_func_dynamic_power,
+	    int dev_func_leakage_power,
+	    int dev_func_area,
+	    int dev_func_cycle_time,
+	    int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
+	    int temp,
+	    int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
+	    int data_arr_ram_cell_tech_flavor_in,
+	    int data_arr_peri_global_tech_flavor_in,
+	    int tag_arr_ram_cell_tech_flavor_in,
+	    int tag_arr_peri_global_tech_flavor_in,
+	    int interconnect_projection_type_in,
+	    int wire_inside_mat_type_in,
+	    int wire_outside_mat_type_in,
+	    int REPEATERS_IN_HTREE_SEGMENTS_in,
+	    int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
+	    int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
+	    int PAGE_SIZE_BITS_in,
+	    int BURST_LENGTH_in,
+	    int INTERNAL_PREFETCH_WIDTH_in,
+	    int force_wiretype,
+	    int wiretype,
+	    int force_config,
+	    int ndwl,
+	    int ndbl,
+	    int nspd,
+	    int ndcm,
+	    int ndsam1,
+	    int ndsam2,
+	    int ecc);
+//    int cache_size,
+//    int line_size,
+//    int associativity,
+//    int rw_ports,
+//    int excl_read_ports,
+//    int excl_write_ports,
+//    int single_ended_read_ports,
+//    int banks,
+//    double tech_node,
+//    int output_width,
+//    int specific_tag,
+//    int tag_width,
+//    int access_mode,
+//    int cache,
+//    int main_mem,
+//    int obj_func_delay,
+//    int obj_func_dynamic_power,
+//    int obj_func_leakage_power,
+//    int obj_func_area,
+//    int obj_func_cycle_time,
+//    int dev_func_delay,
+//    int dev_func_dynamic_power,
+//    int dev_func_leakage_power,
+//    int dev_func_area,
+//    int dev_func_cycle_time,
+//    int temp,
+//    int data_arr_ram_cell_tech_flavor_in,
+//    int data_arr_peri_global_tech_flavor_in,
+//    int tag_arr_ram_cell_tech_flavor_in,
+//    int tag_arr_peri_global_tech_flavor_in,
+//    int interconnect_projection_type_in,
+//    int wire_inside_mat_type_in,
+//    int wire_outside_mat_type_in,
+//    int REPEATERS_IN_HTREE_SEGMENTS_in,
+//    int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
+//    int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
+////    double MAXAREACONSTRAINT_PERC_in,
+////    double MAXACCTIMECONSTRAINT_PERC_in,
+////    double MAX_PERC_DIFF_IN_DELAY_FROM_BEST_DELAY_REPEATER_SOLUTION_in,
+//    int PAGE_SIZE_BITS_in,
+//    int BURST_LENGTH_in,
+//    int INTERNAL_PREFETCH_WIDTH_in);
+
+//Naveen's interface
+uca_org_t cacti_interface(
+    int cache_size,
+    int line_size,
+    int associativity,
+    int rw_ports,
+    int excl_read_ports,
+    int excl_write_ports,
+    int single_ended_read_ports,
+    int banks,
+    double tech_node,
+    int page_sz,
+    int burst_length,
+    int pre_width,
+    int output_width,
+    int specific_tag,
+    int tag_width,
+    int access_mode, //0 normal, 1 seq, 2 fast
+    int cache, //scratch ram or cache
+    int main_mem,
+    int obj_func_delay,
+    int obj_func_dynamic_power,
+    int obj_func_leakage_power,
+    int obj_func_area,
+    int obj_func_cycle_time,
+    int dev_func_delay,
+    int dev_func_dynamic_power,
+    int dev_func_leakage_power,
+    int dev_func_area,
+    int dev_func_cycle_time,
+    int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
+    int temp,
+    int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
+    int data_arr_ram_cell_tech_flavor_in,
+    int data_arr_peri_global_tech_flavor_in,
+    int tag_arr_ram_cell_tech_flavor_in,
+    int tag_arr_peri_global_tech_flavor_in,
+    int interconnect_projection_type_in, // 0 - aggressive, 1 - normal
+    int wire_inside_mat_type_in,
+    int wire_outside_mat_type_in,
+    int is_nuca, // 0 - UCA, 1 - NUCA
+    int core_count,
+    int cache_level, // 0 - L2, 1 - L3
+    int nuca_bank_count,
+    int nuca_obj_func_delay,
+    int nuca_obj_func_dynamic_power,
+    int nuca_obj_func_leakage_power,
+    int nuca_obj_func_area,
+    int nuca_obj_func_cycle_time,
+    int nuca_dev_func_delay,
+    int nuca_dev_func_dynamic_power,
+    int nuca_dev_func_leakage_power,
+    int nuca_dev_func_area,
+    int nuca_dev_func_cycle_time,
+    int REPEATERS_IN_HTREE_SEGMENTS_in,//TODO for now only wires with repeaters are supported
+    int p_input);
+
+class mem_array
+{
+  public:
+  int    Ndcm;
+  int    Ndwl;
+  int    Ndbl;
+  double Nspd;
+  int    deg_bl_muxing;
+  int    Ndsam_lev_1;
+  int    Ndsam_lev_2;
+  double access_time;
+  double cycle_time;
+  double multisubbank_interleave_cycle_time;
+  double area_ram_cells;
+  double area;
+  powerDef power;
+  double delay_senseamp_mux_decoder;
+  double delay_before_subarray_output_driver;
+  double delay_from_subarray_output_driver_to_output;
+  double height;
+  double width;
+
+  double mat_height;
+  double mat_length;
+  double subarray_length;
+  double subarray_height;
+
+  double delay_route_to_bank,
+         delay_input_htree,
+         delay_row_predecode_driver_and_block,
+         delay_row_decoder,
+         delay_bitlines,
+         delay_sense_amp,
+         delay_subarray_output_driver,
+         delay_dout_htree,
+         delay_comparator,
+         delay_matchlines;
+
+  double all_banks_height,
+         all_banks_width,
+         area_efficiency;
+
+  powerDef power_routing_to_bank;
+  powerDef power_addr_input_htree;
+  powerDef power_data_input_htree;
+  powerDef power_data_output_htree;
+  powerDef power_htree_in_search;
+  powerDef power_htree_out_search;
+  powerDef power_row_predecoder_drivers;
+  powerDef power_row_predecoder_blocks;
+  powerDef power_row_decoders;
+  powerDef power_bit_mux_predecoder_drivers;
+  powerDef power_bit_mux_predecoder_blocks;
+  powerDef power_bit_mux_decoders;
+  powerDef power_senseamp_mux_lev_1_predecoder_drivers;
+  powerDef power_senseamp_mux_lev_1_predecoder_blocks;
+  powerDef power_senseamp_mux_lev_1_decoders;
+  powerDef power_senseamp_mux_lev_2_predecoder_drivers;
+  powerDef power_senseamp_mux_lev_2_predecoder_blocks;
+  powerDef power_senseamp_mux_lev_2_decoders;
+  powerDef power_bitlines;
+  powerDef power_sense_amps;
+  powerDef power_prechg_eq_drivers;
+  powerDef power_output_drivers_at_subarray;
+  powerDef power_dataout_vertical_htree;
+  powerDef power_comparators;
+
+  powerDef power_cam_bitline_precharge_eq_drv;
+  powerDef power_searchline;
+  powerDef power_searchline_precharge;
+  powerDef power_matchlines;
+  powerDef power_matchline_precharge;
+  powerDef power_matchline_to_wordline_drv;
+
+  min_values_t *arr_min;
+  enum Wire_type wt;
+
+  // dram stats
+  double activate_energy, read_energy, write_energy, precharge_energy,
+  refresh_power, leak_power_subbank_closed_page, leak_power_subbank_open_page,
+  leak_power_request_and_reply_networks;
+
+  double precharge_delay;
+
+  static bool lt(const mem_array * m1, const mem_array * m2);
+};
+
+
+#endif
diff --git a/src/gpuwattch/cacti/component.cc b/src/gpuwattch/cacti/component.cc
new file mode 100644
index 000000000..abe5cb90c
--- /dev/null
+++ b/src/gpuwattch/cacti/component.cc
@@ -0,0 +1,236 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+
+#include <assert.h>
+#include <iostream>
+#include <math.h>
+
+#include "bank.h"
+#include "component.h"
+#include "decoder.h"
+
+using namespace std;
+
+
+
+Component::Component()
+  :area(), power(), rt_power(),delay(0)
+{
+}
+
+
+
+Component::~Component()
+{
+}
+
+
+
+double Component::compute_diffusion_width(int num_stacked_in, int num_folded_tr)
+{
+  double w_poly = g_ip->F_sz_um;
+  double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact;
+  double total_diff_w = 2 * spacing_poly_to_poly +  // for both source and drain
+                        num_stacked_in * w_poly +
+                        (num_stacked_in - 1) * g_tp.spacing_poly_to_poly;
+
+  if (num_folded_tr > 1)
+  {
+    total_diff_w += (num_folded_tr - 2) * 2 * spacing_poly_to_poly +
+                    (num_folded_tr - 1) * num_stacked_in * w_poly +
+                    (num_folded_tr - 1) * (num_stacked_in - 1) * g_tp.spacing_poly_to_poly;
+  }
+
+  return total_diff_w;
+}
+
+
+
+double Component::compute_gate_area(
+    int gate_type,
+    int num_inputs,
+    double w_pmos,
+    double w_nmos,
+    double h_gate)
+{
+  if (w_pmos <= 0.0 || w_nmos <= 0.0)
+  {
+    return 0.0;
+  }
+
+  double w_folded_pmos, w_folded_nmos;
+  int    num_folded_pmos, num_folded_nmos;
+  double total_ndiff_w, total_pdiff_w;
+  Area gate;
+
+  double h_tr_region  = h_gate - 2 * g_tp.HPOWERRAIL;
+  double ratio_p_to_n = w_pmos / (w_pmos + w_nmos);
+
+  if (ratio_p_to_n >= 1 || ratio_p_to_n <= 0)
+  {
+    return 0.0;
+  }
+
+  w_folded_pmos  = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * ratio_p_to_n;
+  w_folded_nmos  = (h_tr_region - g_tp.MIN_GAP_BET_P_AND_N_DIFFS) * (1 - ratio_p_to_n);
+  assert(w_folded_pmos > 0);
+
+  num_folded_pmos = (int) (ceil(w_pmos / w_folded_pmos));
+  num_folded_nmos = (int) (ceil(w_nmos / w_folded_nmos));
+
+  switch (gate_type)
+  {
+    case INV:
+      total_ndiff_w = compute_diffusion_width(1, num_folded_nmos);
+      total_pdiff_w = compute_diffusion_width(1, num_folded_pmos);
+      break;
+
+    case NOR:
+      total_ndiff_w = compute_diffusion_width(1, num_inputs * num_folded_nmos);
+      total_pdiff_w = compute_diffusion_width(num_inputs, num_folded_pmos);
+      break;
+
+    case NAND:
+      total_ndiff_w = compute_diffusion_width(num_inputs, num_folded_nmos);
+      total_pdiff_w = compute_diffusion_width(1, num_inputs * num_folded_pmos);
+      break;
+    default:
+      cout << "Unknown gate type: " << gate_type << endl;
+      exit(1);
+  }
+
+  gate.w = MAX(total_ndiff_w, total_pdiff_w);
+
+  if (w_folded_nmos > w_nmos)
+  {
+    //means that the height of the gate can
+    //be made smaller than the input height specified, so calculate the height of the gate.
+    gate.h = w_nmos + w_pmos + g_tp.MIN_GAP_BET_P_AND_N_DIFFS + 2 * g_tp.HPOWERRAIL;
+  }
+  else
+  {
+    gate.h = h_gate;
+  }
+  return gate.get_area();
+}
+
+
+
+double Component::compute_tr_width_after_folding(
+    double input_width,
+    double threshold_folding_width)
+{//This is actually the width of the cell not the width of a device.
+//The width of a cell and the width of a device is orthogonal.
+  if (input_width <= 0)
+  {
+    return 0;
+  }
+
+  int    num_folded_tr        = (int) (ceil(input_width / threshold_folding_width));
+  double spacing_poly_to_poly = g_tp.w_poly_contact + 2 * g_tp.spacing_poly_to_contact;
+  double width_poly           = g_ip->F_sz_um;
+  double total_diff_width     = num_folded_tr * width_poly + (num_folded_tr + 1) * spacing_poly_to_poly;
+
+  return total_diff_width;
+}
+
+
+
+double Component::height_sense_amplifier(double pitch_sense_amp)
+{
+  // compute the height occupied by all PMOS transistors
+  double h_pmos_tr = compute_tr_width_after_folding(g_tp.w_sense_p, pitch_sense_amp) * 2 +
+                     compute_tr_width_after_folding(g_tp.w_iso, pitch_sense_amp) +
+                     2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS;
+
+  // compute the height occupied by all NMOS transistors
+  double h_nmos_tr = compute_tr_width_after_folding(g_tp.w_sense_n, pitch_sense_amp) * 2 +
+                     compute_tr_width_after_folding(g_tp.w_sense_en, pitch_sense_amp) +
+                     2 * g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS;
+
+  // compute total height by considering gap between the p and n diffusion areas
+  return h_pmos_tr + h_nmos_tr + g_tp.MIN_GAP_BET_P_AND_N_DIFFS;
+}
+
+
+
+int Component::logical_effort(
+    int num_gates_min,
+    double g,
+    double F,
+    double * w_n,
+    double * w_p,
+    double C_load,
+    double p_to_n_sz_ratio,
+    bool   is_dram_,
+    bool   is_wl_tr_,
+    double max_w_nmos)
+{
+  int num_gates = (int) (log(F) / log(fopt));
+
+  // check if num_gates is odd. if so, add 1 to make it even
+  num_gates+= (num_gates % 2) ? 1 : 0;
+  num_gates = MAX(num_gates, num_gates_min);
+
+  // recalculate the effective fanout of each stage
+  double f = pow(F, 1.0 / num_gates);
+  int    i = num_gates - 1;
+  double C_in = C_load / f;
+  w_n[i]  = (1.0 / (1.0 + p_to_n_sz_ratio)) * C_in / gate_C(1, 0, is_dram_, false, is_wl_tr_);
+  w_n[i]  = MAX(w_n[i], g_tp.min_w_nmos_);
+  w_p[i]  = p_to_n_sz_ratio * w_n[i];
+
+  if (w_n[i] > max_w_nmos)
+  {
+    double C_ld = gate_C((1 + p_to_n_sz_ratio) * max_w_nmos, 0, is_dram_, false, is_wl_tr_);
+    F = g * C_ld / gate_C(w_n[0] + w_p[0], 0, is_dram_, false, is_wl_tr_);
+    num_gates = (int) (log(F) / log(fopt)) + 1;
+    num_gates+= (num_gates % 2) ? 1 : 0;
+    num_gates = MAX(num_gates, num_gates_min);
+    f = pow(F, 1.0 / (num_gates - 1));
+    i = num_gates - 1;
+    w_n[i]  = max_w_nmos;
+    w_p[i]  = p_to_n_sz_ratio * w_n[i];
+  }
+
+  for (i = num_gates - 2; i >= 1; i--)
+  {
+    w_n[i] = MAX(w_n[i+1] / f, g_tp.min_w_nmos_);
+    w_p[i] = p_to_n_sz_ratio * w_n[i];
+  }
+
+  assert(num_gates <= MAX_NUMBER_GATES_STAGE);
+  return num_gates;
+}
+
diff --git a/src/gpuwattch/cacti/component.h b/src/gpuwattch/cacti/component.h
new file mode 100644
index 000000000..74a089dbf
--- /dev/null
+++ b/src/gpuwattch/cacti/component.h
@@ -0,0 +1,84 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __COMPONENT_H__
+#define __COMPONENT_H__
+
+#include "parameter.h"
+#include "area.h"
+
+using namespace std;
+
+class Crossbar;
+class Bank;
+
+class Component
+{
+  public:
+    Component();
+    ~Component();
+
+    Area area;
+    powerDef power,rt_power;
+    double delay;
+    double cycle_time;
+
+    double compute_gate_area(
+        int gate_type,
+        int num_inputs,
+        double w_pmos,
+        double w_nmos,
+        double h_gate);
+
+    double compute_tr_width_after_folding(double input_width, double threshold_folding_width);
+    double height_sense_amplifier(double pitch_sense_amp);
+
+  protected:
+    int logical_effort(
+        int    num_gates_min,
+        double g,
+        double F,
+        double * w_n,
+        double * w_p,
+        double C_load,
+        double p_to_n_sz_ratio,
+        bool   is_dram_,
+        bool   is_wl_tr_,
+        double max_w_nmos);
+
+  private:
+    double compute_diffusion_width(int num_stacked_in, int num_folded_tr);
+};
+
+#endif
+
diff --git a/src/gpuwattch/cacti/const.h b/src/gpuwattch/cacti/const.h
new file mode 100644
index 000000000..ada91f919
--- /dev/null
+++ b/src/gpuwattch/cacti/const.h
@@ -0,0 +1,270 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef __CONST_H__
+#define __CONST_H__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <math.h>
+
+/*  The following are things you might want to change
+ *  when compiling
+ */
+
+/*
+ * Address bits in a word, and number of output bits from the cache
+ */
+
+/*
+was: #define ADDRESS_BITS 32
+now: I'm using 42 bits as in the Power4,
+since that's bigger then the 36 bits on the Pentium 4
+and 40 bits on the Opteron
+*/
+const int ADDRESS_BITS = 42;
+
+/*dt: In addition to the tag bits, the tags also include 1 valid bit, 1 dirty bit, 2 bits for a 4-state
+  cache coherency protocoll (MESI), 1 bit for MRU (change this to log(ways) for full LRU).
+  So in total we have 1 + 1 + 2 + 1 = 5 */
+const int EXTRA_TAG_BITS = 5;
+
+/* limits on the various N parameters */
+
+const unsigned int MAXDATAN     = 512;      // maximum for Ndwl and Ndbl
+const unsigned int MAXSUBARRAYS = 1048576;  // maximum subarrays for data and tag arrays
+const unsigned int MAXDATASPD   = 256;      // maximum for Nspd
+const unsigned int MAX_COL_MUX  = 256;
+
+
+
+#define ROUTER_TYPES 3
+#define WIRE_TYPES 6
+
+const double Cpolywire = 0;
+
+
+/* Threshold voltages (as a proportion of Vdd)
+   If you don't know them, set all values to 0.5 */
+#define VTHFA1         0.452
+#define VTHFA2         0.304
+#define VTHFA3         0.420
+#define VTHFA4         0.413
+#define VTHFA5         0.405
+#define VTHFA6         0.452
+#define VSINV          0.452
+#define VTHCOMPINV     0.437
+#define VTHMUXNAND     0.548  // TODO : this constant must be revisited
+#define VTHEVALINV     0.452
+#define VTHSENSEEXTDRV 0.438
+
+
+//WmuxdrvNANDn and WmuxdrvNANDp are no longer being used but it's part of the old
+//delay_comparator function which we are using exactly as it used to be, so just setting these to 0
+const double WmuxdrvNANDn = 0;
+const double WmuxdrvNANDp = 0;
+
+
+/*===================================================================*/
+/*
+ * The following are things you probably wouldn't want to change.
+ */
+
+#define BIGNUM 1e30
+#define INF 9999999
+#define MAX(a,b) (((a)>(b))?(a):(b))
+#define MIN(a,b) (((a)<(b))?(a):(b))
+
+/* Used to communicate with the horowitz model */
+#define RISE 1
+#define FALL 0
+#define NCH  1
+#define PCH  0
+
+
+#define EPSILON 0.5 //v4.1: This constant is being used in order to fix floating point -> integer
+//conversion problems that were occuring within CACTI. Typical problem that was occuring was
+//that with different compilers a floating point number like 3.0 would get represented as either
+//2.9999....or 3.00000001 and then the integer part of the floating point number (3.0) would
+//be computed differently depending on the compiler. What we are doing now is to replace
+//int (x) with (int) (x+EPSILON) where EPSILON is 0.5. This would fix such problems. Note that
+//this works only when x is an integer >= 0.
+/*
+ * Sheng thinks this is more a solution to solve the simple truncate problem
+ * (http://www.cs.tut.fi/~jkorpela/round.html) rather than the problem mentioned above.
+ * Unfortunately, this solution causes nasty bugs (different results when using O0 and O3).
+ * Moreover, round is not correct in CACTI since when an extra fraction of bit/line is needed,
+ * we need to provide a complete bit/line even the fraction is just 0.01.
+ * So, in later version than 6.5 we use (int)ceil() to get double to int conversion.
+ */
+
+#define EPSILON2 0.1
+#define EPSILON3 0.6
+
+
+#define MINSUBARRAYROWS 16 //For simplicity in modeling, for the row decoding structure, we assume
+//that each row predecode block is composed of at least one 2-4 decoder. When the outputs from the
+//row predecode blocks are combined this means that there are at least 4*4=16 row decode outputs
+#define MAXSUBARRAYROWS 262144 //Each row predecode block produces a max of 2^9 outputs. So
+//the maximum number of row decode outputs will be 2^9*2^9
+#define MINSUBARRAYCOLS 2
+#define MAXSUBARRAYCOLS 262144
+
+
+#define INV 0
+#define NOR 1
+#define NAND 2
+
+
+#define NUMBER_TECH_FLAVORS 4
+
+#define NUMBER_INTERCONNECT_PROJECTION_TYPES 2 //aggressive and conservative
+//0 = Aggressive projections, 1 = Conservative projections
+#define NUMBER_WIRE_TYPES 4 //local, semi-global and global
+//1 = 'Semi-global' wire type, 2 = 'Global' wire type
+
+
+const int dram_cell_tech_flavor = 3;
+
+
+#define VBITSENSEMIN 0.08 //minimum bitline sense voltage is fixed to be 80 mV.
+
+#define fopt 4.0
+
+#define INPUT_WIRE_TO_INPUT_GATE_CAP_RATIO 0
+#define BUFFER_SEPARATION_LENGTH_MULTIPLIER 1
+#define NUMBER_MATS_PER_REDUNDANT_MAT 8
+
+#define NUMBER_STACKED_DIE_LAYERS 1
+
+// this variable can be set to carry out solution optimization for
+// a maximum area allocation.
+#define STACKED_DIE_LAYER_ALLOTED_AREA_mm2 0 //6.24 //6.21//71.5
+
+// this variable can also be employed when solution optimization
+// with maximum area allocation is carried out.
+#define MAX_PERCENT_AWAY_FROM_ALLOTED_AREA 50
+
+// this variable can also be employed when solution optimization
+// with maximum area allocation is carried out.
+#define MIN_AREA_EFFICIENCY 20
+
+// this variable can be employed when solution with a desired
+// aspect ratio is required.
+#define STACKED_DIE_LAYER_ASPECT_RATIO 1
+
+// this variable can be employed when solution with a desired
+// aspect ratio is required.
+#define MAX_PERCENT_AWAY_FROM_ASPECT_RATIO 101
+
+// this variable can be employed to carry out solution optimization
+// for a certain target random cycle time.
+#define TARGET_CYCLE_TIME_ns 1000000000
+
+#define NUMBER_PIPELINE_STAGES 4
+
+// this can be used to model the length of interconnect
+// between a bank and a crossbar
+#define LENGTH_INTERCONNECT_FROM_BANK_TO_CROSSBAR 0 //3791 // 2880//micron
+
+#define IS_CROSSBAR 0
+#define NUMBER_INPUT_PORTS_CROSSBAR 8
+#define NUMBER_OUTPUT_PORTS_CROSSBAR 8
+#define NUMBER_SIGNALS_PER_PORT_CROSSBAR 256
+
+
+#define MAT_LEAKAGE_REDUCTION_DUE_TO_SLEEP_TRANSISTORS_FACTOR 1
+#define LEAKAGE_REDUCTION_DUE_TO_LONG_CHANNEL_HP_TRANSISTORS_FACTOR 1
+
+#define PAGE_MODE 0
+
+#define MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA 60
+// We are actually not using this variable in the CACTI code. We just want to acknowledge that
+// this current should be multiplied by the DDR(n) system VDD value to compute the standby power
+// consumed during precharge.
+
+
+const double VDD_STORAGE_LOSS_FRACTION_WORST = 0.125;
+const double CU_RESISTIVITY = 0.022; //ohm-micron
+const double BULK_CU_RESISTIVITY = 0.018; //ohm-micron
+const double PERMITTIVITY_FREE_SPACE = 8.854e-18; //F/micron
+
+const static uint32_t sram_num_cells_wl_stitching_ = 16;
+const static uint32_t dram_num_cells_wl_stitching_ = 64;
+const static uint32_t comm_dram_num_cells_wl_stitching_ = 256;
+const static double num_bits_per_ecc_b_          = 8.0;
+
+const double    bit_to_byte  = 8.0;
+
+#define MAX_NUMBER_GATES_STAGE 20
+#define MAX_NUMBER_HTREE_NODES 20
+#define NAND2_LEAK_STACK_FACTOR 0.2
+#define NAND3_LEAK_STACK_FACTOR 0.2
+#define NOR2_LEAK_STACK_FACTOR 0.2
+#define INV_LEAK_STACK_FACTOR  0.5
+#define MAX_NUMBER_ARRAY_PARTITIONS 1000000
+
+// abbreviations used in this project
+// ----------------------------------
+//
+//  num  : number
+//  rw   : read/write
+//  rd   : read
+//  wr   : write
+//  se   : single-ended
+//  sz   : size
+//  F    : feature
+//  w    : width
+//  h    : height or horizontal
+//  v    : vertical or velocity
+
+
+enum ram_cell_tech_type_num
+{
+  itrs_hp   = 0,
+  itrs_lstp = 1,
+  itrs_lop  = 2,
+  lp_dram   = 3,
+  comm_dram = 4
+};
+
+const double pppm[4]      = {1,1,1,1};
+const double pppm_lkg[4]  = {0,1,1,0};
+const double pppm_dyn[4]  = {1,0,0,0};
+const double pppm_Isub[4] = {0,1,0,0};
+const double pppm_Ig[4]   = {0,0,1,0};
+const double pppm_sc[4]   = {0,0,0,1};
+
+
+
+#endif
diff --git a/src/gpuwattch/cacti/contention.dat b/src/gpuwattch/cacti/contention.dat
new file mode 100755
index 000000000..826553e7e
--- /dev/null
+++ b/src/gpuwattch/cacti/contention.dat
@@ -0,0 +1,126 @@
+l34c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l34c64l2b: 9 11 19 29 43 62 81 102
+l34c64l4b: 6 8 12 17 24 29 39 47
+l34c64l8b: 7 8 10 14 18 22 25 30
+l34c64l16b: 7 7 9 12 14 17 20 24
+l34c64l32b: 7 7 9 12 14 17 20 24 -r
+l34c64l64b: 7 7 9 12 14 17 20 24 -r
+l34c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l34c128l2b: 4 10 19 30 44 64 82 103
+l34c128l4b: 3 6 11 17 24 31 38 47
+l34c128l8b: 3 5 9 13 17 21 25 29
+l34c128l16b: 4 5 7 10 13 16 19 22
+l34c128l32b: 4 5 7 10 13 16 19 22 -r
+l34c128l64b: 4 5 7 10 13 16 19 22 -r
+l34c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l34c256l2b: 3 10 19 30 44 63 82 103
+l34c256l4b: 3 6 11 17 24 31 38 47
+l34c256l8b: 2 5 8 12 16 20 24 29
+l34c256l16b: 2 4 7 9 12 15 18 21
+l34c256l32b: 2 4 7 9 12 15 18 21 -r
+l34c256l64b: 2 4 7 9 12 15 18 21 -r
+l38c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l38c64l2b: 57 59 77 90 137 187 219 245
+l38c64l4b: 35 40 48 56 43 61 80 101
+l38c64l8b: 18 27 41 45 52 58 58 58  -r
+l38c64l16b: 16 17 19 35 40 49 53 53 -r
+l38c64l32b: 15 15 17 19 22 25 30 30 -r
+l38c64l64b: 15 15 17 19 22 25 30 30 -r
+l38c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l38c128l2b: 38 50 78 93 139 188 220 245
+l38c128l4b: 29 37 46 56 43 61 81 102
+l38c128l8b: 16 30 39 44 50 57 57 57 -r
+l38c128l16b: 14 16 19 33 40 47 52 52 -r
+l38c128l32b: 14 15 17 20 23 27 31 31 -r
+l38c128l64b: 14 15 17 20 23 27 31 31 -r
+l38c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l38c256l2b: 35 50 78 94 139 188 220 246 
+l38c256l4b: 28 36 45 55 55 61 81 102
+l38c256l8b: 17 30 38 43 50 57 57 57 -r
+l38c256l16b: 15 17 21 32 40 47 51 51
+l38c256l32b: 15 17 19 21 24 29 33 33
+l38c256l64b: 15 17 19 21 24 29 33 33 -r
+l316c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l316c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l316c64l4b: 34 35 78 126 178 220 252 274
+l316c64l8b: 9 11 23 43 62 87 105 130
+l316c64l16b: 7 9 13 23 33 45 56 67
+l316c64l32b: 5 6 7 10 13 19 25 30
+l316c64l64b: 4 5 6 8 10 14 18 21
+l316c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l316c128l2b: 25 131 243 1000 1000 1000 1000 1000
+l316c128l4b: 8 28 79 127 179 221 253 274
+l316c128l8b: 4 9 22 43 62 88 106 131
+l316c128l16b: 4 6 11 21 32 44 55 67
+l316c128l32b: 4 6 11 12 12 18 24 29
+l316c128l64b: 2 3 5 7 9 13 17 21
+l316c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l316c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l316c256l4b: 5 28 80 128 180 221 253 274
+l316c256l8b: 3 8 22 43 63 88 107 131
+l316c256l16b: 2 5 11 21 32 44 55 67
+l316c256l32b: 2 3 5 8 12 18 24 29
+l316c256l64b: 2 3 4 6 9 13 17 21
+l24c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c64l2b: 10 12 24 41 60 86 105 122
+l24c64l4b: 5 7 13 20 29 38 47 56
+l24c64l8b: 5 6 9 14 18 24 29 35
+l24c64l16b: 4 5 7 10 12 16 19 22
+l24c64l32b: 5 5 6 8 10 12 14 17
+l24c64l64b: 5 5 6 8 10 12 14 16
+l24c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c128l4b: 3 7 13 20 29 38 47 57
+l24c128l8b: 3 5 9 13 18 23 29 35
+l24c128l16b: 3 4 6 9 12 15 19 22
+l24c128l32b: 3 4 5 7 9 11 14 16
+l24c128l64b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l24c256l4b: 2 6 13 20 29 38 47 57
+l24c256l8b: 2 4 8 13 18 23 28 35
+l24c256l16b: 2 3 6 8 11 15 18 22
+l24c256l32b: 2 3 5 6 8 11 14 16
+l24c256l64b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c64l2b: 46 52 117 157 188 225 246 261
+l28c64l4b: 19 25 39 54 96 107 120 150
+l28c64l8b: 9 12 21 30 39 47 58 79
+l28c64l16b: 8 9 11 16 25 32 37 42
+l28c64l32b: 7 8 9 11 14 19 23 28
+l28c64l64b: 7 7 8 10 12 14 18 22 
+l28c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c128l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c128l4b: 12 22 39 54 98 108 130 151
+l28c128l8b: 7 12 21 30 39 48 59 80
+l28c128l16b: 6 8 11 16 24 31 37 42
+l28c128l32b: 6 7 9 11 14 19 24 28
+l28c128l64b: 6 7 9 11 14 19 24 28
+l28c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l28c256l4b: 12 22 39 54 100 108 130 152
+l28c256l8b: 7 12 21 30 39 48 59 81
+l28c256l16b: 6 8 11 16 24 31 37 42
+l28c256l32b: 6 7 9 11 14 19 24 28
+l28c256l64b: 6 7 9 11 14 19 24 28
+l216c64l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l216c64l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l216c64l4b: 34 35 78 126 178 220 252 274
+l216c64l8b: 9 11 23 43 62 87 105 130
+l216c64l16b: 7 9 13 23 33 45 56 67
+l216c64l32b: 5 6 7 10 13 19 25 30
+l216c64l64b: 4 5 6 8 10 14 18 21
+l216c128l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l216c128l2b: 25 131 243 1000 1000 1000 1000 1000
+l216c128l4b: 8 28 79 127 179 221 253 274
+l216c128l8b: 4 9 22 43 62 88 106 131
+l216c128l16b: 4 6 11 21 32 44 55 67
+l216c128l32b: 4 6 11 12 12 18 24 29
+l216c128l64b: 2 3 5 7 9 13 17 21
+l216c256l1b: 1000 1000 1000 1000 1000 1000 1000 1000
+l216c256l2b: 1000 1000 1000 1000 1000 1000 1000 1000
+l216c256l4b: 5 28 80 128 180 221 253 274
+l216c256l8b: 3 8 22 43 63 88 107 131
+l216c256l16b: 2 5 11 21 32 44 55 67
+l216c256l32b: 2 3 5 8 12 18 24 29
+l216c256l64b: 2 3 4 6 9 13 17 21
diff --git a/src/gpuwattch/cacti/crossbar.cc b/src/gpuwattch/cacti/crossbar.cc
new file mode 100644
index 000000000..d7386a8e2
--- /dev/null
+++ b/src/gpuwattch/cacti/crossbar.cc
@@ -0,0 +1,161 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "crossbar.h"
+
+#define ASPECT_THRESHOLD .8
+#define ADJ 1
+
+Crossbar::Crossbar(
+    double n_inp_,
+    double n_out_,
+    double flit_size_,
+    TechnologyParameter::DeviceType *dt
+    ):n_inp(n_inp_), n_out(n_out_), flit_size(flit_size_), deviceType(dt)
+{
+  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
+  Vdd = dt->Vdd;
+  CB_ADJ = 1;
+}
+
+Crossbar::~Crossbar(){}
+
+double Crossbar::output_buffer()
+{
+
+  //Wire winit(4, 4);
+  double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch;
+  Wire w1(g_ip->wt, l_eff);
+  //double s1 = w1.repeater_size *l_eff*ADJ/w1.repeater_spacing;
+  double s1 = w1.repeater_size * (l_eff <w1.repeater_spacing?  l_eff *ADJ/w1.repeater_spacing : ADJ);
+  double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
+  // the model assumes input capacitance of the wire driver = input capacitance of nand + nor = input cap of the driver transistor
+  TriS1 = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size);
+  TriS2 = s1; //driver transistor
+
+  if (TriS1 < 1)
+    TriS1 = 1;
+
+  double input_cap = gate_C(TriS1*(2*min_w_pmos + g_tp.min_w_nmos_), 0) +
+    gate_C(TriS1*(min_w_pmos + 2*g_tp.min_w_nmos_), 0);
+//  input_cap += drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+//    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
+//    gate_C(TriS2*g_tp.min_w_nmos_, 0)+
+//    drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
+//    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+//    gate_C(TriS2*min_w_pmos, 0);
+  tri_int_cap = drain_C_(TriS1*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
+    gate_C(TriS2*g_tp.min_w_nmos_, 0)+
+    drain_C_(TriS1*min_w_pmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
+    drain_C_(TriS1*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+    gate_C(TriS2*min_w_pmos, 0);
+  double output_cap = drain_C_(TriS2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(TriS2*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def);
+  double ctr_cap = gate_C(TriS2 *(min_w_pmos + g_tp.min_w_nmos_), 0);
+
+  tri_inp_cap = input_cap;
+  tri_out_cap = output_cap;
+  tri_ctr_cap = ctr_cap;
+  return input_cap + output_cap + ctr_cap;
+}
+
+void Crossbar::compute_power()
+{
+
+  Wire winit(4, 4);
+  double tri_cap = output_buffer();
+  assert(tri_cap > 0);
+  //area of a tristate logic
+  double g_area = compute_gate_area(INV, 1, TriS2*g_tp.min_w_nmos_, TriS2*min_w_pmos, g_tp.cell_h_def);
+  g_area *= 2; // to model area of output transistors
+  g_area += compute_gate_area (NAND, 2, TriS1*2*g_tp.min_w_nmos_, TriS1*min_w_pmos, g_tp.cell_h_def);
+  g_area += compute_gate_area (NOR, 2, TriS1*g_tp.min_w_nmos_, TriS1*2*min_w_pmos, g_tp.cell_h_def);
+  double width /*per tristate*/ = g_area/(CB_ADJ * g_tp.cell_h_def);
+  // effective no. of tristate buffers that need to be laid side by side
+  int ntri = (int)ceil(g_tp.cell_h_def/(g_tp.wire_outside_mat.pitch));
+  double wire_len = MAX(width*ntri*n_out, flit_size*g_tp.wire_outside_mat.pitch*n_out);
+  Wire w1(g_ip->wt, wire_len);
+
+  area.w = wire_len;
+  area.h = g_tp.wire_outside_mat.pitch*n_inp*flit_size * CB_ADJ;
+  Wire w2(g_ip->wt, area.h);
+
+  double aspect_ratio_cb = (area.h/area.w)*(n_out/n_inp);
+  if (aspect_ratio_cb > 1) aspect_ratio_cb = 1/aspect_ratio_cb;
+
+  if (aspect_ratio_cb < ASPECT_THRESHOLD) {
+    if (n_out > 2 && n_inp > 2) {
+      CB_ADJ+=0.2;
+      //cout << "CB ADJ " << CB_ADJ << endl;
+      if (CB_ADJ < 4) {
+        this->compute_power();
+      }
+    }
+  }
+
+
+
+  power.readOp.dynamic = (w1.power.readOp.dynamic + w2.power.readOp.dynamic + (tri_inp_cap * n_out + tri_out_cap * n_inp + tri_ctr_cap + tri_int_cap) * Vdd*Vdd)*flit_size;
+  power.readOp.leakage      =  n_inp * n_out * flit_size * (
+    cmos_Isub_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+
+	cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+
+	cmos_Isub_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+
+    w1.power.readOp.leakage + w2.power.readOp.leakage);
+  power.readOp.gate_leakage = n_inp * n_out * flit_size * (
+	  cmos_Ig_leakage(g_tp.min_w_nmos_*TriS2*2, min_w_pmos*TriS2*2, 1, inv) *Vdd+
+	  cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nand)*Vdd+
+	  cmos_Ig_leakage(g_tp.min_w_nmos_*TriS1*3, min_w_pmos*TriS1*3, 2, nor) *Vdd+
+	  w1.power.readOp.gate_leakage + w2.power.readOp.gate_leakage);
+
+  // delay calculation
+  double l_eff = n_inp*flit_size*g_tp.wire_outside_mat.pitch;
+  Wire wdriver(g_ip->wt, l_eff);
+  double res = g_tp.wire_outside_mat.R_per_um * (area.w+area.h) + tr_R_on(g_tp.min_w_nmos_*wdriver.repeater_size, NCH, 1);
+  double cap = g_tp.wire_outside_mat.C_per_um * (area.w + area.h) + n_out*tri_inp_cap + n_inp*tri_out_cap;
+  delay = horowitz(w1.signal_rise_time(), res*cap, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
+
+  Wire wreset();
+}
+
+void Crossbar::print_crossbar()
+{
+  cout << "\nCrossbar Stats (" << n_inp << "x" << n_out << ")\n\n";
+  cout << "Flit size        : " << flit_size << " bits" << endl;
+  cout << "Width            : " << area.w << " u" << endl;
+  cout << "Height           : " << area.h << " u" << endl;
+  cout << "Dynamic Power    : " << power.readOp.dynamic*1e9 * MIN(n_inp, n_out) << " (nJ)" << endl;
+  cout << "Leakage Power    : " << power.readOp.leakage*1e3 << " (mW)" << endl;
+  cout << "Gate Leakage Power    : " << power.readOp.gate_leakage*1e3 << " (mW)" << endl;
+  cout << "Crossbar Delay   : " << delay*1e12 << " ps\n";
+}
+
+
diff --git a/src/gpuwattch/cacti/crossbar.h b/src/gpuwattch/cacti/crossbar.h
new file mode 100644
index 000000000..47339c091
--- /dev/null
+++ b/src/gpuwattch/cacti/crossbar.h
@@ -0,0 +1,83 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __CROSSBAR__
+#define __CROSSBAR__
+
+#include <assert.h>
+#include <iostream>
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "parameter.h"
+#include "mat.h"
+#include "wire.h"
+
+class Crossbar : public Component
+{
+  public:
+    Crossbar(
+      double in,
+      double out,
+      double flit_sz,
+      TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+    ~Crossbar();
+
+    void print_crossbar();
+    double output_buffer();
+    void compute_power();
+
+    double n_inp, n_out;
+    double flit_size;
+    double tri_inp_cap, tri_out_cap, tri_ctr_cap, tri_int_cap;
+
+  private:
+	  double CB_ADJ;
+	  /*
+	   * Adjust factor of the height of the cross-point (tri-state buffer) cell (layout) in crossbar
+	   * buffer is adjusted to get an aspect ratio of whole cross bar close to one;
+	   * when adjust the ratio, the number of wires route over the tri-state buffers does not change,
+	   * however, the effective wiring pitch changes. Specifically, since CB_ADJ will increase
+	   * during the adjust, the tri-state buffer will become taller and thiner, and the effective wiring pitch
+	   * will increase. As a result, the height of the crossbar (area.h) will increase.
+	   */
+
+	TechnologyParameter::DeviceType *deviceType;
+    double TriS1, TriS2;
+    double min_w_pmos, Vdd;
+
+};
+
+
+
+
+#endif
diff --git a/src/gpuwattch/cacti/decoder.cc b/src/gpuwattch/cacti/decoder.cc
new file mode 100644
index 000000000..32644d9cd
--- /dev/null
+++ b/src/gpuwattch/cacti/decoder.cc
@@ -0,0 +1,1576 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include "area.h"
+#include "decoder.h"
+#include "parameter.h"
+#include <iostream>
+#include <math.h>
+#include <assert.h>
+
+using namespace std;
+
+
+Decoder::Decoder(
+    int    _num_dec_signals,
+    bool   flag_way_select,
+    double _C_ld_dec_out,
+    double _R_wire_dec_out,
+    bool   fully_assoc_,
+    bool   is_dram_,
+    bool   is_wl_tr_,
+    const  Area & cell_)
+:exist(false),
+  C_ld_dec_out(_C_ld_dec_out),
+  R_wire_dec_out(_R_wire_dec_out),
+  num_gates(0), num_gates_min(2),
+  delay(0),
+  //power(),
+  fully_assoc(fully_assoc_), is_dram(is_dram_),
+  is_wl_tr(is_wl_tr_), cell(cell_)
+{
+
+  for (int i = 0; i < MAX_NUMBER_GATES_STAGE; i++)
+  {
+    w_dec_n[i] = 0;
+    w_dec_p[i] = 0;
+  }
+
+  /*
+   * _num_dec_signals is the number of decoded signal as output
+   * num_addr_bits_dec is the number of signal to be decoded
+   * as the decoders input.
+   */
+  int num_addr_bits_dec = _log2(_num_dec_signals);
+
+  if (num_addr_bits_dec < 4)
+  {
+    if (flag_way_select)
+    {
+      exist = true;
+      num_in_signals = 2;
+    }
+    else
+    {
+      num_in_signals = 0;
+    }
+  }
+  else
+  {
+    exist = true;
+
+    if (flag_way_select)
+    {
+      num_in_signals = 3;
+    }
+    else
+    {
+      num_in_signals = 2;
+    }
+  }
+
+  assert(cell.h>0);
+  assert(cell.w>0);
+  // the height of a row-decoder-driver cell is fixed to be 4 * cell.h;
+  //area.h = 4 * cell.h;
+  area.h = g_tp.h_dec * cell.h;
+
+  compute_widths();
+  compute_area();
+}
+
+
+
+void Decoder::compute_widths()
+{
+  double F;
+  double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram, is_wl_tr);
+  double gnand2     = (2 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio);
+  double gnand3     = (3 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio);
+
+  if (exist)
+  {
+    if (num_in_signals == 2 || fully_assoc)
+    {
+      w_dec_n[0] = 2 * g_tp.min_w_nmos_;
+      w_dec_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand2;
+    }
+    else
+    {
+      w_dec_n[0] = 3 * g_tp.min_w_nmos_;
+      w_dec_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand3;
+    }
+
+    F *= C_ld_dec_out / (gate_C(w_dec_n[0], 0, is_dram, false, is_wl_tr) +
+                         gate_C(w_dec_p[0], 0, is_dram, false, is_wl_tr));
+    num_gates = logical_effort(
+        num_gates_min,
+        num_in_signals == 2 ? gnand2 : gnand3,
+        F,
+        w_dec_n,
+        w_dec_p,
+        C_ld_dec_out,
+        p_to_n_sz_ratio,
+        is_dram,
+        is_wl_tr,
+        g_tp.max_w_nmos_dec);
+  }
+}
+
+
+
+void Decoder::compute_area()
+{
+  double cumulative_area = 0;
+  double cumulative_curr = 0;  // cumulative leakage current
+  double cumulative_curr_Ig = 0;  // cumulative leakage current
+
+  if (exist)
+  { // First check if this decoder exists
+    if (num_in_signals == 2)
+    {
+      cumulative_area = compute_gate_area(NAND, 2, w_dec_p[0], w_dec_n[0], area.h);
+      cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram);
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram);
+    }
+    else if (num_in_signals == 3)
+    {
+      cumulative_area = compute_gate_area(NAND, 3, w_dec_p[0], w_dec_n[0], area.h);
+      cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram);;
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram);
+    }
+
+    for (int i = 1; i < num_gates; i++)
+    {
+      cumulative_area += compute_gate_area(INV, 1, w_dec_p[i], w_dec_n[i], area.h);
+      cumulative_curr += cmos_Isub_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram);
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram);
+    }
+    power.readOp.leakage = cumulative_curr * g_tp.peri_global.Vdd;
+    power.readOp.gate_leakage = cumulative_curr_Ig * g_tp.peri_global.Vdd;
+
+    area.w = (cumulative_area / area.h);
+  }
+}
+
+
+
+double Decoder::compute_delays(double inrisetime)
+{
+  if (exist)
+  {
+    double ret_val = 0;  // outrisetime
+    int    i;
+    double rd, tf, this_delay, c_load, c_intrinsic, Vpp;
+    double Vdd = g_tp.peri_global.Vdd;
+
+    if ((is_wl_tr) && (is_dram))
+    {
+      Vpp = g_tp.vpp;
+    }
+    else if (is_wl_tr)
+    {
+      Vpp = g_tp.sram_cell.Vdd;
+    }
+    else
+    {
+      Vpp = g_tp.peri_global.Vdd;
+    }
+
+    // first check whether a decoder is required at all
+    rd = tr_R_on(w_dec_n[0], NCH, num_in_signals, is_dram, false, is_wl_tr);
+    c_load = gate_C(w_dec_n[1] + w_dec_p[1], 0.0, is_dram, false, is_wl_tr);
+    c_intrinsic = drain_C_(w_dec_p[0], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) * num_in_signals +
+                  drain_C_(w_dec_n[0], NCH, num_in_signals, 1, area.h, is_dram, false, is_wl_tr);
+    tf = rd * (c_intrinsic + c_load);
+    this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+    delay += this_delay;
+    inrisetime = this_delay / (1.0 - 0.5);
+    power.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd;
+
+    for (i = 1; i < num_gates - 1; ++i)
+    {
+      rd = tr_R_on(w_dec_n[i], NCH, 1, is_dram, false, is_wl_tr);
+      c_load = gate_C(w_dec_p[i+1] + w_dec_n[i+1], 0.0, is_dram, false, is_wl_tr);
+      c_intrinsic = drain_C_(w_dec_p[i], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) +
+                    drain_C_(w_dec_n[i], NCH, 1, 1, area.h, is_dram, false, is_wl_tr);
+      tf = rd * (c_intrinsic + c_load);
+      this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+      delay += this_delay;
+      inrisetime = this_delay / (1.0 - 0.5);
+      power.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd;
+    }
+
+    // add delay of final inverter that drives the wordline
+    i = num_gates - 1;
+    c_load = C_ld_dec_out;
+    rd = tr_R_on(w_dec_n[i], NCH, 1, is_dram, false, is_wl_tr);
+    c_intrinsic = drain_C_(w_dec_p[i], PCH, 1, 1, area.h, is_dram, false, is_wl_tr) +
+                  drain_C_(w_dec_n[i], NCH, 1, 1, area.h, is_dram, false, is_wl_tr);
+    tf = rd * (c_intrinsic + c_load) + R_wire_dec_out * c_load / 2;
+    this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+    delay  += this_delay;
+    ret_val = this_delay / (1.0 - 0.5);
+    power.readOp.dynamic += c_load * Vpp * Vpp + c_intrinsic * Vdd * Vdd;
+
+    return ret_val;
+  }
+  else
+  {
+    return 0.0;
+  }
+}
+
+void Decoder::leakage_feedback(double temperature)
+{
+  double cumulative_curr = 0;  // cumulative leakage current
+  double cumulative_curr_Ig = 0;  // cumulative leakage current
+
+  if (exist)
+  { // First check if this decoder exists
+    if (num_in_signals == 2)
+    {
+      cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram);
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 2, nand,is_dram);
+    }
+    else if (num_in_signals == 3)
+    {
+      cumulative_curr = cmos_Isub_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram);;
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[0], w_dec_p[0], 3, nand, is_dram);
+    }
+
+    for (int i = 1; i < num_gates; i++)
+    {
+      cumulative_curr += cmos_Isub_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram);
+      cumulative_curr_Ig = cmos_Ig_leakage(w_dec_n[i], w_dec_p[i], 1, inv, is_dram);
+    }
+
+    power.readOp.leakage = cumulative_curr * g_tp.peri_global.Vdd;
+    power.readOp.gate_leakage = cumulative_curr_Ig * g_tp.peri_global.Vdd;
+  }
+}
+
+PredecBlk::PredecBlk(
+    int    num_dec_signals,
+    Decoder * dec_,
+    double C_wire_predec_blk_out,
+    double R_wire_predec_blk_out_,
+    int    num_dec_per_predec,
+    bool   is_dram,
+    bool   is_blk1)
+ :dec(dec_),
+  exist(false),
+  number_input_addr_bits(0),
+  C_ld_predec_blk_out(0),
+  R_wire_predec_blk_out(0),
+  branch_effort_nand2_gate_output(1),
+  branch_effort_nand3_gate_output(1),
+  flag_two_unique_paths(false),
+  flag_L2_gate(0),
+  number_inputs_L1_gate(0),
+  number_gates_L1_nand2_path(0),
+  number_gates_L1_nand3_path(0),
+  number_gates_L2(0),
+  min_number_gates_L1(2),
+  min_number_gates_L2(2),
+  num_L1_active_nand2_path(0),
+  num_L1_active_nand3_path(0),
+  delay_nand2_path(0),
+  delay_nand3_path(0),
+  power_nand2_path(),
+  power_nand3_path(),
+  power_L2(),
+  is_dram_(is_dram)
+{
+  int    branch_effort_predec_out;
+  double C_ld_dec_gate;
+  int    num_addr_bits_dec = _log2(num_dec_signals);
+  int    blk1_num_input_addr_bits = (num_addr_bits_dec + 1) / 2;
+  int    blk2_num_input_addr_bits = num_addr_bits_dec - blk1_num_input_addr_bits;
+
+  w_L1_nand2_n[0] = 0;
+  w_L1_nand2_p[0] = 0;
+  w_L1_nand3_n[0] = 0;
+  w_L1_nand3_p[0] = 0;
+
+  if (is_blk1 == true)
+  {
+    if (num_addr_bits_dec <= 0)
+    {
+      return;
+    }
+    else if (num_addr_bits_dec < 4)
+    {
+      // Just one predecoder block is required with NAND2 gates. No decoder required.
+      // The first level of predecoding directly drives the decoder output load
+      exist = true;
+      number_input_addr_bits = num_addr_bits_dec;
+      R_wire_predec_blk_out = dec->R_wire_dec_out;
+      C_ld_predec_blk_out = dec->C_ld_dec_out;
+    }
+    else
+    {
+      exist = true;
+      number_input_addr_bits   = blk1_num_input_addr_bits;
+      branch_effort_predec_out = (1 << blk2_num_input_addr_bits);
+      C_ld_dec_gate = num_dec_per_predec * gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_, false, false);
+      R_wire_predec_blk_out = R_wire_predec_blk_out_;
+      C_ld_predec_blk_out = branch_effort_predec_out * C_ld_dec_gate + C_wire_predec_blk_out;
+    }
+  }
+  else
+  {
+    if (num_addr_bits_dec >= 4)
+    {
+      exist = true;
+      number_input_addr_bits   = blk2_num_input_addr_bits;
+      branch_effort_predec_out = (1 << blk1_num_input_addr_bits);
+      C_ld_dec_gate = num_dec_per_predec * gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_, false, false);
+      R_wire_predec_blk_out = R_wire_predec_blk_out_;
+      C_ld_predec_blk_out = branch_effort_predec_out * C_ld_dec_gate + C_wire_predec_blk_out;
+    }
+  }
+
+  compute_widths();
+  compute_area();
+}
+
+
+
+void PredecBlk::compute_widths()
+{
+  double F, c_load_nand3_path, c_load_nand2_path;
+  double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram_);
+  double gnand2 = (2 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio);
+  double gnand3 = (3 + p_to_n_sz_ratio) / (1 + p_to_n_sz_ratio);
+
+  if (exist == false) return;
+
+
+  switch (number_input_addr_bits)
+  {
+    case 1:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 2;
+      flag_L2_gate                    = 0;
+      break;
+    case 2:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 2;
+      flag_L2_gate                    = 0;
+      break;
+    case 3:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 3;
+      flag_L2_gate                    = 0;
+      break;
+    case 4:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 2;
+      flag_L2_gate                    = 2;
+      branch_effort_nand2_gate_output = 4;
+      break;
+    case 5:
+      flag_two_unique_paths           = true;
+      flag_L2_gate                    = 2;
+      branch_effort_nand2_gate_output = 8;
+      branch_effort_nand3_gate_output = 4;
+      break;
+    case 6:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 3;
+      flag_L2_gate                    = 2;
+      branch_effort_nand3_gate_output = 8;
+      break;
+    case 7:
+      flag_two_unique_paths           = true;
+      flag_L2_gate                    = 3;
+      branch_effort_nand2_gate_output = 32;
+      branch_effort_nand3_gate_output = 16;
+      break;
+    case 8:
+      flag_two_unique_paths           = true;
+      flag_L2_gate                    = 3;
+      branch_effort_nand2_gate_output = 64;
+      branch_effort_nand3_gate_output = 32;
+      break;
+    case 9:
+      flag_two_unique_paths           = false;
+      number_inputs_L1_gate           = 3;
+      flag_L2_gate                    = 3;
+      branch_effort_nand3_gate_output = 64;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  // find the number of gates and sizing in second level of predecoder (if there is a second level)
+  if (flag_L2_gate)
+  {
+    if (flag_L2_gate == 2)
+    { // 2nd level is a NAND2 gate
+      w_L2_n[0] = 2 * g_tp.min_w_nmos_;
+      F = gnand2;
+    }
+    else
+    { // 2nd level is a NAND3 gate
+      w_L2_n[0] = 3 * g_tp.min_w_nmos_;
+      F = gnand3;
+    }
+    w_L2_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+    F *= C_ld_predec_blk_out / (gate_C(w_L2_n[0], 0, is_dram_) + gate_C(w_L2_p[0], 0, is_dram_));
+    number_gates_L2 = logical_effort(
+        min_number_gates_L2,
+        flag_L2_gate == 2 ? gnand2 : gnand3,
+        F,
+        w_L2_n,
+        w_L2_p,
+        C_ld_predec_blk_out,
+        p_to_n_sz_ratio,
+        is_dram_, false,
+        g_tp.max_w_nmos_);
+
+    // Now find the number of gates and widths in first level of predecoder
+    if ((flag_two_unique_paths)||(number_inputs_L1_gate == 2))
+    { // Whenever flag_two_unique_paths is true, it means first level of decoder employs
+      // both NAND2 and NAND3 gates. Or when number_inputs_L1_gate is 2, it means
+      // a NAND2 gate is used in the first level of the predecoder
+      c_load_nand2_path = branch_effort_nand2_gate_output *
+        (gate_C(w_L2_n[0], 0, is_dram_) +
+         gate_C(w_L2_p[0], 0, is_dram_));
+      w_L1_nand2_n[0] = 2 * g_tp.min_w_nmos_;
+      w_L1_nand2_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand2 * c_load_nand2_path /
+        (gate_C(w_L1_nand2_n[0], 0, is_dram_) +
+         gate_C(w_L1_nand2_p[0], 0, is_dram_));
+      number_gates_L1_nand2_path = logical_effort(
+          min_number_gates_L1,
+          gnand2,
+          F,
+          w_L1_nand2_n,
+          w_L1_nand2_p,
+          c_load_nand2_path,
+          p_to_n_sz_ratio,
+          is_dram_, false,
+          g_tp.max_w_nmos_);
+    }
+
+    //Now find widths of gates along path in which first gate is a NAND3
+    if ((flag_two_unique_paths)||(number_inputs_L1_gate == 3))
+    { // Whenever flag_two_unique_paths is TRUE, it means first level of decoder employs
+      // both NAND2 and NAND3 gates. Or when number_inputs_L1_gate is 3, it means
+      // a NAND3 gate is used in the first level of the predecoder
+      c_load_nand3_path = branch_effort_nand3_gate_output *
+        (gate_C(w_L2_n[0], 0, is_dram_) +
+         gate_C(w_L2_p[0], 0, is_dram_));
+      w_L1_nand3_n[0] = 3 * g_tp.min_w_nmos_;
+      w_L1_nand3_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand3 * c_load_nand3_path /
+        (gate_C(w_L1_nand3_n[0], 0, is_dram_) +
+         gate_C(w_L1_nand3_p[0], 0, is_dram_));
+      number_gates_L1_nand3_path = logical_effort(
+          min_number_gates_L1,
+          gnand3,
+          F,
+          w_L1_nand3_n,
+          w_L1_nand3_p,
+          c_load_nand3_path,
+          p_to_n_sz_ratio,
+          is_dram_, false,
+          g_tp.max_w_nmos_);
+    }
+  }
+  else
+  { // find number of gates and widths in first level of predecoder block when there is no second level
+    if (number_inputs_L1_gate == 2)
+    {
+      w_L1_nand2_n[0] = 2 * g_tp.min_w_nmos_;
+      w_L1_nand2_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand2*C_ld_predec_blk_out /
+        (gate_C(w_L1_nand2_n[0], 0, is_dram_) +
+         gate_C(w_L1_nand2_p[0], 0, is_dram_));
+      number_gates_L1_nand2_path = logical_effort(
+          min_number_gates_L1,
+          gnand2,
+          F,
+          w_L1_nand2_n,
+          w_L1_nand2_p,
+          C_ld_predec_blk_out,
+          p_to_n_sz_ratio,
+          is_dram_, false,
+          g_tp.max_w_nmos_);
+    }
+    else if (number_inputs_L1_gate == 3)
+    {
+      w_L1_nand3_n[0] = 3 * g_tp.min_w_nmos_;
+      w_L1_nand3_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+      F = gnand3*C_ld_predec_blk_out /
+        (gate_C(w_L1_nand3_n[0], 0, is_dram_) +
+         gate_C(w_L1_nand3_p[0], 0, is_dram_));
+      number_gates_L1_nand3_path = logical_effort(
+          min_number_gates_L1,
+          gnand3,
+          F,
+          w_L1_nand3_n,
+          w_L1_nand3_p,
+          C_ld_predec_blk_out,
+          p_to_n_sz_ratio,
+          is_dram_, false,
+          g_tp.max_w_nmos_);
+    }
+  }
+}
+
+
+
+void PredecBlk::compute_area()
+{
+  if (exist)
+  { // First check whether a predecoder block is needed
+    int num_L1_nand2 = 0;
+    int num_L1_nand3 = 0;
+    int num_L2 = 0;
+    double tot_area_L1_nand3  =0;
+    double leak_L1_nand3      =0;
+    double gate_leak_L1_nand3 =0;
+
+    double tot_area_L1_nand2  = compute_gate_area(NAND, 2, w_L1_nand2_p[0], w_L1_nand2_n[0], g_tp.cell_h_def);
+    double leak_L1_nand2      = cmos_Isub_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_);
+    double gate_leak_L1_nand2 = cmos_Ig_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_);
+    if (number_inputs_L1_gate != 3) {
+      tot_area_L1_nand3 = 0;
+      leak_L1_nand3 = 0;
+      gate_leak_L1_nand3 =0;
+    }
+    else {
+      tot_area_L1_nand3  = compute_gate_area(NAND, 3, w_L1_nand3_p[0], w_L1_nand3_n[0], g_tp.cell_h_def);
+      leak_L1_nand3      = cmos_Isub_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand);
+      gate_leak_L1_nand3 = cmos_Ig_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand);
+    }
+
+    switch (number_input_addr_bits)
+    {
+      case 1: //2 NAND2 gates
+        num_L1_nand2 = 2;
+        num_L2       = 0;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =0;
+        break;
+      case 2: //4 NAND2 gates
+        num_L1_nand2 = 4;
+        num_L2       = 0;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =0;
+        break;
+      case 3: //8 NAND3 gates
+        num_L1_nand3 = 8;
+        num_L2       = 0;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =1;
+        break;
+      case 4: //4 + 4 NAND2 gates
+        num_L1_nand2 = 8;
+        num_L2       = 16;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =0;
+        break;
+      case 5: //4 NAND2 gates, 8 NAND3 gates
+        num_L1_nand2 = 4;
+        num_L1_nand3 = 8;
+        num_L2       = 32;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =1;
+        break;
+      case 6: //8 + 8 NAND3 gates
+        num_L1_nand3 = 16;
+        num_L2       = 64;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =2;
+        break;
+      case 7: //4 + 4 NAND2 gates, 8 NAND3 gates
+        num_L1_nand2 = 8;
+        num_L1_nand3 = 8;
+        num_L2       = 128;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =1;
+        break;
+      case 8: //4 NAND2 gates, 8 + 8 NAND3 gates
+        num_L1_nand2 = 4;
+        num_L1_nand3 = 16;
+        num_L2       = 256;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =2;
+        break;
+      case 9: //8 + 8 + 8 NAND3 gates
+        num_L1_nand3 = 24;
+        num_L2       = 512;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =3;
+        break;
+      default:
+        break;
+    }
+
+    for (int i = 1; i < number_gates_L1_nand2_path; ++i)
+    {
+      tot_area_L1_nand2  += compute_gate_area(INV, 1, w_L1_nand2_p[i], w_L1_nand2_n[i], g_tp.cell_h_def);
+      leak_L1_nand2      += cmos_Isub_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_);
+      gate_leak_L1_nand2 += cmos_Ig_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_);
+    }
+    tot_area_L1_nand2  *= num_L1_nand2;
+    leak_L1_nand2      *= num_L1_nand2;
+    gate_leak_L1_nand2 *= num_L1_nand2;
+
+    for (int i = 1; i < number_gates_L1_nand3_path; ++i)
+    {
+      tot_area_L1_nand3  += compute_gate_area(INV, 1, w_L1_nand3_p[i], w_L1_nand3_n[i], g_tp.cell_h_def);
+      leak_L1_nand3      += cmos_Isub_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_);
+      gate_leak_L1_nand3 += cmos_Ig_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_);
+    }
+    tot_area_L1_nand3  *= num_L1_nand3;
+    leak_L1_nand3      *= num_L1_nand3;
+    gate_leak_L1_nand3 *= num_L1_nand3;
+
+    double cumulative_area_L1 = tot_area_L1_nand2 + tot_area_L1_nand3;
+    double cumulative_area_L2 = 0.0;
+    double leakage_L2         = 0.0;
+    double gate_leakage_L2    = 0.0;
+
+    if (flag_L2_gate == 2)
+    {
+      cumulative_area_L2 = compute_gate_area(NAND, 2, w_L2_p[0], w_L2_n[0], g_tp.cell_h_def);
+      leakage_L2         = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_);
+      gate_leakage_L2    = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_);
+    }
+    else if (flag_L2_gate == 3)
+    {
+      cumulative_area_L2 = compute_gate_area(NAND, 3, w_L2_p[0], w_L2_n[0], g_tp.cell_h_def);
+      leakage_L2         = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_);
+      gate_leakage_L2    = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_);
+    }
+
+    for (int i = 1; i < number_gates_L2; ++i)
+    {
+      cumulative_area_L2 += compute_gate_area(INV, 1, w_L2_p[i], w_L2_n[i], g_tp.cell_h_def);
+      leakage_L2         += cmos_Isub_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_);
+      gate_leakage_L2    += cmos_Ig_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_);
+    }
+    cumulative_area_L2 *= num_L2;
+    leakage_L2         *= num_L2;
+    gate_leakage_L2    *= num_L2;
+
+    power_nand2_path.readOp.leakage = leak_L1_nand2 * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.leakage = leak_L1_nand3 * g_tp.peri_global.Vdd;
+    power_L2.readOp.leakage         = leakage_L2    * g_tp.peri_global.Vdd;
+    area.set_area(cumulative_area_L1 + cumulative_area_L2);
+    power_nand2_path.readOp.gate_leakage = gate_leak_L1_nand2 * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.gate_leakage = gate_leak_L1_nand3 * g_tp.peri_global.Vdd;
+    power_L2.readOp.gate_leakage         = gate_leakage_L2    * g_tp.peri_global.Vdd;
+  }
+}
+
+
+
+pair<double, double> PredecBlk::compute_delays(
+    pair<double, double> inrisetime)  // <nand2, nand3>
+{
+  pair<double, double> ret_val;
+  ret_val.first  = 0;  // outrisetime_nand2_path
+  ret_val.second = 0;  // outrisetime_nand3_path
+
+  double inrisetime_nand2_path = inrisetime.first;
+  double inrisetime_nand3_path = inrisetime.second;
+  int    i;
+  double rd, c_load, c_intrinsic, tf, this_delay;
+  double Vdd = g_tp.peri_global.Vdd;
+
+  // TODO: following delay calculation part can be greatly simplified.
+  // first check whether a predecoder block is required
+  if (exist)
+  {
+    //Find delay in first level of predecoder block
+    //First find delay in path
+    if ((flag_two_unique_paths) || (number_inputs_L1_gate == 2))
+    {
+      //First gate is a NAND2 gate
+      rd = tr_R_on(w_L1_nand2_n[0], NCH, 2, is_dram_);
+      c_load = gate_C(w_L1_nand2_n[1] + w_L1_nand2_p[1], 0.0, is_dram_);
+      c_intrinsic = 2 * drain_C_(w_L1_nand2_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                        drain_C_(w_L1_nand2_n[0], NCH, 2, 1, g_tp.cell_h_def, is_dram_);
+      tf = rd * (c_intrinsic + c_load);
+      this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+      delay_nand2_path += this_delay;
+      inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+      power_nand2_path.readOp.dynamic += (c_load + c_intrinsic) * Vdd * Vdd;
+
+      //Add delays of all but the last inverter in the chain
+      for (i = 1; i < number_gates_L1_nand2_path - 1; ++i)
+      {
+        rd = tr_R_on(w_L1_nand2_n[i], NCH, 1, is_dram_);
+        c_load = gate_C(w_L1_nand2_n[i+1] + w_L1_nand2_p[i+1], 0.0, is_dram_);
+        c_intrinsic = drain_C_(w_L1_nand2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+        delay_nand2_path += this_delay;
+        inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+        power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+
+      //Add delay of the last inverter
+      i = number_gates_L1_nand2_path - 1;
+      rd = tr_R_on(w_L1_nand2_n[i], NCH, 1, is_dram_);
+      if (flag_L2_gate)
+      {
+        c_load = branch_effort_nand2_gate_output*(gate_C(w_L2_n[0], 0, is_dram_) + gate_C(w_L2_p[0], 0, is_dram_));
+        c_intrinsic = drain_C_(w_L1_nand2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+        delay_nand2_path += this_delay;
+        inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+        power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+      else
+      { //First level directly drives decoder output load
+        c_load = C_ld_predec_blk_out;
+        c_intrinsic = drain_C_(w_L1_nand2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load) + R_wire_predec_blk_out * c_load / 2;
+        this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+        delay_nand2_path += this_delay;
+        ret_val.first = this_delay / (1.0 - 0.5);
+        power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+    }
+
+    if ((flag_two_unique_paths) || (number_inputs_L1_gate == 3))
+    { //Check if the number of gates in the first level is more than 1.
+      //First gate is a NAND3 gate
+      rd = tr_R_on(w_L1_nand3_n[0], NCH, 3, is_dram_);
+      c_load = gate_C(w_L1_nand3_n[1] + w_L1_nand3_p[1], 0.0, is_dram_);
+      c_intrinsic = 3 * drain_C_(w_L1_nand3_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                        drain_C_(w_L1_nand3_n[0], NCH, 3, 1, g_tp.cell_h_def, is_dram_);
+      tf = rd * (c_intrinsic + c_load);
+      this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+      delay_nand3_path += this_delay;
+      inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+      power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+
+      //Add delays of all but the last inverter in the chain
+      for (i = 1; i < number_gates_L1_nand3_path - 1; ++i)
+      {
+        rd = tr_R_on(w_L1_nand3_n[i], NCH, 1, is_dram_);
+        c_load = gate_C(w_L1_nand3_n[i+1] + w_L1_nand3_p[i+1], 0.0, is_dram_);
+        c_intrinsic = drain_C_(w_L1_nand3_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand3_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+        delay_nand3_path += this_delay;
+        inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+        power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+
+      //Add delay of the last inverter
+      i = number_gates_L1_nand3_path - 1;
+      rd = tr_R_on(w_L1_nand3_n[i], NCH, 1, is_dram_);
+      if (flag_L2_gate)
+      {
+        c_load = branch_effort_nand3_gate_output*(gate_C(w_L2_n[0], 0, is_dram_) + gate_C(w_L2_p[0], 0, is_dram_));
+        c_intrinsic = drain_C_(w_L1_nand3_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand3_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+        delay_nand3_path += this_delay;
+        inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+        power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+      else
+      { //First level directly drives decoder output load
+        c_load = C_ld_predec_blk_out;
+        c_intrinsic = drain_C_(w_L1_nand3_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L1_nand3_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load) + R_wire_predec_blk_out * c_load / 2;
+        this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+        delay_nand3_path += this_delay;
+        ret_val.second = this_delay / (1.0 - 0.5);
+        power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+    }
+
+    // Find delay through second level
+    if (flag_L2_gate)
+    {
+      if (flag_L2_gate == 2)
+      {
+        rd = tr_R_on(w_L2_n[0], NCH, 2, is_dram_);
+        c_load = gate_C(w_L2_n[1] + w_L2_p[1], 0.0, is_dram_);
+        c_intrinsic = 2 * drain_C_(w_L2_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                          drain_C_(w_L2_n[0], NCH, 2, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+        delay_nand2_path += this_delay;
+        inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+        power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+      else
+      { // flag_L2_gate = 3
+        rd = tr_R_on(w_L2_n[0], NCH, 3, is_dram_);
+        c_load = gate_C(w_L2_n[1] + w_L2_p[1], 0.0, is_dram_);
+        c_intrinsic = 3 * drain_C_(w_L2_p[0], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                          drain_C_(w_L2_n[0], NCH, 3, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+        delay_nand3_path += this_delay;
+        inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+        power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+
+      for (i = 1; i < number_gates_L2 - 1; ++i)
+      {
+        rd = tr_R_on(w_L2_n[i], NCH, 1, is_dram_);
+        c_load = gate_C(w_L2_n[i+1] + w_L2_p[i+1], 0.0, is_dram_);
+        c_intrinsic = drain_C_(w_L2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                      drain_C_(w_L2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+        tf = rd * (c_intrinsic + c_load);
+        this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+        delay_nand2_path += this_delay;
+        inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+        this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+        delay_nand3_path += this_delay;
+        inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+        power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+      }
+
+      //Add delay of final inverter that drives the wordline decoders
+      i = number_gates_L2 - 1;
+      c_load = C_ld_predec_blk_out;
+      rd = tr_R_on(w_L2_n[i], NCH, 1, is_dram_);
+      c_intrinsic = drain_C_(w_L2_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                    drain_C_(w_L2_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+      tf = rd * (c_intrinsic + c_load) + R_wire_predec_blk_out * c_load / 2;
+      this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+      delay_nand2_path += this_delay;
+      ret_val.first = this_delay / (1.0 - 0.5);
+      this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+      delay_nand3_path += this_delay;
+      ret_val.second = this_delay / (1.0 - 0.5);
+      power_L2.readOp.dynamic += (c_intrinsic + c_load) * Vdd * Vdd;
+    }
+  }
+
+  delay = (ret_val.first > ret_val.second) ? ret_val.first : ret_val.second;
+  return ret_val;
+}
+
+void PredecBlk::leakage_feedback(double temperature)
+{
+  if (exist)
+  { // First check whether a predecoder block is needed
+    int num_L1_nand2 = 0;
+    int num_L1_nand3 = 0;
+    int num_L2 = 0;
+    double leak_L1_nand3      =0;
+    double gate_leak_L1_nand3 =0;
+
+    double leak_L1_nand2      = cmos_Isub_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_);
+    double gate_leak_L1_nand2 = cmos_Ig_leakage(w_L1_nand2_n[0], w_L1_nand2_p[0], 2, nand, is_dram_);
+    if (number_inputs_L1_gate != 3) {
+      leak_L1_nand3 = 0;
+      gate_leak_L1_nand3 =0;
+    }
+    else {
+      leak_L1_nand3      = cmos_Isub_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand);
+      gate_leak_L1_nand3 = cmos_Ig_leakage(w_L1_nand3_n[0], w_L1_nand3_p[0], 3, nand);
+    }
+
+    switch (number_input_addr_bits)
+    {
+      case 1: //2 NAND2 gates
+        num_L1_nand2 = 2;
+        num_L2       = 0;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =0;
+        break;
+      case 2: //4 NAND2 gates
+        num_L1_nand2 = 4;
+        num_L2       = 0;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =0;
+        break;
+      case 3: //8 NAND3 gates
+        num_L1_nand3 = 8;
+        num_L2       = 0;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =1;
+        break;
+      case 4: //4 + 4 NAND2 gates
+        num_L1_nand2 = 8;
+        num_L2       = 16;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =0;
+        break;
+      case 5: //4 NAND2 gates, 8 NAND3 gates
+        num_L1_nand2 = 4;
+        num_L1_nand3 = 8;
+        num_L2       = 32;
+        num_L1_active_nand2_path =1;
+        num_L1_active_nand3_path =1;
+        break;
+      case 6: //8 + 8 NAND3 gates
+        num_L1_nand3 = 16;
+        num_L2       = 64;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =2;
+        break;
+      case 7: //4 + 4 NAND2 gates, 8 NAND3 gates
+        num_L1_nand2 = 8;
+        num_L1_nand3 = 8;
+        num_L2       = 128;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =1;
+        break;
+      case 8: //4 NAND2 gates, 8 + 8 NAND3 gates
+        num_L1_nand2 = 4;
+        num_L1_nand3 = 16;
+        num_L2       = 256;
+        num_L1_active_nand2_path =2;
+        num_L1_active_nand3_path =2;
+        break;
+      case 9: //8 + 8 + 8 NAND3 gates
+        num_L1_nand3 = 24;
+        num_L2       = 512;
+        num_L1_active_nand2_path =0;
+        num_L1_active_nand3_path =3;
+        break;
+      default:
+        break;
+    }
+
+    for (int i = 1; i < number_gates_L1_nand2_path; ++i)
+    {
+      leak_L1_nand2      += cmos_Isub_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_);
+      gate_leak_L1_nand2 += cmos_Ig_leakage(w_L1_nand2_n[i], w_L1_nand2_p[i], 2, nand, is_dram_);
+    }
+    leak_L1_nand2      *= num_L1_nand2;
+    gate_leak_L1_nand2 *= num_L1_nand2;
+
+    for (int i = 1; i < number_gates_L1_nand3_path; ++i)
+    {
+      leak_L1_nand3      += cmos_Isub_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_);
+      gate_leak_L1_nand3 += cmos_Ig_leakage(w_L1_nand3_n[i], w_L1_nand3_p[i], 3, nand, is_dram_);
+    }
+    leak_L1_nand3      *= num_L1_nand3;
+    gate_leak_L1_nand3 *= num_L1_nand3;
+
+    double leakage_L2         = 0.0;
+    double gate_leakage_L2    = 0.0;
+
+    if (flag_L2_gate == 2)
+    {
+      leakage_L2         = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_);
+      gate_leakage_L2    = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 2, nand, is_dram_);
+    }
+    else if (flag_L2_gate == 3)
+    {
+      leakage_L2         = cmos_Isub_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_);
+      gate_leakage_L2    = cmos_Ig_leakage(w_L2_n[0], w_L2_p[0], 3, nand, is_dram_);
+    }
+
+    for (int i = 1; i < number_gates_L2; ++i)
+    {
+      leakage_L2         += cmos_Isub_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_);
+      gate_leakage_L2    += cmos_Ig_leakage(w_L2_n[i], w_L2_p[i], 2, inv, is_dram_);
+    }
+    leakage_L2         *= num_L2;
+    gate_leakage_L2    *= num_L2;
+
+    power_nand2_path.readOp.leakage = leak_L1_nand2 * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.leakage = leak_L1_nand3 * g_tp.peri_global.Vdd;
+    power_L2.readOp.leakage         = leakage_L2    * g_tp.peri_global.Vdd;
+
+    power_nand2_path.readOp.gate_leakage = gate_leak_L1_nand2 * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.gate_leakage = gate_leak_L1_nand3 * g_tp.peri_global.Vdd;
+    power_L2.readOp.gate_leakage         = gate_leakage_L2    * g_tp.peri_global.Vdd;
+  }
+}
+
+PredecBlkDrv::PredecBlkDrv(
+    int    way_select_,
+    PredecBlk * blk_,
+    bool   is_dram)
+ :flag_driver_exists(0),
+  number_gates_nand2_path(0),
+  number_gates_nand3_path(0),
+  min_number_gates(2),
+  num_buffers_driving_1_nand2_load(0),
+  num_buffers_driving_2_nand2_load(0),
+  num_buffers_driving_4_nand2_load(0),
+  num_buffers_driving_2_nand3_load(0),
+  num_buffers_driving_8_nand3_load(0),
+  num_buffers_nand3_path(0),
+  c_load_nand2_path_out(0),
+  c_load_nand3_path_out(0),
+  r_load_nand2_path_out(0),
+  r_load_nand3_path_out(0),
+  delay_nand2_path(0),
+  delay_nand3_path(0),
+  power_nand2_path(),
+  power_nand3_path(),
+  blk(blk_), dec(blk->dec),
+  is_dram_(is_dram),
+  way_select(way_select_)
+{
+  for (int i = 0; i < MAX_NUMBER_GATES_STAGE; i++)
+  {
+    width_nand2_path_n[i] = 0;
+    width_nand2_path_p[i] = 0;
+    width_nand3_path_n[i] = 0;
+    width_nand3_path_p[i] = 0;
+  }
+
+  number_input_addr_bits = blk->number_input_addr_bits;
+
+  if (way_select > 1)
+  {
+    flag_driver_exists     = 1;
+    number_input_addr_bits = way_select;
+    if (dec->num_in_signals == 2)
+    {
+      c_load_nand2_path_out = gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_);
+      num_buffers_driving_2_nand2_load = number_input_addr_bits;
+    }
+    else if (dec->num_in_signals == 3)
+    {
+      c_load_nand3_path_out = gate_C(dec->w_dec_n[0] + dec->w_dec_p[0], 0, is_dram_);
+      num_buffers_driving_2_nand3_load = number_input_addr_bits;
+    }
+  }
+  else if (way_select == 0)
+  {
+    if (blk->exist)
+    {
+      flag_driver_exists = 1;
+    }
+  }
+
+  compute_widths();
+  compute_area();
+}
+
+
+
+void PredecBlkDrv::compute_widths()
+{
+  // The predecode block driver accepts as input the address bits from the h-tree network. For
+  // each addr bit it then generates addr and addrbar as outputs. For now ignore the effect of
+  // inversion to generate addrbar and simply treat addrbar as addr.
+
+  double F;
+  double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram_);
+
+  if (flag_driver_exists)
+  {
+    double C_nand2_gate_blk = gate_C(blk->w_L1_nand2_n[0] + blk->w_L1_nand2_p[0], 0, is_dram_);
+    double C_nand3_gate_blk = gate_C(blk->w_L1_nand3_n[0] + blk->w_L1_nand3_p[0], 0, is_dram_);
+
+    if (way_select == 0)
+    {
+      if (blk->number_input_addr_bits == 1)
+      { //2 NAND2 gates
+        num_buffers_driving_2_nand2_load = 1;
+        c_load_nand2_path_out            = 2 * C_nand2_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 2)
+      { //4 NAND2 gates  one 2-4 decoder
+        num_buffers_driving_4_nand2_load = 2;
+        c_load_nand2_path_out            = 4 * C_nand2_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 3)
+      { //8 NAND3 gates  one 3-8 decoder
+        num_buffers_driving_8_nand3_load = 3;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 4)
+      { //4 + 4 NAND2 gates two 2-4 decoder
+        num_buffers_driving_4_nand2_load = 4;
+        c_load_nand2_path_out            = 4 * C_nand2_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 5)
+      { //4 NAND2 gates, 8 NAND3 gates one 2-4 decoder and one 3-8 decoder
+        num_buffers_driving_4_nand2_load = 2;
+        num_buffers_driving_8_nand3_load = 3;
+        c_load_nand2_path_out            = 4 * C_nand2_gate_blk;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 6)
+      { //8 + 8 NAND3 gates two 3-8 decoder
+        num_buffers_driving_8_nand3_load = 6;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 7)
+      { //4 + 4 NAND2 gates, 8 NAND3 gates two 2-4 decoder and one 3-8 decoder
+        num_buffers_driving_4_nand2_load = 4;
+        num_buffers_driving_8_nand3_load = 3;
+        c_load_nand2_path_out            = 4 * C_nand2_gate_blk;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 8)
+      { //4 NAND2 gates, 8 + 8 NAND3 gates one 2-4 decoder and two 3-8 decoder
+        num_buffers_driving_4_nand2_load = 2;
+        num_buffers_driving_8_nand3_load = 6;
+        c_load_nand2_path_out            = 4 * C_nand2_gate_blk;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+      else if (blk->number_input_addr_bits == 9)
+      { //8 + 8 + 8 NAND3 gates three 3-8 decoder
+        num_buffers_driving_8_nand3_load = 9;
+        c_load_nand3_path_out            = 8 * C_nand3_gate_blk;
+      }
+    }
+
+    if ((blk->flag_two_unique_paths) ||
+        (blk->number_inputs_L1_gate == 2) ||
+        (number_input_addr_bits == 0) ||
+        ((way_select)&&(dec->num_in_signals == 2)))
+    { //this means that way_select is driving NAND2 in decoder.
+      width_nand2_path_n[0] = g_tp.min_w_nmos_;
+      width_nand2_path_p[0] = p_to_n_sz_ratio * width_nand2_path_n[0];
+      F = c_load_nand2_path_out / gate_C(width_nand2_path_n[0] + width_nand2_path_p[0], 0, is_dram_);
+      number_gates_nand2_path = logical_effort(
+          min_number_gates,
+          1,
+          F,
+          width_nand2_path_n,
+          width_nand2_path_p,
+          c_load_nand2_path_out,
+          p_to_n_sz_ratio,
+          is_dram_, false, g_tp.max_w_nmos_);
+    }
+
+    if ((blk->flag_two_unique_paths) ||
+        (blk->number_inputs_L1_gate == 3) ||
+        ((way_select)&&(dec->num_in_signals == 3)))
+    { //this means that way_select is driving NAND3 in decoder.
+      width_nand3_path_n[0] = g_tp.min_w_nmos_;
+      width_nand3_path_p[0] = p_to_n_sz_ratio * width_nand3_path_n[0];
+      F = c_load_nand3_path_out / gate_C(width_nand3_path_n[0] + width_nand3_path_p[0], 0, is_dram_);
+      number_gates_nand3_path = logical_effort(
+          min_number_gates,
+          1,
+          F,
+          width_nand3_path_n,
+          width_nand3_path_p,
+          c_load_nand3_path_out,
+          p_to_n_sz_ratio,
+          is_dram_, false, g_tp.max_w_nmos_);
+    }
+  }
+}
+
+
+
+void PredecBlkDrv::compute_area()
+{
+  double area_nand2_path = 0;
+  double area_nand3_path = 0;
+  double leak_nand2_path = 0;
+  double leak_nand3_path = 0;
+  double gate_leak_nand2_path = 0;
+  double gate_leak_nand3_path = 0;
+
+  if (flag_driver_exists)
+  { // first check whether a predecoder block driver is needed
+    for (int i = 0; i < number_gates_nand2_path; ++i)
+    {
+      area_nand2_path += compute_gate_area(INV, 1, width_nand2_path_p[i], width_nand2_path_n[i], g_tp.cell_h_def);
+      leak_nand2_path += cmos_Isub_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_);
+      gate_leak_nand2_path += cmos_Ig_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_);
+    }
+    area_nand2_path *= (num_buffers_driving_1_nand2_load +
+                        num_buffers_driving_2_nand2_load +
+                        num_buffers_driving_4_nand2_load);
+    leak_nand2_path *= (num_buffers_driving_1_nand2_load +
+                        num_buffers_driving_2_nand2_load +
+                        num_buffers_driving_4_nand2_load);
+    gate_leak_nand2_path *= (num_buffers_driving_1_nand2_load +
+                            num_buffers_driving_2_nand2_load +
+                            num_buffers_driving_4_nand2_load);
+
+    for (int i = 0; i < number_gates_nand3_path; ++i)
+    {
+      area_nand3_path += compute_gate_area(INV, 1, width_nand3_path_p[i], width_nand3_path_n[i], g_tp.cell_h_def);
+      leak_nand3_path += cmos_Isub_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_);
+      gate_leak_nand3_path += cmos_Ig_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_);
+    }
+    area_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load);
+    leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load);
+    gate_leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load);
+
+    power_nand2_path.readOp.leakage = leak_nand2_path * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.leakage = leak_nand3_path * g_tp.peri_global.Vdd;
+    power_nand2_path.readOp.gate_leakage = gate_leak_nand2_path * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.gate_leakage = gate_leak_nand3_path * g_tp.peri_global.Vdd;
+    area.set_area(area_nand2_path + area_nand3_path);
+  }
+}
+
+
+
+pair<double, double> PredecBlkDrv::compute_delays(
+    double inrisetime_nand2_path,
+    double inrisetime_nand3_path)
+{
+  pair<double, double> ret_val;
+  ret_val.first  = 0;  // outrisetime_nand2_path
+  ret_val.second = 0;  // outrisetime_nand3_path
+  int i;
+  double rd, c_gate_load, c_load, c_intrinsic, tf, this_delay;
+  double Vdd = g_tp.peri_global.Vdd;
+
+  if (flag_driver_exists)
+  {
+    for (i = 0; i < number_gates_nand2_path - 1; ++i)
+    {
+      rd = tr_R_on(width_nand2_path_n[i], NCH, 1, is_dram_);
+      c_gate_load = gate_C(width_nand2_path_p[i+1] + width_nand2_path_n[i+1], 0.0, is_dram_);
+      c_intrinsic = drain_C_(width_nand2_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                    drain_C_(width_nand2_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+      tf = rd * (c_intrinsic + c_gate_load);
+      this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+      delay_nand2_path += this_delay;
+      inrisetime_nand2_path = this_delay / (1.0 - 0.5);
+      power_nand2_path.readOp.dynamic += (c_gate_load + c_intrinsic) * 0.5 * Vdd * Vdd;
+    }
+
+    // Final inverter drives the predecoder block or the decoder output load
+    if (number_gates_nand2_path != 0)
+    {
+      i = number_gates_nand2_path - 1;
+      rd = tr_R_on(width_nand2_path_n[i], NCH, 1, is_dram_);
+      c_intrinsic = drain_C_(width_nand2_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                    drain_C_(width_nand2_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+      c_load = c_load_nand2_path_out;
+      tf = rd * (c_intrinsic + c_load) + r_load_nand2_path_out*c_load/ 2;
+      this_delay = horowitz(inrisetime_nand2_path, tf, 0.5, 0.5, RISE);
+      delay_nand2_path += this_delay;
+      ret_val.first = this_delay / (1.0 - 0.5);
+      power_nand2_path.readOp.dynamic += (c_intrinsic + c_load) * 0.5 * Vdd * Vdd;
+//      cout<< "c_intrinsic = " << c_intrinsic << "c_load" << c_load <<endl;
+    }
+
+    for (i = 0; i < number_gates_nand3_path - 1; ++i)
+    {
+      rd = tr_R_on(width_nand3_path_n[i], NCH, 1, is_dram_);
+      c_gate_load = gate_C(width_nand3_path_p[i+1] + width_nand3_path_n[i+1], 0.0, is_dram_);
+      c_intrinsic = drain_C_(width_nand3_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                    drain_C_(width_nand3_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+      tf = rd * (c_intrinsic + c_gate_load);
+      this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+      delay_nand3_path += this_delay;
+      inrisetime_nand3_path = this_delay / (1.0 - 0.5);
+      power_nand3_path.readOp.dynamic += (c_gate_load + c_intrinsic) * 0.5 * Vdd * Vdd;
+    }
+
+    // Final inverter drives the predecoder block or the decoder output load
+    if (number_gates_nand3_path != 0)
+    {
+      i = number_gates_nand3_path - 1;
+      rd = tr_R_on(width_nand3_path_n[i], NCH, 1, is_dram_);
+      c_intrinsic = drain_C_(width_nand3_path_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                    drain_C_(width_nand3_path_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+      c_load = c_load_nand3_path_out;
+      tf = rd*(c_intrinsic + c_load) + r_load_nand3_path_out*c_load / 2;
+      this_delay = horowitz(inrisetime_nand3_path, tf, 0.5, 0.5, RISE);
+      delay_nand3_path += this_delay;
+      ret_val.second = this_delay / (1.0 - 0.5);
+      power_nand3_path.readOp.dynamic += (c_intrinsic + c_load) * 0.5 * Vdd * Vdd;
+    }
+  }
+  return ret_val;
+}
+
+
+double PredecBlkDrv::get_rdOp_dynamic_E(int num_act_mats_hor_dir)
+{
+  return (num_addr_bits_nand2_path()*power_nand2_path.readOp.dynamic +
+          num_addr_bits_nand3_path()*power_nand3_path.readOp.dynamic) * num_act_mats_hor_dir;
+}
+
+
+
+Predec::Predec(
+    PredecBlkDrv * drv1_,
+    PredecBlkDrv * drv2_)
+:blk1(drv1_->blk), blk2(drv2_->blk), drv1(drv1_), drv2(drv2_)
+{
+  driver_power.readOp.leakage = drv1->power_nand2_path.readOp.leakage +
+                                drv1->power_nand3_path.readOp.leakage +
+                                drv2->power_nand2_path.readOp.leakage +
+                                drv2->power_nand3_path.readOp.leakage;
+  block_power.readOp.leakage = blk1->power_nand2_path.readOp.leakage +
+                               blk1->power_nand3_path.readOp.leakage +
+                               blk1->power_L2.readOp.leakage +
+                               blk2->power_nand2_path.readOp.leakage +
+                               blk2->power_nand3_path.readOp.leakage +
+                               blk2->power_L2.readOp.leakage;
+  power.readOp.leakage = driver_power.readOp.leakage + block_power.readOp.leakage;
+
+  driver_power.readOp.gate_leakage = drv1->power_nand2_path.readOp.gate_leakage +
+                                  drv1->power_nand3_path.readOp.gate_leakage +
+                                  drv2->power_nand2_path.readOp.gate_leakage +
+                                  drv2->power_nand3_path.readOp.gate_leakage;
+  block_power.readOp.gate_leakage = blk1->power_nand2_path.readOp.gate_leakage +
+                                 blk1->power_nand3_path.readOp.gate_leakage +
+                                 blk1->power_L2.readOp.gate_leakage +
+                                 blk2->power_nand2_path.readOp.gate_leakage +
+                                 blk2->power_nand3_path.readOp.gate_leakage +
+                                 blk2->power_L2.readOp.gate_leakage;
+  power.readOp.gate_leakage = driver_power.readOp.gate_leakage + block_power.readOp.gate_leakage;
+}
+
+void PredecBlkDrv::leakage_feedback(double temperature)
+{
+  double leak_nand2_path = 0;
+  double leak_nand3_path = 0;
+  double gate_leak_nand2_path = 0;
+  double gate_leak_nand3_path = 0;
+
+  if (flag_driver_exists)
+  { // first check whether a predecoder block driver is needed
+    for (int i = 0; i < number_gates_nand2_path; ++i)
+    {
+      leak_nand2_path += cmos_Isub_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_);
+      gate_leak_nand2_path += cmos_Ig_leakage(width_nand2_path_n[i], width_nand2_path_p[i], 1, inv,is_dram_);
+    }
+    leak_nand2_path *= (num_buffers_driving_1_nand2_load +
+                        num_buffers_driving_2_nand2_load +
+                        num_buffers_driving_4_nand2_load);
+    gate_leak_nand2_path *= (num_buffers_driving_1_nand2_load +
+                            num_buffers_driving_2_nand2_load +
+                            num_buffers_driving_4_nand2_load);
+
+    for (int i = 0; i < number_gates_nand3_path; ++i)
+    {
+      leak_nand3_path += cmos_Isub_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_);
+      gate_leak_nand3_path += cmos_Ig_leakage(width_nand3_path_n[i], width_nand3_path_p[i], 1, inv,is_dram_);
+    }
+    leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load);
+    gate_leak_nand3_path *= (num_buffers_driving_2_nand3_load + num_buffers_driving_8_nand3_load);
+
+    power_nand2_path.readOp.leakage = leak_nand2_path * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.leakage = leak_nand3_path * g_tp.peri_global.Vdd;
+    power_nand2_path.readOp.gate_leakage = gate_leak_nand2_path * g_tp.peri_global.Vdd;
+    power_nand3_path.readOp.gate_leakage = gate_leak_nand3_path * g_tp.peri_global.Vdd;
+  }
+}
+
+double Predec::compute_delays(double inrisetime)
+{
+  // TODO: Jung Ho thinks that predecoder block driver locates between decoder and predecoder block.
+  pair<double, double> tmp_pair1, tmp_pair2;
+  tmp_pair1 = drv1->compute_delays(inrisetime, inrisetime);
+  tmp_pair1 = blk1->compute_delays(tmp_pair1);
+  tmp_pair2 = drv2->compute_delays(inrisetime, inrisetime);
+  tmp_pair2 = blk2->compute_delays(tmp_pair2);
+  tmp_pair1 = get_max_delay_before_decoder(tmp_pair1, tmp_pair2);
+
+  driver_power.readOp.dynamic =
+    drv1->num_addr_bits_nand2_path() * drv1->power_nand2_path.readOp.dynamic +
+    drv1->num_addr_bits_nand3_path() * drv1->power_nand3_path.readOp.dynamic +
+    drv2->num_addr_bits_nand2_path() * drv2->power_nand2_path.readOp.dynamic +
+    drv2->num_addr_bits_nand3_path() * drv2->power_nand3_path.readOp.dynamic;
+
+  block_power.readOp.dynamic =
+    blk1->power_nand2_path.readOp.dynamic*blk1->num_L1_active_nand2_path +
+    blk1->power_nand3_path.readOp.dynamic*blk1->num_L1_active_nand3_path +
+    blk1->power_L2.readOp.dynamic +
+    blk2->power_nand2_path.readOp.dynamic*blk1->num_L1_active_nand2_path  +
+    blk2->power_nand3_path.readOp.dynamic*blk1->num_L1_active_nand3_path +
+    blk2->power_L2.readOp.dynamic;
+
+  power.readOp.dynamic = driver_power.readOp.dynamic + block_power.readOp.dynamic;
+
+  delay = tmp_pair1.first;
+  return  tmp_pair1.second;
+}
+
+
+void Predec::leakage_feedback(double temperature)
+{
+  drv1->leakage_feedback(temperature);
+  drv2->leakage_feedback(temperature);
+  blk1->leakage_feedback(temperature);
+  blk2->leakage_feedback(temperature);
+
+  driver_power.readOp.leakage = drv1->power_nand2_path.readOp.leakage +
+                                drv1->power_nand3_path.readOp.leakage +
+                                drv2->power_nand2_path.readOp.leakage +
+                                drv2->power_nand3_path.readOp.leakage;
+  block_power.readOp.leakage = blk1->power_nand2_path.readOp.leakage +
+                               blk1->power_nand3_path.readOp.leakage +
+                               blk1->power_L2.readOp.leakage +
+                               blk2->power_nand2_path.readOp.leakage +
+                               blk2->power_nand3_path.readOp.leakage +
+                               blk2->power_L2.readOp.leakage;
+  power.readOp.leakage = driver_power.readOp.leakage + block_power.readOp.leakage;
+
+  driver_power.readOp.gate_leakage = drv1->power_nand2_path.readOp.gate_leakage +
+                                  drv1->power_nand3_path.readOp.gate_leakage +
+                                  drv2->power_nand2_path.readOp.gate_leakage +
+                                  drv2->power_nand3_path.readOp.gate_leakage;
+  block_power.readOp.gate_leakage = blk1->power_nand2_path.readOp.gate_leakage +
+                                 blk1->power_nand3_path.readOp.gate_leakage +
+                                 blk1->power_L2.readOp.gate_leakage +
+                                 blk2->power_nand2_path.readOp.gate_leakage +
+                                 blk2->power_nand3_path.readOp.gate_leakage +
+                                 blk2->power_L2.readOp.gate_leakage;
+  power.readOp.gate_leakage = driver_power.readOp.gate_leakage + block_power.readOp.gate_leakage;
+}
+
+// returns <delay, risetime>
+pair<double, double> Predec::get_max_delay_before_decoder(
+    pair<double, double> input_pair1,
+    pair<double, double> input_pair2)
+{
+  pair<double, double> ret_val;
+  double delay;
+
+  delay = drv1->delay_nand2_path + blk1->delay_nand2_path;
+  ret_val.first  = delay;
+  ret_val.second = input_pair1.first;
+  delay = drv1->delay_nand3_path + blk1->delay_nand3_path;
+  if (ret_val.first < delay)
+  {
+    ret_val.first  = delay;
+    ret_val.second = input_pair1.second;
+  }
+  delay = drv2->delay_nand2_path + blk2->delay_nand2_path;
+  if (ret_val.first < delay)
+  {
+    ret_val.first  = delay;
+    ret_val.second = input_pair2.first;
+  }
+  delay = drv2->delay_nand3_path + blk2->delay_nand3_path;
+  if (ret_val.first < delay)
+  {
+    ret_val.first  = delay;
+    ret_val.second = input_pair2.second;
+  }
+
+  return ret_val;
+}
+
+
+
+Driver::Driver(double c_gate_load_, double c_wire_load_, double r_wire_load_, bool is_dram)
+:number_gates(0),
+  min_number_gates(2),
+  c_gate_load(c_gate_load_),
+  c_wire_load(c_wire_load_),
+  r_wire_load(r_wire_load_),
+  delay(0),
+  power(),
+  is_dram_(is_dram)
+{
+  for (int i = 0; i < MAX_NUMBER_GATES_STAGE; i++)
+  {
+    width_n[i] = 0;
+    width_p[i] = 0;
+  }
+
+  compute_widths();
+}
+
+
+void Driver::compute_widths()
+{
+  double p_to_n_sz_ratio = pmos_to_nmos_sz_ratio(is_dram_);
+  double c_load = c_gate_load + c_wire_load;
+  width_n[0] = g_tp.min_w_nmos_;
+  width_p[0] = p_to_n_sz_ratio * g_tp.min_w_nmos_;
+
+  double F = c_load / gate_C(width_n[0] + width_p[0], 0, is_dram_);
+  number_gates = logical_effort(
+      min_number_gates,
+      1,
+      F,
+      width_n,
+      width_p,
+      c_load,
+      p_to_n_sz_ratio,
+      is_dram_, false,
+      g_tp.max_w_nmos_);
+}
+
+
+
+double Driver::compute_delay(double inrisetime)
+{
+  int    i;
+  double rd, c_load, c_intrinsic, tf;
+  double this_delay = 0;
+
+  for (i = 0; i < number_gates - 1; ++i)
+  {
+    rd = tr_R_on(width_n[i], NCH, 1, is_dram_);
+    c_load = gate_C(width_n[i+1] + width_p[i+1], 0.0, is_dram_);
+    c_intrinsic = drain_C_(width_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                  drain_C_(width_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+    tf = rd * (c_intrinsic + c_load);
+    this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+    delay += this_delay;
+    inrisetime = this_delay / (1.0 - 0.5);
+    power.readOp.dynamic += (c_intrinsic + c_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+    power.readOp.leakage += cmos_Isub_leakage(width_n[i], width_p[i], 1, inv, is_dram_) *g_tp.peri_global.Vdd;
+    power.readOp.gate_leakage += cmos_Ig_leakage(width_n[i], width_p[i], 1, inv, is_dram_)* g_tp.peri_global.Vdd;
+  }
+
+  i = number_gates - 1;
+  c_load = c_gate_load + c_wire_load;
+  rd = tr_R_on(width_n[i], NCH, 1, is_dram_);
+  c_intrinsic = drain_C_(width_p[i], PCH, 1, 1, g_tp.cell_h_def, is_dram_) +
+                drain_C_(width_n[i], NCH, 1, 1, g_tp.cell_h_def, is_dram_);
+  tf = rd * (c_intrinsic + c_load) + r_wire_load * (c_wire_load / 2 + c_gate_load);
+  this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+  delay += this_delay;
+  power.readOp.dynamic += (c_intrinsic + c_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  power.readOp.leakage += cmos_Isub_leakage(width_n[i], width_p[i], 1, inv, is_dram_) * g_tp.peri_global.Vdd;
+  power.readOp.gate_leakage += cmos_Ig_leakage(width_n[i], width_p[i], 1, inv, is_dram_)* g_tp.peri_global.Vdd;
+
+  return this_delay / (1.0 - 0.5);
+}
+
diff --git a/src/gpuwattch/cacti/decoder.h b/src/gpuwattch/cacti/decoder.h
new file mode 100644
index 000000000..6366e1607
--- /dev/null
+++ b/src/gpuwattch/cacti/decoder.h
@@ -0,0 +1,246 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __DECODER_H__
+#define __DECODER_H__
+
+#include "area.h"
+#include "component.h"
+#include "parameter.h"
+#include <vector>
+
+using namespace std;
+
+
+class Decoder : public Component
+{
+  public:
+    Decoder(
+        int _num_dec_signals,
+        bool flag_way_select,
+        double _C_ld_dec_out,
+        double _R_wire_dec_out,
+        bool fully_assoc_,
+        bool is_dram_,
+        bool is_wl_tr_,
+        const Area & cell_);
+
+    bool   exist;
+    int    num_in_signals;
+    double C_ld_dec_out;
+    double R_wire_dec_out;
+    int    num_gates;
+    int    num_gates_min;
+    double w_dec_n[MAX_NUMBER_GATES_STAGE];
+    double w_dec_p[MAX_NUMBER_GATES_STAGE];
+    double delay;
+    //powerDef power;
+    bool   fully_assoc;
+    bool   is_dram;
+    bool   is_wl_tr;
+    const  Area & cell;
+
+
+    void   compute_widths();
+    void   compute_area();
+    double compute_delays(double inrisetime);  // return outrisetime
+
+    void leakage_feedback(double temperature);
+};
+
+
+
+class PredecBlk : public Component
+{
+ public:
+  PredecBlk(
+      int num_dec_signals,
+      Decoder * dec,
+      double C_wire_predec_blk_out,
+      double R_wire_predec_blk_out,
+      int    num_dec_per_predec,
+      bool   is_dram_,
+      bool   is_blk1);
+
+  Decoder * dec;
+  bool exist;
+  int number_input_addr_bits;
+  double C_ld_predec_blk_out;
+  double R_wire_predec_blk_out;
+  int branch_effort_nand2_gate_output;
+  int branch_effort_nand3_gate_output;
+  bool   flag_two_unique_paths;
+  int flag_L2_gate;
+  int number_inputs_L1_gate;
+  int number_gates_L1_nand2_path;
+  int number_gates_L1_nand3_path;
+  int number_gates_L2;
+  int min_number_gates_L1;
+  int min_number_gates_L2;
+  int num_L1_active_nand2_path;
+  int num_L1_active_nand3_path;
+  double w_L1_nand2_n[MAX_NUMBER_GATES_STAGE];
+  double w_L1_nand2_p[MAX_NUMBER_GATES_STAGE];
+  double w_L1_nand3_n[MAX_NUMBER_GATES_STAGE];
+  double w_L1_nand3_p[MAX_NUMBER_GATES_STAGE];
+  double w_L2_n[MAX_NUMBER_GATES_STAGE];
+  double w_L2_p[MAX_NUMBER_GATES_STAGE];
+  double delay_nand2_path;
+  double delay_nand3_path;
+  powerDef power_nand2_path;
+  powerDef power_nand3_path;
+  powerDef power_L2;
+
+  bool is_dram_;
+
+  void compute_widths();
+  void compute_area();
+
+  void leakage_feedback(double temperature);
+  
+  pair<double, double> compute_delays(pair<double, double> inrisetime); // <nand2, nand3>
+  // return <outrise_nand2, outrise_nand3>
+};
+
+
+class PredecBlkDrv : public Component
+{
+ public:
+  PredecBlkDrv(
+      int   way_select,
+      PredecBlk * blk_,
+      bool  is_dram);
+
+  int flag_driver_exists;
+  int number_input_addr_bits;
+  int number_gates_nand2_path;
+  int number_gates_nand3_path;
+  int min_number_gates;
+  int num_buffers_driving_1_nand2_load;
+  int num_buffers_driving_2_nand2_load;
+  int num_buffers_driving_4_nand2_load;
+  int num_buffers_driving_2_nand3_load;
+  int num_buffers_driving_8_nand3_load;
+  int num_buffers_nand3_path;
+  double c_load_nand2_path_out;
+  double c_load_nand3_path_out;
+  double r_load_nand2_path_out;
+  double r_load_nand3_path_out;
+  double width_nand2_path_n[MAX_NUMBER_GATES_STAGE];
+  double width_nand2_path_p[MAX_NUMBER_GATES_STAGE];
+  double width_nand3_path_n[MAX_NUMBER_GATES_STAGE];
+  double width_nand3_path_p[MAX_NUMBER_GATES_STAGE];
+  double delay_nand2_path;
+  double delay_nand3_path;
+  powerDef power_nand2_path;
+  powerDef power_nand3_path;
+
+  PredecBlk * blk;
+  Decoder   * dec;
+  bool  is_dram_;
+  int   way_select;
+
+  void compute_widths();
+  void compute_area();
+  
+  void leakage_feedback(double temperature);
+
+  
+  pair<double, double> compute_delays(
+      double inrisetime_nand2_path,
+      double inrisetime_nand3_path);  // return <outrise_nand2, outrise_nand3>
+
+  inline int num_addr_bits_nand2_path()
+  {
+    return num_buffers_driving_1_nand2_load +
+           num_buffers_driving_2_nand2_load +
+           num_buffers_driving_4_nand2_load;
+  }
+  inline int num_addr_bits_nand3_path()
+  {
+    return num_buffers_driving_2_nand3_load +
+           num_buffers_driving_8_nand3_load;
+  }
+  double get_rdOp_dynamic_E(int num_act_mats_hor_dir);
+};
+
+
+
+class Predec : public Component
+{
+  public:
+    Predec(
+        PredecBlkDrv * drv1,
+        PredecBlkDrv * drv2);
+
+    double compute_delays(double inrisetime);  // return outrisetime
+
+    void leakage_feedback(double temperature);
+    PredecBlk    * blk1;
+    PredecBlk    * blk2;
+    PredecBlkDrv * drv1;
+    PredecBlkDrv * drv2;
+
+    powerDef block_power;
+    powerDef driver_power;
+
+  private:
+    // returns <delay, risetime>
+    pair<double, double> get_max_delay_before_decoder(
+        pair<double, double> input_pair1,
+        pair<double, double> input_pair2);
+};
+
+
+
+class Driver : public Component
+{
+ public:
+  Driver(double c_gate_load_, double c_wire_load_, double r_wire_load_, bool is_dram);
+
+  int    number_gates;
+  int    min_number_gates;
+  double width_n[MAX_NUMBER_GATES_STAGE];
+  double width_p[MAX_NUMBER_GATES_STAGE];
+  double c_gate_load;
+  double c_wire_load;
+  double r_wire_load;
+  double delay;
+  powerDef power;
+  bool   is_dram_;
+
+  void   compute_widths();
+  double compute_delay(double inrisetime);
+};
+
+
+#endif
diff --git a/src/gpuwattch/cacti/highradix.cc b/src/gpuwattch/cacti/highradix.cc
new file mode 100644
index 000000000..e06cad778
--- /dev/null
+++ b/src/gpuwattch/cacti/highradix.cc
@@ -0,0 +1,392 @@
+/*------------------------------------------------------------
+ *                              CACTI 6.5
+ *         Copyright 2008 Hewlett-Packard Development Corporation
+ *                         All Rights Reserved
+ *
+ * Permission to use, copy, and modify this software and its documentation is
+ * hereby granted only under the following terms and conditions.  Both the
+ * above copyright notice and this permission notice must appear in all copies
+ * of the software, derivative works or modified versions, and any portions
+ * thereof, and both notices must appear in supporting documentation.
+ *
+ * Users of this software agree to the terms and conditions set forth herein, and
+ * hereby grant back to Hewlett-Packard Company and its affiliated companies ("HP")
+ * a non-exclusive, unrestricted, royalty-free right and license under any changes, 
+ * enhancements or extensions  made to the core functions of the software, including 
+ * but not limited to those affording compatibility with other hardware or software
+ * environments, but excluding applications which incorporate this software.
+ * Users further agree to use their best efforts to return to HP any such changes,
+ * enhancements or extensions that they make and inform HP of noteworthy uses of
+ * this software.  Correspondence should be provided to HP at:
+ *
+ *                       Director of Intellectual Property Licensing
+ *                       Office of Strategy and Technology
+ *                       Hewlett-Packard Company
+ *                       1501 Page Mill Road
+ *                       Palo Alto, California  94304
+ *
+ * This software may be distributed (but not offered for sale or transferred
+ * for compensation) to third parties, provided such third parties agree to
+ * abide by the terms and conditions of this notice.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND HP DISCLAIMS ALL
+ * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS.   IN NO EVENT SHALL HP 
+ * CORPORATION BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+ * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *------------------------------------------------------------*/
+
+#include "highradix.h"
+#include <iomanip>
+using namespace std;
+
+#define MAX_WIRE_SCALE 1
+
+HighRadix::HighRadix(
+    double SUB_SWITCH_SZ_,
+    double ROWS_,
+    double FREQUENCY_, // GHz
+    double RADIX_,
+    double VC_COUNT_,
+    double FLIT_SZ_,
+    double AF_,// activity factor
+    double DIE_LEN_,//u
+    double DIE_HT_,//u
+    double INP_BUFF_ENT_, 
+    double ROW_BUFF_ENT_, 
+    double COL_BUFF_ENT_,
+        TechnologyParameter::DeviceType *dt
+    ):SUB_SWITCH_SZ(SUB_SWITCH_SZ_), ROWS(ROWS_), FREQUENCY(FREQUENCY_), 
+    RADIX(RADIX_), VC_COUNT(VC_COUNT_), FLIT_SZ(FLIT_SZ_), AF(AF_), 
+    DIE_LEN(DIE_LEN_), DIE_HT(DIE_HT_), INP_BUFF_ENT(INP_BUFF_ENT_),
+    ROW_BUFF_ENT(ROW_BUFF_ENT_), COL_BUFF_ENT(COL_BUFF_ENT_), deviceType(dt)
+{
+  double area_scale=1;
+  double tech_init = 90;
+  if (g_ip->F_sz_nm == 65) {
+    area_scale*=1;
+  }
+  else if(g_ip->F_sz_nm == 45) {
+    area_scale*=1;
+  }
+  else if(g_ip->F_sz_nm == 32) {
+    area_scale*=2;
+  }
+
+  DIE_LEN = sqrt(DIE_LEN_*DIE_HT_/area_scale);
+  DIE_HT = DIE_LEN;
+
+  COLUMNS = pow(RADIX/SUB_SWITCH_SZ, 2)/ROWS;
+  INP_BUFF_SZ = FLIT_SZ * INP_BUFF_ENT;
+  ROW_BUFF_SZ = ROW_BUFF_ENT * FLIT_SZ;
+  COL_BUFF_SZ = COL_BUFF_ENT * FLIT_SZ;
+  area.set_area(0);
+}
+
+void
+HighRadix::compute_power()
+{
+  num_sub = ROWS * COLUMNS;
+  //FIXME change cb power to per input
+ 
+  double scale = 1;
+  while (true) {
+    Wire winit(scale, scale);
+    cb = new Crossbar(SUB_SWITCH_SZ, SUB_SWITCH_SZ, FLIT_SZ);
+    cb->compute_power();
+    if (cb->delay*1e12 < (1/FREQUENCY)*(1e3))
+      break;
+    else {
+      scale+=0.2;
+      if (scale > MAX_WIRE_SCALE) break;
+      cout << "scale = " << scale << endl;
+    }
+  }
+  cb->power.readOp.dynamic /= SUB_SWITCH_SZ; // crossbar power per message
+  scale = 1;
+
+  while (true) {
+    Wire winit(scale, scale);
+    out_cb = new Crossbar(1, SUB_SWITCH_SZ, FLIT_SZ);
+    out_cb->compute_power();
+    if (out_cb->delay*1e12 < (1/FREQUENCY)*(1e3))
+      break;
+    else {
+      scale+=0.2;
+      if (scale > MAX_WIRE_SCALE) break;
+      cout << "scale = " << scale << endl;
+    }
+  }
+  Wire winit;
+  out_cb->power.readOp.dynamic /= SUB_SWITCH_SZ; // power per message
+
+  //arbiter initialization
+  vc_arb = new MCPAT_Arbiter(VC_COUNT, FLIT_SZ, cb->area.w);
+  vc_arb->compute_power();
+  c_arb = new MCPAT_Arbiter(COLUMNS, FLIT_SZ, cb->area.w);
+  c_arb->compute_power();
+  cb_arb = new MCPAT_Arbiter(RADIX/ROWS, FLIT_SZ, cb->area.w);
+  cb_arb->compute_power();
+
+  // input buffer, row/column buffer initialization
+  inp_buff =  buffer_(FLIT_SZ, INP_BUFF_SZ);
+  c_buff = buffer_(FLIT_SZ, COL_BUFF_SZ*2);
+  r_buff = buffer_(FLIT_SZ, ROW_BUFF_SZ*2);
+  
+
+  // repeated wire initialization
+  hor_bus = new Wire(g_ip->wt, DIE_LEN);
+  // effective ht of vertical bus (connecting cb to column buffer) in each sub-switch
+  double eff_ht = (ROWS * (ROWS +1)/2) * (DIE_HT/ROWS);
+  ver_bus = new Wire(g_ip->wt, eff_ht);
+
+  // sub switch includes row buffers, column buffers, vc/crossbar/column arbitration and a 2 stage crossbar traversal
+  sub_switch_power();
+  power.readOp.dynamic += sub_sw.power.readOp.dynamic * num_sub;
+  power.readOp.leakage += sub_sw.power.readOp.leakage * num_sub;
+
+  // input buffer power
+  power.readOp.dynamic += 2 /*r&w*/ * inp_buff->power.readOp.dynamic * RADIX;
+  power.readOp.leakage += inp_buff->power.readOp.leakage * RADIX;
+
+  // buses
+  power.readOp.dynamic += hor_bus->power.readOp.dynamic * FLIT_SZ * SUB_SWITCH_SZ * ROWS;
+  power.readOp.leakage += hor_bus->power.readOp.leakage * FLIT_SZ * SUB_SWITCH_SZ * ROWS;
+  power.readOp.dynamic += ver_bus->power.readOp.dynamic * FLIT_SZ * COLUMNS * SUB_SWITCH_SZ;
+  power.readOp.leakage += ver_bus->power.readOp.leakage * FLIT_SZ * ROWS * COLUMNS * SUB_SWITCH_SZ;
+
+  // To calculate contribution of each component to the total power
+  compute_crossbar_power();
+  compute_bus_power();
+  compute_arb_power();
+  compute_buff_power();
+
+  //area 
+  sub_sw.area.set_area(sub_sw.area.get_area() + cb->area.get_area());
+  sub_sw.area.set_area(sub_sw.area.get_area() + out_cb->area.get_area());  
+  sub_sw.area.set_area(sub_sw.area.get_area() + r_buff->area.get_area() * VC_COUNT * SUB_SWITCH_SZ);
+  sub_sw.area.set_area(sub_sw.area.get_area() + c_buff->area.get_area() * VC_COUNT * SUB_SWITCH_SZ);
+
+  buff_tot.area.set_area(buff_tot.area.get_area() + inp_buff->area.get_area() * RADIX);
+  buff_tot.area.set_area(buff_tot.area.get_area() + VC_COUNT * r_buff->area.get_area() * SUB_SWITCH_SZ * num_sub);
+  buff_tot.area.set_area(buff_tot.area.get_area() + VC_COUNT * c_buff->area.get_area() * SUB_SWITCH_SZ * num_sub);
+
+  crossbar_tot.area.set_area(crossbar_tot.area.get_area() + cb->area.get_area() * num_sub);
+  crossbar_tot.area.set_area(crossbar_tot.area.get_area() + out_cb->area.get_area() * num_sub);
+
+  wire_tot.area.set_area(hor_bus->area.get_area() * FLIT_SZ * SUB_SWITCH_SZ * ROWS);
+  wire_tot.area.set_area(ver_bus->area.get_area() * FLIT_SZ * ROWS * COLUMNS);
+}
+
+void HighRadix::compute_crossbar_power()
+{
+  crossbar_tot.power = cb->power;
+  crossbar_tot.power = crossbar_tot.power + out_cb->power;
+  crossbar_tot.power.readOp.dynamic *= num_sub;
+  crossbar_tot.power.readOp.leakage *= num_sub;
+}
+
+void HighRadix::compute_bus_power()
+{
+  wire_tot.power.readOp.dynamic = hor_bus->power.readOp.dynamic * FLIT_SZ * SUB_SWITCH_SZ * ROWS;
+  wire_tot.power.readOp.leakage = hor_bus->power.readOp.leakage * FLIT_SZ * SUB_SWITCH_SZ * ROWS;
+  wire_tot.power.readOp.dynamic += ver_bus->power.readOp.dynamic * FLIT_SZ * COLUMNS * SUB_SWITCH_SZ;
+  wire_tot.power.readOp.leakage += ver_bus->power.readOp.leakage * FLIT_SZ * ROWS * COLUMNS * SUB_SWITCH_SZ;
+}
+
+void HighRadix::compute_arb_power()
+{
+  arb_tot.power = cb_arb->power;
+  arb_tot.power = arb_tot.power + vc_arb->power; // for CB traversal
+  arb_tot.power = arb_tot.power + c_arb->power;
+  arb_tot.power = arb_tot.power + vc_arb->power; // to the o/p port
+
+  arb_tot.power.readOp.dynamic *= num_sub;
+  arb_tot.power.readOp.leakage *= num_sub;
+}
+
+void HighRadix::compute_buff_power()
+{
+  //input buffer read/write
+  buff_tot.power.readOp.dynamic = 2 * inp_buff->power.readOp.dynamic * RADIX;
+  buff_tot.power.readOp.leakage = inp_buff->power.readOp.leakage * RADIX;
+
+  //row buffer read/write
+  buff_tot.power.readOp.dynamic += r_buff->power.readOp.dynamic * 2 * num_sub; 
+  buff_tot.power.readOp.leakage += r_buff->power.readOp.leakage * num_sub;
+
+  //column buffer read/write
+  buff_tot.power.readOp.dynamic += c_buff->power.readOp.dynamic * 2 * num_sub; 
+  buff_tot.power.readOp.leakage += c_buff->power.readOp.leakage * num_sub; 
+}
+
+void
+HighRadix::sub_switch_power()
+{
+  // each sub-switch power
+  sub_sw.power.readOp.dynamic = sub_sw.power.readOp.dynamic + 
+          r_buff->power.readOp.dynamic * 2 /* one read and one write */ * VC_COUNT; 
+  sub_sw.power.readOp.leakage = sub_sw.power.readOp.leakage + 
+          r_buff->power.readOp.leakage * VC_COUNT; 
+  sub_sw.power = sub_sw.power + cb->power;
+
+  sub_sw.power.readOp.dynamic = sub_sw.power.readOp.dynamic + 
+          2 * c_buff->power.readOp.dynamic /* one read and one write */ * VC_COUNT; 
+  sub_sw.power.readOp.leakage = sub_sw.power.readOp.leakage + 
+          c_buff->power.readOp.leakage * VC_COUNT; 
+  sub_sw.power = sub_sw.power + out_cb->power;
+
+  // arbiter power
+  sub_sw.power = sub_sw.power + cb_arb->power;
+  sub_sw.power = sub_sw.power + vc_arb->power; // for CB traversal
+  sub_sw.power = sub_sw.power + c_arb->power;
+  sub_sw.power = sub_sw.power + vc_arb->power; // to the o/p port
+}
+  
+
+HighRadix::~HighRadix()
+{
+  delete inp_buff;
+  delete r_buff;
+  delete c_buff;
+  delete c_arb;
+  delete cb_arb;
+  delete vc_arb;
+  delete out_cb;
+}
+
+Mat * HighRadix::buffer_(double block_sz, double sz)
+{
+  DynamicParameter dyn_p;
+  dyn_p.is_tag = false;
+  dyn_p.num_subarrays = 1;
+  dyn_p.num_mats = 1;
+  dyn_p.Ndbl = 1;
+  dyn_p.Ndwl = 1;
+  dyn_p.Nspd = 1;
+  dyn_p.deg_bl_muxing = 1;
+  dyn_p.deg_senseamp_muxing_non_associativity = 1;
+  dyn_p.Ndsam_lev_1 = 1;
+  dyn_p.Ndsam_lev_2 = 1;
+  dyn_p.number_addr_bits_mat = 8;
+  dyn_p.number_way_select_signals_mat = 1;
+  dyn_p.num_act_mats_hor_dir = 1;
+  dyn_p.is_dram = false;
+  dyn_p.V_b_sense = deviceType->Vdd; // FIXME check power calc.
+  dyn_p.ram_cell_tech_type = 
+  dyn_p.num_r_subarray = (int) (sz/block_sz);
+  dyn_p.num_c_subarray = (int) block_sz;
+  dyn_p.num_mats_h_dir = 1;
+  dyn_p.num_mats_v_dir = 1;
+  dyn_p.num_do_b_subbank = (int)block_sz;
+  dyn_p.num_do_b_mat = (int) block_sz;
+  dyn_p.num_di_b_mat = (int) block_sz;
+
+  dyn_p.use_inp_params = 1;
+  dyn_p.num_wr_ports = 1;
+  dyn_p.num_rd_ports = 1;
+  dyn_p.num_rw_ports = 0;
+  dyn_p.num_se_rd_ports =0;
+  dyn_p.out_w = (int) block_sz;
+
+
+  dyn_p.cell.h = g_tp.sram.b_h + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_wr_ports + 
+      dyn_p.num_rw_ports - 1 + dyn_p.num_rd_ports);
+  dyn_p.cell.w = g_tp.sram.b_w + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_rw_ports - 1 + 
+      (dyn_p.num_rd_ports - dyn_p.num_se_rd_ports) + 
+      dyn_p.num_wr_ports) + g_tp.wire_outside_mat.pitch * dyn_p.num_se_rd_ports;
+
+  Mat *buff = new Mat(dyn_p);
+  buff->compute_delays(0);
+  buff->compute_power_energy();
+  return buff;
+}
+
+void HighRadix::print_buffer(Component *c)
+{
+//  cout << "\tDelay         - " << c->delay * 1e6 << " ns" << endl;
+  cout << "\tDynamic Power - " << c->power.readOp.dynamic*1e9 << " nJ" << endl;
+  cout << "\tLeakage Power - " << c->power.readOp.leakage*1e3 << " mW" << endl;
+  cout << "\tWidth         - " << c->area.w << " u" << endl;
+  cout << "\tLength        - " << c->area.h << " u" << endl;
+}
+
+
+void HighRadix::print_router()
+{
+  cout << "\n\nMCPAT_Router stats:\n";
+  cout << "\tNetwork frequency - " << FREQUENCY <<" GHz\n";
+  cout << "\tNo. of Virtual channels - " << VC_COUNT << "\n";
+  cout << "\tSub-switch size - " << (int)SUB_SWITCH_SZ << endl;
+  cout << "\tNo. of rows - " << (int)ROWS << endl;
+  cb->print_crossbar();
+  out_cb->print_crossbar();
+  vc_arb->print_arbiter();
+  c_arb->print_arbiter();
+  cb_arb->print_arbiter();
+//  hor_bus->print_wire();
+  cout << "\n\nBuffer stats:\n";
+  cout << "\nInput Buffer stats:\n";
+  print_buffer (inp_buff);
+  cout << "\nRow Buffer stats:\n";
+  print_buffer (r_buff);
+  cout << "\nColumn Buffer stats:\n";
+  print_buffer (c_buff);
+
+  
+  cout << "\n\n MCPAT_Router dynamic power (max) = " << power.readOp.dynamic * FREQUENCY * 1e9 << " W\n";
+  cout << " MCPAT_Router dynamic power (load - " << AF << ") = " << power.readOp.dynamic * FREQUENCY * 1e9 * AF << " W\n";
+  cout << "\n\nDetailed Stats\n";
+  cout << "--------------\n";
+  cout << "Power dissipated in buses/wires - " << setprecision(3) << 
+    wire_tot.power.readOp.dynamic * FREQUENCY * 1e9 << " W";
+  cout << "\t" <<setiosflags(ios::fixed) << setprecision(2) <<
+          (wire_tot.power.readOp.dynamic/power.readOp.dynamic)*100 << " %\n";
+  cout << "Buffer power                    - " << buff_tot.power.readOp.dynamic * 
+          FREQUENCY * 1e9 << " W";
+  cout << "\t" << 
+          (buff_tot.power.readOp.dynamic/power.readOp.dynamic)*100 << " %\n";
+  cout << "Crossbar power                  - " << crossbar_tot.power.readOp.dynamic * 
+          FREQUENCY * 1e9 << " W";
+  cout << "\t" << 
+          (crossbar_tot.power.readOp.dynamic/power.readOp.dynamic)*100 << " %\n";
+  cout << "Arbiter power                   - " << arb_tot.power.readOp.dynamic * 
+          FREQUENCY * 1e9 << " W";
+  cout << "\t" << 
+          (arb_tot.power.readOp.dynamic/power.readOp.dynamic)*100 << " %\n";
+  cout << "Sub-switch power (dynamic)      - " << sub_sw.power.readOp.dynamic * num_sub *
+          FREQUENCY * 1e9 << " W";
+  cout << "\t" << 
+          (sub_sw.power.readOp.dynamic * num_sub/power.readOp.dynamic)*100 << " %\n";
+  cout << "Input buffer power (dynamic)    - " << 2 * inp_buff->power.readOp.dynamic * 
+          RADIX * FREQUENCY * 1e9 << " W";
+  cout << "\t" << 
+          (2 * inp_buff->power.readOp.dynamic * RADIX/power.readOp.dynamic)*100 << " %\n";
+  cout << "\nLeakage power\n";
+  cout << "MCPAT_Router power                    - " << power.readOp.leakage << " W\n";
+  cout << "Bus power                       - " <<setprecision(4) <<  wire_tot.power.readOp.leakage << " W\n";
+  cout << "Buffer power                    - " << buff_tot.power.readOp.leakage << " W\n";
+  cout << "Crossbar power                  - " << crossbar_tot.power.readOp.leakage << " W\n";
+  cout << "Arbiter power                   - " << arb_tot.power.readOp.leakage << " W\n";
+  cout << "Sub-switch power                - " << sub_sw.power.readOp.leakage << " W" <<endl;
+
+  cout << "\n\nArea Stats\n";
+  cout << "Input buffer dimension (mm x mm)- " << inp_buff->area.get_h()*1e-3 << " x " << inp_buff->area.get_w()*1e-3 << endl;
+  cout << "Row buffer (mm x mm)            - " << r_buff->area.w*1e-3 << " x " << r_buff->area.h*1e-3 << endl;
+  cout << "Col buffer (mm x mm)            - " << c_buff->area.w*1e-3 << " x " << c_buff->area.h*1e-3 << endl;
+  cout << "Crossbar area  (mm x mm)        - " << cb->area.w*1e-3 << " x " << cb->area.h*1e-3 << endl;
+//  cout << "Wire hor area  (nm x nm)        - " << hor_bus->area.w*1e3 << " x " << hor_bus->area.h*1e3 << endl;
+//  cout << "Wire ver area  (nm x nm)        - " << ver_bus->area.w*1e3 << " x " << ver_bus->area.h*1e3 << endl;
+  cout << "Wire total                      - " << wire_tot.area.get_area()*1e-6 << " mm2\n";
+  cout << "Crossbar total                  - " << crossbar_tot.area.get_area()*1e-6 << " mm2\n";
+  cout << "Buff total                      - " << buff_tot.area.get_area()*1e-6 << " mm2\n";
+  cout << "Subswitch                       - " << sub_sw.area.get_area()*1e-6 << " mm2\n";
+  cout << "Subswitch total                 - " << sub_sw.area.get_area()*num_sub*1e-6 << " mm2\n";
+
+  cout << "Total area                      - " << (wire_tot.area.get_area() + crossbar_tot.area.get_area() +
+                                                  buff_tot.area.get_area())*1e-6 << endl;
+}
+
diff --git a/src/gpuwattch/cacti/highradix.h b/src/gpuwattch/cacti/highradix.h
new file mode 100644
index 000000000..7b41801ae
--- /dev/null
+++ b/src/gpuwattch/cacti/highradix.h
@@ -0,0 +1,134 @@
+/*------------------------------------------------------------
+ *                              CACTI 6.5
+ *         Copyright 2008 Hewlett-Packard Development Corporation
+ *                         All Rights Reserved
+ *
+ * Permission to use, copy, and modify this software and its documentation is
+ * hereby granted only under the following terms and conditions.  Both the
+ * above copyright notice and this permission notice must appear in all copies
+ * of the software, derivative works or modified versions, and any portions
+ * thereof, and both notices must appear in supporting documentation.
+ *
+ * Users of this software agree to the terms and conditions set forth herein, and
+ * hereby grant back to Hewlett-Packard Company and its affiliated companies ("HP")
+ * a non-exclusive, unrestricted, royalty-free right and license under any changes, 
+ * enhancements or extensions  made to the core functions of the software, including 
+ * but not limited to those affording compatibility with other hardware or software
+ * environments, but excluding applications which incorporate this software.
+ * Users further agree to use their best efforts to return to HP any such changes,
+ * enhancements or extensions that they make and inform HP of noteworthy uses of
+ * this software.  Correspondence should be provided to HP at:
+ *
+ *                       Director of Intellectual Property Licensing
+ *                       Office of Strategy and Technology
+ *                       Hewlett-Packard Company
+ *                       1501 Page Mill Road
+ *                       Palo Alto, California  94304
+ *
+ * This software may be distributed (but not offered for sale or transferred
+ * for compensation) to third parties, provided such third parties agree to
+ * abide by the terms and conditions of this notice.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND HP DISCLAIMS ALL
+ * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS.   IN NO EVENT SHALL HP 
+ * CORPORATION BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+ * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *------------------------------------------------------------*/
+
+#ifndef __HIGHRADIX__
+#define __HIGHRADIX__
+
+#include <iostream>
+#include "basic_circuit.h"
+#include "component.h"
+#include "parameter.h"
+#include "assert.h"
+#include "cacti_interface.h"
+#include "wire.h"
+#include "mat.h"
+#include "crossbar.h"
+#include "arbiter.h"
+#include "ROUTER.def"
+
+#define FLIP_FLOP_L 0 //W leakage
+#define FLIP_FLOP_D 0 //J dynamic
+#define ROUTE_LOGIC_D 0 //J
+#define ROUTE_LOGIC_L 0 //W
+
+class HighRadix : public Component
+{
+  public:
+    HighRadix(
+    double SUB_SWITCH_SZ_ = DEF_SUB_SWITCH_SZ,
+    double ROWS_ = DEF_ROWS,
+    double FREQUENCY_ = DEF_FREQUENCY, // GHz
+    double RADIX_ = DEF_RADIX,
+    double VC_COUNT_ = DEF_VC_COUNT,
+    double FLIT_SZ_ = DEF_FLIT_SZ,
+    double AF_ = DEF_AF,// activity factor
+    double DIE_LEN_ = DEF_DIE_LEN,//u
+    double DIE_HT_ = DEF_DIE_HT,//u
+    double INP_BUFF_ENT_ = DEF_INP_BUFF_ENT, 
+    double ROW_BUFF_ENT_ = DEF_ROW_BUFF_ENT, 
+    double COL_BUFF_ENT_ = DEF_COL_BUFF_ENT,
+    TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+    ~HighRadix();
+
+
+// Params
+    double SUB_SWITCH_SZ;
+    double ROWS;
+    double FREQUENCY;// GHz
+    double RADIX;
+    double VC_COUNT;
+    double FLIT_SZ;
+    double AF;// activity factor
+    double DIE_LEN;//u
+    double DIE_HT;//u
+    double INP_BUFF_ENT;
+    double ROW_BUFF_ENT;
+    double COL_BUFF_ENT;
+
+
+
+    void print_router();
+
+    double INP_BUFF_SZ;
+    double COLUMNS;
+    double ROW_BUFF_SZ;
+    double COL_BUFF_SZ;
+    void compute_power();
+    void compute_arb_power();
+    void compute_crossbar_power();
+    void compute_buff_power();
+    void compute_bus_power();
+    void print_buffer(Component *r);
+    void sub_switch_power();
+    Mat * buffer_(double block_sz, double sz);
+
+    Crossbar *cb, *out_cb;
+    MCPAT_Arbiter *cb_arb, *vc_arb, *c_arb;
+    Mat *inp_buff, *r_buff, *c_buff;
+    Component sub_sw;
+    Component wire_tot, buff_tot, crossbar_tot, arb_tot;
+    Wire *hor_bus, *ver_bus;
+
+  private:
+    double min_w_pmos;
+    TechnologyParameter::DeviceType *deviceType;
+    double num_sub;
+
+};
+
+class Waveguide : public Component
+{
+  public:
+    Waveguide(TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+    ~Waveguide();
+};
+
+#endif
diff --git a/src/gpuwattch/cacti/htree2.cc b/src/gpuwattch/cacti/htree2.cc
new file mode 100644
index 000000000..c40a88080
--- /dev/null
+++ b/src/gpuwattch/cacti/htree2.cc
@@ -0,0 +1,640 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include "htree2.h"
+#include "wire.h"
+#include <assert.h>
+#include <iostream>
+
+Htree2::Htree2(
+    enum Wire_type wire_model, double mat_w, double mat_h,
+    int a_bits, int d_inbits, int search_data_in, int d_outbits, int search_data_out, int bl, int wl, enum Htree_type htree_type,
+    bool uca_tree_, bool search_tree_, TechnologyParameter::DeviceType *dt)
+ :in_rise_time(0), out_rise_time(0),
+  tree_type(htree_type), mat_width(mat_w), mat_height(mat_h),
+  add_bits(a_bits), data_in_bits(d_inbits), search_data_in_bits(search_data_in),data_out_bits(d_outbits),
+  search_data_out_bits(search_data_out), ndbl(bl), ndwl(wl),
+  uca_tree(uca_tree_), search_tree(search_tree_), wt(wire_model), deviceType(dt)
+{
+  assert(ndbl >= 2 && ndwl >= 2);
+
+//  if (ndbl == 1 && ndwl == 1)
+//  {
+//    delay = 0;
+//    power.readOp.dynamic = 0;
+//    power.readOp.leakage = 0;
+//    area.w = mat_w;
+//    area.h = mat_h;
+//    return;
+//  }
+//  if (ndwl == 1) ndwl++;
+//  if (ndbl == 1) ndbl++;
+
+  max_unpipelined_link_delay = 0; //TODO
+  min_w_nmos = g_tp.min_w_nmos_;
+  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos;
+
+  switch (htree_type)
+  {
+    case Add_htree:
+      wire_bw = init_wire_bw = add_bits;
+      in_htree();
+      break;
+    case Data_in_htree:
+      wire_bw = init_wire_bw = data_in_bits;
+      in_htree();
+      break;
+    case Data_out_htree:
+      wire_bw = init_wire_bw = data_out_bits;
+      out_htree();
+      break;
+    case Search_in_htree:
+      wire_bw = init_wire_bw = search_data_in_bits;//in_search_tree is broad cast, out_htree is not.
+      in_htree();
+      break;
+    case Search_out_htree:
+      wire_bw = init_wire_bw = search_data_out_bits;
+      out_htree();
+      break;
+    default:
+      assert(0);
+      break;
+  }
+
+  power_bit = power;
+  power.readOp.dynamic *= init_wire_bw;
+
+  assert(power.readOp.dynamic >= 0);
+  assert(power.readOp.leakage >= 0);
+}
+
+
+
+// nand gate sizing calculation
+void Htree2::input_nand(double s1, double s2, double l_eff)
+{
+  Wire w1(wt, l_eff);
+  double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
+  // input capacitance of a repeater  = input capacitance of nand.
+  double nsize = s1*(1 + pton_size)/(2 + pton_size);
+  nsize = (nsize < 1) ? 1 : nsize;
+
+  double tc = 2*tr_R_on(nsize*min_w_nmos, NCH, 1) *
+    (drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)*2 +
+     2 * gate_C(s2*(min_w_nmos + min_w_pmos), 0));
+  delay+= horowitz (w1.out_rise_time, tc,
+      deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
+  power.readOp.dynamic += 0.5 *
+    (2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     + drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     + 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd;
+
+    power.searchOp.dynamic += 0.5 *
+    (2*drain_C_(pton_size * nsize*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     + drain_C_(nsize*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     + 2*gate_C(s2*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd * wire_bw ;
+  power.readOp.leakage += (wire_bw*cmos_Isub_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd;
+  power.readOp.gate_leakage += (wire_bw*cmos_Ig_leakage(min_w_nmos*(nsize*2), min_w_pmos * nsize * 2, 2, nand))*deviceType->Vdd;
+}
+
+
+
+// tristate buffer model consisting of not, nand, nor, and driver transistors
+void Htree2::output_buffer(double s1, double s2, double l_eff)
+{
+  Wire w1(wt, l_eff);
+  double pton_size = deviceType->n_to_p_eff_curr_drv_ratio;
+  // input capacitance of repeater = input capacitance of nand + nor.
+  double size = s1*(1 + pton_size)/(2 + pton_size + 1 + 2*pton_size);
+  double s_eff =  //stage eff of a repeater in a wire
+    (gate_C(s2*(min_w_nmos + min_w_pmos), 0) + w1.wire_cap(l_eff*1e-6,true))/
+    gate_C(s2*(min_w_nmos + min_w_pmos), 0);
+  double tr_size = gate_C(s1*(min_w_nmos + min_w_pmos), 0) * 1/2/(s_eff*gate_C(min_w_pmos, 0));
+  size = (size < 1) ? 1 : size;
+
+  double res_nor = 2*tr_R_on(size*min_w_pmos, PCH, 1);
+  double res_ptrans = tr_R_on(tr_size*min_w_nmos, NCH, 1);
+  double cap_nand_out = drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
+                        drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)*2 +
+                        gate_C(tr_size*min_w_pmos, 0);
+  double cap_ptrans_out = 2 *(drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+                              drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)) +
+                          gate_C(s1*(min_w_nmos + min_w_pmos), 0);
+
+  double tc = res_nor * cap_nand_out + (res_nor + res_ptrans) * cap_ptrans_out;
+
+
+  delay += horowitz (w1.out_rise_time, tc,
+      deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE);
+
+  //nand
+  power.readOp.dynamic += 0.5 *
+    (2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+       drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
+     gate_C(tr_size*(min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd;
+
+    power.searchOp.dynamic += 0.5 *
+    (2*drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+       drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def) +
+     gate_C(tr_size*(min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
+
+  //not
+  power.readOp.dynamic += 0.5 *
+    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     +drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     +gate_C(size*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd;
+
+    power.searchOp.dynamic += 0.5 *
+    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     +drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     +gate_C(size*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
+
+  //nor
+  power.readOp.dynamic += 0.5 *
+    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     + 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     +gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd;
+
+    power.searchOp.dynamic += 0.5 *
+    (drain_C_(size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+     + 2*drain_C_(size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def)
+     +gate_C(tr_size*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
+
+  //output transistor
+  power.readOp.dynamic += 0.5 *
+    ((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+      +drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2
+     + gate_C(s1*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd;
+
+    power.searchOp.dynamic += 0.5 *
+    ((drain_C_(tr_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def)
+      +drain_C_(tr_size*min_w_nmos, NCH, 1, 1, g_tp.cell_h_def))*2
+     + gate_C(s1*(min_w_nmos + min_w_pmos), 0)) *
+    deviceType->Vdd * deviceType->Vdd*init_wire_bw;
+
+  if(uca_tree) {
+	power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
+	power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
+	power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
+
+	power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
+    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
+    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
+    //power.readOp.gate_leakage *=;
+  }
+  else {
+	power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
+	power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
+	power.readOp.leakage += cmos_Isub_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
+
+	power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*tr_size*2, min_w_pmos*tr_size*2, 1, inv)*deviceType->Vdd*wire_bw;/*inverter + output tr*/
+    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nand)*deviceType->Vdd*wire_bw;//nand
+    power.readOp.gate_leakage += cmos_Ig_leakage(min_w_nmos*size*3, min_w_pmos*size*3, 2, nor)*deviceType->Vdd*wire_bw;//nor
+    //power.readOp.gate_leakage *=deviceType->Vdd*wire_bw;
+  }
+}
+
+
+
+/* calculates the input h-tree delay/power
+ * A nand gate is used at each node to
+ * limit the signal
+ * The area of an unbalanced htree (rows != columns)
+ * depends on how data is traversed.
+ * In the following function, if ( no. of rows < no. of columns),
+ * then data first traverse in excess hor. links until vertical
+ * and horizontal nodes are same.
+ * If no. of rows is bigger, then data traverse in
+ * a hor. link followed by a ver. link in a repeated
+ * fashion (similar to a balanced tree) until there are no
+ * hor. links left. After this it goes through the remaining vertical
+ * links.
+ */
+  void
+Htree2::in_htree()
+{
+  //temp var
+  double s1 = 0, s2 = 0, s3 = 0;
+  double l_eff = 0;
+  Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0;
+  double len = 0, ht = 0;
+  int option = 0;
+
+  int h = (int) _log2(ndwl/2); // horizontal nodes
+  int v = (int) _log2(ndbl/2); // vertical nodes
+  double len_temp;
+  double ht_temp;
+  if (uca_tree)
+  {//Sheng: this computation do not consider the wires that route from edge to middle.
+    ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */
+        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
+         2 * (1-pow(0.5,h))))/2;
+    len_temp = (mat_width*ndwl/2 +
+        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
+         2 * (1-pow(0.5,v))))/2;
+  }
+  else
+  {
+    if (ndwl == ndbl) {
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndbl/2-1) * g_tp.wire_outside_mat.pitch) +
+          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
+          )/2;
+      len_temp = (mat_width*ndwl/2 +
+        ((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
+        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
+    }
+    else if (ndwl > ndbl) {
+      double excess_part = (_log2(ndwl/2) - _log2(ndbl/2));
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits + + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch *
+          (2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2;
+      len_temp = (mat_width*ndwl/2 +
+        ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
+        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
+    }
+    else {
+       double excess_part = (_log2(ndbl/2) - _log2(ndwl/2));
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
+          )/2;
+      len_temp = (mat_width*ndwl/2 +
+          ((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2;
+    }
+  }
+
+  area.h   = ht_temp * 2;
+  area.w   = len_temp * 2;
+  delay = 0;
+  power.readOp.dynamic = 0;
+  power.readOp.leakage = 0;
+  power.searchOp.dynamic =0;
+  len = len_temp;
+  ht  = ht_temp/2;
+
+  while (v > 0 || h > 0)
+  {
+    if (wtemp1) delete wtemp1;
+    if (wtemp2) delete wtemp2;
+    if (wtemp3) delete wtemp3;
+
+    if (h > v)
+    {
+      //the iteration considers only one horizontal link
+      wtemp1 = new Wire(wt, len); // hor
+      wtemp2 = new Wire(wt, len/2);  // ver
+      len_temp = len;
+      len /= 2;
+      wtemp3 = 0;
+      h--;
+      option = 0;
+    }
+    else if (v>0 && h>0)
+    {
+      //considers one horizontal link and one vertical link
+      wtemp1 = new Wire(wt, len); // hor
+      wtemp2 = new Wire(wt, ht);  // ver
+      wtemp3 = new Wire(wt, len/2);  // next hor
+      len_temp = len;
+      ht_temp = ht;
+      len /= 2;
+      ht  /= 2;
+      v--;
+      h--;
+      option = 1;
+    }
+    else
+    {
+      // considers only one vertical link
+      assert(h == 0);
+      wtemp1 = new Wire(wt, ht); // ver
+      wtemp2 = new Wire(wt, ht/2);  // hor
+      ht_temp = ht;
+      ht /= 2;
+      wtemp3 = 0;
+      v--;
+      option = 2;
+    }
+
+    delay += wtemp1->delay;
+    power.readOp.dynamic += wtemp1->power.readOp.dynamic;
+    power.searchOp.dynamic += wtemp1->power.readOp.dynamic*wire_bw;
+    power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw;
+    power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw;
+    if ((uca_tree == false && option == 2) || search_tree==true)
+    {
+      wire_bw*=2;  // wire bandwidth doubles only for vertical branches
+    }
+
+    if (uca_tree == false)
+    {
+      if (len_temp > wtemp1->repeater_spacing)
+      {
+        s1 = wtemp1->repeater_size;
+        l_eff = wtemp1->repeater_spacing;
+      }
+      else
+      {
+        s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size;
+        l_eff = len_temp;
+      }
+
+      if (ht_temp > wtemp2->repeater_spacing)
+      {
+        s2 = wtemp2->repeater_size;
+      }
+      else
+      {
+        s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size;
+      }
+      // first level
+      input_nand(s1, s2, l_eff);
+    }
+
+
+    if (option != 1)
+    {
+      continue;
+    }
+
+    // second level
+    delay += wtemp2->delay;
+    power.readOp.dynamic += wtemp2->power.readOp.dynamic;
+    power.searchOp.dynamic += wtemp2->power.readOp.dynamic*wire_bw;
+    power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw;
+    power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+
+    if (uca_tree)
+    {
+      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
+      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+    }
+    else
+    {
+      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
+      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+      wire_bw*=2;
+
+      if (ht_temp > wtemp3->repeater_spacing)
+      {
+        s3    = wtemp3->repeater_size;
+        l_eff = wtemp3->repeater_spacing;
+      }
+      else
+      {
+        s3    = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size;
+        l_eff = ht_temp;
+      }
+
+      input_nand(s2, s3, l_eff);
+    }
+  }
+
+  if (wtemp1) delete wtemp1;
+  if (wtemp2) delete wtemp2;
+  if (wtemp3) delete wtemp3;
+}
+
+
+
+/* a tristate buffer is used to handle fan-ins
+ * The area of an unbalanced htree (rows != columns)
+ * depends on how data is traversed.
+ * In the following function, if ( no. of rows < no. of columns),
+ * then data first traverse in excess hor. links until vertical
+ * and horizontal nodes are same.
+ * If no. of rows is bigger, then data traverse in
+ * a hor. link followed by a ver. link in a repeated
+ * fashion (similar to a balanced tree) until there are no
+ * hor. links left. After this it goes through the remaining vertical
+ * links.
+ */
+void Htree2::out_htree()
+{
+  //temp var
+  double s1 = 0, s2 = 0, s3 = 0;
+  double l_eff = 0;
+  Wire *wtemp1 = 0, *wtemp2 = 0, *wtemp3 = 0;
+  double len = 0, ht = 0;
+  int option = 0;
+
+  int h = (int) _log2(ndwl/2);
+  int v = (int) _log2(ndbl/2);
+  double len_temp;
+  double ht_temp;
+  if (uca_tree)
+  {
+    ht_temp = (mat_height*ndbl/2 +/* since uca_tree models interbank tree, mat_height => bank height */
+        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
+         2 * (1-pow(0.5,h))))/2;
+    len_temp = (mat_width*ndwl/2 +
+        ((add_bits + data_in_bits + data_out_bits + (search_data_in_bits + search_data_out_bits)) * g_tp.wire_outside_mat.pitch *
+         2 * (1-pow(0.5,v))))/2;
+  }
+  else
+    {
+    if (ndwl == ndbl) {
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits+ (search_data_in_bits + search_data_out_bits)) * (ndbl/2-1) * g_tp.wire_outside_mat.pitch) +
+          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
+          )/2;
+      len_temp = (mat_width*ndwl/2 +
+        ((add_bits + (search_data_in_bits + search_data_out_bits)) * (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
+        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
+
+    }
+    else if (ndwl > ndbl) {
+      double excess_part = (_log2(ndwl/2) - _log2(ndbl/2));
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits + (search_data_in_bits + search_data_out_bits)) * ((ndbl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch *
+          (2*(1 - pow(0.5, h-v)) + pow(0.5, v-h) * v))/2;
+      len_temp = (mat_width*ndwl/2 +
+        ((add_bits + (search_data_in_bits + search_data_out_bits))* (ndwl/2-1) * g_tp.wire_outside_mat.pitch) +
+        ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * v))/2;
+    }
+    else {
+      double excess_part = (_log2(ndbl/2) - _log2(ndwl/2));
+      ht_temp = ((mat_height*ndbl/2) +
+          ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          ((data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * h)
+          )/2;
+      len_temp = (mat_width*ndwl/2 +
+          ((add_bits + (search_data_in_bits + search_data_out_bits))* ((ndwl/2-1) + excess_part) * g_tp.wire_outside_mat.pitch) +
+          (data_in_bits + data_out_bits) * g_tp.wire_outside_mat.pitch * (h + 2*(1-pow(0.5, v-h))))/2;
+    }
+  }
+  area.h = ht_temp * 2;
+  area.w = len_temp * 2;
+  delay = 0;
+  power.readOp.dynamic = 0;
+  power.readOp.leakage = 0;
+  power.readOp.gate_leakage = 0;
+  //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
+  len = len_temp;
+  ht = ht_temp/2;
+
+  while (v > 0 || h > 0)
+  { //finds delay/power of each link in the tree
+    if (wtemp1) delete wtemp1;
+    if (wtemp2) delete wtemp2;
+    if (wtemp3) delete wtemp3;
+
+    if(h > v) {
+      //the iteration considers only one horizontal link
+      wtemp1 = new Wire(wt, len); // hor
+      wtemp2 = new Wire(wt, len/2);  // ver
+      len_temp = len;
+      len /= 2;
+      wtemp3 = 0;
+      h--;
+      option = 0;
+    }
+    else if (v>0 && h>0) {
+      //considers one horizontal link and one vertical link
+      wtemp1 = new Wire(wt, len); // hor
+      wtemp2 = new Wire(wt, ht);  // ver
+      wtemp3 = new Wire(wt, len/2);  // next hor
+      len_temp = len;
+      ht_temp = ht;
+      len /= 2;
+      ht /= 2;
+      v--;
+      h--;
+      option = 1;
+    }
+    else {
+      // considers only one vertical link
+      assert(h == 0);
+      wtemp1 = new Wire(wt, ht); // hor
+      wtemp2 = new Wire(wt, ht/2);  // ver
+      ht_temp = ht;
+      ht /= 2;
+      wtemp3 = 0;
+      v--;
+      option = 2;
+    }
+    delay += wtemp1->delay;
+    power.readOp.dynamic += wtemp1->power.readOp.dynamic;
+    power.searchOp.dynamic += wtemp1->power.readOp.dynamic*init_wire_bw;
+    power.readOp.leakage += wtemp1->power.readOp.leakage*wire_bw;
+    power.readOp.gate_leakage += wtemp1->power.readOp.gate_leakage*wire_bw;
+    //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
+    if ((uca_tree == false && option == 2) || search_tree==true)
+    {
+      wire_bw*=2;
+    }
+
+    if (uca_tree == false)
+    {
+      if (len_temp > wtemp1->repeater_spacing)
+      {
+        s1 = wtemp1->repeater_size;
+        l_eff = wtemp1->repeater_spacing;
+      }
+      else
+      {
+        s1 = (len_temp/wtemp1->repeater_spacing) * wtemp1->repeater_size;
+        l_eff = len_temp;
+      }
+      if (ht_temp > wtemp2->repeater_spacing)
+      {
+        s2 = wtemp2->repeater_size;
+      }
+      else
+      {
+        s2 = (len_temp/wtemp2->repeater_spacing) * wtemp2->repeater_size;
+      }
+      // first level
+      output_buffer(s1, s2, l_eff);
+    }
+
+
+    if (option != 1)
+    {
+      continue;
+    }
+
+    // second level
+    delay += wtemp2->delay;
+    power.readOp.dynamic += wtemp2->power.readOp.dynamic;
+    power.searchOp.dynamic += wtemp2->power.readOp.dynamic*init_wire_bw;
+    power.readOp.leakage += wtemp2->power.readOp.leakage*wire_bw;
+    power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+    //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
+    if (uca_tree)
+    {
+      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
+      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+    }
+    else
+    {
+      power.readOp.leakage += (wtemp2->power.readOp.leakage*wire_bw);
+      power.readOp.gate_leakage += wtemp2->power.readOp.gate_leakage*wire_bw;
+      wire_bw*=2;
+
+      if (ht_temp > wtemp3->repeater_spacing)
+      {
+        s3 = wtemp3->repeater_size;
+        l_eff = wtemp3->repeater_spacing;
+      }
+      else
+      {
+        s3 = (len_temp/wtemp3->repeater_spacing) * wtemp3->repeater_size;
+        l_eff = ht_temp;
+      }
+
+      output_buffer(s2, s3, l_eff);
+    }
+    //cout<<"power.readOp.leakage"<<power.readOp.leakage<<endl;
+    //cout<<"power.readOp.gate_leakage"<<power.readOp.gate_leakage<<endl;
+    //cout<<"wtemp2->power.readOp.gate_leakage"<<wtemp2->power.readOp.gate_leakage<<endl;
+  }
+
+  if (wtemp1) delete wtemp1;
+  if (wtemp2) delete wtemp2;
+  if (wtemp3) delete wtemp3;
+}
+
diff --git a/src/gpuwattch/cacti/htree2.h b/src/gpuwattch/cacti/htree2.h
new file mode 100644
index 000000000..6700e8055
--- /dev/null
+++ b/src/gpuwattch/cacti/htree2.h
@@ -0,0 +1,97 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __HTREE2_H__
+#define __HTREE2_H__
+
+#include "basic_circuit.h"
+#include "component.h"
+#include "parameter.h"
+#include "assert.h"
+#include "subarray.h"
+#include "cacti_interface.h"
+#include "wire.h"
+
+// leakge power includes entire htree in a bank (when uca_tree == false)
+// leakge power includes only part to one bank when uca_tree == true
+
+class Htree2 : public Component
+{
+  public:
+    Htree2(enum Wire_type wire_model,
+        double mat_w, double mat_h, int add, int data_in, int search_data_in, int data_out, int search_data_out, int bl, int wl,
+        enum Htree_type h_type, bool uca_tree_ = false, bool search_tree_ = false,
+        TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+    ~Htree2() {};
+
+    void in_htree();
+    void out_htree();
+
+    // repeaters only at h-tree nodes
+    void limited_in_htree();
+    void limited_out_htree();
+    void input_nand(double s1, double s2, double l);
+    void output_buffer(double s1, double s2, double l);
+
+    double in_rise_time, out_rise_time;
+
+    void set_in_rise_time(double rt)
+    {
+      in_rise_time = rt;
+    }
+
+    double max_unpipelined_link_delay;
+    powerDef power_bit;
+
+
+  private:
+    double wire_bw;
+    double init_wire_bw;  // bus width at root
+    enum Htree_type tree_type;
+    double htree_hnodes;
+    double htree_vnodes;
+    double mat_width;
+    double mat_height;
+    int add_bits, data_in_bits,search_data_in_bits,data_out_bits,  search_data_out_bits;
+    int ndbl, ndwl;
+    bool uca_tree; // should have full bandwidth to access all banks in the array simultaneously
+    bool search_tree;
+
+    enum Wire_type wt;
+    double min_w_nmos;
+    double min_w_pmos;
+
+    TechnologyParameter::DeviceType *deviceType;
+
+};
+
+#endif
diff --git a/src/gpuwattch/cacti/io.cc b/src/gpuwattch/cacti/io.cc
new file mode 100644
index 000000000..c513dbf82
--- /dev/null
+++ b/src/gpuwattch/cacti/io.cc
@@ -0,0 +1,2457 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+
+#include "io.h"
+#include "area.h"
+#include "basic_circuit.h"
+#include "parameter.h"
+#include "Ucache.h"
+#include "nuca.h"
+#include "crossbar.h"
+#include "arbiter.h"
+//#include "highradix.h"
+
+using namespace std;
+
+
+/* Parses "cache.cfg" file */
+  void
+InputParameter::parse_cfg(const string & in_file)
+{
+  FILE *fp = fopen(in_file.c_str(), "r");
+  char line[5000];
+  char jk[5000];
+  char temp_var[5000];
+
+  if(!fp) {
+    cout << in_file << " is missing!\n";
+    exit(-1);
+  }
+
+  while(fscanf(fp, "%[^\n]\n", line) != EOF) {
+
+    if (!strncmp("-size", line, strlen("-size"))) {
+      sscanf(line, "-size %[(:-~)*]%u", jk, &(cache_sz));
+      continue;
+    }
+
+    if (!strncmp("-page size", line, strlen("-page size"))) {
+      sscanf(line, "-page size %[(:-~)*]%u", jk, &(page_sz_bits));
+      continue;
+    }
+
+    if (!strncmp("-burst length", line, strlen("-burst length"))) {
+      sscanf(line, "-burst %[(:-~)*]%u", jk, &(burst_len));
+      continue;
+    }
+
+    if (!strncmp("-internal prefetch width", line, strlen("-internal prefetch width"))) {
+      sscanf(line, "-internal prefetch %[(:-~)*]%u", jk, &(int_prefetch_w));
+      continue;
+    }
+
+    if (!strncmp("-block", line, strlen("-block"))) {
+      sscanf(line, "-block size (bytes) %d", &(line_sz));
+      continue;
+    }
+
+    if (!strncmp("-associativity", line, strlen("-associativity"))) {
+      sscanf(line, "-associativity %d", &(assoc));
+      continue;
+    }
+
+    if (!strncmp("-read-write", line, strlen("-read-write"))) {
+      sscanf(line, "-read-write port %d", &(num_rw_ports));
+      continue;
+    }
+
+    if (!strncmp("-exclusive read", line, strlen("exclusive read"))) {
+      sscanf(line, "-exclusive read port %d", &(num_rd_ports));
+      continue;
+    }
+
+    if(!strncmp("-exclusive write", line, strlen("-exclusive write"))) {
+      sscanf(line, "-exclusive write port %d", &(num_wr_ports));
+      continue;
+    }
+
+    if (!strncmp("-single ended", line, strlen("-single ended"))) {
+      sscanf(line, "-single %[(:-~)*]%d", jk,
+          &(num_se_rd_ports));
+      continue;
+    }
+
+    if (!strncmp("-search", line, strlen("-search"))) {
+      sscanf(line, "-search port %d", &(num_search_ports));
+      continue;
+    }
+
+    if (!strncmp("-UCA bank", line, strlen("-UCA bank"))) {
+      sscanf(line, "-UCA bank%[((:-~)| )*]%d", jk, &(nbanks));
+      continue;
+    }
+
+    if (!strncmp("-technology", line, strlen("-technology"))) {
+      sscanf(line, "-technology (u) %lf", &(F_sz_um));
+      F_sz_nm = F_sz_um*1000;
+      continue;
+    }
+
+    if (!strncmp("-output/input", line, strlen("-output/input"))) {
+      sscanf(line, "-output/input bus %[(:-~)*]%d", jk, &(out_w));
+      continue;
+    }
+
+    if (!strncmp("-operating temperature", line, strlen("-operating temperature"))) {
+      sscanf(line, "-operating temperature %[(:-~)*]%d", jk, &(temp));
+      continue;
+    }
+
+    if (!strncmp("-cache type", line, strlen("-cache type"))) {
+      sscanf(line, "-cache type%[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("cache", temp_var, sizeof("cache"))) {
+        is_cache = true;
+      }
+      else
+      {
+        is_cache = false;
+      }
+
+      if (!strncmp("main memory", temp_var, sizeof("main memory"))) {
+        is_main_mem = true;
+      }
+      else {
+        is_main_mem = false;
+      }
+
+      if (!strncmp("cam", temp_var, sizeof("cam"))) {
+        pure_cam = true;
+      }
+      else {
+        pure_cam = false;
+      }
+
+      if (!strncmp("ram", temp_var, sizeof("ram"))) {
+        pure_ram = true;
+      }
+      else {
+    	  if (!is_main_mem)
+    		  pure_ram = false;
+    	  else
+    		  pure_ram = true;
+      }
+
+      continue;
+    }
+
+
+    if (!strncmp("-tag size", line, strlen("-tag size"))) {
+      sscanf(line, "-tag size%[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("default", temp_var, sizeof("default"))) {
+        specific_tag = false;
+        tag_w = 42; /* the acutal value is calculated
+                     * later based on the cache size, bank count, and associativity
+                     */
+      }
+      else {
+        specific_tag = true;
+        sscanf(line, "-tag size (b) %d", &(tag_w));
+      }
+      continue;
+    }
+
+    if (!strncmp("-access mode", line, strlen("-access mode"))) {
+      sscanf(line, "-access %[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("fast", temp_var, strlen("fast"))) {
+        access_mode = 2;
+      }
+      else if (!strncmp("sequential", temp_var, strlen("sequential"))) {
+        access_mode = 1;
+      }
+      else if(!strncmp("normal", temp_var, strlen("normal"))) {
+        access_mode = 0;
+      }
+      else {
+        cout << "ERROR: Invalid access mode!\n";
+        exit(0);
+      }
+      continue;
+    }
+
+    if (!strncmp("-Data array cell type", line, strlen("-Data array cell type"))) {
+      sscanf(line, "-Data array cell type %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) {
+        data_arr_ram_cell_tech_type = 0;
+      }
+      else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) {
+        data_arr_ram_cell_tech_type = 1;
+      }
+      else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) {
+        data_arr_ram_cell_tech_type = 2;
+      }
+      else if(!strncmp("lp-dram", temp_var, strlen("lp-dram"))) {
+        data_arr_ram_cell_tech_type = 3;
+      }
+      else if(!strncmp("comm-dram", temp_var, strlen("comm-dram"))) {
+        data_arr_ram_cell_tech_type = 4;
+      }
+      else {
+        cout << "ERROR: Invalid type!\n";
+        exit(0);
+      }
+      continue;
+    }
+
+    if (!strncmp("-Data array peripheral type", line, strlen("-Data array peripheral type"))) {
+      sscanf(line, "-Data array peripheral type %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) {
+        data_arr_peri_global_tech_type = 0;
+      }
+      else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) {
+        data_arr_peri_global_tech_type = 1;
+      }
+      else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) {
+        data_arr_peri_global_tech_type = 2;
+      }
+      else {
+        cout << "ERROR: Invalid type!\n";
+        exit(0);
+      }
+      continue;
+    }
+
+    if (!strncmp("-Tag array cell type", line, strlen("-Tag array cell type"))) {
+      sscanf(line, "-Tag array cell type %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) {
+        tag_arr_ram_cell_tech_type = 0;
+      }
+      else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) {
+        tag_arr_ram_cell_tech_type = 1;
+      }
+      else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) {
+        tag_arr_ram_cell_tech_type = 2;
+      }
+      else if(!strncmp("lp-dram", temp_var, strlen("lp-dram"))) {
+        tag_arr_ram_cell_tech_type = 3;
+      }
+      else if(!strncmp("comm-dram", temp_var, strlen("comm-dram"))) {
+        tag_arr_ram_cell_tech_type = 4;
+      }
+      else {
+        cout << "ERROR: Invalid type!\n";
+        exit(0);
+      }
+      continue;
+    }
+
+    if (!strncmp("-Tag array peripheral type", line, strlen("-Tag array peripheral type"))) {
+      sscanf(line, "-Tag array peripheral type %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if(!strncmp("itrs-hp", temp_var, strlen("itrs-hp"))) {
+        tag_arr_peri_global_tech_type = 0;
+      }
+      else if(!strncmp("itrs-lstp", temp_var, strlen("itrs-lstp"))) {
+        tag_arr_peri_global_tech_type = 1;
+      }
+      else if(!strncmp("itrs-lop", temp_var, strlen("itrs-lop"))) {
+        tag_arr_peri_global_tech_type = 2;
+      }
+      else {
+        cout << "ERROR: Invalid type!\n";
+        exit(0);
+      }
+      continue;
+    }
+    if(!strncmp("-design", line, strlen("-design"))) {
+      sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk,
+          &(delay_wt), &(dynamic_power_wt),
+          &(leakage_power_wt),
+          &(cycle_time_wt), &(area_wt));
+      continue;
+    }
+
+    if(!strncmp("-deviate", line, strlen("-deviate"))) {
+      sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk,
+          &(delay_dev), &(dynamic_power_dev),
+          &(leakage_power_dev),
+          &(cycle_time_dev), &(area_dev));
+      continue;
+    }
+
+    if(!strncmp("-Optimize", line, strlen("-Optimize"))) {
+      sscanf(line, "-Optimize  %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if(!strncmp("ED^2", temp_var, strlen("ED^2"))) {
+        ed = 2;
+      }
+      else if(!strncmp("ED", temp_var, strlen("ED"))) {
+        ed = 1;
+      }
+      else {
+        ed = 0;
+      }
+    }
+
+    if(!strncmp("-NUCAdesign", line, strlen("-NUCAdesign"))) {
+      sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk,
+          &(delay_wt_nuca), &(dynamic_power_wt_nuca),
+          &(leakage_power_wt_nuca),
+          &(cycle_time_wt_nuca), &(area_wt_nuca));
+      continue;
+    }
+
+    if(!strncmp("-NUCAdeviate", line, strlen("-NUCAdeviate"))) {
+      sscanf(line, "-%[((:-~)| |,)*]%d:%d:%d:%d:%d", jk,
+          &(delay_dev_nuca), &(dynamic_power_dev_nuca),
+          &(leakage_power_dev_nuca),
+          &(cycle_time_dev_nuca), &(area_dev_nuca));
+      continue;
+    }
+
+    if(!strncmp("-Cache model", line, strlen("-cache model"))) {
+      sscanf(line, "-Cache model %[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("UCA", temp_var, strlen("UCA"))) {
+        nuca = 0;
+      }
+      else {
+        nuca = 1;
+      }
+      continue;
+    }
+
+    if(!strncmp("-NUCA bank", line, strlen("-NUCA bank"))) {
+      sscanf(line, "-NUCA bank count %d", &(nuca_bank_count));
+
+      if (nuca_bank_count != 0) {
+        force_nuca_bank = 1;
+      }
+      continue;
+    }
+
+    if(!strncmp("-Wire inside mat", line, strlen("-Wire inside mat"))) {
+      sscanf(line, "-Wire%[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("global", temp_var, strlen("global"))) {
+        wire_is_mat_type = 2;
+        continue;
+      }
+      else if (!strncmp("local", temp_var, strlen("local"))) {
+        wire_is_mat_type = 0;
+        continue;
+      }
+      else {
+        wire_is_mat_type = 1;
+        continue;
+      }
+    }
+
+    if(!strncmp("-Wire outside mat", line, strlen("-Wire outside mat"))) {
+      sscanf(line, "-Wire%[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("global", temp_var, strlen("global"))) {
+        wire_os_mat_type = 2;
+      }
+      else {
+        wire_os_mat_type = 1;
+      }
+      continue;
+    }
+
+    if(!strncmp("-Interconnect projection", line, strlen("-Interconnect projection"))) {
+      sscanf(line, "-Interconnect projection%[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("aggressive", temp_var, strlen("aggressive"))) {
+        ic_proj_type = 0;
+      }
+      else {
+        ic_proj_type = 1;
+      }
+      continue;
+    }
+
+    if(!strncmp("-Wire signalling", line, strlen("-wire signalling"))) {
+      sscanf(line, "-Wire%[^\"]\"%[^\"]\"", jk, temp_var);
+
+      if (!strncmp("default", temp_var, strlen("default"))) {
+        force_wiretype = 0;
+        wt = Global;
+      }
+      else if (!(strncmp("Global_10", temp_var, strlen("Global_10")))) {
+        force_wiretype = 1;
+        wt = Global_10;
+      }
+      else if (!(strncmp("Global_20", temp_var, strlen("Global_20")))) {
+        force_wiretype = 1;
+        wt = Global_20;
+      }
+      else if (!(strncmp("Global_30", temp_var, strlen("Global_30")))) {
+        force_wiretype = 1;
+        wt = Global_30;
+      }
+      else if (!(strncmp("Global_5", temp_var, strlen("Global_5")))) {
+        force_wiretype = 1;
+        wt = Global_5;
+      }
+      else if (!(strncmp("Global", temp_var, strlen("Global")))) {
+        force_wiretype = 1;
+        wt = Global;
+      }
+      else {
+        wt = Low_swing;
+        force_wiretype = 1;
+      }
+      continue;
+    }
+
+
+
+    if(!strncmp("-Core", line, strlen("-Core"))) {
+      sscanf(line, "-Core count %d\n", &(cores));
+      if (cores > 16) {
+        printf("No. of cores should be less than 16!\n");
+      }
+      continue;
+    }
+
+    if(!strncmp("-Cache level", line, strlen("-Cache level"))) {
+      sscanf(line, "-Cache l%[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("L2", temp_var, strlen("L2"))) {
+        cache_level = 0;
+      }
+      else {
+        cache_level = 1;
+      }
+    }
+
+    if(!strncmp("-Print level", line, strlen("-Print level"))) {
+      sscanf(line, "-Print l%[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("DETAILED", temp_var, strlen("DETAILED"))) {
+        print_detail = 1;
+      }
+      else {
+        print_detail = 0;
+      }
+
+    }
+    if(!strncmp("-Add ECC", line, strlen("-Add ECC"))) {
+      sscanf(line, "-Add ECC %[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("true", temp_var, strlen("true"))) {
+        add_ecc_b_ = true;
+      }
+      else {
+        add_ecc_b_ = false;
+      }
+    }
+
+    if(!strncmp("-Print input parameters", line, strlen("-Print input parameters"))) {
+      sscanf(line, "-Print input %[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("true", temp_var, strlen("true"))) {
+        print_input_args = true;
+      }
+      else {
+        print_input_args = false;
+      }
+    }
+
+    if(!strncmp("-Force cache config", line, strlen("-Force cache config"))) {
+      sscanf(line, "-Force cache %[^\"]\"%[^\"]\"", jk, temp_var);
+      if (!strncmp("true", temp_var, strlen("true"))) {
+        force_cache_config = true;
+      }
+      else {
+        force_cache_config = false;
+      }
+    }
+
+    if(!strncmp("-Ndbl", line, strlen("-Ndbl"))) {
+      sscanf(line, "-Ndbl %d\n", &(ndbl));
+      continue;
+    }
+    if(!strncmp("-Ndwl", line, strlen("-Ndwl"))) {
+      sscanf(line, "-Ndwl %d\n", &(ndwl));
+      continue;
+    }
+    if(!strncmp("-Nspd", line, strlen("-Nspd"))) {
+      sscanf(line, "-Nspd %d\n", &(nspd));
+      continue;
+    }
+    if(!strncmp("-Ndsam1", line, strlen("-Ndsam1"))) {
+      sscanf(line, "-Ndsam1 %d\n", &(ndsam1));
+      continue;
+    }
+    if(!strncmp("-Ndsam2", line, strlen("-Ndsam2"))) {
+      sscanf(line, "-Ndsam2 %d\n", &(ndsam2));
+      continue;
+    }
+   if(!strncmp("-Ndcm", line, strlen("-Ndcm"))) {
+      sscanf(line, "-Ndcm %d\n", &(ndcm));
+      continue;
+    }
+
+  }
+  rpters_in_htree = true;
+  fclose(fp);
+}
+
+  void
+InputParameter::display_ip()
+{
+  cout << "Cache size                    : " << cache_sz << endl;
+  cout << "Block size                    : " << line_sz << endl;
+  cout << "Associativity                 : " << assoc << endl;
+  cout << "Read only ports               : " << num_rd_ports << endl;
+  cout << "Write only ports              : " << num_wr_ports << endl;
+  cout << "Read write ports              : " << num_rw_ports << endl;
+  cout << "Single ended read ports       : " << num_se_rd_ports << endl;
+  if (fully_assoc||pure_cam)
+  {
+	  cout << "Search ports                  : " << num_search_ports << endl;
+  }
+  cout << "Cache banks (UCA)             : " << nbanks << endl;
+  cout << "Technology                    : " << F_sz_um << endl;
+  cout << "Temperature                   : " << temp << endl;
+  cout << "Tag size                      : " << tag_w << endl;
+  if (is_cache) {
+    cout << "array type                    : " << "Cache" << endl;
+  }
+  if (pure_ram) {
+    cout << "array type                    : " << "Scratch RAM" << endl;
+  }
+  if (pure_cam)
+  {
+      cout << "array type                    : " << "CAM" << endl;
+  }
+  cout << "Model as memory               : " << is_main_mem << endl;
+  cout << "Access mode                   : " << access_mode << endl;
+  cout << "Data array cell type          : " << data_arr_ram_cell_tech_type << endl;
+  cout << "Data array peripheral type    : " << data_arr_peri_global_tech_type << endl;
+  cout << "Tag array cell type           : " << tag_arr_ram_cell_tech_type << endl;
+  cout << "Tag array peripheral type     : " << tag_arr_peri_global_tech_type << endl;
+  cout << "Optimization target           : " << ed << endl;
+  cout << "Design objective (UCA wt)     : " << delay_wt << " "
+                                                << dynamic_power_wt << " " << leakage_power_wt << " " << cycle_time_wt
+                                                << " " << area_wt << endl;
+  cout << "Design objective (UCA dev)    : " << delay_dev << " "
+                                                << dynamic_power_dev << " " << leakage_power_dev << " " << cycle_time_dev
+                                                << " " << area_dev << endl;
+  if (nuca)
+    {
+    cout << "Cores                         : " << cores << endl;
+
+
+    cout << "Design objective (NUCA wt)    : " << delay_wt_nuca << " "
+                                                << dynamic_power_wt_nuca << " " << leakage_power_wt_nuca << " " << cycle_time_wt_nuca
+                                                << " " << area_wt_nuca << endl;
+    cout << "Design objective (NUCA dev)   : " << delay_dev_nuca << " "
+                                                << dynamic_power_dev_nuca << " " << leakage_power_dev_nuca << " " << cycle_time_dev_nuca
+                                       << " " << area_dev_nuca << endl;
+    }
+  cout << "Cache model                   : " << nuca << endl;
+  cout << "Nuca bank                     : " << nuca_bank_count << endl;
+  cout << "Wire inside mat               : " << wire_is_mat_type << endl;
+  cout << "Wire outside mat              : " << wire_os_mat_type << endl;
+  cout << "Interconnect projection       : " << ic_proj_type << endl;
+  cout << "Wire signalling               : " << force_wiretype << endl;
+  cout << "Print level                   : " << print_detail << endl;
+  cout << "ECC overhead                  : " << add_ecc_b_ << endl;
+  cout << "Page size                     : " << page_sz_bits << endl;
+  cout << "Burst length                  : " << burst_len << endl;
+  cout << "Internal prefetch width       : " << int_prefetch_w << endl;
+  cout << "Force cache config            : " << g_ip->force_cache_config << endl;
+  if (g_ip->force_cache_config) {
+    cout << "Ndwl                          : " << g_ip->ndwl << endl;
+    cout << "Ndbl                          : " << g_ip->ndbl << endl;
+    cout << "Nspd                          : " << g_ip->nspd << endl;
+    cout << "Ndcm                          : " << g_ip->ndcm << endl;
+    cout << "Ndsam1                        : " << g_ip->ndsam1 << endl;
+    cout << "Ndsam2                        : " << g_ip->ndsam2 << endl;
+  }
+}
+
+
+
+powerComponents operator+(const powerComponents & x, const powerComponents & y)
+{
+  powerComponents z;
+
+  z.dynamic = x.dynamic + y.dynamic;
+  z.leakage = x.leakage + y.leakage;
+  z.gate_leakage  = x.gate_leakage  + y.gate_leakage;
+  z.short_circuit = x.short_circuit + y.short_circuit;
+  z.longer_channel_leakage = x.longer_channel_leakage + y.longer_channel_leakage;
+
+  return z;
+}
+
+powerComponents operator*(const powerComponents & x, double const * const y)
+{
+  powerComponents z;
+
+  z.dynamic = x.dynamic*y[0];
+  z.leakage = x.leakage*y[1];
+  z.gate_leakage  = x.gate_leakage*y[2];
+  z.short_circuit = x.short_circuit*y[3];
+  z.longer_channel_leakage = x.longer_channel_leakage*y[1];//longer channel leakage has the same behavior as normal leakage
+
+  return z;
+}
+
+
+powerDef operator+(const powerDef & x, const powerDef & y)
+{
+  powerDef z;
+
+  z.readOp   = x.readOp  + y.readOp;
+  z.writeOp  = x.writeOp + y.writeOp;
+  z.searchOp = x.searchOp + y.searchOp;
+  return z;
+}
+
+powerDef operator*(const powerDef & x, double const * const y)
+{
+  powerDef z;
+
+  z.readOp   = x.readOp*y;
+  z.writeOp  = x.writeOp*y;
+  z.searchOp = x.searchOp*y;
+  return z;
+}
+
+uca_org_t cacti_interface(const string & infile_name)
+{
+
+  uca_org_t fin_res;
+  //uca_org_t result;
+  fin_res.valid = false;
+
+  g_ip = new InputParameter();
+  g_ip->parse_cfg(infile_name);
+  if(!g_ip->error_checking())
+	  exit(0);
+  if (g_ip->print_input_args)
+    g_ip->display_ip();
+
+  init_tech_params(g_ip->F_sz_um, false);
+  Wire winit; // Do not delete this line. It initializes wires.
+
+
+//  For HighRadix Only
+//  ////  Wire wirea(g_ip->wt, 1000);
+//  ////  wirea.print_wire();
+//  ////  cout << "Wire Area " << wirea.area.get_area() << " sq. u" << endl;
+//  //  winit.print_wire();
+//  //
+//    HighRadix *hr;
+//      hr = new HighRadix();
+//      hr->compute_power();
+//      hr->print_router();
+//    exit(0);
+//
+//    double sub_switch_sz = 2;
+//    double rows = 32;
+//    for (int i=0; i<6; i++) {
+//      sub_switch_sz = pow(2, i);
+//      rows = 64/sub_switch_sz;
+//      hr = new HighRadix(sub_switch_sz, rows, .8/* freq */, 64, 2, 64, 0.7);
+//      hr->compute_power();
+//      hr->print_router();
+//      delete hr;
+//    }
+//  //  HighRadix yarc;
+//  //  yarc.compute_power();
+//  //  yarc.print_router();
+//    winit.print_wire();
+//    exit(0);
+//  For HighRadix Only End
+
+  if (g_ip->nuca == 1)
+  {
+    Nuca n(&g_tp.peri_global);
+    n.sim_nuca();
+  }
+  g_ip->display_ip();
+  solve(&fin_res);
+
+  output_UCA(&fin_res);
+  output_data_csv(fin_res);
+
+  delete (g_ip);
+  return fin_res;
+}
+
+//cacti6.5's plain interface, please keep !!!
+uca_org_t cacti_interface(
+    int cache_size,
+    int line_size,
+    int associativity,
+    int rw_ports,
+    int excl_read_ports,
+    int excl_write_ports,
+    int single_ended_read_ports,
+    int banks,
+    double tech_node, // in nm
+    int page_sz,
+    int burst_length,
+    int pre_width,
+    int output_width,
+    int specific_tag,
+    int tag_width,
+    int access_mode, //0 normal, 1 seq, 2 fast
+    int cache, //scratch ram or cache
+    int main_mem,
+    int obj_func_delay,
+    int obj_func_dynamic_power,
+    int obj_func_leakage_power,
+    int obj_func_area,
+    int obj_func_cycle_time,
+    int dev_func_delay,
+    int dev_func_dynamic_power,
+    int dev_func_leakage_power,
+    int dev_func_area,
+    int dev_func_cycle_time,
+    int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
+    int temp,
+    int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
+    int data_arr_ram_cell_tech_flavor_in, // 0-4
+    int data_arr_peri_global_tech_flavor_in,
+    int tag_arr_ram_cell_tech_flavor_in,
+    int tag_arr_peri_global_tech_flavor_in,
+    int interconnect_projection_type_in, // 0 - aggressive, 1 - normal
+    int wire_inside_mat_type_in,
+    int wire_outside_mat_type_in,
+    int is_nuca, // 0 - UCA, 1 - NUCA
+    int core_count,
+    int cache_level, // 0 - L2, 1 - L3
+    int nuca_bank_count,
+    int nuca_obj_func_delay,
+    int nuca_obj_func_dynamic_power,
+    int nuca_obj_func_leakage_power,
+    int nuca_obj_func_area,
+    int nuca_obj_func_cycle_time,
+    int nuca_dev_func_delay,
+    int nuca_dev_func_dynamic_power,
+    int nuca_dev_func_leakage_power,
+    int nuca_dev_func_area,
+    int nuca_dev_func_cycle_time,
+    int REPEATERS_IN_HTREE_SEGMENTS_in,//TODO for now only wires with repeaters are supported
+    int p_input)
+{
+  g_ip = new InputParameter();
+  g_ip->add_ecc_b_ = true;
+
+  g_ip->data_arr_ram_cell_tech_type    = data_arr_ram_cell_tech_flavor_in;
+  g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in;
+  g_ip->tag_arr_ram_cell_tech_type     = tag_arr_ram_cell_tech_flavor_in;
+  g_ip->tag_arr_peri_global_tech_type  = tag_arr_peri_global_tech_flavor_in;
+
+  g_ip->ic_proj_type     = interconnect_projection_type_in;
+  g_ip->wire_is_mat_type = wire_inside_mat_type_in;
+  g_ip->wire_os_mat_type = wire_outside_mat_type_in;
+  g_ip->burst_len        = burst_length;
+  g_ip->int_prefetch_w   = pre_width;
+  g_ip->page_sz_bits     = page_sz;
+
+  g_ip->cache_sz            = cache_size;
+  g_ip->line_sz             = line_size;
+  g_ip->assoc               = associativity;
+  g_ip->nbanks              = banks;
+  g_ip->out_w               = output_width;
+  g_ip->specific_tag        = specific_tag;
+
+
+  if (tag_width == 0) {
+    g_ip->tag_w = 42;
+  }
+  else {
+    g_ip->tag_w               = tag_width;
+  }
+
+  g_ip->access_mode         = access_mode;
+  g_ip->delay_wt = obj_func_delay;
+  g_ip->dynamic_power_wt = obj_func_dynamic_power;
+  g_ip->leakage_power_wt = obj_func_leakage_power;
+  g_ip->area_wt = obj_func_area;
+  g_ip->cycle_time_wt    = obj_func_cycle_time;
+  g_ip->delay_dev = dev_func_delay;
+  g_ip->dynamic_power_dev = dev_func_dynamic_power;
+  g_ip->leakage_power_dev = dev_func_leakage_power;
+  g_ip->area_dev = dev_func_area;
+  g_ip->cycle_time_dev    = dev_func_cycle_time;
+  g_ip->ed = ed_ed2_none;
+
+  switch(wt) {
+    case (0):
+      g_ip->force_wiretype = 0;
+      g_ip->wt = Global;
+      break;
+    case (1):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Global;
+      break;
+    case (2):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Global_5;
+      break;
+    case (3):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Global_10;
+      break;
+    case (4):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Global_20;
+      break;
+    case (5):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Global_30;
+      break;
+    case (6):
+      g_ip->force_wiretype = 1;
+      g_ip->wt = Low_swing;
+      break;
+    default:
+      cout << "Unknown wire type!\n";
+      exit(0);
+  }
+
+  g_ip->delay_wt_nuca = nuca_obj_func_delay;
+  g_ip->dynamic_power_wt_nuca = nuca_obj_func_dynamic_power;
+  g_ip->leakage_power_wt_nuca = nuca_obj_func_leakage_power;
+  g_ip->area_wt_nuca = nuca_obj_func_area;
+  g_ip->cycle_time_wt_nuca    = nuca_obj_func_cycle_time;
+  g_ip->delay_dev_nuca = dev_func_delay;
+  g_ip->dynamic_power_dev_nuca = nuca_dev_func_dynamic_power;
+  g_ip->leakage_power_dev_nuca = nuca_dev_func_leakage_power;
+  g_ip->area_dev_nuca = nuca_dev_func_area;
+  g_ip->cycle_time_dev_nuca    = nuca_dev_func_cycle_time;
+  g_ip->nuca = is_nuca;
+  g_ip->nuca_bank_count = nuca_bank_count;
+  if(nuca_bank_count > 0) {
+    g_ip->force_nuca_bank = 1;
+  }
+  g_ip->cores = core_count;
+  g_ip->cache_level = cache_level;
+
+  g_ip->temp = temp;
+
+  g_ip->F_sz_nm         = tech_node;
+  g_ip->F_sz_um         = tech_node / 1000;
+  g_ip->is_main_mem     = (main_mem != 0) ? true : false;
+  g_ip->is_cache        = (cache != 0) ? true : false;
+  g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false;
+
+  g_ip->num_rw_ports    = rw_ports;
+  g_ip->num_rd_ports    = excl_read_ports;
+  g_ip->num_wr_ports    = excl_write_ports;
+  g_ip->num_se_rd_ports = single_ended_read_ports;
+  g_ip->print_detail = 1;
+  g_ip->nuca = 0;
+
+  g_ip->wt = Global_5;
+  g_ip->force_cache_config = false;
+  g_ip->force_wiretype = false;
+  g_ip->print_input_args = p_input;
+
+
+  uca_org_t fin_res;
+  fin_res.valid = false;
+
+  if (g_ip->error_checking() == false) exit(0);
+  if (g_ip->print_input_args)
+    g_ip->display_ip();
+  init_tech_params(g_ip->F_sz_um, false);
+  Wire winit; // Do not delete this line. It initializes wires.
+
+  if (g_ip->nuca == 1)
+  {
+    Nuca n(&g_tp.peri_global);
+    n.sim_nuca();
+  }
+  solve(&fin_res);
+
+  output_UCA(&fin_res);
+
+  delete (g_ip);
+  return fin_res;
+}
+
+//McPAT's plain interface, please keep !!!
+uca_org_t cacti_interface(
+    int cache_size,
+    int line_size,
+    int associativity,
+    int rw_ports,
+    int excl_read_ports,// para5
+    int excl_write_ports,
+    int single_ended_read_ports,
+    int search_ports,
+    int banks,
+    double tech_node,//para10
+    int output_width,
+    int specific_tag,
+    int tag_width,
+    int access_mode,
+    int cache,      //para15
+    int main_mem,
+    int obj_func_delay,
+    int obj_func_dynamic_power,
+    int obj_func_leakage_power,
+    int obj_func_cycle_time, //para20
+    int obj_func_area,
+    int dev_func_delay,
+    int dev_func_dynamic_power,
+    int dev_func_leakage_power,
+    int dev_func_area, //para25
+    int dev_func_cycle_time,
+    int ed_ed2_none, // 0 - ED, 1 - ED^2, 2 - use weight and deviate
+    int temp,
+    int wt, //0 - default(search across everything), 1 - global, 2 - 5% delay penalty, 3 - 10%, 4 - 20 %, 5 - 30%, 6 - low-swing
+    int data_arr_ram_cell_tech_flavor_in,//para30
+    int data_arr_peri_global_tech_flavor_in,
+    int tag_arr_ram_cell_tech_flavor_in,
+    int tag_arr_peri_global_tech_flavor_in,
+    int interconnect_projection_type_in,
+    int wire_inside_mat_type_in,//para35
+    int wire_outside_mat_type_in,
+    int REPEATERS_IN_HTREE_SEGMENTS_in,
+    int VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in,
+    int BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in,
+    int PAGE_SIZE_BITS_in,//para40
+    int BURST_LENGTH_in,
+    int INTERNAL_PREFETCH_WIDTH_in,
+    int force_wiretype,
+    int wiretype,
+    int force_config,//para45
+    int ndwl,
+    int ndbl,
+    int nspd,
+    int ndcm,
+    int ndsam1,//para50
+    int ndsam2,
+    int ecc)
+{
+  g_ip = new InputParameter();
+
+  uca_org_t fin_res;
+  fin_res.valid = false;
+
+  g_ip->data_arr_ram_cell_tech_type    = data_arr_ram_cell_tech_flavor_in;
+  g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in;
+  g_ip->tag_arr_ram_cell_tech_type     = tag_arr_ram_cell_tech_flavor_in;
+  g_ip->tag_arr_peri_global_tech_type  = tag_arr_peri_global_tech_flavor_in;
+
+  g_ip->ic_proj_type     = interconnect_projection_type_in;
+  g_ip->wire_is_mat_type = wire_inside_mat_type_in;
+  g_ip->wire_os_mat_type = wire_outside_mat_type_in;
+  g_ip->burst_len        = BURST_LENGTH_in;
+  g_ip->int_prefetch_w   = INTERNAL_PREFETCH_WIDTH_in;
+  g_ip->page_sz_bits     = PAGE_SIZE_BITS_in;
+
+  g_ip->cache_sz            = cache_size;
+  g_ip->line_sz             = line_size;
+  g_ip->assoc               = associativity;
+  g_ip->nbanks              = banks;
+  g_ip->out_w               = output_width;
+  g_ip->specific_tag        = specific_tag;
+  if (specific_tag == 0) {
+    g_ip->tag_w = 42;
+  }
+  else {
+    g_ip->tag_w               = tag_width;
+  }
+
+  g_ip->access_mode         = access_mode;
+  g_ip->delay_wt = obj_func_delay;
+  g_ip->dynamic_power_wt = obj_func_dynamic_power;
+  g_ip->leakage_power_wt = obj_func_leakage_power;
+  g_ip->area_wt = obj_func_area;
+  g_ip->cycle_time_wt    = obj_func_cycle_time;
+  g_ip->delay_dev = dev_func_delay;
+  g_ip->dynamic_power_dev = dev_func_dynamic_power;
+  g_ip->leakage_power_dev = dev_func_leakage_power;
+  g_ip->area_dev = dev_func_area;
+  g_ip->cycle_time_dev    = dev_func_cycle_time;
+  g_ip->temp = temp;
+  g_ip->ed = ed_ed2_none;
+
+  g_ip->F_sz_nm         = tech_node;
+  g_ip->F_sz_um         = tech_node / 1000;
+  g_ip->is_main_mem     = (main_mem != 0) ? true : false;
+  g_ip->is_cache        = (cache ==1) ? true : false;
+  g_ip->pure_ram        = (cache ==0) ? true : false;
+  g_ip->pure_cam        = (cache ==2) ? true : false;
+  g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false;
+  g_ip->ver_htree_wires_over_array = VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in;
+  g_ip->broadcast_addr_din_over_ver_htrees = BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in;
+
+  g_ip->num_rw_ports    = rw_ports;
+  g_ip->num_rd_ports    = excl_read_ports;
+  g_ip->num_wr_ports    = excl_write_ports;
+  g_ip->num_se_rd_ports = single_ended_read_ports;
+  g_ip->num_search_ports = search_ports;
+
+  g_ip->print_detail = 1;
+  g_ip->nuca = 0;
+
+  if (force_wiretype == 0)
+  {
+	  g_ip->wt = Global;
+      g_ip->force_wiretype = false;
+  }
+  else
+  {   g_ip->force_wiretype = true;
+	  if (wiretype==10) {
+		  g_ip->wt = Global_10;
+	        }
+	  if (wiretype==20) {
+		  g_ip->wt = Global_20;
+	        }
+	  if (wiretype==30) {
+		  g_ip->wt = Global_30;
+	        }
+	  if (wiretype==5) {
+	      g_ip->wt = Global_5;
+	        }
+	  if (wiretype==0) {
+		  g_ip->wt = Low_swing;
+	  }
+  }
+  //g_ip->wt = Global_5;
+  if (force_config == 0)
+    {
+  	  g_ip->force_cache_config = false;
+    }
+    else
+    {
+    	g_ip->force_cache_config = true;
+    	g_ip->ndbl=ndbl;
+    	g_ip->ndwl=ndwl;
+    	g_ip->nspd=nspd;
+    	g_ip->ndcm=ndcm;
+    	g_ip->ndsam1=ndsam1;
+    	g_ip->ndsam2=ndsam2;
+
+
+    }
+
+  if (ecc==0){
+	  g_ip->add_ecc_b_=false;
+  }
+  else
+  {
+	  g_ip->add_ecc_b_=true;
+  }
+
+
+  if(!g_ip->error_checking())
+	  exit(0);
+
+  init_tech_params(g_ip->F_sz_um, false);
+  Wire winit; // Do not delete this line. It initializes wires.
+
+  g_ip->display_ip();
+  solve(&fin_res);
+  output_UCA(&fin_res);
+  output_data_csv(fin_res);
+  delete (g_ip);
+
+  return fin_res;
+}
+
+
+InputParameter::InputParameter()
+{
+    cache_sz=0;  // in bytes
+    line_sz=0;
+    assoc=0;
+    nbanks=0;
+    out_w=0;// == nr_bits_out
+    specific_tag=false;
+    tag_w=0;
+    access_mode=0;
+    obj_func_dyn_energy=0;
+    obj_func_dyn_power=0;
+    obj_func_leak_power=0;
+    obj_func_cycle_t=0;
+
+    F_sz_nm=0;          // feature size in nm
+    F_sz_um=0;          // feature size in um
+    num_rw_ports=0;
+    num_rd_ports=0;
+    num_wr_ports=0;
+    num_se_rd_ports=0;  // number of single ended read ports
+    num_search_ports=0;  // Sheng: number of search ports for CAM
+    is_main_mem=false;
+    is_cache=false;
+    pure_ram=false;
+    pure_cam=false;
+    rpters_in_htree=false;  // if there are repeaters in htree segment
+    ver_htree_wires_over_array=0;
+    broadcast_addr_din_over_ver_htrees=0;
+    temp=0;
+
+    ram_cell_tech_type=0;
+    peri_global_tech_type=0;
+    data_arr_ram_cell_tech_type=0;
+    data_arr_peri_global_tech_type=0;
+    tag_arr_ram_cell_tech_type=0;
+    tag_arr_peri_global_tech_type=0;
+
+    burst_len=0;
+    int_prefetch_w=0;
+    page_sz_bits=0;
+
+    ic_proj_type=0;      // interconnect_projection_type
+    wire_is_mat_type=0;  // wire_inside_mat_type
+    wire_os_mat_type=0; // wire_outside_mat_type
+    wt=Invalid_wtype;
+    force_wiretype=0;
+    print_input_args=false;
+    nuca_cache_sz=0; // TODO
+    ndbl=0;
+    ndwl=0;
+    nspd=0;
+    ndsam1=0;
+    ndsam2=0;
+    ndcm=0;
+    force_cache_config=false;
+
+    cache_level=0;
+    cores=0;
+    nuca_bank_count=0;
+    force_nuca_bank=0;
+
+    delay_wt=0;
+    dynamic_power_wt=0;
+    leakage_power_wt=0;
+    cycle_time_wt=0;
+    area_wt=0;
+    delay_wt_nuca=0;
+    dynamic_power_wt_nuca=0;
+    leakage_power_wt_nuca=0;
+    cycle_time_wt_nuca=0;
+    area_wt_nuca=0;
+
+    delay_dev=0;
+    dynamic_power_dev=0;
+    leakage_power_dev=0;
+    cycle_time_dev=0;
+    area_dev=0;
+    delay_dev_nuca=0;
+    dynamic_power_dev_nuca=0;
+    leakage_power_dev_nuca=0;
+    cycle_time_dev_nuca=0;
+    area_dev_nuca=0;
+    ed=0; //ED or ED2 optimization
+    nuca=0;
+
+    fast_access=false;
+    block_sz=0;  // bytes
+    tag_assoc=0;
+    data_assoc=0;
+    is_seq_acc=false;
+    fully_assoc=false;
+    nsets=0;  // == number_of_sets
+    print_detail=0;
+
+
+    add_ecc_b_=false;
+    //parameters for design constraint
+    throughput=0;
+    latency=0;
+    pipelinable=false;
+    pipeline_stages=0;
+    per_stage_vector=0;
+    with_clock_grid=false;
+}
+bool InputParameter::error_checking()
+{
+  int  A;
+  bool seq_access  = false;
+  fast_access = true;
+
+  switch (access_mode)
+  {
+    case 0:
+      seq_access  = false;
+      fast_access = false;
+      break;
+    case 1:
+      seq_access  = true;
+      fast_access = false;
+      break;
+    case 2:
+      seq_access  = false;
+      fast_access = true;
+      break;
+  }
+
+  if(is_main_mem)
+  {
+    if(ic_proj_type == 0)
+    {
+      cerr << "DRAM model supports only conservative interconnect projection!\n\n";
+      return false;
+    }
+  }
+
+
+  uint32_t B = line_sz;
+
+  if (B < 1)
+  {
+    cerr << "Block size must >= 1" << endl;
+    return false;
+  }
+  else if (B*8 < out_w)
+  {
+    cerr << "Block size must be at least " << out_w/8 << endl;
+    return false;
+  }
+
+  if (F_sz_um <= 0)
+  {
+    cerr << "Feature size must be > 0" << endl;
+    return false;
+  }
+  else if (F_sz_um > 0.091)
+  {
+    cerr << "Feature size must be <= 90 nm" << endl;
+    return false;
+  }
+
+
+  uint32_t RWP  = num_rw_ports;
+  uint32_t ERP  = num_rd_ports;
+  uint32_t EWP  = num_wr_ports;
+  uint32_t NSER = num_se_rd_ports;
+  uint32_t SCHP = num_search_ports;
+
+//TODO: revisit this. This is an important feature. Sheng thought this should be used
+//  // If multiple banks and multiple ports are specified, then if number of ports is less than or equal to
+//  // the number of banks, we assume that the multiple ports are implemented via the multiple banks.
+//  // In such a case we assume that each bank has 1 RWP port.
+//  if ((RWP + ERP + EWP) <= nbanks && nbanks>1)
+//  {
+//    RWP  = 1;
+//    ERP  = 0;
+//    EWP  = 0;
+//    NSER = 0;
+//  }
+//  else if ((RWP < 0) || (EWP < 0) || (ERP < 0))
+//  {
+//    cerr << "Ports must >=0" << endl;
+//    return false;
+//  }
+//  else if (RWP > 2)
+//  {
+//    cerr << "Maximum of 2 read/write ports" << endl;
+//    return false;
+//  }
+//  else if ((RWP+ERP+EWP) < 1)
+  // Changed to new implementation:
+  // The number of ports specified at input is per bank
+  if ((RWP+ERP+EWP) < 1)
+  {
+    cerr << "Must have at least one port" << endl;
+    return false;
+  }
+
+  if (is_pow2(nbanks) == false)
+  {
+    cerr << "Number of subbanks should be greater than or equal to 1 and should be a power of 2" << endl;
+    return false;
+  }
+
+  int C = cache_sz/nbanks;
+  if (C < 64)
+  {
+    cerr << "Cache size must >=64" << endl;
+    return false;
+  }
+
+//TODO: revisit this
+//   if (pure_ram==true && assoc!=1)
+//    {
+//  	  cerr << "Pure RAM must have assoc as 1" << endl;
+//  	  return false;
+//    }
+
+    //fully assoc and cam check
+    if (is_cache && assoc==0)
+    	fully_assoc =true;
+    else
+    	fully_assoc = false;
+
+    if (pure_cam==true && assoc!=0)
+    {
+  	  cerr << "Pure CAM must have associativity as 0" << endl;
+  	  return false;
+    }
+
+    if (assoc==0 && (pure_cam==false && is_cache ==false))
+    {
+  	  cerr << "Only CAM or Fully associative cache can have associativity as 0" << endl;
+  	  return false;
+    }
+
+    if ((fully_assoc==true || pure_cam==true)
+  		  &&  (data_arr_ram_cell_tech_type!= tag_arr_ram_cell_tech_type
+  				 || data_arr_peri_global_tech_type != tag_arr_peri_global_tech_type  ))
+    {
+  	  cerr << "CAM and fully associative cache must have same device type for both data and tag array" << endl;
+  	  return false;
+    }
+
+    if ((fully_assoc==true || pure_cam==true)
+  		  &&  (data_arr_ram_cell_tech_type== lp_dram || data_arr_ram_cell_tech_type== comm_dram))
+    {
+  	  cerr << "DRAM based CAM and fully associative cache are not supported" << endl;
+  	  return false;
+    }
+
+    if ((fully_assoc==true || pure_cam==true)
+  		  &&  (is_main_mem==true))
+    {
+  	  cerr << "CAM and fully associative cache cannot be as main memory" << endl;
+  	  return false;
+    }
+
+    if ((fully_assoc || pure_cam) && SCHP<1)
+    {
+	  cerr << "CAM and fully associative must have at least 1 search port" << endl;
+	  return false;
+    }
+
+   if (RWP==0 && ERP==0 && SCHP>0 && ((fully_assoc || pure_cam)))
+    {
+  	  ERP=SCHP;
+    }
+
+//    if ((!(fully_assoc || pure_cam)) && SCHP>=1)
+//    {
+//	  cerr << "None CAM and fully associative cannot have search ports" << endl;
+//	  return false;
+//    }
+
+  if (assoc == 0)
+  {
+    A = C/B;
+    //fully_assoc = true;
+  }
+  else
+  {
+    if (assoc == 1)
+    {
+      A = 1;
+      //fully_assoc = false;
+    }
+    else
+    {
+      //fully_assoc = false;
+      A = assoc;
+      if (is_pow2(A) == false)
+      {
+        cerr << "Associativity must be a power of 2" << endl;
+        return false;
+      }
+    }
+  }
+
+  if (C/(B*A) <= 1 && assoc!=0)
+  {
+    cerr << "Number of sets is too small: " << endl;
+    cerr << " Need to either increase cache size, or decrease associativity or block size" << endl;
+    cerr << " (or use fully associative cache)" << endl;
+    return false;
+  }
+
+  block_sz = B;
+
+  /*dt: testing sequential access mode*/
+  if(seq_access)
+  {
+    tag_assoc  = A;
+    data_assoc = 1;
+    is_seq_acc = true;
+  }
+  else
+  {
+    tag_assoc  = A;
+    data_assoc = A;
+    is_seq_acc = false;
+  }
+
+  if (assoc==0)
+  {
+    data_assoc = 1;
+  }
+  num_rw_ports     = RWP;
+  num_rd_ports     = ERP;
+  num_wr_ports     = EWP;
+  num_se_rd_ports  = NSER;
+  if (!(fully_assoc || pure_cam))
+    num_search_ports = 0;
+  nsets            = C/(B*A);
+
+  if (temp < 300 || temp > 400 || temp%10 != 0)
+  {
+    cerr << temp << " Temperature must be between 300 and 400 Kelvin and multiple of 10." << endl;
+    return false;
+  }
+
+  if (nsets < 1)
+  {
+    cerr << "Less than one set..." << endl;
+    return false;
+  }
+
+  return true;
+}
+
+
+
+void output_data_csv(const uca_org_t & fin_res)
+{
+  //TODO: the csv output should remain
+  fstream file("out.csv", ios::in);
+  bool    print_index = file.fail();
+  file.close();
+
+  file.open("out.csv", ios::out|ios::app);
+  if (file.fail() == true)
+  {
+    cerr << "File out.csv could not be opened successfully" << endl;
+  }
+  else
+  {
+    if (print_index == true)
+    {
+      file << "Tech node (nm), ";
+      file << "Capacity (bytes), ";
+      file << "Number of banks, ";
+      file << "Associativity, ";
+      file << "Output width (bits), ";
+      file << "Access time (ns), ";
+      file << "Random cycle time (ns), ";
+//      file << "Multisubbank interleave cycle time (ns), ";
+
+//      file << "Delay request network (ns), ";
+//      file << "Delay inside mat (ns), ";
+//      file << "Delay reply network (ns), ";
+//      file << "Tag array access time (ns), ";
+//      file << "Data array access time (ns), ";
+//      file << "Refresh period (microsec), ";
+//      file << "DRAM array availability (%), ";
+      file << "Dynamic search energy (nJ), ";
+      file << "Dynamic read energy (nJ), ";
+      file << "Dynamic write energy (nJ), ";
+//      file << "Tag Dynamic read energy (nJ), ";
+//      file << "Data Dynamic read energy (nJ), ";
+//      file << "Dynamic read power (mW), ";
+      file << "Standby leakage per bank(mW), ";
+//      file << "Leakage per bank with leak power management (mW), ";
+//      file << "Leakage per bank with leak power management (mW), ";
+//      file << "Refresh power as percentage of standby leakage, ";
+      file << "Area (mm2), ";
+      file << "Ndwl, ";
+      file << "Ndbl, ";
+      file << "Nspd, ";
+      file << "Ndcm, ";
+      file << "Ndsam_level_1, ";
+      file << "Ndsam_level_2, ";
+      file << "Data arrary area efficiency %, ";
+      file << "Ntwl, ";
+      file << "Ntbl, ";
+      file << "Ntspd, ";
+      file << "Ntcm, ";
+      file << "Ntsam_level_1, ";
+      file << "Ntsam_level_2, ";
+      file << "Tag arrary area efficiency %, ";
+
+//      file << "Resistance per unit micron (ohm-micron), ";
+//      file << "Capacitance per unit micron (fF per micron), ";
+//      file << "Unit-length wire delay (ps), ";
+//      file << "FO4 delay (ps), ";
+//      file << "delay route to bank (including crossb delay) (ps), ";
+//      file << "Crossbar delay (ps), ";
+//      file << "Dyn read energy per access from closed page (nJ), ";
+//      file << "Dyn read energy per access from open page (nJ), ";
+//      file << "Leak power of an subbank with page closed (mW), ";
+//      file << "Leak power of a subbank with page  open (mW), ";
+//      file << "Leak power of request and reply networks (mW), ";
+//      file << "Number of subbanks, ";
+//      file << "Page size in bits, ";
+//      file << "Activate power, ";
+//      file << "Read power, ";
+//      file << "Write power, ";
+//      file << "Precharge power, ";
+//      file << "tRCD, ";
+//      file << "CAS latency, ";
+//      file << "Precharge delay, ";
+//      file << "Perc dyn energy bitlines, ";
+//      file << "perc dyn energy wordlines, ";
+//      file << "perc dyn energy outside mat, ";
+//      file << "Area opt (perc), ";
+//      file << "Delay opt (perc), ";
+//      file << "Repeater opt (perc), ";
+//      file << "Aspect ratio";
+      file << endl;
+    }
+    file << g_ip->F_sz_nm << ", ";
+    file << g_ip->cache_sz << ", ";
+    file << g_ip->nbanks << ", ";
+    file << g_ip->tag_assoc << ", ";
+    file << g_ip->out_w << ", ";
+    file << fin_res.access_time*1e+9 << ", ";
+    file << fin_res.cycle_time*1e+9 << ", ";
+//    file << fin_res.data_array2->multisubbank_interleave_cycle_time*1e+9 << ", ";
+//    file << fin_res.data_array2->delay_request_network*1e+9 << ", ";
+//    file << fin_res.data_array2->delay_inside_mat*1e+9 <<  ", ";
+//    file << fin_res.data_array2.delay_reply_network*1e+9 << ", ";
+
+//    if (!(g_ip->fully_assoc || g_ip->pure_cam || g_ip->pure_ram))
+//        {
+//    	  file << fin_res.tag_array2->access_time*1e+9 << ", ";
+//        }
+//    else
+//    {
+//    	file << 0 << ", ";
+//    }
+//    file << fin_res.data_array2->access_time*1e+9 << ", ";
+//    file << fin_res.data_array2->dram_refresh_period*1e+6 << ", ";
+//    file << fin_res.data_array2->dram_array_availability <<  ", ";
+    if (g_ip->fully_assoc || g_ip->pure_cam)
+    {
+    	file << fin_res.power.searchOp.dynamic*1e+9 << ", ";
+    }
+    	else
+    {
+    		file << "N/A" << ", ";
+    }
+    file << fin_res.power.readOp.dynamic*1e+9 << ", ";
+    file << fin_res.power.writeOp.dynamic*1e+9 << ", ";
+//    if (!(g_ip->fully_assoc || g_ip->pure_cam || g_ip->pure_ram))
+//        {
+//        	file << fin_res.tag_array2->power.readOp.dynamic*1e+9 << ", ";
+//        }
+//        	else
+//        {
+//        		file << "NA" << ", ";
+//        }
+//    file << fin_res.data_array2->power.readOp.dynamic*1e+9 << ", ";
+//    if (g_ip->fully_assoc || g_ip->pure_cam)
+//        {
+//    	    file << fin_res.power.searchOp.dynamic*1000/fin_res.cycle_time << ", ";
+//        }
+//        	else
+//        {
+//        	file << fin_res.power.readOp.dynamic*1000/fin_res.cycle_time << ", ";
+//        }
+
+    file <<( fin_res.power.readOp.leakage + fin_res.power.readOp.gate_leakage )*1000 << ", ";
+//    file << fin_res.leak_power_with_sleep_transistors_in_mats*1000 << ", ";
+//    file << fin_res.data_array.refresh_power / fin_res.data_array.total_power.readOp.leakage << ", ";
+    file << fin_res.area*1e-6 << ", ";
+
+    file << fin_res.data_array2->Ndwl << ", ";
+    file << fin_res.data_array2->Ndbl << ", ";
+    file << fin_res.data_array2->Nspd << ", ";
+    file << fin_res.data_array2->deg_bl_muxing << ", ";
+    file << fin_res.data_array2->Ndsam_lev_1 << ", ";
+    file << fin_res.data_array2->Ndsam_lev_2 << ", ";
+    file << fin_res.data_array2->area_efficiency << ", ";
+    if (!(g_ip->fully_assoc || g_ip->pure_cam || g_ip->pure_ram))
+    {
+    file << fin_res.tag_array2->Ndwl << ", ";
+    file << fin_res.tag_array2->Ndbl << ", ";
+    file << fin_res.tag_array2->Nspd << ", ";
+    file << fin_res.tag_array2->deg_bl_muxing << ", ";
+    file << fin_res.tag_array2->Ndsam_lev_1 << ", ";
+    file << fin_res.tag_array2->Ndsam_lev_2 << ", ";
+    file << fin_res.tag_array2->area_efficiency << ", ";
+    }
+    else
+    {
+    	file << "N/A" << ", ";
+    	file << "N/A"<< ", ";
+    	file << "N/A" << ", ";
+    	file << "N/A" << ", ";
+    	file << "N/A" << ", ";
+    	file << "N/A" << ", ";
+    	file << "N/A" << ", ";
+    }
+
+//    file << g_tp.wire_inside_mat.R_per_um << ", ";
+//    file << g_tp.wire_inside_mat.C_per_um / 1e-15 << ", ";
+//    file << g_tp.unit_len_wire_del / 1e-12 << ", ";
+//    file << g_tp.FO4 / 1e-12 << ", ";
+//    file << fin_res.data_array.delay_route_to_bank / 1e-9 << ", ";
+//    file << fin_res.data_array.delay_crossbar / 1e-9 << ", ";
+//    file << fin_res.data_array.dyn_read_energy_from_closed_page / 1e-9 << ", ";
+//    file << fin_res.data_array.dyn_read_energy_from_open_page / 1e-9 << ", ";
+//    file << fin_res.data_array.leak_power_subbank_closed_page / 1e-3 << ", ";
+//    file << fin_res.data_array.leak_power_subbank_open_page / 1e-3 << ", ";
+//    file << fin_res.data_array.leak_power_request_and_reply_networks / 1e-3 << ", ";
+//    file << fin_res.data_array.number_subbanks << ", " ;
+//    file << fin_res.data_array.page_size_in_bits << ", " ;
+//    file << fin_res.data_array.activate_energy * 1e9 << ", " ;
+//    file << fin_res.data_array.read_energy * 1e9 << ", " ;
+//    file << fin_res.data_array.write_energy * 1e9 << ", " ;
+//    file << fin_res.data_array.precharge_energy * 1e9 << ", " ;
+//    file << fin_res.data_array.trcd * 1e9 << ", " ;
+//    file << fin_res.data_array.cas_latency * 1e9 << ", " ;
+//    file << fin_res.data_array.precharge_delay * 1e9 << ", " ;
+//    file << fin_res.data_array.all_banks_height / fin_res.data_array.all_banks_width;
+    file<<endl;
+  }
+  file.close();
+}
+
+
+
+void output_UCA(uca_org_t *fr)
+{
+  //    if (NUCA)
+  if (0) {
+    cout << "\n\n Detailed Bank Stats:\n";
+    cout << "    Bank Size (bytes): %d\n" <<
+                                     (int) (g_ip->cache_sz);
+  }
+  else {
+    if (g_ip->data_arr_ram_cell_tech_type == 3) {
+      cout << "\n---------- CACTI version 6.5, Uniform Cache Access " <<
+        "Logic Process Based DRAM Model ----------\n";
+    }
+    else if (g_ip->data_arr_ram_cell_tech_type == 4) {
+      cout << "\n---------- CACTI version 6.5, Uniform" <<
+        "Cache Access Commodity DRAM Model ----------\n";
+    }
+    else {
+      cout << "\n---------- CACTI version 6.5, Uniform Cache Access "
+        "SRAM Model ----------\n";
+    }
+    cout << "\nCache Parameters:\n";
+    cout << "    Total cache size (bytes): " <<
+      (int) (g_ip->cache_sz) << endl;
+  }
+
+  cout << "    Number of banks: " << (int) g_ip->nbanks << endl;
+  if (g_ip->fully_assoc|| g_ip->pure_cam)
+    cout << "    Associativity: fully associative\n";
+  else {
+    if (g_ip->tag_assoc == 1)
+      cout << "    Associativity: direct mapped\n";
+    else
+      cout << "    Associativity: " <<
+        g_ip->tag_assoc << endl;
+  }
+
+
+  cout << "    Block size (bytes): " << g_ip->line_sz << endl;
+  cout << "    Read/write Ports: " <<
+    g_ip->num_rw_ports << endl;
+  cout << "    Read ports: " <<
+    g_ip->num_rd_ports << endl;
+  cout << "    Write ports: " <<
+    g_ip->num_wr_ports << endl;
+  if (g_ip->fully_assoc|| g_ip->pure_cam)
+	  cout << "    search ports: " <<
+	      g_ip->num_search_ports << endl;
+  cout << "    Technology size (nm): " <<
+    g_ip->F_sz_nm << endl << endl;
+
+  cout << "    Access time (ns): " << fr->access_time*1e9 << endl;
+  cout << "    Cycle time (ns):  " << fr->cycle_time*1e9 << endl;
+  if (g_ip->data_arr_ram_cell_tech_type >= 4) {
+    cout << "    Precharge Delay (ns): " << fr->data_array2->precharge_delay*1e9 << endl;
+    cout << "    Activate Energy (nJ): " << fr->data_array2->activate_energy*1e9 << endl;
+    cout << "    Read Energy (nJ): " << fr->data_array2->read_energy*1e9 << endl;
+    cout << "    Write Energy (nJ): " << fr->data_array2->write_energy*1e9 << endl;
+    cout << "    Precharge Energy (nJ): " << fr->data_array2->precharge_energy*1e9 << endl;
+    cout << "    Leakage Power Closed Page (mW): " << fr->data_array2->leak_power_subbank_closed_page*1e3 << endl;
+    cout << "    Leakage Power Open Page (mW): " << fr->data_array2->leak_power_subbank_open_page*1e3 << endl;
+    cout << "    Leakage Power I/O (mW): " << fr->data_array2->leak_power_request_and_reply_networks*1e3 << endl;
+    cout << "    Refresh power (mW): " <<
+      fr->data_array2->refresh_power*1e3 << endl;
+  }
+  else {
+	  if ((g_ip->fully_assoc|| g_ip->pure_cam))
+	  {
+		  cout << "    Total dynamic associative search energy per access (nJ): " <<
+		  fr->power.searchOp.dynamic*1e9 << endl;
+//		  cout << "    Total dynamic read energy per access (nJ): " <<
+//		  fr->power.readOp.dynamic*1e9 << endl;
+//		  cout << "    Total dynamic write energy per access (nJ): " <<
+//		  fr->power.writeOp.dynamic*1e9 << endl;
+	  }
+//	  else
+//	  {
+		  cout << "    Total dynamic read energy per access (nJ): " <<
+		  fr->power.readOp.dynamic*1e9 << endl;
+		  cout << "    Total dynamic write energy per access (nJ): " <<
+		  fr->power.writeOp.dynamic*1e9 << endl;
+//	  }
+	  cout << "    Total leakage power of a bank"
+	  " (mW): " << fr->power.readOp.leakage*1e3 << endl;
+	  cout << "    Total gate leakage power of a bank"
+	  " (mW): " << fr->power.readOp.gate_leakage*1e3 << endl;
+  }
+
+  if (g_ip->data_arr_ram_cell_tech_type ==3 || g_ip->data_arr_ram_cell_tech_type ==4)
+  {
+  }
+  cout <<  "    Cache height x width (mm): " <<
+    fr->cache_ht*1e-3 << " x " << fr->cache_len*1e-3 << endl << endl;
+
+
+  cout << "    Best Ndwl : " << fr->data_array2->Ndwl << endl;
+  cout << "    Best Ndbl : " << fr->data_array2->Ndbl << endl;
+  cout << "    Best Nspd : " << fr->data_array2->Nspd << endl;
+  cout << "    Best Ndcm : " << fr->data_array2->deg_bl_muxing << endl;
+  cout << "    Best Ndsam L1 : " << fr->data_array2->Ndsam_lev_1 << endl;
+  cout << "    Best Ndsam L2 : " << fr->data_array2->Ndsam_lev_2 << endl << endl;
+
+  if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem)
+  {
+    cout << "    Best Ntwl : " << fr->tag_array2->Ndwl << endl;
+    cout << "    Best Ntbl : " << fr->tag_array2->Ndbl << endl;
+    cout << "    Best Ntspd : " << fr->tag_array2->Nspd << endl;
+    cout << "    Best Ntcm : " << fr->tag_array2->deg_bl_muxing << endl;
+    cout << "    Best Ntsam L1 : " << fr->tag_array2->Ndsam_lev_1 << endl;
+    cout << "    Best Ntsam L2 : " << fr->tag_array2->Ndsam_lev_2 << endl;
+  }
+
+  switch (fr->data_array2->wt) {
+    case (0):
+      cout <<  "    Data array, H-tree wire type: Delay optimized global wires\n";
+      break;
+    case (1):
+      cout <<  "    Data array, H-tree wire type: Global wires with 5\% delay penalty\n";
+      break;
+    case (2):
+      cout <<  "    Data array, H-tree wire type: Global wires with 10\% delay penalty\n";
+      break;
+    case (3):
+      cout <<  "    Data array, H-tree wire type: Global wires with 20\% delay penalty\n";
+      break;
+    case (4):
+      cout <<  "    Data array, H-tree wire type: Global wires with 30\% delay penalty\n";
+      break;
+    case (5):
+      cout <<  "    Data array, wire type: Low swing wires\n";
+      break;
+    default:
+      cout << "ERROR - Unknown wire type " << (int) fr->data_array2->wt <<endl;
+      exit(0);
+  }
+
+  if (!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) {
+    switch (fr->tag_array2->wt) {
+      case (0):
+        cout <<  "    Tag array, H-tree wire type: Delay optimized global wires\n";
+        break;
+      case (1):
+        cout <<  "    Tag array, H-tree wire type: Global wires with 5\% delay penalty\n";
+        break;
+      case (2):
+        cout <<  "    Tag array, H-tree wire type: Global wires with 10\% delay penalty\n";
+        break;
+      case (3):
+        cout <<  "    Tag array, H-tree wire type: Global wires with 20\% delay penalty\n";
+        break;
+      case (4):
+        cout <<  "    Tag array, H-tree wire type: Global wires with 30\% delay penalty\n";
+        break;
+      case (5):
+        cout <<  "    Tag array, wire type: Low swing wires\n";
+        break;
+      default:
+        cout << "ERROR - Unknown wire type " << (int) fr->tag_array2->wt <<endl;
+        exit(-1);
+    }
+  }
+
+  if (g_ip->print_detail)
+  {
+    //if(g_ip->fully_assoc) return;
+
+    /* Delay stats */
+    /* data array stats */
+    cout << endl << "Time Components:" << endl << endl;
+
+    cout << "  Data side (with Output driver) (ns): " <<
+      fr->data_array2->access_time/1e-9 << endl;
+
+    cout <<  "\tH-tree input delay (ns): " <<
+      fr->data_array2->delay_route_to_bank * 1e9 +
+      fr->data_array2->delay_input_htree * 1e9 << endl;
+
+    if (!(g_ip->pure_cam || g_ip->fully_assoc))
+    {
+      cout <<  "\tDecoder + wordline delay (ns): " <<
+        fr->data_array2->delay_row_predecode_driver_and_block * 1e9 +
+        fr->data_array2->delay_row_decoder * 1e9 << endl;
+    }
+    else
+    {
+        cout <<  "\tCAM search delay (ns): " <<
+          fr->data_array2->delay_matchlines * 1e9 << endl;
+    }
+
+    cout <<  "\tBitline delay (ns): " <<
+      fr->data_array2->delay_bitlines/1e-9 << endl;
+
+    cout <<  "\tSense Amplifier delay (ns): " <<
+      fr->data_array2->delay_sense_amp * 1e9 << endl;
+
+
+    cout <<  "\tH-tree output delay (ns): " <<
+      fr->data_array2->delay_subarray_output_driver * 1e9 +
+      fr->data_array2->delay_dout_htree * 1e9 << endl;
+
+    if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem)
+    {
+      /* tag array stats */
+      cout << endl << "  Tag side (with Output driver) (ns): " <<
+        fr->tag_array2->access_time/1e-9 << endl;
+
+      cout <<  "\tH-tree input delay (ns): " <<
+        fr->tag_array2->delay_route_to_bank * 1e9 +
+        fr->tag_array2->delay_input_htree * 1e9 << endl;
+
+      cout <<  "\tDecoder + wordline delay (ns): " <<
+        fr->tag_array2->delay_row_predecode_driver_and_block * 1e9 +
+        fr->tag_array2->delay_row_decoder * 1e9 << endl;
+
+      cout <<  "\tBitline delay (ns): " <<
+        fr->tag_array2->delay_bitlines/1e-9 << endl;
+
+      cout <<  "\tSense Amplifier delay (ns): " <<
+        fr->tag_array2->delay_sense_amp * 1e9 << endl;
+
+      cout <<  "\tComparator delay (ns): " <<
+        fr->tag_array2->delay_comparator * 1e9 << endl;
+
+      cout <<  "\tH-tree output delay (ns): " <<
+        fr->tag_array2->delay_subarray_output_driver * 1e9 +
+        fr->tag_array2->delay_dout_htree * 1e9 << endl;
+    }
+
+
+
+    /* Energy/Power stats */
+    cout << endl << endl << "Power Components:" << endl << endl;
+
+    if (!(g_ip->pure_cam || g_ip->fully_assoc))
+    {
+    	cout << "  Data array: Total dynamic read energy/access  (nJ): " <<
+    	      fr->data_array2->power.readOp.dynamic * 1e9 << endl;
+    	cout << "\tTotal leakage read/write power of a bank (mW): " <<
+    	        fr->data_array2->power.readOp.leakage * 1e3 << endl;
+
+    	cout << "\tTotal energy in H-tree (that includes both "
+    	      "address and data transfer) (nJ): " <<
+    	        (fr->data_array2->power_addr_input_htree.readOp.dynamic +
+    	         fr->data_array2->power_data_output_htree.readOp.dynamic +
+    	         fr->data_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl;
+
+    	cout << "\tTotal leakage power in H-tree (that includes both "
+    	      "address and data network) ((mW)): " <<
+    	        (fr->data_array2->power_addr_input_htree.readOp.leakage +
+    	         fr->data_array2->power_data_output_htree.readOp.leakage +
+    	         fr->data_array2->power_routing_to_bank.readOp.leakage) * 1e3 << endl;
+
+    	cout << "\tTotal gate leakage power in H-tree (that includes both "
+    	      "address and data network) ((mW)): " <<
+    	        (fr->data_array2->power_addr_input_htree.readOp.gate_leakage +
+    	         fr->data_array2->power_data_output_htree.readOp.gate_leakage +
+    	         fr->data_array2->power_routing_to_bank.readOp.gate_leakage) * 1e3 << endl;
+
+    	cout << "\tOutput Htree inside bank Energy (nJ): " <<
+    	   fr->data_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl;
+    	cout <<  "\tDecoder (nJ): " <<
+    	   fr->data_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 +
+    	   fr->data_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl;
+    	cout <<  "\tWordline (nJ): " <<
+    	   fr->data_array2->power_row_decoders.readOp.dynamic * 1e9 << endl;
+    	cout <<  "\tBitline mux & associated drivers (nJ): " <<
+    	   fr->data_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 +
+    	   fr->data_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 +
+    	   fr->data_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl;
+    	cout <<  "\tSense amp mux & associated drivers (nJ): " <<
+    	   fr->data_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 +
+    	   fr->data_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 +
+    	   fr->data_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9  +
+    	   fr->data_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 +
+    	   fr->data_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 +
+    	   fr->data_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl;
+
+    	cout <<  "\tBitlines precharge and equalization circuit (nJ): " <<
+    	    	   fr->data_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9 << endl;
+    	cout <<  "\tBitlines (nJ): " <<
+    	   fr->data_array2->power_bitlines.readOp.dynamic * 1e9 << endl;
+    	cout <<  "\tSense amplifier energy (nJ): " <<
+    	   fr->data_array2->power_sense_amps.readOp.dynamic * 1e9 << endl;
+    	cout <<  "\tSub-array output driver (nJ): " <<
+    	   fr->data_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl;
+    }
+
+        else if (g_ip->pure_cam)
+        {
+
+           	cout << "  CAM array:"<<endl;
+            	cout << "  Total dynamic associative search energy/access  (nJ): " <<
+                      fr->data_array2->power.searchOp.dynamic * 1e9 << endl;
+    	        cout << "\tTotal energy in H-tree (that includes both "
+    	            	      "match key and data transfer) (nJ): " <<
+    	              (fr->data_array2->power_htree_in_search.searchOp.dynamic +
+    	               fr->data_array2->power_htree_out_search.searchOp.dynamic +
+    	               fr->data_array2->power_routing_to_bank.searchOp.dynamic) * 1e9 << endl;
+    	        cout << "\tKeyword input and result output Htrees inside bank Energy (nJ): " <<
+    	              (fr->data_array2->power_htree_in_search.searchOp.dynamic +
+    	       	               fr->data_array2->power_htree_out_search.searchOp.dynamic) * 1e9 << endl;
+    	        cout <<  "\tSearchlines (nJ): " <<
+    	          	   fr->data_array2->power_searchline.searchOp.dynamic * 1e9 +
+    	          	   fr->data_array2->power_searchline_precharge.searchOp.dynamic * 1e9 << endl;
+    	        cout <<  "\tMatchlines  (nJ): " <<
+    	               fr->data_array2->power_matchlines.searchOp.dynamic * 1e9 +
+    	        	   fr->data_array2->power_matchline_precharge.searchOp.dynamic * 1e9 << endl;
+    	        cout <<  "\tSub-array output driver (nJ): " <<
+    	          	   fr->data_array2->power_output_drivers_at_subarray.searchOp.dynamic * 1e9 << endl;
+
+
+            	cout <<endl<< "  Total dynamic read energy/access  (nJ): " <<
+            	      fr->data_array2->power.readOp.dynamic * 1e9 << endl;
+    	        cout << "\tTotal energy in H-tree (that includes both "
+    	            	      "address and data transfer) (nJ): " <<
+    	              (fr->data_array2->power_addr_input_htree.readOp.dynamic +
+    	               fr->data_array2->power_data_output_htree.readOp.dynamic +
+    	               fr->data_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl;
+    	        cout << "\tOutput Htree inside bank Energy (nJ): " <<
+    	          	   fr->data_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl;
+    	        cout <<  "\tDecoder (nJ): " <<
+    	          	   fr->data_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 +
+    	          	   fr->data_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl;
+    	        cout <<  "\tWordline (nJ): " <<
+    	          	   fr->data_array2->power_row_decoders.readOp.dynamic * 1e9 << endl;
+    	        cout <<  "\tBitline mux & associated drivers (nJ): " <<
+    	          	   fr->data_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 +
+    	          	   fr->data_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 +
+    	           	   fr->data_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl;
+    	        cout <<  "\tSense amp mux & associated drivers (nJ): " <<
+    	         	   fr->data_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 +
+    	          	   fr->data_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 +
+    	          	   fr->data_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9  +
+    	           	   fr->data_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 +
+    	           	   fr->data_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 +
+    	          	   fr->data_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl;
+    	        cout <<  "\tBitlines (nJ): " <<
+    	          	   fr->data_array2->power_bitlines.readOp.dynamic * 1e9 +
+    	          	   fr->data_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9<< endl;
+    	        cout <<  "\tSense amplifier energy (nJ): " <<
+    	          	   fr->data_array2->power_sense_amps.readOp.dynamic * 1e9 << endl;
+    	        cout <<  "\tSub-array output driver (nJ): " <<
+    	          	   fr->data_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl;
+
+            	cout << endl <<"  Total leakage power of a bank (mW): " <<
+                      fr->data_array2->power.readOp.leakage * 1e3 << endl;
+        }
+        else
+        {
+        	cout << "  Fully associative array:"<<endl;
+        	cout << "  Total dynamic associative search energy/access  (nJ): " <<
+                  fr->data_array2->power.searchOp.dynamic * 1e9 << endl;
+	        cout << "\tTotal energy in H-tree (that includes both "
+	            	      "match key and data transfer) (nJ): " <<
+	              (fr->data_array2->power_htree_in_search.searchOp.dynamic +
+	               fr->data_array2->power_htree_out_search.searchOp.dynamic +
+	               fr->data_array2->power_routing_to_bank.searchOp.dynamic) * 1e9 << endl;
+	        cout << "\tKeyword input and result output Htrees inside bank Energy (nJ): " <<
+	              (fr->data_array2->power_htree_in_search.searchOp.dynamic +
+	       	               fr->data_array2->power_htree_out_search.searchOp.dynamic) * 1e9 << endl;
+	        cout <<  "\tSearchlines (nJ): " <<
+	          	   fr->data_array2->power_searchline.searchOp.dynamic * 1e9 +
+	          	   fr->data_array2->power_searchline_precharge.searchOp.dynamic * 1e9 << endl;
+	        cout <<  "\tMatchlines  (nJ): " <<
+	               fr->data_array2->power_matchlines.searchOp.dynamic * 1e9 +
+	        	   fr->data_array2->power_matchline_precharge.searchOp.dynamic * 1e9 << endl;
+	        cout <<  "\tData portion wordline (nJ): " <<
+	          	   fr->data_array2->power_matchline_to_wordline_drv.searchOp.dynamic * 1e9 << endl;
+	        cout <<  "\tData Bitlines (nJ): " <<
+	          	   fr->data_array2->power_bitlines.searchOp.dynamic * 1e9 +
+	          	   fr->data_array2->power_prechg_eq_drivers.searchOp.dynamic * 1e9 << endl;
+	        cout <<  "\tSense amplifier energy (nJ): " <<
+	          	   fr->data_array2->power_sense_amps.searchOp.dynamic * 1e9 << endl;
+	        cout <<  "\tSub-array output driver (nJ): " <<
+	          	   fr->data_array2->power_output_drivers_at_subarray.searchOp.dynamic * 1e9 << endl;
+
+
+        	cout <<endl<< "  Total dynamic read energy/access  (nJ): " <<
+        	      fr->data_array2->power.readOp.dynamic * 1e9 << endl;
+	        cout << "\tTotal energy in H-tree (that includes both "
+	            	      "address and data transfer) (nJ): " <<
+	              (fr->data_array2->power_addr_input_htree.readOp.dynamic +
+	               fr->data_array2->power_data_output_htree.readOp.dynamic +
+	               fr->data_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl;
+	        cout << "\tOutput Htree inside bank Energy (nJ): " <<
+	          	   fr->data_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl;
+	        cout <<  "\tDecoder (nJ): " <<
+	          	   fr->data_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 +
+	          	   fr->data_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl;
+	        cout <<  "\tWordline (nJ): " <<
+	          	   fr->data_array2->power_row_decoders.readOp.dynamic * 1e9 << endl;
+	        cout <<  "\tBitline mux & associated drivers (nJ): " <<
+	          	   fr->data_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 +
+	          	   fr->data_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 +
+	           	   fr->data_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl;
+	        cout <<  "\tSense amp mux & associated drivers (nJ): " <<
+	         	   fr->data_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 +
+	          	   fr->data_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 +
+	          	   fr->data_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9  +
+	           	   fr->data_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 +
+	           	   fr->data_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 +
+	          	   fr->data_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl;
+	        cout <<  "\tBitlines (nJ): " <<
+	          	   fr->data_array2->power_bitlines.readOp.dynamic * 1e9 +
+	          	   fr->data_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9<< endl;
+	        cout <<  "\tSense amplifier energy (nJ): " <<
+	          	   fr->data_array2->power_sense_amps.readOp.dynamic * 1e9 << endl;
+	        cout <<  "\tSub-array output driver (nJ): " <<
+	          	   fr->data_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl;
+
+        	cout << endl <<"  Total leakage power of a bank (mW): " <<
+                  fr->data_array2->power.readOp.leakage * 1e3 << endl;
+      }
+
+
+    if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem)
+    {
+      cout << endl << "  Tag array:  Total dynamic read energy/access (nJ): " <<
+        fr->tag_array2->power.readOp.dynamic * 1e9 << endl;
+      cout << "\tTotal leakage read/write power of a bank (mW): " <<
+          fr->tag_array2->power.readOp.leakage * 1e3 << endl;
+      cout << "\tTotal energy in H-tree (that includes both "
+        "address and data transfer) (nJ): " <<
+          (fr->tag_array2->power_addr_input_htree.readOp.dynamic +
+           fr->tag_array2->power_data_output_htree.readOp.dynamic +
+           fr->tag_array2->power_routing_to_bank.readOp.dynamic) * 1e9 << endl;
+
+      cout << "\tTotal leakage power in H-tree (that includes both "
+  	      "address and data network) ((mW)): " <<
+  	        (fr->tag_array2->power_addr_input_htree.readOp.leakage +
+  	         fr->tag_array2->power_data_output_htree.readOp.leakage +
+  	         fr->tag_array2->power_routing_to_bank.readOp.leakage) * 1e3 << endl;
+
+  	  cout << "\tTotal gate leakage power in H-tree (that includes both "
+  	      "address and data network) ((mW)): " <<
+  	        (fr->tag_array2->power_addr_input_htree.readOp.gate_leakage +
+  	         fr->tag_array2->power_data_output_htree.readOp.gate_leakage +
+  	         fr->tag_array2->power_routing_to_bank.readOp.gate_leakage) * 1e3 << endl;
+
+      cout << "\tOutput Htree inside a bank Energy (nJ): " <<
+        fr->tag_array2->power_data_output_htree.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tDecoder (nJ): " <<
+        fr->tag_array2->power_row_predecoder_drivers.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_row_predecoder_blocks.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tWordline (nJ): " <<
+        fr->tag_array2->power_row_decoders.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tBitline mux & associated drivers (nJ): " <<
+        fr->tag_array2->power_bit_mux_predecoder_drivers.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_bit_mux_predecoder_blocks.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_bit_mux_decoders.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tSense amp mux & associated drivers (nJ): " <<
+        fr->tag_array2->power_senseamp_mux_lev_1_predecoder_drivers.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_senseamp_mux_lev_1_predecoder_blocks.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_senseamp_mux_lev_1_decoders.readOp.dynamic * 1e9  +
+        fr->tag_array2->power_senseamp_mux_lev_2_predecoder_drivers.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_senseamp_mux_lev_2_predecoder_blocks.readOp.dynamic * 1e9 +
+        fr->tag_array2->power_senseamp_mux_lev_2_decoders.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tBitlines precharge and equalization circuit (nJ): " <<
+        fr->tag_array2->power_prechg_eq_drivers.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tBitlines (nJ): " <<
+        fr->tag_array2->power_bitlines.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tSense amplifier energy (nJ): " <<
+        fr->tag_array2->power_sense_amps.readOp.dynamic * 1e9 << endl;
+      cout <<  "\tSub-array output driver (nJ): " <<
+        fr->tag_array2->power_output_drivers_at_subarray.readOp.dynamic * 1e9 << endl;
+    }
+
+    cout << endl << endl <<  "Area Components:" << endl << endl;
+    /* Data array area stats */
+    if (!(g_ip->pure_cam || g_ip->fully_assoc))
+    	cout <<  "  Data array: Area (mm2): " << fr->data_array2->area * 1e-6 << endl;
+    else if (g_ip->pure_cam)
+    	cout <<  "  CAM array: Area (mm2): " << fr->data_array2->area * 1e-6 << endl;
+    else
+    	cout <<  "  Fully associative cache array: Area (mm2): " << fr->data_array2->area * 1e-6 << endl;
+    cout <<  "\tHeight (mm): " <<
+      fr->data_array2->all_banks_height*1e-3 << endl;
+    cout <<  "\tWidth (mm): " <<
+      fr->data_array2->all_banks_width*1e-3 << endl;
+    if (g_ip->print_detail) {
+      cout <<  "\tArea efficiency (Memory cell area/Total area) - " <<
+        fr->data_array2->area_efficiency << " %" << endl;
+      cout << "\t\tMAT Height (mm): " <<
+        fr->data_array2->mat_height*1e-3 << endl;
+      cout << "\t\tMAT Length (mm): " <<
+        fr->data_array2->mat_length*1e-3 << endl;
+      cout << "\t\tSubarray Height (mm): " <<
+        fr->data_array2->subarray_height*1e-3 << endl;
+      cout << "\t\tSubarray Length (mm): " <<
+        fr->data_array2->subarray_length*1e-3 << endl;
+    }
+
+    /* Tag array area stats */
+    if ((!(g_ip->pure_ram|| g_ip->pure_cam || g_ip->fully_assoc)) && !g_ip->is_main_mem)
+    {
+      cout << endl << "  Tag array: Area (mm2): " << fr->tag_array2->area * 1e-6 << endl;
+      cout <<  "\tHeight (mm): " <<
+        fr->tag_array2->all_banks_height*1e-3 << endl;
+      cout <<  "\tWidth (mm): " <<
+        fr->tag_array2->all_banks_width*1e-3 << endl;
+      if (g_ip->print_detail)
+      {
+        cout <<  "\tArea efficiency (Memory cell area/Total area) - " <<
+          fr->tag_array2->area_efficiency << " %" << endl;
+      cout << "\t\tMAT Height (mm): " <<
+        fr->tag_array2->mat_height*1e-3 << endl;
+      cout << "\t\tMAT Length (mm): " <<
+        fr->tag_array2->mat_length*1e-3 << endl;
+      cout << "\t\tSubarray Height (mm): " <<
+        fr->tag_array2->subarray_height*1e-3 << endl;
+      cout << "\t\tSubarray Length (mm): " <<
+        fr->tag_array2->subarray_length*1e-3 << endl;
+      }
+    }
+    Wire wpr;
+    wpr.print_wire();
+
+    //cout << "FO4 = " << g_tp.FO4 << endl;
+  }
+}
+
+//McPAT's plain interface, please keep !!!
+uca_org_t cacti_interface(InputParameter  * const local_interface)
+{
+//  g_ip = new InputParameter();
+  //g_ip->add_ecc_b_ = true;
+
+  uca_org_t fin_res;
+  fin_res.valid = false;
+
+  g_ip = local_interface;
+
+
+//  g_ip->data_arr_ram_cell_tech_type    = data_arr_ram_cell_tech_flavor_in;
+//  g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in;
+//  g_ip->tag_arr_ram_cell_tech_type     = tag_arr_ram_cell_tech_flavor_in;
+//  g_ip->tag_arr_peri_global_tech_type  = tag_arr_peri_global_tech_flavor_in;
+//
+//  g_ip->ic_proj_type     = interconnect_projection_type_in;
+//  g_ip->wire_is_mat_type = wire_inside_mat_type_in;
+//  g_ip->wire_os_mat_type = wire_outside_mat_type_in;
+//  g_ip->burst_len        = BURST_LENGTH_in;
+//  g_ip->int_prefetch_w   = INTERNAL_PREFETCH_WIDTH_in;
+//  g_ip->page_sz_bits     = PAGE_SIZE_BITS_in;
+//
+//  g_ip->cache_sz            = cache_size;
+//  g_ip->line_sz             = line_size;
+//  g_ip->assoc               = associativity;
+//  g_ip->nbanks              = banks;
+//  g_ip->out_w               = output_width;
+//  g_ip->specific_tag        = specific_tag;
+//  if (tag_width == 0) {
+//    g_ip->tag_w = 42;
+//  }
+//  else {
+//    g_ip->tag_w               = tag_width;
+//  }
+//
+//  g_ip->access_mode         = access_mode;
+//  g_ip->delay_wt = obj_func_delay;
+//  g_ip->dynamic_power_wt = obj_func_dynamic_power;
+//  g_ip->leakage_power_wt = obj_func_leakage_power;
+//  g_ip->area_wt = obj_func_area;
+//  g_ip->cycle_time_wt    = obj_func_cycle_time;
+//  g_ip->delay_dev = dev_func_delay;
+//  g_ip->dynamic_power_dev = dev_func_dynamic_power;
+//  g_ip->leakage_power_dev = dev_func_leakage_power;
+//  g_ip->area_dev = dev_func_area;
+//  g_ip->cycle_time_dev    = dev_func_cycle_time;
+//  g_ip->temp = temp;
+//
+//  g_ip->F_sz_nm         = tech_node;
+//  g_ip->F_sz_um         = tech_node / 1000;
+//  g_ip->is_main_mem     = (main_mem != 0) ? true : false;
+//  g_ip->is_cache        = (cache ==1) ? true : false;
+//  g_ip->pure_ram        = (cache ==0) ? true : false;
+//  g_ip->pure_cam        = (cache ==2) ? true : false;
+//  g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false;
+//  g_ip->ver_htree_wires_over_array = VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in;
+//  g_ip->broadcast_addr_din_over_ver_htrees = BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in;
+//
+//  g_ip->num_rw_ports    = rw_ports;
+//  g_ip->num_rd_ports    = excl_read_ports;
+//  g_ip->num_wr_ports    = excl_write_ports;
+//  g_ip->num_se_rd_ports = single_ended_read_ports;
+//  g_ip->num_search_ports = search_ports;
+//
+//  g_ip->print_detail = 1;
+//    g_ip->nuca = 0;
+//    g_ip->is_cache=true;
+//
+//  if (force_wiretype == 0)
+//  {
+//	  g_ip->wt = Global;
+//      g_ip->force_wiretype = false;
+//  }
+//  else
+//  {   g_ip->force_wiretype = true;
+//	  if (wiretype==10) {
+//		  g_ip->wt = Global_10;
+//	        }
+//	  if (wiretype==20) {
+//		  g_ip->wt = Global_20;
+//	        }
+//	  if (wiretype==30) {
+//		  g_ip->wt = Global_30;
+//	        }
+//	  if (wiretype==5) {
+//	      g_ip->wt = Global_5;
+//	        }
+//	  if (wiretype==0) {
+//		  g_ip->wt = Low_swing;
+//	  }
+//  }
+//  //g_ip->wt = Global_5;
+//  if (force_config == 0)
+//    {
+//  	  g_ip->force_cache_config = false;
+//    }
+//    else
+//    {
+//    	g_ip->force_cache_config = true;
+//    	g_ip->ndbl=ndbl;
+//    	g_ip->ndwl=ndwl;
+//    	g_ip->nspd=nspd;
+//    	g_ip->ndcm=ndcm;
+//    	g_ip->ndsam1=ndsam1;
+//    	g_ip->ndsam2=ndsam2;
+//
+//
+//    }
+//
+//  if (ecc==0){
+//	  g_ip->add_ecc_b_=false;
+//  }
+//  else
+//  {
+//	  g_ip->add_ecc_b_=true;
+//  }
+
+
+  g_ip->error_checking();
+
+
+  init_tech_params(g_ip->F_sz_um, false);
+  Wire winit; // Do not delete this line. It initializes wires.
+
+  solve(&fin_res);
+
+//  g_ip->display_ip();
+//  output_UCA(&fin_res);
+//  output_data_csv(fin_res);
+
+ // delete (g_ip);
+
+  return fin_res;
+}
+
+//McPAT's plain interface, please keep !!!
+uca_org_t init_interface(InputParameter* const local_interface)
+{
+ // g_ip = new InputParameter();
+  //g_ip->add_ecc_b_ = true;
+
+  uca_org_t fin_res;
+  fin_res.valid = false;
+
+   g_ip = local_interface;
+
+
+//  g_ip->data_arr_ram_cell_tech_type    = data_arr_ram_cell_tech_flavor_in;
+//  g_ip->data_arr_peri_global_tech_type = data_arr_peri_global_tech_flavor_in;
+//  g_ip->tag_arr_ram_cell_tech_type     = tag_arr_ram_cell_tech_flavor_in;
+//  g_ip->tag_arr_peri_global_tech_type  = tag_arr_peri_global_tech_flavor_in;
+//
+//  g_ip->ic_proj_type     = interconnect_projection_type_in;
+//  g_ip->wire_is_mat_type = wire_inside_mat_type_in;
+//  g_ip->wire_os_mat_type = wire_outside_mat_type_in;
+//  g_ip->burst_len        = BURST_LENGTH_in;
+//  g_ip->int_prefetch_w   = INTERNAL_PREFETCH_WIDTH_in;
+//  g_ip->page_sz_bits     = PAGE_SIZE_BITS_in;
+//
+//  g_ip->cache_sz            = cache_size;
+//  g_ip->line_sz             = line_size;
+//  g_ip->assoc               = associativity;
+//  g_ip->nbanks              = banks;
+//  g_ip->out_w               = output_width;
+//  g_ip->specific_tag        = specific_tag;
+//  if (tag_width == 0) {
+//    g_ip->tag_w = 42;
+//  }
+//  else {
+//    g_ip->tag_w               = tag_width;
+//  }
+//
+//  g_ip->access_mode         = access_mode;
+//  g_ip->delay_wt = obj_func_delay;
+//  g_ip->dynamic_power_wt = obj_func_dynamic_power;
+//  g_ip->leakage_power_wt = obj_func_leakage_power;
+//  g_ip->area_wt = obj_func_area;
+//  g_ip->cycle_time_wt    = obj_func_cycle_time;
+//  g_ip->delay_dev = dev_func_delay;
+//  g_ip->dynamic_power_dev = dev_func_dynamic_power;
+//  g_ip->leakage_power_dev = dev_func_leakage_power;
+//  g_ip->area_dev = dev_func_area;
+//  g_ip->cycle_time_dev    = dev_func_cycle_time;
+//  g_ip->temp = temp;
+//
+//  g_ip->F_sz_nm         = tech_node;
+//  g_ip->F_sz_um         = tech_node / 1000;
+//  g_ip->is_main_mem     = (main_mem != 0) ? true : false;
+//  g_ip->is_cache        = (cache ==1) ? true : false;
+//  g_ip->pure_ram        = (cache ==0) ? true : false;
+//  g_ip->pure_cam        = (cache ==2) ? true : false;
+//  g_ip->rpters_in_htree = (REPEATERS_IN_HTREE_SEGMENTS_in != 0) ? true : false;
+//  g_ip->ver_htree_wires_over_array = VERTICAL_HTREE_WIRES_OVER_THE_ARRAY_in;
+//  g_ip->broadcast_addr_din_over_ver_htrees = BROADCAST_ADDR_DATAIN_OVER_VERTICAL_HTREES_in;
+//
+//  g_ip->num_rw_ports    = rw_ports;
+//  g_ip->num_rd_ports    = excl_read_ports;
+//  g_ip->num_wr_ports    = excl_write_ports;
+//  g_ip->num_se_rd_ports = single_ended_read_ports;
+//  g_ip->num_search_ports = search_ports;
+//
+//  g_ip->print_detail = 1;
+//  g_ip->nuca = 0;
+//
+//  if (force_wiretype == 0)
+//  {
+//	  g_ip->wt = Global;
+//      g_ip->force_wiretype = false;
+//  }
+//  else
+//  {   g_ip->force_wiretype = true;
+//	  if (wiretype==10) {
+//		  g_ip->wt = Global_10;
+//	        }
+//	  if (wiretype==20) {
+//		  g_ip->wt = Global_20;
+//	        }
+//	  if (wiretype==30) {
+//		  g_ip->wt = Global_30;
+//	        }
+//	  if (wiretype==5) {
+//	      g_ip->wt = Global_5;
+//	        }
+//	  if (wiretype==0) {
+//		  g_ip->wt = Low_swing;
+//	  }
+//  }
+//  //g_ip->wt = Global_5;
+//  if (force_config == 0)
+//    {
+//  	  g_ip->force_cache_config = false;
+//    }
+//    else
+//    {
+//    	g_ip->force_cache_config = true;
+//    	g_ip->ndbl=ndbl;
+//    	g_ip->ndwl=ndwl;
+//    	g_ip->nspd=nspd;
+//    	g_ip->ndcm=ndcm;
+//    	g_ip->ndsam1=ndsam1;
+//    	g_ip->ndsam2=ndsam2;
+//
+//
+//    }
+//
+//  if (ecc==0){
+//	  g_ip->add_ecc_b_=false;
+//  }
+//  else
+//  {
+//	  g_ip->add_ecc_b_=true;
+//  }
+
+
+  g_ip->error_checking();
+
+  init_tech_params(g_ip->F_sz_um, false);
+  Wire winit; // Do not delete this line. It initializes wires.
+  //solve(&fin_res);
+  //g_ip->display_ip();
+
+  //solve(&fin_res);
+  //output_UCA(&fin_res);
+  //output_data_csv(fin_res);
+ // delete (g_ip);
+
+  return fin_res;
+}
+
+void reconfigure(InputParameter *local_interface, uca_org_t *fin_res)
+{
+  // Copy the InputParameter to global interface (g_ip) and do error checking.
+  g_ip = local_interface;
+  g_ip->error_checking();
+
+  // Initialize technology parameters
+  init_tech_params(g_ip->F_sz_um,false);
+
+  Wire winit; // Do not delete this line. It initializes wires.
+
+  // This corresponds to solve() in the initialization process.
+  update(fin_res);
+}
diff --git a/src/gpuwattch/cacti/io.h b/src/gpuwattch/cacti/io.h
new file mode 100644
index 000000000..1bd06cf69
--- /dev/null
+++ b/src/gpuwattch/cacti/io.h
@@ -0,0 +1,45 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __IO_H__
+#define __IO_H__
+
+
+#include "const.h"
+#include "cacti_interface.h"
+
+
+void output_data_csv(const uca_org_t & fin_res);
+void output_UCA(uca_org_t * fin_res);
+
+
+#endif
diff --git a/src/gpuwattch/cacti/main.cc b/src/gpuwattch/cacti/main.cc
new file mode 100644
index 000000000..58c2fe1b6
--- /dev/null
+++ b/src/gpuwattch/cacti/main.cc
@@ -0,0 +1,190 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "io.h"
+#include <iostream>
+
+using namespace std;
+
+
+int main(int argc,char *argv[])
+{
+
+  uca_org_t result;
+  if (argc != 53 && argc != 55)
+  {
+    bool infile_specified = false;
+    string infile_name("");
+
+    for (int32_t i = 0; i < argc; i++)
+    {
+      if (argv[i] == string("-infile"))
+      {
+        infile_specified = true;
+        i++;
+        infile_name = argv[i];
+      }
+    }
+
+    if (infile_specified == false)
+    {
+      cerr << " Invalid arguments -- how to use CACTI:" << endl;
+      cerr << "  1) cacti -infile <input file name>" << endl;
+      cerr << "  2) cacti arg1 ... arg52 -- please refer to the README file" << endl;
+      cerr << " No. of arguments input - " << argc << endl;
+      exit(1);
+    }
+    else
+    {
+      result = cacti_interface(infile_name);
+    }
+  }
+  else if (argc == 53)
+  {
+	  result = cacti_interface(atoi(argv[ 1]),
+			  atoi(argv[ 2]),
+			  atoi(argv[ 3]),
+			  atoi(argv[ 4]),
+			  atoi(argv[ 5]),
+			  atoi(argv[ 6]),
+			  atoi(argv[ 7]),
+			  atoi(argv[ 8]),
+			  atoi(argv[ 9]),
+			  atof(argv[10]),
+			  atoi(argv[11]),
+			  atoi(argv[12]),
+			  atoi(argv[13]),
+			  atoi(argv[14]),
+			  atoi(argv[15]),
+			  atoi(argv[16]),
+			  atoi(argv[17]),
+			  atoi(argv[18]),
+			  atoi(argv[19]),
+			  atoi(argv[20]),
+			  atoi(argv[21]),
+			  atoi(argv[22]),
+			  atoi(argv[23]),
+			  atoi(argv[24]),
+			  atoi(argv[25]),
+			  atoi(argv[26]),
+			  atoi(argv[27]),
+			  atoi(argv[28]),
+			  atoi(argv[29]),
+			  atoi(argv[30]),
+			  atoi(argv[31]),
+			  atoi(argv[32]),
+			  atoi(argv[33]),
+			  atoi(argv[34]),
+			  atoi(argv[35]),
+			  atoi(argv[36]),
+			  atoi(argv[37]),
+			  atoi(argv[38]),
+			  atoi(argv[39]),
+			  atoi(argv[40]),
+			  atoi(argv[41]),
+			  atoi(argv[42]),
+			  atoi(argv[43]),
+			  atoi(argv[44]),
+			  atoi(argv[45]),
+			  atoi(argv[46]),
+			  atoi(argv[47]),
+			  atoi(argv[48]),
+			  atoi(argv[49]),
+			  atoi(argv[50]),
+			  atoi(argv[51]),
+			  atoi(argv[52]));
+  }
+  else
+  {
+	  result = cacti_interface(atoi(argv[ 1]),
+			  atoi(argv[ 2]),
+			  atoi(argv[ 3]),
+			  atoi(argv[ 4]),
+			  atoi(argv[ 5]),
+			  atoi(argv[ 6]),
+			  atoi(argv[ 7]),
+			  atoi(argv[ 8]),
+			  atof(argv[ 9]),
+			  atoi(argv[10]),
+			  atoi(argv[11]),
+			  atoi(argv[12]),
+			  atoi(argv[13]),
+			  atoi(argv[14]),
+			  atoi(argv[15]),
+			  atoi(argv[16]),
+			  atoi(argv[17]),
+			  atoi(argv[18]),
+			  atoi(argv[19]),
+			  atoi(argv[20]),
+			  atoi(argv[21]),
+			  atoi(argv[22]),
+			  atoi(argv[23]),
+			  atoi(argv[24]),
+			  atoi(argv[25]),
+			  atoi(argv[26]),
+			  atoi(argv[27]),
+			  atoi(argv[28]),
+			  atoi(argv[29]),
+			  atoi(argv[30]),
+			  atoi(argv[31]),
+			  atoi(argv[32]),
+			  atoi(argv[33]),
+			  atoi(argv[34]),
+			  atoi(argv[35]),
+			  atoi(argv[36]),
+			  atoi(argv[37]),
+			  atoi(argv[38]),
+			  atoi(argv[39]),
+			  atoi(argv[40]),
+			  atoi(argv[41]),
+			  atoi(argv[42]),
+			  atoi(argv[43]),
+			  atoi(argv[44]),
+			  atoi(argv[45]),
+			  atoi(argv[46]),
+			  atoi(argv[47]),
+			  atoi(argv[48]),
+			  atoi(argv[49]),
+			  atoi(argv[50]),
+			  atoi(argv[51]),
+			  atoi(argv[52]),
+			  atoi(argv[53]),
+			  atoi(argv[54]));
+  }
+
+  result.cleanup();
+//  delete result.data_array2;
+//  if (result.tag_array2!=NULL)
+//	  delete result.tag_array2;
+
+  return 0;
+}
+
diff --git a/src/gpuwattch/cacti/makefile b/src/gpuwattch/cacti/makefile
new file mode 100644
index 000000000..6cc36db78
--- /dev/null
+++ b/src/gpuwattch/cacti/makefile
@@ -0,0 +1,32 @@
+TAR = cacti
+
+.PHONY: dbg opt depend clean clean_dbg clean_opt
+
+all: opt
+
+dbg: $(TAR).mk obj_dbg
+	@$(MAKE) TAG=dbg -C . -f $(TAR).mk
+
+opt: $(TAR).mk obj_opt
+	@$(MAKE) TAG=opt -C . -f $(TAR).mk
+
+obj_dbg:
+	mkdir $@
+
+obj_opt:
+	mkdir $@
+
+depend:
+	@$(MAKE) TAG=opt -C . -f $(TAR).mk depend
+
+clean: clean_dbg clean_opt
+
+clean_dbg: obj_dbg
+	@$(MAKE) TAG=dbg -C . -f $(TAR).mk clean
+	rm -rf $<
+
+clean_opt: obj_opt
+	@$(MAKE) TAG=opt -C . -f $(TAR).mk clean
+	rm -rf $<
+
+
diff --git a/src/gpuwattch/cacti/mat.cc b/src/gpuwattch/cacti/mat.cc
new file mode 100755
index 000000000..dc903da73
--- /dev/null
+++ b/src/gpuwattch/cacti/mat.cc
@@ -0,0 +1,1704 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include "mat.h"
+#include <assert.h>
+
+
+Mat::Mat(const DynamicParameter & dyn_p)
+ :dp(dyn_p),
+  power_subarray_out_drv(),
+  delay_fa_tag(0), delay_cam(0),
+  delay_before_decoder(0), delay_bitline(0),
+  delay_wl_reset(0), delay_bl_restore(0),
+  delay_searchline(0), delay_matchchline(0),
+  delay_cam_sl_restore(0), delay_cam_ml_reset(0),
+  delay_fa_ram_wl(0),delay_hit_miss_reset(0),
+  delay_hit_miss(0),
+  subarray(dp, dp.fully_assoc),
+  power_bitline(), per_bitline_read_energy(0),
+  deg_bl_muxing(dp.deg_bl_muxing),
+  num_act_mats_hor_dir(dyn_p.num_act_mats_hor_dir),
+  delay_writeback(0),
+  cell(subarray.cell), cam_cell(subarray.cam_cell),
+  is_dram(dyn_p.is_dram),
+  pure_cam(dyn_p.pure_cam),
+  num_mats(dp.num_mats),
+  power_sa(), delay_sa(0),
+  leak_power_sense_amps_closed_page_state(0),
+  leak_power_sense_amps_open_page_state(0),
+  delay_subarray_out_drv(0),
+  delay_comparator(0), power_comparator(),
+  num_do_b_mat(dyn_p.num_do_b_mat), num_so_b_mat(dyn_p.num_so_b_mat),
+  num_subarrays_per_mat(dp.num_subarrays/dp.num_mats),
+  num_subarrays_per_row(dp.Ndwl/dp.num_mats_h_dir)
+{
+  assert(num_subarrays_per_mat <= 4);
+  assert(num_subarrays_per_row <= 2);
+  is_fa = (dp.fully_assoc) ? true : false;
+  camFlag = (is_fa || pure_cam);//although cam_cell.w = cell.w for fa, we still differentiate them.
+
+  if (is_fa || pure_cam)
+	  num_subarrays_per_row = num_subarrays_per_mat>2?num_subarrays_per_mat/2:num_subarrays_per_mat;
+
+  if (dp.use_inp_params == 1) {
+	  RWP  = dp.num_rw_ports;
+	  ERP  = dp.num_rd_ports;
+	  EWP  = dp.num_wr_ports;
+	  SCHP = dp.num_search_ports;
+  }
+  else {
+    RWP = g_ip->num_rw_ports;
+    ERP = g_ip->num_rd_ports;
+    EWP = g_ip->num_wr_ports;
+    SCHP = g_ip->num_search_ports;
+
+  }
+
+  double number_sa_subarray;
+
+  if (!is_fa && !pure_cam)
+  {
+	  number_sa_subarray = subarray.num_cols / deg_bl_muxing;
+  }
+  else if (is_fa && !pure_cam)
+  {
+	  number_sa_subarray =  (subarray.num_cols_fa_cam + subarray.num_cols_fa_ram) / deg_bl_muxing;
+  }
+
+  else
+  {
+	  number_sa_subarray =  (subarray.num_cols_fa_cam) / deg_bl_muxing;
+  }
+
+  int    num_dec_signals           = subarray.num_rows;
+  double C_ld_bit_mux_dec_out      = 0;
+  double C_ld_sa_mux_lev_1_dec_out = 0;
+  double C_ld_sa_mux_lev_2_dec_out = 0;
+  double R_wire_wl_drv_out;
+
+  if (!is_fa && !pure_cam)
+    {
+	    R_wire_wl_drv_out = subarray.num_cols * cell.w * g_tp.wire_local.R_per_um;
+    }
+    else if (is_fa && !pure_cam)
+    {
+    	R_wire_wl_drv_out = (subarray.num_cols_fa_cam * cam_cell.w + subarray.num_cols_fa_ram * cell.w) * g_tp.wire_local.R_per_um ;
+    }
+    else
+    {
+    	R_wire_wl_drv_out = (subarray.num_cols_fa_cam * cam_cell.w ) * g_tp.wire_local.R_per_um;
+    }
+
+  double R_wire_bit_mux_dec_out = num_subarrays_per_row * subarray.num_cols * g_tp.wire_inside_mat.R_per_um * cell.w;//TODO:revisit for FA
+  double R_wire_sa_mux_dec_out  = num_subarrays_per_row * subarray.num_cols * g_tp.wire_inside_mat.R_per_um * cell.w;
+
+  if (deg_bl_muxing > 1)
+  {
+    C_ld_bit_mux_dec_out =
+      (2 * num_subarrays_per_mat * subarray.num_cols / deg_bl_muxing)*gate_C(g_tp.w_nmos_b_mux, 0, is_dram) +  // 2 transistor per cell
+      num_subarrays_per_row * subarray.num_cols*g_tp.wire_inside_mat.C_per_um*cell.get_w();
+  }
+
+  if (dp.Ndsam_lev_1 > 1)
+  {
+    C_ld_sa_mux_lev_1_dec_out =
+      (num_subarrays_per_mat * number_sa_subarray / dp.Ndsam_lev_1)*gate_C(g_tp.w_nmos_sa_mux, 0, is_dram) +
+      num_subarrays_per_row * subarray.num_cols*g_tp.wire_inside_mat.C_per_um*cell.get_w();
+  }
+  if (dp.Ndsam_lev_2 > 1)
+  {
+    C_ld_sa_mux_lev_2_dec_out =
+      (num_subarrays_per_mat * number_sa_subarray / (dp.Ndsam_lev_1*dp.Ndsam_lev_2))*gate_C(g_tp.w_nmos_sa_mux, 0, is_dram) +
+      num_subarrays_per_row * subarray.num_cols*g_tp.wire_inside_mat.C_per_um*cell.get_w();
+  }
+
+  if (num_subarrays_per_row >= 2)
+  {
+    // wire heads for both right and left side of a mat, so half the resistance
+    R_wire_bit_mux_dec_out /= 2.0;
+    R_wire_sa_mux_dec_out  /= 2.0;
+  }
+
+
+  row_dec = new Decoder(
+      num_dec_signals,
+      false,
+      subarray.C_wl,
+      R_wire_wl_drv_out,
+      false/*is_fa*/,
+      is_dram,
+      true,
+      camFlag? cam_cell:cell);
+//  if (is_fa && (!dp.is_tag))
+//  {
+//    row_dec->exist = true;
+//  }
+  bit_mux_dec = new Decoder(
+      deg_bl_muxing,// This number is 1 for FA or CAM
+      false,
+      C_ld_bit_mux_dec_out,
+      R_wire_bit_mux_dec_out,
+      false/*is_fa*/,
+      is_dram,
+      false,
+      camFlag? cam_cell:cell);
+  sa_mux_lev_1_dec = new Decoder(
+      dp.deg_senseamp_muxing_non_associativity, // This number is 1 for FA or CAM
+      dp.number_way_select_signals_mat ? true : false,//only sa_mux_lev_1_dec needs way select signal
+      C_ld_sa_mux_lev_1_dec_out,
+      R_wire_sa_mux_dec_out,
+      false/*is_fa*/,
+      is_dram,
+      false,
+      camFlag? cam_cell:cell);
+  sa_mux_lev_2_dec = new Decoder(
+      dp.Ndsam_lev_2, // This number is 1 for FA or CAM
+      false,
+      C_ld_sa_mux_lev_2_dec_out,
+      R_wire_sa_mux_dec_out,
+      false/*is_fa*/,
+      is_dram,
+      false,
+      camFlag? cam_cell:cell);
+
+  double C_wire_predec_blk_out;
+  double R_wire_predec_blk_out;
+
+  if (!is_fa && !pure_cam)
+      {
+
+	  C_wire_predec_blk_out  = num_subarrays_per_row * subarray.num_rows * g_tp.wire_inside_mat.C_per_um * cell.h;
+	  R_wire_predec_blk_out  = num_subarrays_per_row * subarray.num_rows * g_tp.wire_inside_mat.R_per_um * cell.h;
+
+      }
+      else //for pre-decode block's load is same for both FA and CAM
+      {
+    	  C_wire_predec_blk_out  = subarray.num_rows * g_tp.wire_inside_mat.C_per_um * cam_cell.h;
+    	  R_wire_predec_blk_out  = subarray.num_rows * g_tp.wire_inside_mat.R_per_um * cam_cell.h;
+      }
+
+
+  if (is_fa||pure_cam)
+	  num_dec_signals += _log2(num_subarrays_per_mat);
+
+  PredecBlk * r_predec_blk1 = new PredecBlk(
+      num_dec_signals,
+      row_dec,
+      C_wire_predec_blk_out,
+      R_wire_predec_blk_out,
+      num_subarrays_per_mat,
+      is_dram,
+      true);
+  PredecBlk * r_predec_blk2 = new PredecBlk(
+      num_dec_signals,
+      row_dec,
+      C_wire_predec_blk_out,
+      R_wire_predec_blk_out,
+      num_subarrays_per_mat,
+      is_dram,
+      false);
+  PredecBlk * b_mux_predec_blk1 = new PredecBlk(deg_bl_muxing, bit_mux_dec, 0, 0, 1, is_dram, true);
+  PredecBlk * b_mux_predec_blk2 = new PredecBlk(deg_bl_muxing, bit_mux_dec, 0, 0, 1, is_dram, false);
+  PredecBlk * sa_mux_lev_1_predec_blk1 = new PredecBlk(dyn_p.deg_senseamp_muxing_non_associativity, sa_mux_lev_1_dec, 0, 0, 1, is_dram, true);
+  PredecBlk * sa_mux_lev_1_predec_blk2 = new PredecBlk(dyn_p.deg_senseamp_muxing_non_associativity, sa_mux_lev_1_dec, 0, 0, 1, is_dram, false);
+  PredecBlk * sa_mux_lev_2_predec_blk1 = new PredecBlk(dp.Ndsam_lev_2, sa_mux_lev_2_dec, 0, 0, 1, is_dram, true);
+  PredecBlk * sa_mux_lev_2_predec_blk2 = new PredecBlk(dp.Ndsam_lev_2, sa_mux_lev_2_dec, 0, 0, 1, is_dram, false);
+  dummy_way_sel_predec_blk1 = new PredecBlk(1, sa_mux_lev_1_dec, 0, 0, 0, is_dram, true);
+  dummy_way_sel_predec_blk2 = new PredecBlk(1, sa_mux_lev_1_dec, 0, 0, 0, is_dram, false);
+
+  PredecBlkDrv * r_predec_blk_drv1 = new PredecBlkDrv(0, r_predec_blk1, is_dram);
+  PredecBlkDrv * r_predec_blk_drv2 = new PredecBlkDrv(0, r_predec_blk2, is_dram);
+  PredecBlkDrv * b_mux_predec_blk_drv1 = new PredecBlkDrv(0, b_mux_predec_blk1, is_dram);
+  PredecBlkDrv * b_mux_predec_blk_drv2 = new PredecBlkDrv(0, b_mux_predec_blk2, is_dram);
+  PredecBlkDrv * sa_mux_lev_1_predec_blk_drv1 = new PredecBlkDrv(0, sa_mux_lev_1_predec_blk1, is_dram);
+  PredecBlkDrv * sa_mux_lev_1_predec_blk_drv2 = new PredecBlkDrv(0, sa_mux_lev_1_predec_blk2, is_dram);
+  PredecBlkDrv * sa_mux_lev_2_predec_blk_drv1 = new PredecBlkDrv(0, sa_mux_lev_2_predec_blk1, is_dram);
+  PredecBlkDrv * sa_mux_lev_2_predec_blk_drv2 = new PredecBlkDrv(0, sa_mux_lev_2_predec_blk2, is_dram);
+  way_sel_drv1 = new PredecBlkDrv(dyn_p.number_way_select_signals_mat, dummy_way_sel_predec_blk1, is_dram);
+  dummy_way_sel_predec_blk_drv2 = new PredecBlkDrv(1, dummy_way_sel_predec_blk2, is_dram);
+
+  r_predec            = new Predec(r_predec_blk_drv1, r_predec_blk_drv2);
+  b_mux_predec        = new Predec(b_mux_predec_blk_drv1, b_mux_predec_blk_drv2);
+  sa_mux_lev_1_predec = new Predec(sa_mux_lev_1_predec_blk_drv1, sa_mux_lev_1_predec_blk_drv2);
+  sa_mux_lev_2_predec = new Predec(sa_mux_lev_2_predec_blk_drv1, sa_mux_lev_2_predec_blk_drv2);
+
+  subarray_out_wire   = new Wire(g_ip->wt, subarray.area.h);//Bug should be subarray.area.w Owen and Sheng
+
+  double driver_c_gate_load;
+  double driver_c_wire_load;
+  double driver_r_wire_load;
+
+  if (is_fa || pure_cam)
+
+  {   //Although CAM and RAM use different bl pre-charge driver, assuming the precharge p size is the same
+	  driver_c_gate_load =  (subarray.num_cols_fa_cam )* gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false);
+	  driver_c_wire_load =  subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.C_per_um;
+	  driver_r_wire_load =  subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.R_per_um;
+	  cam_bl_precharge_eq_drv = new Driver(
+			  driver_c_gate_load,
+			  driver_c_wire_load,
+			  driver_r_wire_load,
+			  is_dram);
+
+	  if (!pure_cam)
+	  {
+		  //This is only used for fully asso not pure CAM
+		  driver_c_gate_load =  (subarray.num_cols_fa_ram )* gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false);
+		  driver_c_wire_load =  subarray.num_cols_fa_ram * cell.w * g_tp.wire_outside_mat.C_per_um;
+		  driver_r_wire_load =  subarray.num_cols_fa_ram * cell.w * g_tp.wire_outside_mat.R_per_um;
+		  bl_precharge_eq_drv = new Driver(
+				  driver_c_gate_load,
+				  driver_c_wire_load,
+				  driver_r_wire_load,
+				  is_dram);
+	  }
+  }
+
+  else
+  {
+	  driver_c_gate_load =  subarray.num_cols * gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false);
+	  driver_c_wire_load =  subarray.num_cols * cell.w * g_tp.wire_outside_mat.C_per_um;
+	  driver_r_wire_load =  subarray.num_cols * cell.w * g_tp.wire_outside_mat.R_per_um;
+	  bl_precharge_eq_drv = new Driver(
+			  driver_c_gate_load,
+			  driver_c_wire_load,
+			  driver_r_wire_load,
+			  is_dram);
+  }
+  double area_row_decoder = row_dec->area.get_area() * subarray.num_rows * (RWP + ERP + EWP);
+  double w_row_decoder    = area_row_decoder / subarray.area.get_h();
+
+  double h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux =
+    compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h();
+
+  double h_subarray_out_drv = subarray_out_wire->area.get_area() *
+    (subarray.num_cols / (deg_bl_muxing * dp.Ndsam_lev_1 * dp.Ndsam_lev_2)) / subarray.area.get_w();
+
+
+  h_subarray_out_drv *= (RWP + ERP + SCHP);
+
+  double h_comparators                = 0.0;
+  double w_row_predecode_output_wires = 0.0;
+  double h_bit_mux_dec_out_wires      = 0.0;
+  double h_senseamp_mux_dec_out_wires = 0.0;
+
+  if ((!is_fa)&&(dp.is_tag))
+  {
+    //tagbits = (4 * num_cols_subarray / (deg_bl_muxing * dp.Ndsam_lev_1 * dp.Ndsam_lev_2)) / num_do_b_mat;
+    h_comparators  = compute_comparators_height(dp.tagbits, dyn_p.num_do_b_mat, subarray.area.get_w());
+    h_comparators *= (RWP + ERP);
+  }
+
+
+    int branch_effort_predec_blk1_out = (1 << r_predec_blk2->number_input_addr_bits);
+    int branch_effort_predec_blk2_out = (1 << r_predec_blk1->number_input_addr_bits);
+    w_row_predecode_output_wires   = (branch_effort_predec_blk1_out + branch_effort_predec_blk2_out) *
+      g_tp.wire_inside_mat.pitch * (RWP + ERP + EWP);
+
+
+  double h_non_cell_area = (num_subarrays_per_mat / num_subarrays_per_row) *
+                           (h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux +
+                            h_subarray_out_drv + h_comparators);
+
+  double w_non_cell_area = MAX(w_row_predecode_output_wires, num_subarrays_per_row * w_row_decoder);
+
+  if (deg_bl_muxing > 1)
+  {
+    h_bit_mux_dec_out_wires = deg_bl_muxing * g_tp.wire_inside_mat.pitch * (RWP + ERP);
+  }
+  if (dp.Ndsam_lev_1 > 1)
+  {
+    h_senseamp_mux_dec_out_wires =  dp.Ndsam_lev_1 * g_tp.wire_inside_mat.pitch * (RWP + ERP);
+  }
+  if (dp.Ndsam_lev_2 > 1)
+  {
+    h_senseamp_mux_dec_out_wires += dp.Ndsam_lev_2 * g_tp.wire_inside_mat.pitch * (RWP + ERP);
+  }
+
+  double h_addr_datain_wires;
+  if (!g_ip->ver_htree_wires_over_array)
+  {
+    h_addr_datain_wires = (dp.number_addr_bits_mat + dp.number_way_select_signals_mat +
+                                  (dp.num_di_b_mat + dp.num_do_b_mat)/num_subarrays_per_row) *
+                                 g_tp.wire_inside_mat.pitch * (RWP + ERP + EWP);
+
+    if (is_fa || pure_cam)
+    {
+    	h_addr_datain_wires = (dp.number_addr_bits_mat + dp.number_way_select_signals_mat +     //TODO: revisit
+    			              (dp.num_di_b_mat+ dp.num_do_b_mat )/num_subarrays_per_row) *
+    			               g_tp.wire_inside_mat.pitch * (RWP + ERP + EWP) +
+    			               (dp.num_si_b_mat + dp.num_so_b_mat )/num_subarrays_per_row * g_tp.wire_inside_mat.pitch * SCHP;
+    }
+    //h_non_cell_area = 2 * h_bit_mux_sense_amp_precharge_sa_mux +
+    //MAX(h_addr_datain_wires, 2 * h_subarray_out_drv);
+    h_non_cell_area = (h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux + h_comparators +
+                       h_subarray_out_drv) * (num_subarrays_per_mat / num_subarrays_per_row) +
+                      h_addr_datain_wires +
+                      h_bit_mux_dec_out_wires +
+                      h_senseamp_mux_dec_out_wires;
+
+  }
+
+  // double area_rectangle_center_mat = h_non_cell_area * w_non_cell_area;
+  double area_mat_center_circuitry = (r_predec_blk_drv1->area.get_area() +
+                                      b_mux_predec_blk_drv1->area.get_area() +
+                                      sa_mux_lev_1_predec_blk_drv1->area.get_area() +
+                                      sa_mux_lev_2_predec_blk_drv1->area.get_area() +
+                                      way_sel_drv1->area.get_area() +
+                                      r_predec_blk_drv2->area.get_area() +
+                                      b_mux_predec_blk_drv2->area.get_area() +
+                                      sa_mux_lev_1_predec_blk_drv2->area.get_area() +
+                                      sa_mux_lev_2_predec_blk_drv2->area.get_area() +
+                                      r_predec_blk1->area.get_area() +
+                                      b_mux_predec_blk1->area.get_area() +
+                                      sa_mux_lev_1_predec_blk1->area.get_area() +
+                                      sa_mux_lev_2_predec_blk1->area.get_area() +
+                                      r_predec_blk2->area.get_area() +
+                                      b_mux_predec_blk2->area.get_area() +
+                                      sa_mux_lev_1_predec_blk2->area.get_area() +
+                                      sa_mux_lev_2_predec_blk2->area.get_area() +
+                                      bit_mux_dec->area.get_area() +
+                                      sa_mux_lev_1_dec->area.get_area() +
+                                      sa_mux_lev_2_dec->area.get_area()) * (RWP + ERP + EWP);
+
+
+//  if (!is_fa)
+//  {
+    assert(num_subarrays_per_mat/num_subarrays_per_row>0);
+    area.h = (num_subarrays_per_mat/num_subarrays_per_row)* subarray.area.h + h_non_cell_area;
+    area.w = num_subarrays_per_row * subarray.area.get_w() + w_non_cell_area;
+    area.w = (area.h*area.w + area_mat_center_circuitry) / area.h;
+
+//    cout<<"h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux"<<h_bit_mux_sense_amp_precharge_sa_mux_write_driver_write_mux<<endl;
+//    cout<<"h_comparators"<<h_comparators<<endl;
+//    cout<<"h_subarray_out_drv"<<h_subarray_out_drv<<endl;
+//    cout<<"h_addr_datain_wires"<<h_addr_datain_wires<<endl;
+//    cout<<"h_bit_mux_dec_out_wires"<<h_bit_mux_dec_out_wires<<endl;
+//    cout<<"h_senseamp_mux_dec_out_wires"<<h_senseamp_mux_dec_out_wires<<endl;
+//    cout<<"h_non_cell_area"<<h_non_cell_area<<endl;
+//    cout<<"area.h =" << (num_subarrays_per_mat/num_subarrays_per_row)* subarray.area.h<<endl;
+//    cout<<"w_non_cell_area"<<w_non_cell_area<<endl;
+//    cout<<"area_mat_center_circuitry"<<area_mat_center_circuitry<<endl;
+
+    assert(area.h>0);
+    assert(area.w>0);
+//  }
+//  else
+//  {
+//    area.h = (num_subarrays_per_mat / num_subarrays_per_row) * subarray.area.get_h() + h_non_cell_area;
+//    area.w = num_subarrays_per_row * subarray.area.get_w() + w_non_cell_area;
+//    area.w = (area.h*area.w + area_mat_center_circuitry) / area.h;
+//    area_efficiency_mat = subarray.area.get_area() * num_subarrays_per_row * 100.0 / area.get_area();
+//  }
+  }
+
+
+
+Mat::~Mat()
+{
+  delete row_dec;
+  delete bit_mux_dec;
+  delete sa_mux_lev_1_dec;
+  delete sa_mux_lev_2_dec;
+
+  delete r_predec->blk1;
+  delete r_predec->blk2;
+  delete b_mux_predec->blk1;
+  delete b_mux_predec->blk2;
+  delete sa_mux_lev_1_predec->blk1;
+  delete sa_mux_lev_1_predec->blk2;
+  delete sa_mux_lev_2_predec->blk1;
+  delete sa_mux_lev_2_predec->blk2;
+  delete dummy_way_sel_predec_blk1;
+  delete dummy_way_sel_predec_blk2;
+
+  delete r_predec->drv1;
+  delete r_predec->drv2;
+  delete b_mux_predec->drv1;
+  delete b_mux_predec->drv2;
+  delete sa_mux_lev_1_predec->drv1;
+  delete sa_mux_lev_1_predec->drv2;
+  delete sa_mux_lev_2_predec->drv1;
+  delete sa_mux_lev_2_predec->drv2;
+  delete way_sel_drv1;
+  delete dummy_way_sel_predec_blk_drv2;
+
+  delete r_predec;
+  delete b_mux_predec;
+  delete sa_mux_lev_1_predec;
+  delete sa_mux_lev_2_predec;
+
+  delete subarray_out_wire;
+  if (!pure_cam)
+    delete bl_precharge_eq_drv;
+
+  if (is_fa || pure_cam)
+  {
+    delete sl_precharge_eq_drv ;
+    delete sl_data_drv ;
+    delete cam_bl_precharge_eq_drv;
+    delete ml_precharge_drv;
+    delete ml_to_ram_wl_drv;
+  }
+}
+
+
+
+double Mat::compute_delays(double inrisetime)
+{
+	int k;
+	double rd, C_intrinsic, C_ld, tf, R_bl_precharge,r_b_metal, R_bl, C_bl;
+	double outrisetime_search, outrisetime, row_dec_outrisetime;
+	// delay calculation for tags of fully associative cache
+	if (is_fa || pure_cam)
+	{
+		//Compute search access time
+		outrisetime_search = compute_cam_delay(inrisetime);
+		if (is_fa)
+		{
+			bl_precharge_eq_drv->compute_delay(0);
+			k = ml_to_ram_wl_drv->number_gates - 1;
+			rd = tr_R_on(ml_to_ram_wl_drv->width_n[k], NCH, 1, is_dram, false, true);
+			C_intrinsic = drain_C_(ml_to_ram_wl_drv->width_n[k], PCH, 1, 1, 4*cell.h, is_dram, false, true) +
+			drain_C_(ml_to_ram_wl_drv->width_n[k], NCH, 1, 1, 4*cell.h, is_dram, false, true);
+			C_ld = ml_to_ram_wl_drv->c_gate_load+ ml_to_ram_wl_drv->c_wire_load;
+			tf = rd * (C_intrinsic + C_ld) + ml_to_ram_wl_drv->r_wire_load * C_ld / 2;
+			delay_wl_reset = horowitz(0, tf, 0.5, 0.5, RISE);
+
+			R_bl_precharge = tr_R_on(g_tp.w_pmos_bl_precharge, PCH, 1, is_dram, false, false);
+			r_b_metal = cam_cell.h * g_tp.wire_local.R_per_um;//dummy rows in sram are filled in
+			R_bl = subarray.num_rows * r_b_metal;
+			C_bl = subarray.C_bl;
+			delay_bl_restore = bl_precharge_eq_drv->delay +
+			         log((g_tp.sram.Vbitpre - 0.1 * dp.V_b_sense) / (g_tp.sram.Vbitpre - dp.V_b_sense))*
+			         (R_bl_precharge * C_bl + R_bl * C_bl / 2);
+
+
+			outrisetime_search = compute_bitline_delay(outrisetime_search);
+			outrisetime_search = compute_sa_delay(outrisetime_search);
+		}
+			outrisetime_search = compute_subarray_out_drv(outrisetime_search);
+			subarray_out_wire->set_in_rise_time(outrisetime_search);
+			outrisetime_search = subarray_out_wire->signal_rise_time();
+			delay_subarray_out_drv_htree = delay_subarray_out_drv + subarray_out_wire->delay;
+
+
+			//TODO: this is just for compute plain read/write energy for fa and cam, plain read/write access timing need to be revisited.
+			outrisetime = r_predec->compute_delays(inrisetime);
+			row_dec_outrisetime = row_dec->compute_delays(outrisetime);
+
+			outrisetime = b_mux_predec->compute_delays(inrisetime);
+			bit_mux_dec->compute_delays(outrisetime);
+
+			outrisetime = sa_mux_lev_1_predec->compute_delays(inrisetime);
+			sa_mux_lev_1_dec->compute_delays(outrisetime);
+
+			outrisetime = sa_mux_lev_2_predec->compute_delays(inrisetime);
+			sa_mux_lev_2_dec->compute_delays(outrisetime);
+
+			if (pure_cam)
+			{
+			  outrisetime = compute_bitline_delay(row_dec_outrisetime);
+			  outrisetime = compute_sa_delay(outrisetime);
+			}
+			return outrisetime_search;
+    }
+	else
+	{
+		bl_precharge_eq_drv->compute_delay(0);
+		if (row_dec->exist == true)
+		{
+			int k = row_dec->num_gates - 1;
+			double rd = tr_R_on(row_dec->w_dec_n[k], NCH, 1, is_dram, false, true);
+			// TODO: this 4*cell.h number must be revisited
+			double C_intrinsic = drain_C_(row_dec->w_dec_p[k], PCH, 1, 1, 4*cell.h, is_dram, false, true) +
+			drain_C_(row_dec->w_dec_n[k], NCH, 1, 1, 4*cell.h, is_dram, false, true);
+			double C_ld = row_dec->C_ld_dec_out;
+			double tf = rd * (C_intrinsic + C_ld) + row_dec->R_wire_dec_out * C_ld / 2;
+			delay_wl_reset = horowitz(0, tf, 0.5, 0.5, RISE);
+		}
+		double R_bl_precharge = tr_R_on(g_tp.w_pmos_bl_precharge, PCH, 1, is_dram, false, false);
+		double r_b_metal = cell.h * g_tp.wire_local.R_per_um;
+		double R_bl = subarray.num_rows * r_b_metal;
+		double C_bl = subarray.C_bl;
+
+		if (is_dram)
+		{
+			delay_bl_restore = bl_precharge_eq_drv->delay + 2.3 * (R_bl_precharge * C_bl + R_bl * C_bl / 2);
+		}
+		else
+		{
+			delay_bl_restore = bl_precharge_eq_drv->delay +
+			log((g_tp.sram.Vbitpre - 0.1 * dp.V_b_sense) / (g_tp.sram.Vbitpre - dp.V_b_sense))*
+			(R_bl_precharge * C_bl + R_bl * C_bl / 2);
+		}
+  }
+
+
+
+  outrisetime = r_predec->compute_delays(inrisetime);
+  row_dec_outrisetime = row_dec->compute_delays(outrisetime);
+
+  outrisetime = b_mux_predec->compute_delays(inrisetime);
+  bit_mux_dec->compute_delays(outrisetime);
+
+  outrisetime = sa_mux_lev_1_predec->compute_delays(inrisetime);
+  sa_mux_lev_1_dec->compute_delays(outrisetime);
+
+  outrisetime = sa_mux_lev_2_predec->compute_delays(inrisetime);
+  sa_mux_lev_2_dec->compute_delays(outrisetime);
+
+  outrisetime = compute_bitline_delay(row_dec_outrisetime);
+  outrisetime = compute_sa_delay(outrisetime);
+  outrisetime = compute_subarray_out_drv(outrisetime);
+  subarray_out_wire->set_in_rise_time(outrisetime);
+  outrisetime = subarray_out_wire->signal_rise_time();
+
+  delay_subarray_out_drv_htree = delay_subarray_out_drv + subarray_out_wire->delay;
+
+  if (dp.is_tag == true && dp.fully_assoc == false)
+  {
+    compute_comparator_delay(0);
+  }
+
+  if (row_dec->exist == false)
+    {
+      delay_wl_reset = MAX(r_predec->blk1->delay, r_predec->blk2->delay);
+    }
+  return outrisetime;
+}
+
+
+
+double Mat::compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h()
+{
+
+  double height = compute_tr_width_after_folding(g_tp.w_pmos_bl_precharge, camFlag? cam_cell.w:cell.w / (2 *(RWP + ERP + SCHP))) +
+    compute_tr_width_after_folding(g_tp.w_pmos_bl_eq, camFlag? cam_cell.w:cell.w / (RWP + ERP + SCHP));  // precharge circuitry
+
+  if (deg_bl_muxing > 1)
+  {
+    height += compute_tr_width_after_folding(g_tp.w_nmos_b_mux, cell.w / (2 *(RWP + ERP)));  // col mux tr height
+    // height += deg_bl_muxing * g_tp.wire_inside_mat.pitch * (RWP + ERP);  // bit mux dec out wires height
+  }
+
+  height += height_sense_amplifier(/*camFlag? sram_cell.w:*/cell.w * deg_bl_muxing / (RWP + ERP));  // sense_amp_height
+
+  if (dp.Ndsam_lev_1 > 1)
+  {
+    height += compute_tr_width_after_folding(
+        g_tp.w_nmos_sa_mux, cell.w * dp.Ndsam_lev_1 / (RWP + ERP));  // sense_amp_mux_height
+    //height_senseamp_mux_decode_output_wires =  Ndsam * wire_inside_mat_pitch * (RWP + ERP);
+  }
+
+  if (dp.Ndsam_lev_2 > 1)
+  {
+    height += compute_tr_width_after_folding(
+        g_tp.w_nmos_sa_mux, cell.w * deg_bl_muxing * dp.Ndsam_lev_1 / (RWP + ERP));  // sense_amp_mux_height
+    //height_senseamp_mux_decode_output_wires =  Ndsam * wire_inside_mat_pitch * (RWP + ERP);
+
+    // add height of inverter-buffers between the two levels (pass-transistors) of sense-amp mux
+    height += 2 * compute_tr_width_after_folding(
+        pmos_to_nmos_sz_ratio(is_dram) * g_tp.min_w_nmos_, cell.w * dp.Ndsam_lev_2 / (RWP + ERP));
+    height += 2 * compute_tr_width_after_folding(g_tp.min_w_nmos_, cell.w * dp.Ndsam_lev_2 / (RWP + ERP));
+  }
+
+  // TODO: this should be uncommented...
+  /*if (deg_bl_muxing * dp.Ndsam_lev_1 * dp.Ndsam_lev_2 > 1)
+    {
+  //height_write_mux_decode_output_wires = deg_bl_muxing * Ndsam * g_tp.wire_inside_mat.pitch * (RWP + EWP);
+  double width_write_driver_write_mux  = width_write_driver_or_write_mux();
+  double height_write_driver_write_mux = compute_tr_width_after_folding(2 * width_write_driver_write_mux,
+  cell.w *
+  // deg_bl_muxing *
+  dp.Ndsam_lev_1 * dp.Ndsam_lev_2 / (RWP + EWP));
+  height += height_write_driver_write_mux;
+  }*/
+
+  return height;
+}
+
+
+
+double Mat::compute_cam_delay(double inrisetime)
+{
+
+  double out_time_ramp, this_delay;
+  double Rwire, tf, c_intrinsic, rd, Cwire, c_gate_load;
+
+ double  Wfaprechp, Wdummyn, Wdummyinvn, Wdummyinvp, Waddrnandn, Waddrnandp,
+     Wfanorn, Wfanorp,W_hit_miss_n, W_hit_miss_p;
+
+  double c_matchline_metal, r_matchline_metal, c_searchline_metal, r_searchline_metal,  dynSearchEng;
+  int Htagbits;
+
+  double driver_c_gate_load;
+  double driver_c_wire_load;
+  double driver_r_wire_load;
+  //double searchline_precharge_time;
+
+  double leak_power_cc_inverters_sram_cell         = 0;
+  double leak_power_acc_tr_RW_or_WR_port_sram_cell = 0;
+  double leak_power_RD_port_sram_cell              = 0;
+  double leak_power_SCHP_port_sram_cell            = 0;
+  double leak_comparator_cam_cell                  =0;
+
+  double gate_leak_comparator_cam_cell          = 0;
+  double gate_leak_power_cc_inverters_sram_cell = 0;
+  double gate_leak_power_RD_port_sram_cell      = 0;
+  double gate_leak_power_SCHP_port_sram_cell    = 0;
+
+  c_matchline_metal   = cam_cell.get_w() * g_tp.wire_local.C_per_um;
+  c_searchline_metal  = cam_cell.get_h() * g_tp.wire_local.C_per_um;
+  r_matchline_metal   = cam_cell.get_w() * g_tp.wire_local.R_per_um;
+  r_searchline_metal  = cam_cell.get_h() * g_tp.wire_local.R_per_um;
+
+  dynSearchEng = 0.0;
+  delay_matchchline = 0.0;
+  double p_to_n_sizing_r = pmos_to_nmos_sz_ratio(is_dram);
+  bool linear_scaling = false;
+
+  if (linear_scaling)
+  {
+
+	  Wfaprechp     = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+	  Wdummyn       = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+	  Wdummyinvn    =   75 * g_ip->F_sz_um;//this was  60 micron for the 0.8 micron process
+	  Wdummyinvp    =  100 * g_ip->F_sz_um;//this was  80 micron for the 0.8 micron process
+	  Waddrnandn    = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+	  Waddrnandp    = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+	  Wfanorn       = 6.25 * g_ip->F_sz_um;//this was   5 micron for the 0.8 micron process
+	  Wfanorp       = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+	  W_hit_miss_n    = Wdummyn;
+	  W_hit_miss_p    = g_tp.min_w_nmos_*p_to_n_sizing_r;
+	  //TODO: this number should updated using new layout; from the NAND to output NOR should be computed using logical effort
+  }
+  else
+  {
+	  Wfaprechp     = g_tp.w_pmos_bl_precharge;//this was  10 micron for the 0.8 micron process
+	  Wdummyn       = g_tp.cam.cell_nmos_w;
+	  Wdummyinvn    =   75 * g_ip->F_sz_um;//this was  60 micron for the 0.8 micron process
+	  Wdummyinvp    =  100 * g_ip->F_sz_um;//this was  80 micron for the 0.8 micron process
+	  Waddrnandn    = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+	  Waddrnandp    = 62.5 * g_ip->F_sz_um;//this was  50 micron for the 0.8 micron process
+	  Wfanorn       = 6.25 * g_ip->F_sz_um;//this was   5 micron for the 0.8 micron process
+	  Wfanorp       = 12.5 * g_ip->F_sz_um;//this was  10 micron for the 0.8 micron process
+	  W_hit_miss_n    = Wdummyn;
+	  W_hit_miss_p    = g_tp.min_w_nmos_*p_to_n_sizing_r;
+  }
+
+  Htagbits = (int)(ceil ((double) (subarray.num_cols_fa_cam) / 2.0));
+
+  /* First stage, searchline is precharged. searchline data driver drives the searchline to open (if miss) the comparators.
+     search_line_delay, search_line_power, search_line_restore_delay for cycle time computation.
+     From the driver(am and an) to the comparators in all the rows including the dummy row,
+     Assuming that comparators in both the normal matching line and the dummy matching line have the same sizing */
+
+  //Searchline precharge circuitry is same as that of bitline. However, no sharing between search ports and r/w ports
+  //Searchline precharge routes horizontally
+  driver_c_gate_load = subarray.num_cols_fa_cam * gate_C(2 * g_tp.w_pmos_bl_precharge + g_tp.w_pmos_bl_eq, 0, is_dram, false, false);
+  driver_c_wire_load = subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.C_per_um;
+  driver_r_wire_load = subarray.num_cols_fa_cam * cam_cell.w * g_tp.wire_outside_mat.R_per_um;
+
+  sl_precharge_eq_drv = new Driver(
+      driver_c_gate_load,
+	  driver_c_wire_load,
+      driver_r_wire_load,
+      is_dram);
+
+  //searchline data driver ; subarray.num_rows + 1 is because of the dummy row
+  //data drv should only have gate_C not 2*gate_C since the two searchlines are differential--same as bitlines
+  driver_c_gate_load = (subarray.num_rows + 1) * gate_C(Wdummyn, 0, is_dram, false, false);
+  driver_c_wire_load = (subarray.num_rows + 1) * c_searchline_metal;
+  driver_r_wire_load = (subarray.num_rows + 1) * r_searchline_metal;
+  sl_data_drv = new Driver(
+      driver_c_gate_load,
+	  driver_c_wire_load,
+      driver_r_wire_load,
+      is_dram);
+
+  sl_precharge_eq_drv->compute_delay(0);
+  double R_bl_precharge = tr_R_on(g_tp.w_pmos_bl_precharge, PCH, 1, is_dram, false, false);//Assuming CAM and SRAM have same Pre_eq_dr
+  double r_b_metal = cam_cell.h * g_tp.wire_local.R_per_um;
+  double R_bl = (subarray.num_rows + 1) * r_b_metal;
+  double C_bl = subarray.C_bl_cam;
+  delay_cam_sl_restore = sl_precharge_eq_drv->delay
+                         + log(g_tp.cam.Vbitpre)* (R_bl_precharge * C_bl + R_bl * C_bl / 2);
+
+  out_time_ramp = sl_data_drv->compute_delay(inrisetime);//After entering one mat, start to consider the inrisetime from 0(0 is passed from outside)
+
+  //matchline ops delay
+  delay_matchchline += sl_data_drv->delay;
+
+  /* second stage, from the trasistors in the comparators(both normal row and dummy row) to the NAND gates that combins both half*/
+  //matchline delay, matchline power, matchline_reset for cycle time computation,
+
+  ////matchline precharge circuitry routes vertically
+  //There are two matchline precharge driver chains per subarray.
+  driver_c_gate_load = (subarray.num_rows + 1) * gate_C(Wfaprechp, 0, is_dram);
+  driver_c_wire_load = (subarray.num_rows + 1) * c_searchline_metal;
+  driver_r_wire_load = (subarray.num_rows + 1) * r_searchline_metal;
+
+  ml_precharge_drv = new Driver(
+						  driver_c_gate_load,
+  	                      driver_c_wire_load,
+                          driver_r_wire_load,
+                          is_dram);
+
+  ml_precharge_drv->compute_delay(0);
+
+
+  rd =  tr_R_on(Wdummyn, NCH, 2, is_dram);
+  c_intrinsic = Htagbits*(2*drain_C_(Wdummyn, NCH, 2, 1, g_tp.cell_h_def, is_dram)//TODO: the cell_h_def should be revisit
+				  + drain_C_(Wfaprechp, PCH, 1, 1, g_tp.cell_h_def, is_dram)/Htagbits);//since each halve only has one precharge tx per matchline
+
+  Cwire = c_matchline_metal * Htagbits;
+  Rwire = r_matchline_metal * Htagbits;
+  c_gate_load = gate_C(Waddrnandn + Waddrnandp, 0, is_dram);
+
+  double R_ml_precharge = tr_R_on(Wfaprechp, PCH, 1, is_dram);
+  //double r_ml_metal = cam_cell.w * g_tp.wire_local.R_per_um;
+  double R_ml = Rwire;
+  double C_ml = Cwire + c_intrinsic;
+  delay_cam_ml_reset = ml_precharge_drv->delay
+                           + log(g_tp.cam.Vbitpre)* (R_ml_precharge * C_ml + R_ml * C_ml / 2);//TODO: latest CAM has sense amps on matchlines too
+
+  //matchline ops delay
+  tf = rd * (c_intrinsic + Cwire / 2 + c_gate_load) + Rwire * (Cwire / 2 + c_gate_load);
+  this_delay = horowitz(out_time_ramp, tf, VTHFA2, VTHFA3, FALL);
+  delay_matchchline += this_delay;
+  out_time_ramp = this_delay / VTHFA3;
+
+  dynSearchEng += ((c_intrinsic + Cwire + c_gate_load)*(subarray.num_rows +1)) //+ 2*drain_C_(Wdummyn, NCH, 2, 1, g_tp.cell_h_def, is_dram))//TODO: need to be precise
+					  * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd *2;//* Ntbl;//each subarry has two halves
+
+  /* third stage, from the NAND2 gates to the drivers in the dummy row */
+  rd = tr_R_on(Waddrnandn, NCH, 2, is_dram);
+  c_intrinsic = drain_C_(Waddrnandn, NCH, 2, 1, g_tp.cell_h_def, is_dram) +
+                drain_C_(Waddrnandp, PCH, 1, 1, g_tp.cell_h_def, is_dram)*2;
+  c_gate_load = gate_C(Wdummyinvn + Wdummyinvp, 0, is_dram);
+  tf = rd * (c_intrinsic + c_gate_load);
+  this_delay = horowitz(out_time_ramp, tf, VTHFA3, VTHFA4, RISE);
+  out_time_ramp = this_delay / (1 - VTHFA4);
+  delay_matchchline += this_delay;
+
+  //only the dummy row has the extra inverter between NAND and NOR gates
+  dynSearchEng += (c_intrinsic* (subarray.num_rows+1)+ c_gate_load*2) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;//  * Ntbl;
+
+  /* fourth stage, from the driver in dummy matchline to the NOR2 gate which drives the wordline of the data portion */
+  rd = tr_R_on(Wdummyinvn, NCH, 1, is_dram);
+  c_intrinsic = drain_C_(Wdummyinvn, NCH, 1, 1, g_tp.cell_h_def, is_dram) + drain_C_(Wdummyinvp, NCH, 1, 1, g_tp.cell_h_def, is_dram);
+  Cwire = c_matchline_metal * Htagbits +  c_searchline_metal * (subarray.num_rows+1)/2;
+  Rwire = r_matchline_metal * Htagbits +  r_searchline_metal * (subarray.num_rows+1)/2;
+  c_gate_load = gate_C(Wfanorn + Wfanorp, 0, is_dram);
+  tf = rd * (c_intrinsic + Cwire + c_gate_load) + Rwire * (Cwire / 2 + c_gate_load);
+  this_delay = horowitz (out_time_ramp, tf, VTHFA4, VTHFA5, FALL);
+  out_time_ramp = this_delay / VTHFA5;
+  delay_matchchline += this_delay;
+
+  dynSearchEng += (c_intrinsic + Cwire + subarray.num_rows*c_gate_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;//* Ntbl;
+
+  /*final statge from the NOR gate to drive the wordline of the data portion */
+
+  //searchline data driver There are two matchline precharge driver chains per subarray.
+  driver_c_gate_load = gate_C(W_hit_miss_n, 0, is_dram, false, false);//nmos of the pull down logic
+  driver_c_wire_load = subarray.C_wl_ram;
+  driver_r_wire_load = subarray.R_wl_ram;
+
+  ml_to_ram_wl_drv = new Driver(
+						  driver_c_gate_load,
+  	                      driver_c_wire_load,
+                          driver_r_wire_load,
+                          is_dram);
+
+
+
+  rd = tr_R_on(Wfanorn, NCH, 1, is_dram);
+  c_intrinsic = 2* drain_C_(Wfanorn, NCH, 1, 1, g_tp.cell_h_def, is_dram) + drain_C_(Wfanorp, NCH, 1, 1, g_tp.cell_h_def, is_dram);
+  c_gate_load = gate_C(ml_to_ram_wl_drv->width_n[0] + ml_to_ram_wl_drv->width_p[0], 0, is_dram);
+  tf = rd * (c_intrinsic + c_gate_load);
+  this_delay = horowitz (out_time_ramp, tf, 0.5, 0.5, RISE);
+  out_time_ramp = this_delay / (1-0.5);
+  delay_matchchline += this_delay;
+
+  out_time_ramp   = ml_to_ram_wl_drv->compute_delay(out_time_ramp);
+
+  //c_gate_load energy is computed in ml_to_ram_wl_drv
+  dynSearchEng  += (c_intrinsic) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;//* Ntbl;
+
+
+  /* peripheral-- hitting logic "CMOS VLSI Design Fig11.51*/
+  /*Precharge the hitting logic */
+  c_intrinsic = 2*drain_C_(W_hit_miss_p, NCH, 2, 1, g_tp.cell_h_def, is_dram);
+  Cwire = c_searchline_metal * subarray.num_rows;
+  Rwire = r_searchline_metal * subarray.num_rows;
+  c_gate_load = drain_C_(W_hit_miss_n, NCH, 1, 1, g_tp.cell_h_def, is_dram)* subarray.num_rows;
+
+  rd = tr_R_on(W_hit_miss_p, PCH, 1, is_dram, false, false);
+  //double r_ml_metal = cam_cell.w * g_tp.wire_local.R_per_um;
+  double R_hit_miss = Rwire;
+  double C_hit_miss = Cwire + c_intrinsic;
+  delay_hit_miss_reset = log(g_tp.cam.Vbitpre)* (rd * C_hit_miss + R_hit_miss * C_hit_miss / 2);
+  dynSearchEng  += (c_intrinsic + Cwire + c_gate_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+
+  /*hitting logic evaluation */
+  c_intrinsic = 2*drain_C_(W_hit_miss_n, NCH, 2, 1, g_tp.cell_h_def, is_dram);
+  Cwire = c_searchline_metal * subarray.num_rows;
+  Rwire = r_searchline_metal * subarray.num_rows;
+  c_gate_load = drain_C_(W_hit_miss_n, NCH, 1, 1, g_tp.cell_h_def, is_dram)* subarray.num_rows;
+
+  rd = tr_R_on(W_hit_miss_n, PCH, 1, is_dram, false, false);
+  tf = rd * (c_intrinsic + Cwire / 2 + c_gate_load) + Rwire * (Cwire / 2 + c_gate_load);
+
+  delay_hit_miss = horowitz(0, tf, 0.5, 0.5, FALL);
+
+  if (is_fa)
+      delay_matchchline += MAX(ml_to_ram_wl_drv->delay, delay_hit_miss);
+
+  dynSearchEng  += (c_intrinsic + Cwire + c_gate_load) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+
+  /* TODO: peripheral-- Priority Encoder, usually this is not necessary in processor components*/
+
+  power_matchline.searchOp.dynamic = dynSearchEng;
+
+  //leakage in one subarray
+  double Iport     = cmos_Isub_leakage(g_tp.cam.cell_a_w, 0,  1, nmos, false, true);//TODO: how much is the idle time? just by *2?
+  double Iport_erp = cmos_Isub_leakage(g_tp.cam.cell_a_w, 0,  2, nmos, false, true);
+  double Icell     = cmos_Isub_leakage(g_tp.cam.cell_nmos_w, g_tp.cam.cell_pmos_w, 1, inv, false, true)*2;
+  double Icell_comparator = cmos_Isub_leakage(Wdummyn, Wdummyn, 1, inv, false, true)*2;//approx XOR with Inv
+
+  leak_power_cc_inverters_sram_cell         = Icell * g_tp.cam_cell.Vdd;
+  leak_comparator_cam_cell                  = Icell_comparator * g_tp.cam_cell.Vdd;
+  leak_power_acc_tr_RW_or_WR_port_sram_cell = Iport * g_tp.cam_cell.Vdd;
+  leak_power_RD_port_sram_cell              = Iport_erp * g_tp.cam_cell.Vdd;
+  leak_power_SCHP_port_sram_cell            = 0;//search port and r/w port are sperate, therefore no access txs in search ports
+
+  power_matchline.searchOp.leakage += leak_power_cc_inverters_sram_cell +
+    leak_comparator_cam_cell +
+    leak_power_acc_tr_RW_or_WR_port_sram_cell +
+    leak_power_acc_tr_RW_or_WR_port_sram_cell * (RWP + EWP - 1) +
+    leak_power_RD_port_sram_cell * ERP +
+    leak_power_SCHP_port_sram_cell*SCHP;
+//  power_matchline.searchOp.leakage += leak_comparator_cam_cell;
+  power_matchline.searchOp.leakage *= (subarray.num_rows+1) * subarray.num_cols_fa_cam;//TODO:dumy line precise
+  power_matchline.searchOp.leakage += (subarray.num_rows+1) * cmos_Isub_leakage(0, Wfaprechp, 1, pmos) * g_tp.cam_cell.Vdd;
+  power_matchline.searchOp.leakage += (subarray.num_rows+1) * cmos_Isub_leakage(Waddrnandn, Waddrnandp, 2, nand) * g_tp.cam_cell.Vdd;
+  power_matchline.searchOp.leakage += (subarray.num_rows+1) * cmos_Isub_leakage(Wfanorn, Wfanorp,2, nor) * g_tp.cam_cell.Vdd;
+  //In idle states, the hit/miss txs are closed (on) therefore no Isub
+  power_matchline.searchOp.leakage += 0;// subarray.num_rows * cmos_Isub_leakage(W_hit_miss_n, 0,1, nmos) * g_tp.cam_cell.Vdd+
+    // + cmos_Isub_leakage(0, W_hit_miss_p,1, pmos) * g_tp.cam_cell.Vdd;
+
+  //in idle state, Ig_on only possibly exist in access transistors of read only ports
+  double Ig_port_erp = cmos_Ig_leakage(g_tp.cam.cell_a_w, 0, 1, nmos, false, true);
+  double Ig_cell     = cmos_Ig_leakage(g_tp.cam.cell_nmos_w, g_tp.cam.cell_pmos_w, 1, inv, false, true)*2;
+  double Ig_cell_comparator = cmos_Ig_leakage(Wdummyn, Wdummyn, 1, inv, false, true)*2;// cmos_Ig_leakage(Wdummyn, 0, 2, nmos)*2;
+
+  gate_leak_comparator_cam_cell          = Ig_cell_comparator* g_tp.cam_cell.Vdd;
+  gate_leak_power_cc_inverters_sram_cell = Ig_cell*g_tp.cam_cell.Vdd;
+  gate_leak_power_RD_port_sram_cell      = Ig_port_erp*g_tp.sram_cell.Vdd;
+  gate_leak_power_SCHP_port_sram_cell    = 0;
+
+  //cout<<"power_matchline.searchOp.leakage"<<power_matchline.searchOp.leakage<<endl;
+
+  power_matchline.searchOp.gate_leakage += gate_leak_power_cc_inverters_sram_cell;
+  power_matchline.searchOp.gate_leakage += gate_leak_comparator_cam_cell;
+  power_matchline.searchOp.gate_leakage += gate_leak_power_SCHP_port_sram_cell*SCHP + gate_leak_power_RD_port_sram_cell * ERP;
+  power_matchline.searchOp.gate_leakage *= (subarray.num_rows+1) * subarray.num_cols_fa_cam;//TODO:dumy line precise
+  power_matchline.searchOp.gate_leakage += (subarray.num_rows+1) * cmos_Ig_leakage(0, Wfaprechp,1, pmos) * g_tp.cam_cell.Vdd;
+  power_matchline.searchOp.gate_leakage += (subarray.num_rows+1) * cmos_Ig_leakage(Waddrnandn, Waddrnandp, 2, nand) * g_tp.cam_cell.Vdd;
+  power_matchline.searchOp.gate_leakage += (subarray.num_rows+1) * cmos_Ig_leakage(Wfanorn, Wfanorp, 2, nor) * g_tp.cam_cell.Vdd;
+  power_matchline.searchOp.gate_leakage += subarray.num_rows * cmos_Ig_leakage(W_hit_miss_n, 0,1, nmos) * g_tp.cam_cell.Vdd+
+                                       + cmos_Ig_leakage(0, W_hit_miss_p,1, pmos) * g_tp.cam_cell.Vdd;
+
+
+   return out_time_ramp;
+}
+
+
+double Mat::width_write_driver_or_write_mux()
+{
+  // calculate resistance of SRAM cell pull-up PMOS transistor
+  // cam and sram have same cell trasistor properties
+  double R_sram_cell_pull_up_tr  = tr_R_on(g_tp.sram.cell_pmos_w, NCH, 1, is_dram, true);
+  double R_access_tr             = tr_R_on(g_tp.sram.cell_a_w,    NCH, 1, is_dram, true);
+  double target_R_write_driver_and_mux = (2 * R_sram_cell_pull_up_tr - R_access_tr) / 2;
+  double width_write_driver_nmos = R_to_w(target_R_write_driver_and_mux, NCH, is_dram);
+
+  return width_write_driver_nmos;
+}
+
+
+
+double Mat::compute_comparators_height(
+    int tagbits,
+    int number_ways_in_mat,
+    double subarray_mem_cell_area_width)
+{
+  double nand2_area = compute_gate_area(NAND, 2, 0, g_tp.w_comp_n, g_tp.cell_h_def);
+  double cumulative_area = nand2_area * number_ways_in_mat * tagbits / 4;
+  return cumulative_area / subarray_mem_cell_area_width;
+}
+
+
+
+double Mat::compute_bitline_delay(double inrisetime)
+{
+  double V_b_pre, v_th_mem_cell, V_wl;
+  double tstep;
+  double dynRdEnergy = 0.0, dynWriteEnergy = 0.0;
+  double R_cell_pull_down=0.0, R_cell_acc =0.0, r_dev=0.0;
+  int deg_senseamp_muxing = dp.Ndsam_lev_1 * dp.Ndsam_lev_2;
+
+  double R_b_metal = camFlag? cam_cell.h:cell.h * g_tp.wire_local.R_per_um;
+  double R_bl      = subarray.num_rows * R_b_metal;
+  double C_bl      = subarray.C_bl;
+
+  // TODO: no leakage for DRAMs?
+  double leak_power_cc_inverters_sram_cell = 0;
+  double gate_leak_power_cc_inverters_sram_cell = 0;
+  double leak_power_acc_tr_RW_or_WR_port_sram_cell = 0;
+  double leak_power_RD_port_sram_cell = 0;
+  double gate_leak_power_RD_port_sram_cell = 0;
+
+  if (is_dram == true)
+  {
+    V_b_pre = g_tp.dram.Vbitpre;
+    v_th_mem_cell = g_tp.dram_acc.Vth;
+    V_wl = g_tp.vpp;
+    //The access transistor is not folded. So we just need to specify a threshold value for the
+    //folding width that is equal to or greater than Wmemcella.
+    R_cell_acc = tr_R_on(g_tp.dram.cell_a_w, NCH, 1, true, true);
+    r_dev = g_tp.dram_cell_Vdd / g_tp.dram_cell_I_on + R_bl / 2;
+  }
+  else
+  { //SRAM
+    V_b_pre = g_tp.sram.Vbitpre;
+    v_th_mem_cell = g_tp.sram_cell.Vth;
+    V_wl = g_tp.sram_cell.Vdd;
+    R_cell_pull_down = tr_R_on(g_tp.sram.cell_nmos_w, NCH, 1, false, true);
+    R_cell_acc = tr_R_on(g_tp.sram.cell_a_w, NCH, 1, false, true);
+
+    //Leakage current of an SRAM cell
+    double Iport     = cmos_Isub_leakage(g_tp.sram.cell_a_w, 0,  1, nmos,false, true);//TODO: how much is the idle time? just by *2?
+    double Iport_erp = cmos_Isub_leakage(g_tp.sram.cell_a_w, 0,  2, nmos,false, true);
+    double Icell     = cmos_Isub_leakage(g_tp.sram.cell_nmos_w, g_tp.sram.cell_pmos_w, 1, inv,false, true)*2;//two invs per cell
+
+    leak_power_cc_inverters_sram_cell         = Icell * g_tp.sram_cell.Vdd;
+    leak_power_acc_tr_RW_or_WR_port_sram_cell = Iport * g_tp.sram_cell.Vdd;
+    leak_power_RD_port_sram_cell              = Iport_erp * g_tp.sram_cell.Vdd;
+
+
+    //in idle state, Ig_on only possibly exist in access transistors of read only ports
+    double Ig_port_erp   = cmos_Ig_leakage(g_tp.sram.cell_a_w, 0, 1, nmos,false, true);
+    double Ig_cell   = cmos_Ig_leakage(g_tp.sram.cell_nmos_w, g_tp.sram.cell_pmos_w, 1, inv,false, true);
+
+    gate_leak_power_cc_inverters_sram_cell = Ig_cell*g_tp.sram_cell.Vdd;
+    gate_leak_power_RD_port_sram_cell      = Ig_port_erp*g_tp.sram_cell.Vdd;
+  }
+
+
+  double C_drain_bit_mux = drain_C_(g_tp.w_nmos_b_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w / (2 *(RWP + ERP + SCHP)), is_dram);
+  double R_bit_mux = tr_R_on(g_tp.w_nmos_b_mux, NCH, 1, is_dram);
+  double C_drain_sense_amp_iso = drain_C_(g_tp.w_iso, PCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram);
+  double R_sense_amp_iso = tr_R_on(g_tp.w_iso, PCH, 1, is_dram);
+  double C_sense_amp_latch = gate_C(g_tp.w_sense_p + g_tp.w_sense_n, 0, is_dram) +
+    drain_C_(g_tp.w_sense_n, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) +
+    drain_C_(g_tp.w_sense_p, PCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram);
+  double C_drain_sense_amp_mux = drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram);
+
+  if (is_dram)
+  {
+    double fraction = dp.V_b_sense / ((g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C /(g_tp.dram_cell_C + C_bl));
+    tstep = 2.3 * fraction * r_dev *
+      (g_tp.dram_cell_C * (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux)) /
+      (g_tp.dram_cell_C + (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux));
+    delay_writeback = tstep;
+    dynRdEnergy += (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) *
+      (g_tp.dram_cell_Vdd / 2) * g_tp.dram_cell_Vdd /* subarray.num_cols * num_subarrays_per_mat*/;
+    dynWriteEnergy += (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch) *
+      (g_tp.dram_cell_Vdd / 2) * g_tp.dram_cell_Vdd /* subarray.num_cols * num_subarrays_per_mat*/ * num_act_mats_hor_dir*100;
+    per_bitline_read_energy = (C_bl + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) *
+      (g_tp.dram_cell_Vdd / 2) * g_tp.dram_cell_Vdd;
+  }
+  else
+  {
+    double tau;
+
+    if (deg_bl_muxing > 1)
+    {
+      tau = (R_cell_pull_down + R_cell_acc) *
+        (C_bl + 2*C_drain_bit_mux + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) +
+        R_bl * (C_bl/2 + 2*C_drain_bit_mux + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) +
+        R_bit_mux * (C_drain_bit_mux + 2*C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) +
+        R_sense_amp_iso * (C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux);
+      dynRdEnergy += (C_bl + 2 * C_drain_bit_mux) * 2 * dp.V_b_sense * g_tp.sram_cell.Vdd /*
+        subarray.num_cols * num_subarrays_per_mat*/;
+      dynRdEnergy += (2 * C_drain_sense_amp_iso + C_sense_amp_latch +  C_drain_sense_amp_mux) *
+        2 * dp.V_b_sense * g_tp.sram_cell.Vdd * (1.0/*subarray.num_cols * num_subarrays_per_mat*/ / deg_bl_muxing);
+      dynWriteEnergy += ((1.0/*subarray.num_cols *num_subarrays_per_mat*/ / deg_bl_muxing) / deg_senseamp_muxing) *
+          num_act_mats_hor_dir * (C_bl + 2*C_drain_bit_mux) * g_tp.sram_cell.Vdd * g_tp.sram_cell.Vdd*2;
+      //Write Ops are differential for SRAM
+    }
+    else
+    {
+      tau = (R_cell_pull_down + R_cell_acc) *
+        (C_bl + C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) + R_bl * C_bl / 2 +
+        R_sense_amp_iso * (C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux);
+      dynRdEnergy += (C_bl + 2 * C_drain_sense_amp_iso + C_sense_amp_latch + C_drain_sense_amp_mux) *
+        2 * dp.V_b_sense * g_tp.sram_cell.Vdd /* subarray.num_cols * num_subarrays_per_mat*/;
+      dynWriteEnergy += (((1.0/*subarray.num_cols * num_subarrays_per_mat*/ / deg_bl_muxing) / deg_senseamp_muxing) *
+          num_act_mats_hor_dir * C_bl) * g_tp.sram_cell.Vdd * g_tp.sram_cell.Vdd*2;
+
+    }
+    tstep = tau * log(V_b_pre / (V_b_pre - dp.V_b_sense));
+    power_bitline.readOp.leakage =
+      leak_power_cc_inverters_sram_cell +
+      leak_power_acc_tr_RW_or_WR_port_sram_cell +
+      leak_power_acc_tr_RW_or_WR_port_sram_cell * (RWP + EWP - 1) +
+      leak_power_RD_port_sram_cell * ERP;
+    power_bitline.readOp.gate_leakage = gate_leak_power_cc_inverters_sram_cell +
+      gate_leak_power_RD_port_sram_cell * ERP;
+
+  }
+
+//  cout<<"leak_power_cc_inverters_sram_cell"<<leak_power_cc_inverters_sram_cell<<endl;
+//  cout<<"leak_power_acc_tr_RW_or_WR_port_sram_cell"<<leak_power_acc_tr_RW_or_WR_port_sram_cell<<endl;
+//  cout<<"leak_power_acc_tr_RW_or_WR_port_sram_cell"<<leak_power_acc_tr_RW_or_WR_port_sram_cell<<endl;
+//  cout<<"leak_power_RD_port_sram_cell"<<leak_power_RD_port_sram_cell<<endl;
+
+
+  /* take input rise time into account */
+  double m = V_wl / inrisetime;
+  if (tstep <= (0.5 * (V_wl - v_th_mem_cell) / m))
+  {
+    delay_bitline = sqrt(2 * tstep * (V_wl - v_th_mem_cell)/ m);
+  }
+  else
+  {
+    delay_bitline = tstep + (V_wl - v_th_mem_cell) / (2 * m);
+  }
+
+  bool is_fa = (dp.fully_assoc) ? true : false;
+
+  if (dp.is_tag == false || is_fa == false)
+  {
+    power_bitline.readOp.dynamic  = dynRdEnergy;
+    power_bitline.writeOp.dynamic = dynWriteEnergy;
+  }
+
+  double outrisetime = 0;
+  return outrisetime;
+}
+
+
+
+double Mat::compute_sa_delay(double inrisetime)
+{
+  //int num_sa_subarray = subarray.num_cols / deg_bl_muxing; //in a subarray
+
+  //Bitline circuitry leakage.
+  double Iiso     = simplified_pmos_leakage(g_tp.w_iso, is_dram);
+  double IsenseEn = simplified_nmos_leakage(g_tp.w_sense_en, is_dram);
+  double IsenseN  = simplified_nmos_leakage(g_tp.w_sense_n, is_dram);
+  double IsenseP  = simplified_pmos_leakage(g_tp.w_sense_p, is_dram);
+
+  double lkgIdlePh  = IsenseEn;//+ 2*IoBufP;
+  //double lkgWritePh = Iiso + IsenseEn;// + 2*IoBufP + 2*Ipch;
+  double lkgReadPh  = Iiso + IsenseN + IsenseP;//+ IoBufN + IoBufP + 2*IsPch ;
+  //double lkgRead = lkgReadPh * num_sa_subarray * 4 * num_act_mats_hor_dir +
+  //    lkgIdlePh * num_sa_subarray * 4 * (num_mats - num_act_mats_hor_dir);
+  double lkgIdle = lkgIdlePh /*num_sa_subarray * num_subarrays_per_mat*/;
+  leak_power_sense_amps_closed_page_state = lkgIdlePh * g_tp.peri_global.Vdd /* num_sa_subarray * num_subarrays_per_mat*/;
+  leak_power_sense_amps_open_page_state   = lkgReadPh * g_tp.peri_global.Vdd /* num_sa_subarray * num_subarrays_per_mat*/;
+
+  // sense amplifier has to drive logic in "data out driver" and sense precharge load.
+  // load seen by sense amp. New delay model for sense amp that is sensitive to both the output time
+  //constant as well as the magnitude of input differential voltage.
+  double C_ld = gate_C(g_tp.w_sense_p + g_tp.w_sense_n, 0, is_dram) +
+    drain_C_(g_tp.w_sense_n, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) +
+    drain_C_(g_tp.w_sense_p, PCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) +
+    drain_C_(g_tp.w_iso,PCH,1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) +
+    drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram);
+  double tau = C_ld / g_tp.gm_sense_amp_latch;
+  delay_sa = tau * log(g_tp.peri_global.Vdd / dp.V_b_sense);
+  power_sa.readOp.dynamic = C_ld * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd /* num_sa_subarray
+                            num_subarrays_per_mat * num_act_mats_hor_dir*/;
+  power_sa.readOp.leakage = lkgIdle * g_tp.peri_global.Vdd;
+
+  double outrisetime = 0;
+  return outrisetime;
+}
+
+
+
+double Mat::compute_subarray_out_drv(double inrisetime)
+{
+  double C_ld, rd, tf, this_delay;
+  double p_to_n_sz_r = pmos_to_nmos_sz_ratio(is_dram);
+
+  // delay of signal through pass-transistor of first level of sense-amp mux to input of inverter-buffer.
+  rd = tr_R_on(g_tp.w_nmos_sa_mux, NCH, 1, is_dram);
+  C_ld = dp.Ndsam_lev_1 * drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing / (RWP + ERP + SCHP), is_dram) +
+    gate_C(g_tp.min_w_nmos_ + p_to_n_sz_r * g_tp.min_w_nmos_, 0.0, is_dram);
+  tf = rd * C_ld;
+  this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+  delay_subarray_out_drv += this_delay;
+  inrisetime = this_delay/(1.0 - 0.5);
+  power_subarray_out_drv.readOp.dynamic += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.leakage += 0;  // for now, let leakage of the pass transistor be 0
+  power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.w_nmos_sa_mux, 0, 1, nmos)* g_tp.peri_global.Vdd;
+  // delay of signal through inverter-buffer to second level of sense-amp mux.
+  // internal delay of buffer
+  rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1, is_dram);
+  C_ld = drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    drain_C_(p_to_n_sz_r * g_tp.min_w_nmos_, PCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    gate_C(g_tp.min_w_nmos_ + p_to_n_sz_r * g_tp.min_w_nmos_, 0.0, is_dram);
+  tf = rd * C_ld;
+  this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+  delay_subarray_out_drv += this_delay;
+  inrisetime = this_delay/(1.0 - 0.5);
+  power_subarray_out_drv.readOp.dynamic      += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.leakage      += cmos_Isub_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv, is_dram)* g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv)* g_tp.peri_global.Vdd;
+
+  // inverter driving drain of pass transistor of second level of sense-amp mux.
+  rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1, is_dram);
+  C_ld = drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    drain_C_(p_to_n_sz_r * g_tp.min_w_nmos_, PCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing * dp.Ndsam_lev_1 / (RWP + ERP + SCHP), is_dram);
+  tf = rd * C_ld;
+  this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+  delay_subarray_out_drv += this_delay;
+  inrisetime = this_delay/(1.0 - 0.5);
+  power_subarray_out_drv.readOp.dynamic      += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.leakage      += cmos_Isub_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv)* g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.min_w_nmos_, p_to_n_sz_r * g_tp.min_w_nmos_, 1, inv)* g_tp.peri_global.Vdd;
+
+
+  // delay of signal through pass-transistor to input of subarray output driver.
+  rd = tr_R_on(g_tp.w_nmos_sa_mux, NCH, 1, is_dram);
+  C_ld = dp.Ndsam_lev_2 * drain_C_(g_tp.w_nmos_sa_mux, NCH, 1, 0, camFlag? cam_cell.w:cell.w * deg_bl_muxing * dp.Ndsam_lev_1 / (RWP + ERP + SCHP), is_dram) +
+    //gate_C(subarray_out_wire->repeater_size * g_tp.min_w_nmos_ * (1 + p_to_n_sz_r), 0.0, is_dram);
+    gate_C(subarray_out_wire->repeater_size *(subarray_out_wire->wire_length/subarray_out_wire->repeater_spacing) * g_tp.min_w_nmos_ * (1 + p_to_n_sz_r), 0.0, is_dram);
+  tf = rd * C_ld;
+  this_delay = horowitz(inrisetime, tf, 0.5, 0.5, RISE);
+  delay_subarray_out_drv += this_delay;
+  inrisetime = this_delay/(1.0 - 0.5);
+  power_subarray_out_drv.readOp.dynamic += C_ld * 0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  power_subarray_out_drv.readOp.leakage += 0;  // for now, let leakage of the pass transistor be 0
+  power_subarray_out_drv.readOp.gate_leakage += cmos_Ig_leakage(g_tp.w_nmos_sa_mux, 0, 1, nmos)* g_tp.peri_global.Vdd;
+
+
+  return inrisetime;
+}
+
+
+
+double Mat::compute_comparator_delay(double inrisetime)
+{
+  int A = g_ip->tag_assoc;
+
+  int tagbits_ = dp.tagbits / 4; // Assuming there are 4 quarter comparators. input tagbits is already
+  // a multiple of 4.
+
+  /* First Inverter */
+  double Ceq = gate_C(g_tp.w_comp_inv_n2+g_tp.w_comp_inv_p2, 0, is_dram) +
+               drain_C_(g_tp.w_comp_inv_p1, PCH, 1, 1, g_tp.cell_h_def, is_dram) +
+               drain_C_(g_tp.w_comp_inv_n1, NCH, 1, 1, g_tp.cell_h_def, is_dram);
+  double Req = tr_R_on(g_tp.w_comp_inv_p1, PCH, 1, is_dram);
+  double tf  = Req*Ceq;
+  double st1del = horowitz(inrisetime,tf,VTHCOMPINV,VTHCOMPINV,FALL);
+  double nextinputtime = st1del/VTHCOMPINV;
+  power_comparator.readOp.dynamic += 0.5 * Ceq * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A;
+
+  //For each degree of associativity
+  //there are 4 such quarter comparators
+  double lkgCurrent   = cmos_Isub_leakage(g_tp.w_comp_inv_n1, g_tp.w_comp_inv_p1, 1, inv, is_dram)* 4 * A;
+  double gatelkgCurrent = cmos_Ig_leakage(g_tp.w_comp_inv_n1, g_tp.w_comp_inv_p1, 1, inv, is_dram)* 4 * A;
+  /* Second Inverter */
+  Ceq = gate_C(g_tp.w_comp_inv_n3+g_tp.w_comp_inv_p3, 0, is_dram) +
+    drain_C_(g_tp.w_comp_inv_p2, PCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    drain_C_(g_tp.w_comp_inv_n2, NCH, 1, 1, g_tp.cell_h_def, is_dram);
+  Req = tr_R_on(g_tp.w_comp_inv_n2, NCH, 1, is_dram);
+  tf = Req*Ceq;
+  double st2del = horowitz(nextinputtime,tf,VTHCOMPINV,VTHCOMPINV,RISE);
+  nextinputtime = st2del/(1.0-VTHCOMPINV);
+  power_comparator.readOp.dynamic += 0.5 * Ceq * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A;
+  lkgCurrent += cmos_Isub_leakage(g_tp.w_comp_inv_n2, g_tp.w_comp_inv_p2, 1, inv, is_dram)* 4 * A;
+  gatelkgCurrent += cmos_Ig_leakage(g_tp.w_comp_inv_n2, g_tp.w_comp_inv_p2, 1, inv, is_dram)* 4 * A;
+
+  /* Third Inverter */
+  Ceq = gate_C(g_tp.w_eval_inv_n+g_tp.w_eval_inv_p, 0, is_dram) +
+    drain_C_(g_tp.w_comp_inv_p3, PCH, 1, 1, g_tp.cell_h_def, is_dram) +
+    drain_C_(g_tp.w_comp_inv_n3, NCH, 1, 1, g_tp.cell_h_def, is_dram);
+  Req = tr_R_on(g_tp.w_comp_inv_p3, PCH, 1, is_dram);
+  tf = Req*Ceq;
+  double st3del = horowitz(nextinputtime,tf,VTHCOMPINV,VTHEVALINV,FALL);
+  nextinputtime = st3del/(VTHEVALINV);
+  power_comparator.readOp.dynamic += 0.5 * Ceq * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A;
+  lkgCurrent += cmos_Isub_leakage(g_tp.w_comp_inv_n3, g_tp.w_comp_inv_p3, 1, inv, is_dram)* 4 * A;
+  gatelkgCurrent += cmos_Ig_leakage(g_tp.w_comp_inv_n3, g_tp.w_comp_inv_p3, 1, inv, is_dram)* 4 * A;
+
+  /* Final Inverter (virtual ground driver) discharging compare part */
+  double r1 = tr_R_on(g_tp.w_comp_n,NCH,2, is_dram);
+  double r2 = tr_R_on(g_tp.w_eval_inv_n,NCH,1, is_dram); /* was switch */
+  double c2 = (tagbits_)*(drain_C_(g_tp.w_comp_n,NCH,1, 1, g_tp.cell_h_def, is_dram) +
+                   drain_C_(g_tp.w_comp_n,NCH,2, 1, g_tp.cell_h_def, is_dram)) +
+       drain_C_(g_tp.w_eval_inv_p,PCH,1, 1, g_tp.cell_h_def, is_dram) +
+       drain_C_(g_tp.w_eval_inv_n,NCH,1, 1, g_tp.cell_h_def, is_dram);
+  double c1 = (tagbits_)*(drain_C_(g_tp.w_comp_n,NCH,1, 1, g_tp.cell_h_def, is_dram) +
+                          drain_C_(g_tp.w_comp_n,NCH,2, 1, g_tp.cell_h_def, is_dram)) +
+    drain_C_(g_tp.w_comp_p,PCH,1, 1, g_tp.cell_h_def, is_dram) +
+    gate_C(WmuxdrvNANDn+WmuxdrvNANDp,0, is_dram);
+  power_comparator.readOp.dynamic += 0.5 * c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd * 4 * A;
+  power_comparator.readOp.dynamic += c1 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd *  (A - 1);
+  lkgCurrent += cmos_Isub_leakage(g_tp.w_eval_inv_n, g_tp.w_eval_inv_p, 1, inv, is_dram)* 4 * A;
+  lkgCurrent += cmos_Isub_leakage(g_tp.w_comp_n, g_tp.w_comp_n, 1, inv, is_dram)* 4 * A;  // stack factor of 0.2
+
+  gatelkgCurrent += cmos_Ig_leakage(g_tp.w_eval_inv_n, g_tp.w_eval_inv_p, 1, inv, is_dram)* 4 * A;
+  gatelkgCurrent += cmos_Ig_leakage(g_tp.w_comp_n, g_tp.w_comp_n, 1, inv, is_dram)* 4 * A;//for gate leakage this equals to a inverter
+
+  /* time to go to threshold of mux driver */
+  double tstep = (r2*c2+(r1+r2)*c1)*log(1.0/VTHMUXNAND);
+  /* take into account non-zero input rise time */
+  double m = g_tp.peri_global.Vdd/nextinputtime;
+  double Tcomparatorni;
+
+  if((tstep) <= (0.5*(g_tp.peri_global.Vdd-g_tp.peri_global.Vth)/m))
+  {
+    double a = m;
+    double b = 2*((g_tp.peri_global.Vdd*VTHEVALINV)-g_tp.peri_global.Vth);
+    double c = -2*(tstep)*(g_tp.peri_global.Vdd-g_tp.peri_global.Vth)+1/m*((g_tp.peri_global.Vdd*VTHEVALINV)-g_tp.peri_global.Vth)*((g_tp.peri_global.Vdd*VTHEVALINV)-g_tp.peri_global.Vth);
+    Tcomparatorni = (-b+sqrt(b*b-4*a*c))/(2*a);
+  }
+  else
+  {
+    Tcomparatorni = (tstep) + (g_tp.peri_global.Vdd+g_tp.peri_global.Vth)/(2*m) - (g_tp.peri_global.Vdd*VTHEVALINV)/m;
+  }
+  delay_comparator = Tcomparatorni+st1del+st2del+st3del;
+  power_comparator.readOp.leakage = lkgCurrent * g_tp.peri_global.Vdd;
+  power_comparator.readOp.gate_leakage = gatelkgCurrent * g_tp.peri_global.Vdd;
+
+  return Tcomparatorni / (1.0 - VTHMUXNAND);;
+}
+
+
+
+void Mat::compute_power_energy()
+{
+	//for cam and FA, power.readOp is the plain read power, power.searchOp is the associative search related power
+    //when search all subarrays and all mats are fully active
+	//when plain read/write only one subarray in a single mat is active.
+
+    // add energy consumed in predecoder drivers. This unit is shared by all subarrays in a mat.
+  power.readOp.dynamic += r_predec->power.readOp.dynamic +
+                          b_mux_predec->power.readOp.dynamic +
+                          sa_mux_lev_1_predec->power.readOp.dynamic +
+                          sa_mux_lev_2_predec->power.readOp.dynamic;
+
+  // add energy consumed in decoders
+  power_row_decoders.readOp.dynamic        = row_dec->power.readOp.dynamic;
+  if (!(is_fa||pure_cam))
+    power_row_decoders.readOp.dynamic        *= num_subarrays_per_mat;
+
+  // add energy consumed in bitline prechagers, SAs, and bitlines
+  if (!(is_fa||pure_cam))
+  {
+	  // add energy consumed in bitline prechagers
+	  power_bl_precharge_eq_drv.readOp.dynamic = bl_precharge_eq_drv->power.readOp.dynamic;
+	  power_bl_precharge_eq_drv.readOp.dynamic *= num_subarrays_per_mat;
+
+	  //Add sense amps energy
+	  num_sa_subarray = subarray.num_cols / deg_bl_muxing;
+	  power_sa.readOp.dynamic *= num_sa_subarray*num_subarrays_per_mat ;
+
+	  // add energy consumed in bitlines
+	  //cout<<"bitline power"<<power_bitline.readOp.dynamic<<endl;
+	  power_bitline.readOp.dynamic *= num_subarrays_per_mat*subarray.num_cols;
+	  power_bitline.writeOp.dynamic *= num_subarrays_per_mat*subarray.num_cols;
+	  //cout<<"bitline power"<<power_bitline.readOp.dynamic<<"subarray"<<num_subarrays_per_mat<<"cols"<<subarray.num_cols<<endl;
+	  //Add subarray output energy
+	  power_subarray_out_drv.readOp.dynamic =
+		  (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_do_b_mat;
+
+	  power.readOp.dynamic += power_bl_precharge_eq_drv.readOp.dynamic +
+	                          power_sa.readOp.dynamic +
+	                          power_bitline.readOp.dynamic +
+	                          power_subarray_out_drv.readOp.dynamic;
+
+	  power.readOp.dynamic += power_row_decoders.readOp.dynamic +
+	                          bit_mux_dec->power.readOp.dynamic +
+	                          sa_mux_lev_1_dec->power.readOp.dynamic +
+	                          sa_mux_lev_2_dec->power.readOp.dynamic +
+	                          power_comparator.readOp.dynamic;
+  }
+
+  else if (is_fa)
+  {
+	  //for plain read/write only one subarray in a mat is active
+	  // add energy consumed in bitline prechagers
+	  power_bl_precharge_eq_drv.readOp.dynamic = bl_precharge_eq_drv->power.readOp.dynamic
+	           + cam_bl_precharge_eq_drv->power.readOp.dynamic;
+	  power_bl_precharge_eq_drv.searchOp.dynamic = bl_precharge_eq_drv->power.readOp.dynamic;
+
+	  //Add sense amps energy
+	  num_sa_subarray = (subarray.num_cols_fa_cam + subarray.num_cols_fa_ram)/ deg_bl_muxing;
+	  num_sa_subarray_search = subarray.num_cols_fa_ram/ deg_bl_muxing;
+	  power_sa.searchOp.dynamic = power_sa.readOp.dynamic*num_sa_subarray_search;
+	  power_sa.readOp.dynamic *= num_sa_subarray;
+
+
+	  // add energy consumed in bitlines
+	  power_bitline.searchOp.dynamic = power_bitline.readOp.dynamic;
+	  power_bitline.readOp.dynamic *= (subarray.num_cols_fa_cam+subarray.num_cols_fa_ram);
+	  power_bitline.writeOp.dynamic *= (subarray.num_cols_fa_cam+subarray.num_cols_fa_ram);
+	  power_bitline.searchOp.dynamic *= subarray.num_cols_fa_ram;
+
+	  //Add subarray output energy
+      power_subarray_out_drv.searchOp.dynamic =
+		  (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_so_b_mat;
+	  power_subarray_out_drv.readOp.dynamic =
+		  (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_do_b_mat;
+
+
+	  power.readOp.dynamic += power_bl_precharge_eq_drv.readOp.dynamic +
+	                          power_sa.readOp.dynamic +
+	                          power_bitline.readOp.dynamic +
+	                          power_subarray_out_drv.readOp.dynamic;
+
+	  power.readOp.dynamic += power_row_decoders.readOp.dynamic +
+	                          bit_mux_dec->power.readOp.dynamic +
+	                          sa_mux_lev_1_dec->power.readOp.dynamic +
+	                          sa_mux_lev_2_dec->power.readOp.dynamic +
+	                          power_comparator.readOp.dynamic;
+
+	  //add energy consumed inside cam
+	  power_matchline.searchOp.dynamic *= num_subarrays_per_mat;
+	  power_searchline_precharge = sl_precharge_eq_drv->power;
+      power_searchline_precharge.searchOp.dynamic = power_searchline_precharge.readOp.dynamic * num_subarrays_per_mat;
+      power_searchline = sl_data_drv->power;
+      power_searchline.searchOp.dynamic = power_searchline.readOp.dynamic*subarray.num_cols_fa_cam* num_subarrays_per_mat;;
+      power_matchline_precharge  = ml_precharge_drv->power;
+      power_matchline_precharge.searchOp.dynamic = power_matchline_precharge.readOp.dynamic* num_subarrays_per_mat;
+      power_ml_to_ram_wl_drv= ml_to_ram_wl_drv->power;
+      power_ml_to_ram_wl_drv.searchOp.dynamic= ml_to_ram_wl_drv->power.readOp.dynamic;
+
+	  power_cam_all_active.searchOp.dynamic = power_matchline.searchOp.dynamic;
+	  power_cam_all_active.searchOp.dynamic +=power_searchline_precharge.searchOp.dynamic;
+	  power_cam_all_active.searchOp.dynamic +=power_searchline.searchOp.dynamic;
+	  power_cam_all_active.searchOp.dynamic +=power_matchline_precharge.searchOp.dynamic;
+
+	  power.searchOp.dynamic += power_cam_all_active.searchOp.dynamic;
+	  //power.searchOp.dynamic += ml_to_ram_wl_drv->power.readOp.dynamic;
+
+  }
+  else
+  {
+	  // add energy consumed in bitline prechagers
+	  power_bl_precharge_eq_drv.readOp.dynamic = cam_bl_precharge_eq_drv->power.readOp.dynamic;
+	  //power_bl_precharge_eq_drv.readOp.dynamic *= num_subarrays_per_mat;
+	  //power_bl_precharge_eq_drv.searchOp.dynamic = cam_bl_precharge_eq_drv->power.readOp.dynamic;
+	  //power_bl_precharge_eq_drv.searchOp.dynamic *= num_subarrays_per_mat;
+
+	  //Add sense amps energy
+	  num_sa_subarray = subarray.num_cols_fa_cam/ deg_bl_muxing;
+	  power_sa.readOp.dynamic *= num_sa_subarray;//*num_subarrays_per_mat;
+	  power_sa.searchOp.dynamic = 0;
+
+	  power_bitline.readOp.dynamic *= subarray.num_cols_fa_cam;
+	  power_bitline.searchOp.dynamic = 0;
+	  power_bitline.writeOp.dynamic *= subarray.num_cols_fa_cam;
+
+	  power_subarray_out_drv.searchOp.dynamic =
+		  (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_so_b_mat;
+	  power_subarray_out_drv.readOp.dynamic =
+	  		  (power_subarray_out_drv.readOp.dynamic + subarray_out_wire->power.readOp.dynamic) * num_do_b_mat;
+
+	  power.readOp.dynamic += power_bl_precharge_eq_drv.readOp.dynamic +
+	                          power_sa.readOp.dynamic +
+	                          power_bitline.readOp.dynamic +
+	                          power_subarray_out_drv.readOp.dynamic;
+
+	  power.readOp.dynamic += power_row_decoders.readOp.dynamic +
+	                          bit_mux_dec->power.readOp.dynamic +
+	                          sa_mux_lev_1_dec->power.readOp.dynamic +
+	                          sa_mux_lev_2_dec->power.readOp.dynamic +
+	                          power_comparator.readOp.dynamic;
+
+
+	  ////add energy consumed inside cam
+	  power_matchline.searchOp.dynamic *= num_subarrays_per_mat;
+	  power_searchline_precharge = sl_precharge_eq_drv->power;
+      power_searchline_precharge.searchOp.dynamic = power_searchline_precharge.readOp.dynamic * num_subarrays_per_mat;
+      power_searchline = sl_data_drv->power;
+      power_searchline.searchOp.dynamic = power_searchline.readOp.dynamic*subarray.num_cols_fa_cam* num_subarrays_per_mat;;
+      power_matchline_precharge  = ml_precharge_drv->power;
+      power_matchline_precharge.searchOp.dynamic = power_matchline_precharge.readOp.dynamic* num_subarrays_per_mat;
+      power_ml_to_ram_wl_drv= ml_to_ram_wl_drv->power;
+      power_ml_to_ram_wl_drv.searchOp.dynamic= ml_to_ram_wl_drv->power.readOp.dynamic;
+
+	  power_cam_all_active.searchOp.dynamic = power_matchline.searchOp.dynamic;
+	  power_cam_all_active.searchOp.dynamic +=power_searchline_precharge.searchOp.dynamic;
+	  power_cam_all_active.searchOp.dynamic +=power_searchline.searchOp.dynamic;
+	  power_cam_all_active.searchOp.dynamic +=power_matchline_precharge.searchOp.dynamic;
+
+	  power.searchOp.dynamic += power_cam_all_active.searchOp.dynamic;
+	  //power.searchOp.dynamic += ml_to_ram_wl_drv->power.readOp.dynamic;
+
+  }
+
+
+
+  // calculate leakage power
+  if (!(is_fa || pure_cam))
+  {
+	int number_output_drivers_subarray = num_sa_subarray / (dp.Ndsam_lev_1 * dp.Ndsam_lev_2);
+
+	power_bitline.readOp.leakage            *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat;
+    power_bl_precharge_eq_drv.readOp.leakage = bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat;
+    power_sa.readOp.leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP);
+
+    //num_sa_subarray             = subarray.num_cols / deg_bl_muxing;
+    power_subarray_out_drv.readOp.leakage =
+      (power_subarray_out_drv.readOp.leakage + subarray_out_wire->power.readOp.leakage) *
+      number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP);
+
+    power.readOp.leakage += power_bitline.readOp.leakage +
+                            power_bl_precharge_eq_drv.readOp.leakage +
+                            power_sa.readOp.leakage +
+                            power_subarray_out_drv.readOp.leakage;
+    //cout<<"leakage"<<power.readOp.leakage<<endl;
+
+    power_comparator.readOp.leakage *= num_do_b_mat * (RWP + ERP);
+    power.readOp.leakage += power_comparator.readOp.leakage;
+
+    //cout<<"leakage1"<<power.readOp.leakage<<endl;
+
+    // leakage power
+    power_row_decoders.readOp.leakage = row_dec->power.readOp.leakage * subarray.num_rows * num_subarrays_per_mat;
+    power_bit_mux_decoders.readOp.leakage      = bit_mux_dec->power.readOp.leakage * deg_bl_muxing;
+    power_sa_mux_lev_1_decoders.readOp.leakage = sa_mux_lev_1_dec->power.readOp.leakage * dp.Ndsam_lev_1;
+    power_sa_mux_lev_2_decoders.readOp.leakage = sa_mux_lev_2_dec->power.readOp.leakage * dp.Ndsam_lev_2;
+
+    power.readOp.leakage += r_predec->power.readOp.leakage +
+                          b_mux_predec->power.readOp.leakage +
+                          sa_mux_lev_1_predec->power.readOp.leakage +
+                          sa_mux_lev_2_predec->power.readOp.leakage +
+                          power_row_decoders.readOp.leakage +
+                          power_bit_mux_decoders.readOp.leakage +
+                          power_sa_mux_lev_1_decoders.readOp.leakage +
+                          power_sa_mux_lev_2_decoders.readOp.leakage;
+    //cout<<"leakage2"<<power.readOp.leakage<<endl;
+
+    //++++Below is gate leakage
+	power_bitline.readOp.gate_leakage            *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat;
+    power_bl_precharge_eq_drv.readOp.gate_leakage = bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat;
+    power_sa.readOp.gate_leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP);
+
+    //num_sa_subarray             = subarray.num_cols / deg_bl_muxing;
+    power_subarray_out_drv.readOp.gate_leakage =
+      (power_subarray_out_drv.readOp.gate_leakage + subarray_out_wire->power.readOp.gate_leakage) *
+      number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP);
+
+    power.readOp.gate_leakage += power_bitline.readOp.gate_leakage +
+                            power_bl_precharge_eq_drv.readOp.gate_leakage +
+                            power_sa.readOp.gate_leakage +
+                            power_subarray_out_drv.readOp.gate_leakage;
+    //cout<<"leakage"<<power.readOp.leakage<<endl;
+
+    power_comparator.readOp.gate_leakage *= num_do_b_mat * (RWP + ERP);
+    power.readOp.gate_leakage += power_comparator.readOp.gate_leakage;
+
+    //cout<<"leakage1"<<power.readOp.gate_leakage<<endl;
+
+    // gate_leakage power
+    power_row_decoders.readOp.gate_leakage = row_dec->power.readOp.gate_leakage * subarray.num_rows * num_subarrays_per_mat;
+    power_bit_mux_decoders.readOp.gate_leakage      = bit_mux_dec->power.readOp.gate_leakage * deg_bl_muxing;
+    power_sa_mux_lev_1_decoders.readOp.gate_leakage = sa_mux_lev_1_dec->power.readOp.gate_leakage * dp.Ndsam_lev_1;
+    power_sa_mux_lev_2_decoders.readOp.gate_leakage = sa_mux_lev_2_dec->power.readOp.gate_leakage * dp.Ndsam_lev_2;
+
+    power.readOp.gate_leakage += r_predec->power.readOp.gate_leakage +
+                          b_mux_predec->power.readOp.gate_leakage +
+                          sa_mux_lev_1_predec->power.readOp.gate_leakage +
+                          sa_mux_lev_2_predec->power.readOp.gate_leakage +
+                          power_row_decoders.readOp.gate_leakage +
+                          power_bit_mux_decoders.readOp.gate_leakage +
+                          power_sa_mux_lev_1_decoders.readOp.gate_leakage +
+                          power_sa_mux_lev_2_decoders.readOp.gate_leakage;
+  }
+  else if (is_fa)
+  {
+	  int number_output_drivers_subarray = num_sa_subarray;// / (dp.Ndsam_lev_1 * dp.Ndsam_lev_2);
+
+	  power_bitline.readOp.leakage            *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat;
+	  power_bl_precharge_eq_drv.readOp.leakage = bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat;
+	  power_bl_precharge_eq_drv.searchOp.leakage = cam_bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat;
+	  power_sa.readOp.leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP);
+
+	  //cout<<"leakage3"<<power.readOp.leakage<<endl;
+
+
+	  power_subarray_out_drv.readOp.leakage =
+		  (power_subarray_out_drv.readOp.leakage + subarray_out_wire->power.readOp.leakage) *
+		  number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP);
+
+	  power.readOp.leakage += power_bitline.readOp.leakage +
+	                          power_bl_precharge_eq_drv.readOp.leakage +
+	                          power_bl_precharge_eq_drv.searchOp.leakage +
+	                          power_sa.readOp.leakage +
+	                          power_subarray_out_drv.readOp.leakage;
+
+	  //cout<<"leakage4"<<power.readOp.leakage<<endl;
+
+	  // leakage power
+	  power_row_decoders.readOp.leakage = row_dec->power.readOp.leakage * subarray.num_rows * num_subarrays_per_mat;
+	  power.readOp.leakage += r_predec->power.readOp.leakage +
+	                          power_row_decoders.readOp.leakage;
+
+	  //cout<<"leakage5"<<power.readOp.leakage<<endl;
+
+	  //inside cam
+	  power_cam_all_active.searchOp.leakage = power_matchline.searchOp.leakage;
+	  power_cam_all_active.searchOp.leakage +=sl_precharge_eq_drv->power.readOp.leakage;
+	  power_cam_all_active.searchOp.leakage +=sl_data_drv->power.readOp.leakage*subarray.num_cols_fa_cam;
+	  power_cam_all_active.searchOp.leakage +=ml_precharge_drv->power.readOp.dynamic;
+	  power_cam_all_active.searchOp.leakage *= num_subarrays_per_mat;
+
+	  power.readOp.leakage += power_cam_all_active.searchOp.leakage;
+
+//	  cout<<"leakage6"<<power.readOp.leakage<<endl;
+
+	  //+++Below is gate leakage
+	  power_bitline.readOp.gate_leakage            *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat;
+	  power_bl_precharge_eq_drv.readOp.gate_leakage = bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat;
+	  power_bl_precharge_eq_drv.searchOp.gate_leakage = cam_bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat;
+	  power_sa.readOp.gate_leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP);
+
+	  //cout<<"leakage3"<<power.readOp.gate_leakage<<endl;
+
+
+	  power_subarray_out_drv.readOp.gate_leakage =
+		  (power_subarray_out_drv.readOp.gate_leakage + subarray_out_wire->power.readOp.gate_leakage) *
+		  number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP);
+
+	  power.readOp.gate_leakage += power_bitline.readOp.gate_leakage +
+	  power_bl_precharge_eq_drv.readOp.gate_leakage +
+	  power_bl_precharge_eq_drv.searchOp.gate_leakage +
+	  power_sa.readOp.gate_leakage +
+	  power_subarray_out_drv.readOp.gate_leakage;
+
+	  //cout<<"leakage4"<<power.readOp.gate_leakage<<endl;
+
+	  // gate_leakage power
+	  power_row_decoders.readOp.gate_leakage = row_dec->power.readOp.gate_leakage * subarray.num_rows * num_subarrays_per_mat;
+	  power.readOp.gate_leakage += r_predec->power.readOp.gate_leakage +
+	  power_row_decoders.readOp.gate_leakage;
+
+	  //cout<<"leakage5"<<power.readOp.gate_leakage<<endl;
+
+	  //inside cam
+	  power_cam_all_active.searchOp.gate_leakage = power_matchline.searchOp.gate_leakage;
+	  power_cam_all_active.searchOp.gate_leakage +=sl_precharge_eq_drv->power.readOp.gate_leakage;
+	  power_cam_all_active.searchOp.gate_leakage +=sl_data_drv->power.readOp.gate_leakage*subarray.num_cols_fa_cam;
+	  power_cam_all_active.searchOp.gate_leakage +=ml_precharge_drv->power.readOp.dynamic;
+	  power_cam_all_active.searchOp.gate_leakage *= num_subarrays_per_mat;
+
+	  power.readOp.gate_leakage += power_cam_all_active.searchOp.gate_leakage;
+
+  }
+  else
+  {
+	  int number_output_drivers_subarray = num_sa_subarray;// / (dp.Ndsam_lev_1 * dp.Ndsam_lev_2);
+
+	  //power_bitline.readOp.leakage            *= subarray.num_rows * subarray.num_cols * num_subarrays_per_mat;
+	  //power_bl_precharge_eq_drv.readOp.leakage = bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat;
+	  power_bl_precharge_eq_drv.searchOp.leakage = cam_bl_precharge_eq_drv->power.readOp.leakage * num_subarrays_per_mat;
+	  power_sa.readOp.leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP);
+
+
+	  power_subarray_out_drv.readOp.leakage =
+		  (power_subarray_out_drv.readOp.leakage + subarray_out_wire->power.readOp.leakage) *
+		  number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP);
+
+	  power.readOp.leakage += //power_bitline.readOp.leakage +
+	                          //power_bl_precharge_eq_drv.readOp.leakage +
+	                          power_bl_precharge_eq_drv.searchOp.leakage +
+	                          power_sa.readOp.leakage +
+	                          power_subarray_out_drv.readOp.leakage;
+
+	  // leakage power
+	  power_row_decoders.readOp.leakage = row_dec->power.readOp.leakage * subarray.num_rows * num_subarrays_per_mat*(RWP + ERP + EWP);
+	  power.readOp.leakage += r_predec->power.readOp.leakage +
+	                          power_row_decoders.readOp.leakage;
+
+	  //inside cam
+	  power_cam_all_active.searchOp.leakage = power_matchline.searchOp.leakage;
+	  power_cam_all_active.searchOp.leakage +=sl_precharge_eq_drv->power.readOp.leakage;
+	  power_cam_all_active.searchOp.leakage +=sl_data_drv->power.readOp.leakage*subarray.num_cols_fa_cam;
+	  power_cam_all_active.searchOp.leakage +=ml_precharge_drv->power.readOp.dynamic;
+	  power_cam_all_active.searchOp.leakage *= num_subarrays_per_mat;
+
+	  power.readOp.leakage += power_cam_all_active.searchOp.leakage;
+
+	  //+++Below is gate leakage
+	  power_bl_precharge_eq_drv.searchOp.gate_leakage = cam_bl_precharge_eq_drv->power.readOp.gate_leakage * num_subarrays_per_mat;
+	  power_sa.readOp.gate_leakage                 *= num_sa_subarray*num_subarrays_per_mat*(RWP + ERP + SCHP);
+
+
+	  power_subarray_out_drv.readOp.gate_leakage =
+		  (power_subarray_out_drv.readOp.gate_leakage + subarray_out_wire->power.readOp.gate_leakage) *
+		  number_output_drivers_subarray * num_subarrays_per_mat * (RWP + ERP + SCHP);
+
+	  power.readOp.gate_leakage += //power_bitline.readOp.gate_leakage +
+	                          //power_bl_precharge_eq_drv.readOp.gate_leakage +
+	                          power_bl_precharge_eq_drv.searchOp.gate_leakage +
+	                          power_sa.readOp.gate_leakage +
+	                          power_subarray_out_drv.readOp.gate_leakage;
+
+	  // gate_leakage power
+	  power_row_decoders.readOp.gate_leakage = row_dec->power.readOp.gate_leakage * subarray.num_rows * num_subarrays_per_mat*(RWP + ERP + EWP);
+	  power.readOp.gate_leakage += r_predec->power.readOp.gate_leakage +
+	                          power_row_decoders.readOp.gate_leakage;
+
+	  //inside cam
+	  power_cam_all_active.searchOp.gate_leakage = power_matchline.searchOp.gate_leakage;
+	  power_cam_all_active.searchOp.gate_leakage +=sl_precharge_eq_drv->power.readOp.gate_leakage;
+	  power_cam_all_active.searchOp.gate_leakage +=sl_data_drv->power.readOp.gate_leakage*subarray.num_cols_fa_cam;
+	  power_cam_all_active.searchOp.gate_leakage +=ml_precharge_drv->power.readOp.dynamic;
+	  power_cam_all_active.searchOp.gate_leakage *= num_subarrays_per_mat;
+
+	  power.readOp.gate_leakage += power_cam_all_active.searchOp.gate_leakage;
+  }
+}
+
diff --git a/src/gpuwattch/cacti/mat.h b/src/gpuwattch/cacti/mat.h
new file mode 100755
index 000000000..e408a7d92
--- /dev/null
+++ b/src/gpuwattch/cacti/mat.h
@@ -0,0 +1,148 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __MAT_H__
+#define __MAT_H__
+
+#include "component.h"
+#include "decoder.h"
+#include "wire.h"
+#include "subarray.h"
+
+class Mat : public Component
+{
+  public:
+    Mat(const DynamicParameter & dyn_p);
+    ~Mat();
+    double compute_delays(double inrisetime);  // return outrisetime
+    void compute_power_energy();
+
+    const DynamicParameter & dp;
+
+    // TODO: clean up pointers and powerDefs below
+    Decoder * row_dec;
+    Decoder * bit_mux_dec;
+    Decoder * sa_mux_lev_1_dec;
+    Decoder * sa_mux_lev_2_dec;
+    PredecBlk * dummy_way_sel_predec_blk1;
+    PredecBlk * dummy_way_sel_predec_blk2;
+    PredecBlkDrv * way_sel_drv1;
+    PredecBlkDrv * dummy_way_sel_predec_blk_drv2;
+
+    Predec * r_predec;
+    Predec * b_mux_predec;
+    Predec * sa_mux_lev_1_predec;
+    Predec * sa_mux_lev_2_predec;
+
+    Wire   * subarray_out_wire;
+    Driver * bl_precharge_eq_drv;
+    Driver * cam_bl_precharge_eq_drv;//bitline pre-charge circuit is separated for CAM and RAM arrays.
+    Driver * ml_precharge_drv;//matchline prechange driver
+    Driver * sl_precharge_eq_drv;//searchline prechage driver
+    Driver * sl_data_drv;//search line data driver
+    Driver * ml_to_ram_wl_drv;//search line data driver
+
+
+    powerDef power_row_decoders;
+    powerDef power_bit_mux_decoders;
+    powerDef power_sa_mux_lev_1_decoders;
+    powerDef power_sa_mux_lev_2_decoders;
+    powerDef power_fa_cam;  // TODO: leakage power is not computed yet
+    powerDef power_bl_precharge_eq_drv;
+    powerDef power_subarray_out_drv;
+    powerDef power_cam_all_active;
+    powerDef power_searchline_precharge;
+    powerDef power_matchline_precharge;
+    powerDef power_ml_to_ram_wl_drv;
+
+    double   delay_fa_tag, delay_cam;
+    double   delay_before_decoder;
+    double   delay_bitline;
+    double   delay_wl_reset;
+    double   delay_bl_restore;
+
+    double   delay_searchline;
+    double   delay_matchchline;
+    double   delay_cam_sl_restore;
+    double   delay_cam_ml_reset;
+    double   delay_fa_ram_wl;
+
+    double   delay_hit_miss_reset;
+    double   delay_hit_miss;
+
+    Subarray subarray;
+    powerDef power_bitline, power_searchline, power_matchline;
+    double   per_bitline_read_energy;
+    int      deg_bl_muxing;
+    int      num_act_mats_hor_dir;
+    double   delay_writeback;
+    Area     cell,cam_cell;
+    bool     is_dram,is_fa, pure_cam, camFlag;
+    int      num_mats;
+    powerDef power_sa;
+    double   delay_sa;
+    double   leak_power_sense_amps_closed_page_state;
+    double   leak_power_sense_amps_open_page_state;
+    double   delay_subarray_out_drv;
+    double   delay_subarray_out_drv_htree;
+    double   delay_comparator;
+    powerDef power_comparator;
+    int      num_do_b_mat;
+    int      num_so_b_mat;
+    int      num_sa_subarray;
+    int      num_sa_subarray_search;
+    double   C_bl;
+
+    uint32_t num_subarrays_per_mat;  // the number of subarrays in a mat
+    uint32_t num_subarrays_per_row;  // the number of subarrays in a row of a mat
+
+
+  private:
+    double compute_bit_mux_sa_precharge_sa_mux_wr_drv_wr_mux_h();
+    double width_write_driver_or_write_mux();
+    double compute_comparators_height(int tagbits, int number_ways_in_mat, double subarray_mem_cell_area_w);
+    double compute_cam_delay(double inrisetime);
+    double compute_bitline_delay(double inrisetime);
+    double compute_sa_delay(double inrisetime);
+    double compute_subarray_out_drv(double inrisetime);
+    double compute_comparator_delay(double inrisetime);
+
+    int RWP;
+    int ERP;
+    int EWP;
+    int SCHP;
+};
+
+
+
+#endif
diff --git a/src/gpuwattch/cacti/nuca.cc b/src/gpuwattch/cacti/nuca.cc
new file mode 100644
index 000000000..f31b0c851
--- /dev/null
+++ b/src/gpuwattch/cacti/nuca.cc
@@ -0,0 +1,612 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include "nuca.h"
+#include "Ucache.h"
+#include <assert.h>
+
+unsigned int MIN_BANKSIZE=65536;
+#define FIXED_OVERHEAD 55e-12 /* clock skew and jitter in s. Ref: Hrishikesh et al ISCA 01 */
+#define LATCH_DELAY 28e-12 /* latch delay in s (later should use FO4 TODO) */
+#define CONTR_2_BANK_LAT 0
+
+int cont_stats[2 /*l2 or l3*/][5/* cores */][ROUTER_TYPES][7 /*banks*/][8 /* cycle time */];
+
+Nuca::Nuca():deviceType(&(g_tp.peri_global))
+{
+  init_cont();
+}
+
+Nuca::Nuca(
+      TechnologyParameter::DeviceType *dt
+      ):deviceType(dt)
+{
+  init_cont();
+}
+
+void
+Nuca::init_cont()
+{
+  FILE *cont;
+  char line[5000];
+  char jk[5000];
+  cont = fopen("contention.dat", "r");
+  if (!cont) {
+    cout << "contention.dat file is missing!\n";
+    exit(0);
+  }
+
+  for(int i=0; i<2; i++) {
+    for(int j=2; j<5; j++) {
+      for(int k=0; k<ROUTER_TYPES; k++) {
+        for(int l=0;l<7; l++) {
+          int *temp = cont_stats[i/*l2 or l3*/][j/*core*/][k/*64 or 128 or 256 link bw*/][l /* no banks*/];
+          assert(fscanf(cont, "%[^\n]\n", line) != EOF);
+          sscanf(line, "%[^:]: %d %d %d %d %d %d %d %d",jk, &temp[0], &temp[1], &temp[2], &temp[3],
+              &temp[4], &temp[5], &temp[6], &temp[7]);
+        }
+      }
+    }
+  }
+  fclose(cont);
+}
+
+  void
+Nuca::print_cont_stats()
+{
+  for(int i=0; i<2; i++) {
+    for(int j=2; j<5; j++) {
+      for(int k=0; k<ROUTER_TYPES; k++) {
+        for(int l=0;l<7; l++) {
+          for(int m=0;l<7; l++) {
+            cout << cont_stats[i][j][k][l][m] << " ";
+          }
+          cout << endl;
+        }
+      }
+    }
+  }
+  cout << endl;
+}
+
+Nuca::~Nuca(){
+  for (int i = wt_min; i <= wt_max; i++) {
+    delete wire_vertical[i];
+    delete wire_horizontal[i];
+  }
+}
+
+/* converts latency (in s) to cycles depending upon the FREQUENCY (in GHz) */
+  int
+Nuca::calc_cycles(double lat, double oper_freq)
+{
+  //TODO: convert latch delay to FO4 */
+  double cycle_time = (1.0/(oper_freq*1e9)); /*s*/
+  cycle_time -= LATCH_DELAY;
+  cycle_time -= FIXED_OVERHEAD;
+
+  return (int)ceil(lat/cycle_time);
+}
+
+
+nuca_org_t::~nuca_org_t() {
+  // if(h_wire) delete h_wire;
+  // if(v_wire) delete v_wire;
+  // if(router) delete router;
+}
+
+/*
+ * Version - 6.0
+ *
+ * Perform exhaustive search across different bank organizatons,
+ * router configurations, grid organizations, and wire models and
+ * find an optimal NUCA organization
+ * For different bank count values
+ * 1. Optimal bank organization is calculated
+ * 2. For each bank organization, find different NUCA organizations
+ *    using various router configurations, grid organizations,
+ *    and wire models.
+ * 3. NUCA model with the least cost is picked for
+ *    this particular bank count
+ * Finally include contention statistics and find the optimal
+ *    NUCA configuration
+ */
+  void
+Nuca::sim_nuca()
+{
+  /* temp variables */
+  int it, ro, wr;
+  int num_cyc;
+  unsigned int i, j;
+  unsigned int r, c;
+  int l2_c;
+  int bank_count = 0;
+  uca_org_t ures;
+  nuca_org_t *opt_n;
+  mem_array tag, data;
+  list<nuca_org_t *> nuca_list;
+  MCPAT_Router *router_s[ROUTER_TYPES];
+  router_s[0] = new MCPAT_Router(64.0, 8, 4, &(g_tp.peri_global));
+  router_s[0]->print_router();
+  router_s[1] = new MCPAT_Router(128.0, 8, 4, &(g_tp.peri_global));
+  router_s[1]->print_router();
+  router_s[2] = new MCPAT_Router(256.0, 8, 4, &(g_tp.peri_global));
+  router_s[2]->print_router();
+
+  int core_in; // to store no. of cores
+
+  /* to search diff grid organizations */
+  double curr_hop, totno_hops, totno_hhops, totno_vhops, tot_lat,
+         curr_acclat;
+  double avg_lat, avg_hop, avg_hhop, avg_vhop, avg_dyn_power,
+         avg_leakage_power;
+
+  double opt_acclat = INF;
+  int opt_rows = 0;
+  int opt_columns = 0;
+  double opt_avg_hop = 0;
+  double opt_dyn_power = 0, opt_leakage_power = 0;
+  min_values_t minval;
+
+  int bank_start = 0;
+
+  int flit_width = 0;
+
+  /* vertical and horizontal hop latency values */
+  int ver_hop_lat, hor_hop_lat; /* in cycles */
+
+
+  /* no. of different bank sizes to consider */
+  int iterations;
+
+
+  g_ip->nuca_cache_sz = g_ip->cache_sz;
+  nuca_list.push_back(new nuca_org_t());
+
+  if (g_ip->cache_level == 0) l2_c = 1;
+  else l2_c = 0;
+
+  if (g_ip->cores <= 4) core_in = 2;
+  else if (g_ip->cores <= 8) core_in = 3;
+  else if (g_ip->cores <= 16) core_in = 4;
+  else {cout << "Number of cores should be <= 16!\n"; exit(0);}
+
+
+  // set the lower bound to an appropriate value. this depends on cache associativity
+  if (g_ip->assoc > 2) {
+    i = 2;
+    while (i != g_ip->assoc) {
+      MIN_BANKSIZE *= 2;
+      i *= 2;
+    }
+  }
+
+  iterations = (int)logtwo((int)g_ip->cache_sz/MIN_BANKSIZE);
+
+  if (g_ip->force_wiretype)
+  {
+    if (g_ip->wt == Low_swing) {
+      wt_min = Low_swing;
+      wt_max = Low_swing;
+    }
+    else {
+      wt_min = Global;
+      wt_max = Low_swing-1;
+    }
+  }
+  else {
+    wt_min = Global;
+    wt_max = Low_swing;
+  }
+  if (g_ip->nuca_bank_count != 0) { // simulate just one bank
+    if (g_ip->nuca_bank_count != 2 && g_ip->nuca_bank_count != 4 &&
+        g_ip->nuca_bank_count != 8 && g_ip->nuca_bank_count != 16 &&
+        g_ip->nuca_bank_count != 32 && g_ip->nuca_bank_count != 64) {
+      fprintf(stderr,"Incorrect bank count value! Please fix the value in cache.cfg\n");
+    }
+    bank_start = (int)logtwo((double)g_ip->nuca_bank_count);
+    iterations = bank_start+1;
+    g_ip->cache_sz = g_ip->cache_sz/g_ip->nuca_bank_count;
+  }
+  cout << "Simulating various NUCA configurations\n";
+  for (it=bank_start; it<iterations; it++) { /* different bank count values */
+    ures.tag_array2 = &tag;
+    ures.data_array2 = &data;
+    /*
+     * find the optimal bank organization
+     */
+    solve(&ures);
+//    output_UCA(&ures);
+    bank_count = g_ip->nuca_cache_sz/g_ip->cache_sz;
+    cout << "====" <<  g_ip->cache_sz << "\n";
+
+    for (wr=wt_min; wr<=wt_max; wr++) {
+
+      for (ro=0; ro<ROUTER_TYPES; ro++)
+      {
+        flit_width = (int) router_s[ro]->flit_size; //initialize router
+        nuca_list.back()->nuca_pda.cycle_time = router_s[ro]->cycle_time;
+
+        /* calculate router and wire parameters */
+
+        double vlength = ures.cache_ht; /* length of the wire (u)*/
+        double hlength = ures.cache_len; // u
+
+        /* find delay, area, and power for wires */
+        wire_vertical[wr] = new Wire((enum Wire_type) wr, vlength);
+        wire_horizontal[wr] = new Wire((enum Wire_type) wr, hlength);
+
+
+        hor_hop_lat = calc_cycles(wire_horizontal[wr]->delay,
+            1/(nuca_list.back()->nuca_pda.cycle_time*.001));
+        ver_hop_lat = calc_cycles(wire_vertical[wr]->delay,
+            1/(nuca_list.back()->nuca_pda.cycle_time*.001));
+
+        /*
+         * assume a grid like topology and explore for optimal network
+         * configuration using different row and column count values.
+         */
+        for (c=1; c<=(unsigned int)bank_count; c++) {
+          while (bank_count%c != 0) c++;
+          r = bank_count/c;
+
+          /*
+           * to find the avg access latency of a NUCA cache, uncontended
+           * access time to each bank from the
+           * cache controller is calculated.
+           * avg latency =
+           * sum of the access latencies to individual banks)/bank
+           * count value.
+           */
+          totno_hops = totno_hhops = totno_vhops = tot_lat = 0;
+          
+          for (i=0; i<r; i++) {
+            for (j=0; j<c; j++) {
+              /*
+               * vertical hops including the
+               * first hop from the cache controller
+               */
+              curr_hop = i + 1;
+              curr_hop += j; /* horizontal hops */
+              totno_hhops += j;
+              totno_vhops += (i+1);
+              curr_acclat = (i * ver_hop_lat + CONTR_2_BANK_LAT +
+                  j * hor_hop_lat);
+
+              tot_lat += curr_acclat;
+              totno_hops += curr_hop;
+            }
+          }
+          avg_lat = tot_lat/bank_count;
+          avg_hop = totno_hops/bank_count;
+          avg_hhop = totno_hhops/bank_count;
+          avg_vhop = totno_vhops/bank_count;
+
+          /* net access latency */
+          curr_acclat = 2*avg_lat + 2*(router_s[ro]->delay*avg_hop) +
+            calc_cycles(ures.access_time,
+                1/(nuca_list.back()->nuca_pda.cycle_time*.001));
+
+          /* avg access lat of nuca */
+          avg_dyn_power =
+            avg_hop *
+            (router_s[ro]->power.readOp.dynamic) + avg_hhop *
+            (wire_horizontal[wr]->power.readOp.dynamic) *
+            (g_ip->block_sz*8 + 64) + avg_vhop *
+            (wire_vertical[wr]->power.readOp.dynamic) *
+            (g_ip->block_sz*8 + 64) + ures.power.readOp.dynamic;
+
+          avg_leakage_power =
+            bank_count * router_s[ro]->power.readOp.leakage +
+            avg_hhop * (wire_horizontal[wr]->power.readOp.leakage*
+                wire_horizontal[wr]->delay) * flit_width +
+            avg_vhop * (wire_vertical[wr]->power.readOp.leakage *
+                wire_horizontal[wr]->delay);
+
+          if (curr_acclat < opt_acclat) {
+            opt_acclat = curr_acclat;
+            opt_avg_hop = avg_hop;
+            opt_rows = r;
+            opt_columns = c;
+            opt_dyn_power = avg_dyn_power;
+            opt_leakage_power = avg_leakage_power;
+          }
+          totno_hops = 0;
+          tot_lat = 0;
+          totno_hhops = 0;
+          totno_vhops = 0;
+        }
+        nuca_list.back()->wire_pda.power.readOp.dynamic =
+          opt_avg_hop * flit_width *
+          (wire_horizontal[wr]->power.readOp.dynamic +
+           wire_vertical[wr]->power.readOp.dynamic);
+        nuca_list.back()->avg_hops = opt_avg_hop;
+        /* network delay/power */
+        nuca_list.back()->h_wire = wire_horizontal[wr];
+        nuca_list.back()->v_wire = wire_vertical[wr];
+        nuca_list.back()->router = router_s[ro];
+        /* bank delay/power */
+
+        nuca_list.back()->bank_pda.delay = ures.access_time;
+        nuca_list.back()->bank_pda.power = ures.power;
+        nuca_list.back()->bank_pda.area.h = ures.cache_ht;
+        nuca_list.back()->bank_pda.area.w = ures.cache_len;
+        nuca_list.back()->bank_pda.cycle_time = ures.cycle_time;
+
+        num_cyc = calc_cycles(nuca_list.back()->bank_pda.delay /*s*/,
+            1/(nuca_list.back()->nuca_pda.cycle_time*.001/*GHz*/));
+        if(num_cyc%2 != 0) num_cyc++;
+        if (num_cyc > 16) num_cyc = 16; // we have data only up to 16 cycles
+
+        if (it < 7) {
+          nuca_list.back()->nuca_pda.delay = opt_acclat +
+            cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
+          nuca_list.back()->contention =
+            cont_stats[l2_c][core_in][ro][it][num_cyc/2-1];
+        }
+        else {
+          nuca_list.back()->nuca_pda.delay = opt_acclat +
+            cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
+          nuca_list.back()->contention =
+            cont_stats[l2_c][core_in][ro][7][num_cyc/2-1];
+        }
+        nuca_list.back()->nuca_pda.power.readOp.dynamic = opt_dyn_power;
+        nuca_list.back()->nuca_pda.power.readOp.leakage = opt_leakage_power;
+
+        /* array organization */
+        nuca_list.back()->bank_count = bank_count;
+        nuca_list.back()->rows = opt_rows;
+        nuca_list.back()->columns = opt_columns;
+        calculate_nuca_area (nuca_list.back());
+
+        minval.update_min_values(nuca_list.back());
+        nuca_list.push_back(new nuca_org_t());
+        opt_acclat = BIGNUM;
+
+      }
+    }
+    g_ip->cache_sz /= 2;
+  }
+
+  delete(nuca_list.back());
+  nuca_list.pop_back();
+  opt_n = find_optimal_nuca(&nuca_list, &minval);
+  print_nuca(opt_n);
+  g_ip->cache_sz = g_ip->nuca_cache_sz/opt_n->bank_count;
+
+  list<nuca_org_t *>::iterator niter;
+  for (niter = nuca_list.begin(); niter != nuca_list.end(); ++niter)
+  {
+    delete *niter;
+  }
+  nuca_list.clear();
+
+  for(int i=0; i < ROUTER_TYPES; i++)
+  {
+    delete router_s[i];
+  }
+  g_ip->display_ip();
+  //  g_ip->force_cache_config = true;
+  //  g_ip->ndwl = 8;
+  //  g_ip->ndbl = 16;
+  //  g_ip->nspd = 4;
+  //  g_ip->ndcm = 1;
+  //  g_ip->ndsam1 = 8;
+  //  g_ip->ndsam2 = 32;
+
+}
+
+
+  void
+Nuca::print_nuca (nuca_org_t *fr)
+{
+  printf("\n---------- CACTI version 6.5, Non-uniform Cache Access "
+      "----------\n\n");
+  printf("Optimal number of banks - %d\n", fr->bank_count);
+  printf("Grid organization rows x columns - %d x %d\n",
+      fr->rows, fr->columns);
+  printf("Network frequency - %g GHz\n",
+      (1/fr->nuca_pda.cycle_time)*1e3);
+  printf("Cache dimension (mm x mm) - %g x %g\n",
+      fr->nuca_pda.area.h,
+      fr->nuca_pda.area.w);
+
+  fr->router->print_router();
+
+  printf("\n\nWire stats:\n");
+  if (fr->h_wire->wt == Global) {
+    printf("\tWire type - Full swing global wires with least "
+        "possible delay\n");
+  }
+  else if (fr->h_wire->wt == Global_5) {
+    printf("\tWire type - Full swing global wires with "
+        "5%% delay penalty\n");
+  }
+  else if (fr->h_wire->wt == Global_10) {
+    printf("\tWire type - Full swing global wires with "
+        "10%% delay penalty\n");
+  }
+  else if (fr->h_wire->wt == Global_20) {
+    printf("\tWire type - Full swing global wires with "
+        "20%% delay penalty\n");
+  }
+  else if (fr->h_wire->wt == Global_30) {
+    printf("\tWire type - Full swing global wires with "
+        "30%% delay penalty\n");
+  }
+  else if(fr->h_wire->wt == Low_swing) {
+    printf("\tWire type - Low swing wires\n");
+  }
+
+  printf("\tHorizontal link delay - %g (ns)\n",
+      fr->h_wire->delay*1e9);
+  printf("\tVertical link delay - %g (ns)\n",
+      fr->v_wire->delay*1e9);
+  printf("\tDelay/length - %g (ns/mm)\n",
+      fr->h_wire->delay*1e9/fr->bank_pda.area.w);
+  printf("\tHorizontal link energy -dynamic/access %g (nJ)\n"
+      "\t                       -leakage %g (nW)\n\n",
+      fr->h_wire->power.readOp.dynamic*1e9,
+      fr->h_wire->power.readOp.leakage*1e9);
+  printf("\tVertical link energy -dynamic/access %g (nJ)\n"
+      "\t                     -leakage %g (nW)\n\n",
+      fr->v_wire->power.readOp.dynamic*1e9,
+      fr->v_wire->power.readOp.leakage*1e9);
+  printf("\n\n");
+  fr->v_wire->print_wire();
+  printf("\n\nBank stats:\n");
+}
+
+
+  nuca_org_t *
+Nuca::find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval)
+{
+  double cost = 0;
+  double min_cost = BIGNUM;
+  nuca_org_t *res = NULL;
+  float d, a, dp, lp, c;
+  int v;
+  dp = g_ip->dynamic_power_wt_nuca;
+  lp = g_ip->leakage_power_wt_nuca;
+  a = g_ip->area_wt_nuca;
+  d = g_ip->delay_wt_nuca;
+  c = g_ip->cycle_time_wt_nuca;
+
+  list<nuca_org_t *>::iterator niter;
+
+
+  for (niter = n->begin(); niter != n->end(); niter++) {
+    fprintf(stderr, "\n-----------------------------"
+        "---------------\n");
+
+
+    printf("NUCA___stats %d \tbankcount: lat = %g \tdynP = %g \twt = %d\t "
+        "bank_dpower = %g \tleak = %g \tcycle = %g\n",
+        (*niter)->bank_count,
+        (*niter)->nuca_pda.delay,
+        (*niter)->nuca_pda.power.readOp.dynamic,
+        (*niter)->h_wire->wt,
+        (*niter)->bank_pda.power.readOp.dynamic,
+        (*niter)->nuca_pda.power.readOp.leakage,
+        (*niter)->nuca_pda.cycle_time);
+
+
+    if (g_ip->ed == 1) {
+      cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
+        ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
+      if (min_cost > cost) {
+        min_cost = cost;
+        res = ((*niter));
+      }
+    }
+    else if (g_ip->ed == 2) {
+      cost = ((*niter)->nuca_pda.delay/minval->min_delay)*
+        ((*niter)->nuca_pda.delay/minval->min_delay)*
+        ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn);
+      if (min_cost > cost) {
+        min_cost = cost;
+        res = ((*niter));
+      }
+    }
+    else {
+      /*
+       * check whether the current organization
+       * meets the input deviation constraints
+       */
+      v = check_nuca_org((*niter), minval);
+      if (minval->min_leakage == 0) minval->min_leakage = 0.1; //FIXME remove this after leakage modeling
+
+      if (v) {
+        cost = (d  * ((*niter)->nuca_pda.delay/minval->min_delay) +
+            c  * ((*niter)->nuca_pda.cycle_time/minval->min_cyc) +
+            dp * ((*niter)->nuca_pda.power.readOp.dynamic/minval->min_dyn) +
+            lp * ((*niter)->nuca_pda.power.readOp.leakage/minval->min_leakage) +
+            a  * ((*niter)->nuca_pda.area.get_area()/minval->min_area));
+        fprintf(stderr, "cost = %g\n", cost);
+
+        if (min_cost > cost) {
+          min_cost = cost;
+          res = ((*niter));
+        }
+      }
+      else {
+        niter = n->erase(niter);
+        if (niter !=n->begin())
+        	niter --;
+      }
+    }
+  }
+  return res;
+}
+
+  int
+Nuca::check_nuca_org (nuca_org_t *n, min_values_t *minval)
+{
+  if (((n->nuca_pda.delay - minval->min_delay)*100/minval->min_delay) > g_ip->delay_dev_nuca) {
+    return 0;
+  }
+  if (((n->nuca_pda.power.readOp.dynamic - minval->min_dyn)/minval->min_dyn)*100 >
+      g_ip->dynamic_power_dev_nuca) {
+    return 0;
+  }
+  if (((n->nuca_pda.power.readOp.leakage - minval->min_leakage)/minval->min_leakage)*100 >
+      g_ip->leakage_power_dev_nuca) {
+    return 0;
+  }
+  if (((n->nuca_pda.cycle_time - minval->min_cyc)/minval->min_cyc)*100 >
+      g_ip->cycle_time_dev_nuca) {
+    return 0;
+  }
+  if (((n->nuca_pda.area.get_area() - minval->min_area)/minval->min_area)*100 >
+      g_ip->area_dev_nuca) {
+    return 0;
+  }
+  return 1;
+}
+
+  void
+Nuca::calculate_nuca_area (nuca_org_t *nuca)
+{
+  nuca->nuca_pda.area.h=
+    nuca->rows * ((nuca->h_wire->wire_width +
+          nuca->h_wire->wire_spacing)
+        * nuca->router->flit_size +
+        nuca->bank_pda.area.h);
+
+  nuca->nuca_pda.area.w =
+    nuca->columns * ((nuca->v_wire->wire_width +
+          nuca->v_wire->wire_spacing)
+        * nuca->router->flit_size +
+        nuca->bank_pda.area.w);
+}
+
diff --git a/src/gpuwattch/cacti/nuca.h b/src/gpuwattch/cacti/nuca.h
new file mode 100644
index 000000000..bbdee7d49
--- /dev/null
+++ b/src/gpuwattch/cacti/nuca.h
@@ -0,0 +1,102 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#ifndef __NUCA_H__
+#define __NUCA_H__
+
+#include "basic_circuit.h"
+#include "component.h"
+#include "parameter.h"
+#include "assert.h"
+#include "cacti_interface.h"
+#include "wire.h"
+#include "mat.h"
+#include "io.h"
+#include "router.h"
+#include <iostream>
+
+
+
+class nuca_org_t {
+  public:
+  ~nuca_org_t();
+//    int size;
+    /* area, power, access time, and cycle time stats */
+    Component nuca_pda;
+    Component bank_pda;
+    Component wire_pda;
+    Wire *h_wire;
+    Wire *v_wire;
+    MCPAT_Router *router;
+    /* for particular network configuration
+     * calculated based on a cycle accurate
+     * simulation Ref: CACTI 6 - Tech report
+     */
+    double contention;
+
+    /* grid network stats */
+    double avg_hops;
+    int rows;
+    int columns;
+    int bank_count;
+};
+
+
+
+class Nuca : public Component
+{
+  public:
+    Nuca();
+    Nuca(
+        TechnologyParameter::DeviceType *dt);
+    void print_router();
+    ~Nuca();
+    void sim_nuca();
+    void init_cont();
+    int calc_cycles(double lat, double oper_freq);
+    void calculate_nuca_area (nuca_org_t *nuca);
+    int check_nuca_org (nuca_org_t *n, min_values_t *minval);
+    nuca_org_t * find_optimal_nuca (list<nuca_org_t *> *n, min_values_t *minval);
+    void print_nuca(nuca_org_t *n);
+    void print_cont_stats();
+
+  private:
+
+    TechnologyParameter::DeviceType *deviceType;
+    int wt_min, wt_max;
+    Wire *wire_vertical[WIRE_TYPES],
+         *wire_horizontal[WIRE_TYPES];
+
+};
+
+
+#endif
diff --git a/src/gpuwattch/cacti/out_batch_test_result.csv b/src/gpuwattch/cacti/out_batch_test_result.csv
new file mode 100644
index 000000000..74cd5f2f4
--- /dev/null
+++ b/src/gpuwattch/cacti/out_batch_test_result.csv
@@ -0,0 +1,37 @@
+Tech node (nm), Capacity (bytes), Number of banks, Associativity, Output width (bits), Access time (ns), Random cycle time (ns), Multisubbank interleave cycle time (ns), Delay request network (ns), Delay inside mat (ns), Delay reply network (ns), Tag array access time (ns), Refresh period (microsec), DRAM array availability (%), Dynamic read energy (nJ), Dynamic write energy (nJ), Dynamic read power (mW), Standby leakage per bank(mW), Leakage per bank with leak power management (mW), Refresh power as percentage of standby leakage, Area (mm2), Ndwl, Ndbl, Nspd, Ndcm, Ndsam_level_1, Ndsam_level_2, Ntwl, Ntbl, Ntspd, Ntcm, Ntsam_level_1, Ntsam_level_2, Area efficiency, Resistance per unit micron (ohm-micron), Capacitance per unit micron (fF per micron), Unit-length wire delay (ps), FO4 delay (ps), delay route to bank (including crossb delay) (ps), Crossbar delay (ps), Dyn read energy per access from closed page (nJ), Dyn read energy per access from open page (nJ), Leak power of an subbank with page closed (mW), Leak power of a subbank with page  open (mW), Leak power of request and reply networks (mW), Number of subbanks, Page size in bits, Activate power, Read power, Write power, Precharge power, tRCD, CAS latency, Precharge delay, Perc dyn energy bitlines, perc dyn energy wordlines, perc dyn energy outside mat, Area opt (perc), Delay opt (perc), Repeater opt (perc), Aspect ratio
+65, 8192, 1, 1, 512, 0.558019, 0.463081, 0.218592, 0.218592, 0.205916, 0.133511, 0, 0, 0, 0.0492188, 0.0501686, 106.285, 12.9322, 12.9322, 0, 0.437143, 2, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 9.24775, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.0492188, 0.0392399, 1.11519, 1.72608, 0, 2, 1152, 0.00415197, 0.0389469, 0.0325982, 0.00602088, 0, 0, 0, 5.23889, 1.21375, 65.8707, 50, 10, 10, 1.33868
+65, 16384, 1, 1, 512, 0.589623, 0.471871, 0.230131, 0.230131, 0.212877, 0.146614, 0, 0, 0, 0.0578158, 0.0635782, 122.525, 24.7671, 24.7671, 0, 0.508203, 2, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 15.9093, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.0578158, 0.0457606, 2.10185, 2.71275, 0, 2, 1152, 0.00429685, 0.0453733, 0.0356522, 0.0079953, 0, 0, 0, 7.87491, 1.03327, 61.5093, 50, 10, 10, 1.55217
+65, 32768, 1, 1, 512, 0.698149, 0.524383, 0.271635, 0.271635, 0.261592, 0.164922, 0, 0, 0, 0.0815853, 0.0969731, 155.583, 52.1796, 52.1796, 0, 0.664852, 2, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 24.3217, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.0815853, 0.0654673, 4.00333, 4.61423, 0, 2, 1152, 0.00444395, 0.0649402, 0.0423799, 0.0119442, 0, 0, 0, 10.4207, 0.732229, 52.01, 50, 10, 10, 2.0278
+65, 65536, 1, 1, 512, 0.885893, 0.524383, 0.365507, 0.365507, 0.261592, 0.258794, 0, 0, 0, 0.224312, 0.2397, 427.763, 120.707, 120.707, 0, 1.67509, 4, 4, 2, 1, 1, 2, 0, 0, 0, 0, 0, 0, 19.3068, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.224312, 0.208194, 4.00333, 4.61423, 16.3477, 4, 1152, 0.00543017, 0.146233, 0.123673, 0.0119442, 0, 0, 0, 3.79017, 0.266322, 82.5454, 50, 10, 10, 1.27726
+65, 131072, 1, 1, 512, 1.03673, 0.524383, 0.44429, 0.44429, 0.261592, 0.330853, 0, 0, 0, 0.214402, 0.218029, 408.866, 229.434, 229.434, 0, 2.53868, 4, 8, 2, 1, 1, 4, 0, 0, 0, 0, 0, 0, 25.4784, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.214402, 0.182166, 8.03521, 9.257, 31.7514, 4, 2304, 0.00962336, 0.148944, 0.122603, 0.0238883, 0, 0, 0, 7.9307, 0.557262, 73.9501, 50, 10, 10, 1.50017
+65, 262144, 1, 1, 512, 1.25884, 0.632833, 0.518139, 0.518139, 0.361894, 0.378806, 0, 0, 0, 0.284046, 0.299026, 448.849, 417.712, 417.712, 0, 3.92425, 4, 8, 2, 1, 1, 4, 0, 0, 0, 0, 0, 0, 32.965, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.284046, 0.235649, 15.4589, 16.6807, 31.7867, 4, 2304, 0.0101595, 0.201021, 0.159149, 0.0396837, 0, 0, 0, 11.5471, 0.42063, 69.1816, 50, 10, 10, 2.31775
+65, 524288, 1, 1, 512, 1.64314, 0.524383, 0.755022, 0.755022, 0.261592, 0.626529, 0, 0, 0, 0.437934, 0.41804, 835.141, 857.421, 857.421, 0, 7.85585, 8, 16, 4, 1, 1, 8, 0, 0, 0, 0, 0, 0, 32.9341, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.437934, 0.373461, 16.1849, 18.6285, 88.0631, 8, 4608, 0.0204885, 0.255931, 0.225422, 0.0477766, 0, 0, 0, 7.76538, 0.545646, 79.6147, 50, 10, 10, 1.29167
+65, 1048576, 1, 1, 512, 2.17546, 0.503511, 0.9816, 0.9816, 0.333412, 0.860452, 0, 0, 0, 0.646093, 0.621756, 1283.18, 1723.58, 1723.58, 0, 13.3102, 16, 16, 4, 1, 1, 8, 0, 0, 0, 0, 0, 0, 38.8762, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.646093, 0.546134, 42.7209, 45.1644, 115.538, 8, 4608, 0.0265051, 0.354566, 0.30764, 0.0802094, 0, 0, 0, 10.153, 0.419326, 78.2381, 50, 10, 10, 2.02155
+65, 2097152, 1, 1, 512, 2.85989, 0.503511, 1.32702, 1.32702, 0.333412, 1.19946, 0, 0, 0, 1.31931, 1.29497, 2620.22, 3495.59, 3495.59, 0, 28.1831, 32, 16, 8, 1, 2, 4, 0, 0, 0, 0, 0, 0, 36.7207, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 1.31931, 1.21935, 42.6061, 45.0497, 281.397, 16, 4608, 0.0361886, 0.446887, 0.400544, 0.0802094, 0, 0, 0, 4.97216, 0.205353, 89.2367, 50, 10, 10, 1.07059
+65, 4194304, 1, 1, 512, 3.77332, 0.395548, 1.80718, 1.80718, 0.233597, 1.73254, 0, 0, 0, 1.36604, 1.29911, 3453.55, 6977.29, 6977.29, 0, 52.0054, 32, 64, 8, 1, 1, 16, 0, 0, 0, 0, 0, 0, 39.7998, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 1.36604, 1.23223, 43.9657, 48.8529, 532.413, 32, 9216, 0.0511602, 0.743816, 0.703198, 0.0972372, 0, 0, 0, 4.97893, 0.396654, 88.1687, 50, 10, 10, 2.0164
+65, 8388608, 1, 1, 512, 5.05308, 0.395548, 2.44613, 2.44613, 0.233597, 2.37336, 0, 0, 0, 2.19274, 2.03172, 5543.56, 13468.7, 13468.7, 0, 90.5121, 64, 64, 16, 1, 2, 16, 0, 0, 0, 0, 0, 0, 45.7354, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 2.19274, 1.92512, 88.371, 98.1454, 653.911, 32, 18432, 0.107869, 0.869254, 0.828441, 0.194474, 0, 0, 0, 6.20359, 0.494218, 86.0504, 50, 10, 10, 1.15423
+65, 16777216, 1, 1, 512, 6.82626, 0.347216, 3.31477, 3.31477, 0.224437, 3.28706, 0, 0, 0, 3.92609, 3.79134, 11307.3, 29075.4, 29075.4, 0, 194.454, 128, 128, 16, 2, 16, 1, 0, 0, 0, 0, 0, 0, 42.5768, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 3.92609, 3.66618, 114.787, 119.674, 1689.35, 64, 9216, 0.10696, 1.86202, 1.81712, 0.20086, 0, 0, 0, 3.45584, 0.318228, 92.2076, 50, 10, 10, 1.90849
+65, 8192, 1, 1, 512, 1.2206, 0.86108, 0.362869, 0.362869, 0.561969, 0.295758, 0, 137.755, 99.98, 0.132488, 0.131928, 153.862, 36.3705, 36.3705, 0.000966049, 0.339278, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 2.12479, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.132488, 0.114294, 0.804878, 1.41577, 33.2187, 2, 1152, 0.00851843, 0.0309598, 0.0264043, 0.012956, 0, 0, 0, 5.74601, 0.739312, 82.295, 50, 10, 10, 1.57599
+65, 16384, 1, 1, 512, 1.32693, 1.04123, 0.38341, 0.38341, 0.639627, 0.303897, 0, 137.755, 99.9516, 0.141784, 0.141225, 136.17, 37.8873, 37.8873, 0.00281924, 0.351482, 32, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 4.10204, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.141784, 0.118158, 1.39436, 2.00525, 33.3247, 2, 1152, 0.00966313, 0.0338326, 0.0277497, 0.01796, 0, 0, 0, 8.89857, 0.690835, 78.4711, 50, 10, 10, 1.77746
+65, 32768, 1, 1, 512, 1.50254, 1.21307, 0.405655, 0.405655, 0.764488, 0.332397, 0, 137.755, 99.9436, 0.181437, 0.180318, 149.569, 39.3396, 39.3396, 0.00303174, 0.492011, 32, 4, 2, 1, 1, 4, 0, 0, 0, 0, 0, 0, 5.86081, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.181437, 0.136739, 1.9097, 3.13149, 33.4308, 2, 2304, 0.0143454, 0.0378775, 0.0312658, 0.0348023, 0, 0, 0, 13.9076, 0.973162, 71.2197, 50, 10, 10, 1.25825
+65, 65536, 1, 1, 512, 1.65774, 1.52867, 0.439614, 0.439614, 0.870227, 0.347895, 0, 137.755, 99.858, 0.212754, 0.211635, 139.176, 46.1763, 46.1763, 0.0107287, 0.590959, 32, 4, 2, 1, 1, 4, 0, 0, 0, 0, 0, 0, 9.759, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.212754, 0.146146, 4.98409, 6.20588, 33.5368, 2, 2304, 0.0169538, 0.0438991, 0.0343955, 0.0548184, 0, 0, 0, 21.2686, 0.829916, 63.8205, 50, 10, 10, 1.37708
+65, 131072, 1, 1, 512, 1.90476, 1.72275, 0.485739, 0.485739, 1.01444, 0.404582, 0, 137.755, 99.8399, 0.307846, 0.305608, 178.694, 49.0302, 49.0302, 0.0118726, 0.897741, 32, 4, 4, 1, 1, 8, 0, 0, 0, 0, 0, 0, 12.8482, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.307846, 0.181495, 6.30537, 8.74895, 33.6429, 2, 4608, 0.0262442, 0.051573, 0.0410116, 0.106269, 0, 0, 0, 29.3976, 1.02991, 55.171, 50, 10, 10, 0.910406
+65, 262144, 1, 1, 512, 2.01532, 1.72275, 0.539418, 0.539418, 1.01444, 0.46147, 0, 137.755, 99.8399, 0.353312, 0.351074, 205.086, 89.9678, 89.9678, 0.0129405, 1.60126, 32, 8, 4, 1, 1, 8, 0, 0, 0, 0, 0, 0, 14.4066, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.353312, 0.226962, 6.30537, 8.74895, 59.1932, 4, 4608, 0.0262442, 0.0906365, 0.0766835, 0.106269, 0, 0, 0, 25.6146, 0.897372, 60.9398, 50, 10, 10, 0.899619
+65, 524288, 1, 1, 512, 2.37224, 1.72275, 0.714788, 0.714788, 1.01444, 0.643012, 0, 137.755, 99.8399, 0.604898, 0.600422, 351.123, 131.787, 131.787, 0.0300533, 2.66594, 64, 8, 8, 1, 16, 1, 0, 0, 0, 0, 0, 0, 17.3062, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.604898, 0.352196, 12.4979, 17.3851, 71.4045, 4, 9216, 0.0536907, 0.105999, 0.0900704, 0.212538, 0, 0, 0, 29.9222, 1.04829, 55.7281, 50, 10, 10, 0.5318
+65, 1048576, 1, 1, 512, 2.638, 2.05102, 0.731091, 0.731091, 1.24824, 0.658674, 0, 137.755, 99.8094, 0.613326, 0.608851, 299.035, 188.643, 188.643, 0.0160231, 4.36241, 32, 16, 8, 1, 1, 16, 0, 0, 0, 0, 0, 0, 21.1522, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.613326, 0.367414, 8.22121, 13.1084, 111.079, 8, 9216, 0.0450162, 0.163339, 0.143879, 0.20917, 0, 0, 0, 29.511, 0.987413, 57.541, 50, 10, 10, 0.761285
+65, 2097152, 1, 1, 512, 2.86477, 2.05102, 0.857281, 0.857281, 1.24824, 0.759252, 0, 137.755, 99.8094, 0.714692, 0.710216, 348.457, 369.313, 369.313, 0.016369, 8.29269, 32, 32, 8, 1, 1, 16, 0, 0, 0, 0, 0, 0, 22.2545, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 0.714692, 0.468779, 8.22121, 13.1084, 214.185, 16, 9216, 0.0450162, 0.250113, 0.227261, 0.20917, 0, 0, 0, 25.3254, 0.847367, 63.563, 50, 10, 10, 1.10531
+65, 4194304, 1, 1, 512, 3.41675, 1.72275, 1.23308, 1.23308, 1.01444, 1.16923, 0, 137.755, 99.8399, 1.41491, 1.40595, 821.305, 810.148, 810.148, 0.0713446, 15.2612, 128, 32, 16, 1, 1, 32, 0, 0, 0, 0, 0, 0, 24.1854, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 1.41491, 0.909502, 24.9594, 34.7338, 310.328, 16, 18432, 0.109872, 0.335555, 0.308471, 0.425076, 0, 0, 0, 25.5846, 0.896322, 62.7744, 50, 10, 10, 0.596778
+65, 8388608, 1, 1, 512, 3.88406, 1.72275, 1.46478, 1.46478, 1.01444, 1.40484, 0, 137.755, 99.8399, 1.7501, 1.74115, 1015.87, 1609.73, 1609.73, 0.0718128, 29.9355, 128, 64, 16, 1, 1, 32, 0, 0, 0, 0, 0, 0, 24.6596, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 1.7501, 1.2447, 24.9594, 34.7338, 610.093, 32, 18432, 0.109872, 0.560444, 0.529084, 0.425076, 0, 0, 0, 20.6844, 0.72465, 69.9042, 50, 10, 10, 0.969704
+65, 16777216, 1, 1, 512, 5.1321, 1.71929, 2.0507, 2.0507, 1.12637, 1.95502, 0, 137.755, 99.9201, 2.4429, 2.42499, 1420.87, 2572.69, 2572.69, 0.0236893, 57.6906, 128, 128, 32, 1, 4, 16, 0, 0, 0, 0, 0, 0, 25.5916, 0.734018, 0.282409, 0.000103647, 14.8166, 0, 0, 2.4429, 1.78708, 17.9291, 37.4777, 1200.23, 64, 36864, 0.176127, 0.751377, 0.724147, 0.516423, 0, 0, 0, 16.527, 0.99162, 72.1163, 50, 10, 10, 0.739707
+65, 8192, 1, 1, 512, 2.68545, 1.91918, 0.985318, 0.985318, 0.874516, 0.825619, 0, 64000, 99.9999, 0.123888, 0.122142, 64.5528, 0.00501254, 0.00501254, 0.00512143, 0.216944, 8, 4, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0.76579, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 0.123888, 0.0861485, 0.000138175, 0.000310399, 0.00445485, 2, 1152, 0.0155969, 0.0410981, 0.0343889, 0.0231759, 0, 0, 0, 14.376, 2.33943, 63.4377, 50, 10, 10, 2.5393
+65, 16384, 1, 1, 512, 2.94812, 2.41333, 1.0064, 1.0064, 1.13271, 0.809008, 0, 64000, 99.9999, 0.168491, 0.164999, 69.817, 0.00516196, 0.00516196, 0.0089706, 0.267823, 8, 4, 2, 1, 4, 1, 0, 0, 0, 0, 0, 0, 1.24062, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 0.168491, 0.094572, 0.000199345, 0.000543793, 0.0044606, 2, 2304, 0.0299573, 0.0449509, 0.0374717, 0.0451113, 0, 0, 0, 21.1407, 3.38716, 51.1626, 50, 10, 10, 1.9591
+65, 32768, 1, 1, 512, 3.31686, 2.99032, 1.07665, 1.07665, 1.32772, 0.912501, 0, 64000, 99.9997, 0.188801, 0.185309, 63.1373, 0.00542644, 0.00542644, 0.024166, 0.282282, 8, 4, 2, 1, 1, 4, 0, 0, 0, 0, 0, 0, 2.35415, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 0.188801, 0.0977438, 0.00027431, 0.000618758, 0.00446636, 2, 2304, 0.0303117, 0.0477919, 0.0384748, 0.0621465, 0, 0, 0, 27.8894, 3.02279, 46.3457, 50, 10, 10, 2.05308
+65, 65536, 1, 1, 512, 3.75551, 4.14515, 1.19468, 1.19468, 1.61712, 0.94371, 0, 64000, 99.9992, 0.227443, 0.223951, 54.8697, 0.00621556, 0.00621556, 0.0686973, 0.307861, 8, 4, 2, 1, 1, 4, 0, 0, 0, 0, 0, 0, 4.31711, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 0.227443, 0.101854, 0.000500872, 0.00084532, 0.00447211, 2, 2304, 0.0309805, 0.0513089, 0.0402212, 0.0962169, 0, 0, 0, 38.1308, 2.50923, 39.5077, 50, 10, 10, 2.17275
+65, 131072, 1, 1, 512, 4.50009, 5.2458, 1.23846, 1.23846, 2.1965, 1.06513, 0, 64000, 99.999, 0.370867, 0.363882, 70.6978, 0.00683683, 0.00683683, 0.116486, 0.43523, 8, 4, 4, 1, 1, 8, 0, 0, 0, 0, 0, 0, 6.10744, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 0.370867, 0.121813, 0.000623297, 0.00131219, 0.00447787, 2, 4608, 0.0598007, 0.0618269, 0.0473916, 0.191193, 0, 0, 0, 46.7693, 3.05356, 28.6823, 50, 10, 10, 1.53116
+65, 262144, 1, 1, 512, 5.16174, 7.03695, 1.45885, 1.45885, 2.40362, 1.29927, 0, 64000, 99.9972, 0.558579, 0.551595, 79.378, 0.0139056, 0.0139056, 0.229481, 0.568687, 16, 4, 4, 1, 1, 8, 0, 0, 0, 0, 0, 0, 9.34835, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 0.558579, 0.170445, 0.00175071, 0.00243961, 0.0067782, 2, 4608, 0.0638026, 0.0686428, 0.0506055, 0.328715, 0, 0, 0, 55.4502, 2.04342, 27.0549, 50, 10, 10, 1.51412
+65, 524288, 1, 1, 512, 5.90729, 7.03695, 1.76253, 1.76253, 2.40362, 1.74115, 0, 64000, 99.9972, 1.04265, 1.02868, 148.168, 0.024021, 0.024021, 0.317465, 1.00377, 32, 4, 8, 1, 1, 16, 0, 0, 0, 0, 0, 0, 10.5927, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 1.04265, 0.266383, 0.00331908, 0.00469687, 0.00918596, 2, 9216, 0.128697, 0.0834746, 0.0587421, 0.657431, 0, 0, 0, 59.4127, 2.18944, 23.0386, 50, 10, 10, 0.869118
+65, 1048576, 1, 1, 512, 6.50703, 8.13816, 1.76927, 1.75421, 2.98355, 1.76927, 0, 64000, 99.9967, 1.04493, 1.03096, 128.398, 0.0331781, 0.0331781, 0.347146, 1.5832, 16, 8, 8, 1, 1, 16, 0, 0, 0, 0, 0, 0, 13.4318, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 1.04493, 0.273583, 0.00184324, 0.00322104, 0.013418, 4, 9216, 0.121765, 0.141056, 0.110773, 0.65495, 0, 0, 0, 59.2833, 2.16754, 23.7158, 50, 10, 10, 1.02491
+65, 2097152, 1, 1, 512, 6.83858, 8.13816, 2.01439, 2.01439, 2.98355, 1.84065, 0, 64000, 99.9967, 1.12553, 1.11156, 138.303, 0.0661909, 0.0661909, 0.348012, 3.01419, 16, 16, 8, 1, 16, 1, 0, 0, 0, 0, 0, 0, 14.11, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 1.12553, 0.354185, 0.00184324, 0.00322104, 0.0266709, 8, 9216, 0.121767, 0.214367, 0.179712, 0.65495, 0, 0, 0, 55.0378, 2.01232, 29.2835, 50, 10, 10, 1.22631
+65, 4194304, 1, 1, 512, 8.02853, 7.03695, 2.93388, 2.69103, 2.40362, 2.93388, 0, 64000, 99.9972, 2.19009, 2.16215, 311.228, 0.172924, 0.172924, 0.467862, 5.42702, 64, 16, 16, 1, 1, 32, 0, 0, 0, 0, 0, 0, 15.6735, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 2.19009, 0.637557, 0.00665223, 0.00940781, 0.0353868, 8, 18432, 0.259231, 0.264407, 0.215182, 1.31486, 0, 0, 0, 56.5699, 2.08468, 27.2378, 50, 10, 10, 0.679358
+65, 8388608, 1, 1, 512, 8.76424, 5.2458, 3.39393, 3.39393, 2.1965, 3.17381, 0, 64000, 99.999, 2.86984, 2.81396, 547.074, 0.226045, 0.226045, 0.338889, 10.4706, 64, 32, 32, 1, 4, 16, 0, 0, 0, 0, 0, 0, 16.2475, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 2.86984, 0.877413, 0.00476335, 0.0102745, 0.0668039, 16, 36864, 0.487267, 0.357445, 0.324833, 1.52955, 0, 0, 0, 48.3516, 3.15687, 29.554, 50, 10, 10, 0.522379
+65, 16777216, 1, 1, 512, 10.3056, 7.03695, 3.95819, 3.95819, 2.40362, 3.94377, 0, 64000, 99.9972, 4.3959, 4.34002, 624.688, 0.793739, 0.793739, 0.608271, 17.8297, 128, 32, 32, 1, 2, 32, 0, 0, 0, 0, 0, 0, 19.0829, 0.734018, 0.282409, 0.000103647, 54.5344, 0, 0, 4.3959, 1.29083, 0.0135484, 0.0190595, 0.0828256, 16, 36864, 0.521782, 0.519346, 0.473655, 2.62972, 0, 0, 0, 56.3677, 2.07723, 28.1384, 50, 10, 10, 0.589723
diff --git a/src/gpuwattch/cacti/parameter.cc b/src/gpuwattch/cacti/parameter.cc
new file mode 100644
index 000000000..3f5a80279
--- /dev/null
+++ b/src/gpuwattch/cacti/parameter.cc
@@ -0,0 +1,732 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <iostream>
+#include <string>
+#include <iomanip>
+
+#include "parameter.h"
+#include "area.h"
+
+using namespace std;
+
+
+InputParameter * g_ip;
+TechnologyParameter g_tp;
+
+
+
+void TechnologyParameter::DeviceType::display(uint32_t indent)
+{
+  string indent_str(indent, ' ');
+
+  cout << indent_str << "C_g_ideal = " << setw(12) << C_g_ideal << " F/um" << endl;
+  cout << indent_str << "C_fringe  = " << setw(12) << C_fringe  << " F/um" << endl;
+  cout << indent_str << "C_overlap = " << setw(12) << C_overlap << " F/um" << endl;
+  cout << indent_str << "C_junc    = " << setw(12) << C_junc    << " F/um^2" << endl;
+  cout << indent_str << "l_phy     = " << setw(12) << l_phy     << " um" << endl;
+  cout << indent_str << "l_elec    = " << setw(12) << l_elec    << " um" << endl;
+  cout << indent_str << "R_nch_on  = " << setw(12) << R_nch_on  << " ohm-um" << endl;
+  cout << indent_str << "R_pch_on  = " << setw(12) << R_pch_on  << " ohm-um" << endl;
+  cout << indent_str << "Vdd       = " << setw(12) << Vdd       << " V" << endl;
+  cout << indent_str << "Vth       = " << setw(12) << Vth       << " V" << endl;
+  cout << indent_str << "I_on_n    = " << setw(12) << I_on_n    << " A/um" << endl;
+  cout << indent_str << "I_on_p    = " << setw(12) << I_on_p    << " A/um" << endl;
+  cout << indent_str << "I_off_n   = " << setw(12) << I_off_n   << " A/um" << endl;
+  cout << indent_str << "I_off_p   = " << setw(12) << I_off_p   << " A/um" << endl;
+  cout << indent_str << "C_ox      = " << setw(12) << C_ox      << " F/um^2" << endl;
+  cout << indent_str << "t_ox      = " << setw(12) << t_ox      << " um" << endl;
+  cout << indent_str << "n_to_p_eff_curr_drv_ratio = " << n_to_p_eff_curr_drv_ratio << endl;
+}
+
+
+
+void TechnologyParameter::InterconnectType::display(uint32_t indent)
+{
+  string indent_str(indent, ' ');
+
+  cout << indent_str << "pitch    = " << setw(12) << pitch    << " um" << endl;
+  cout << indent_str << "R_per_um = " << setw(12) << R_per_um << " ohm/um" << endl;
+  cout << indent_str << "C_per_um = " << setw(12) << C_per_um << " F/um" << endl;
+}
+
+void TechnologyParameter::ScalingFactor::display(uint32_t indent)
+{
+  string indent_str(indent, ' ');
+
+  cout << indent_str << "logic_scaling_co_eff    = " << setw(12) << logic_scaling_co_eff << endl;
+  cout << indent_str << "curr_core_tx_density = " << setw(12) << core_tx_density << " # of tx/um^2" << endl;
+}
+
+void TechnologyParameter::MemoryType::display(uint32_t indent)
+{
+  string indent_str(indent, ' ');
+
+  cout << indent_str << "b_w         = " << setw(12) << b_w << " um" << endl;
+  cout << indent_str << "b_h         = " << setw(12) << b_h << " um" << endl;
+  cout << indent_str << "cell_a_w    = " << setw(12) << cell_a_w << " um" << endl;
+  cout << indent_str << "cell_pmos_w = " << setw(12) << cell_pmos_w << " um" << endl;
+  cout << indent_str << "cell_nmos_w = " << setw(12) << cell_nmos_w << " um" << endl;
+  cout << indent_str << "Vbitpre     = " << setw(12) << Vbitpre << " V" << endl;
+}
+
+
+
+void TechnologyParameter::display(uint32_t indent)
+{
+  string indent_str(indent, ' ');
+
+  cout << indent_str << "ram_wl_stitching_overhead_ = " << setw(12) << ram_wl_stitching_overhead_ << " um" << endl;
+  cout << indent_str << "min_w_nmos_                = " << setw(12) << min_w_nmos_                << " um" << endl;
+  cout << indent_str << "max_w_nmos_                = " << setw(12) << max_w_nmos_                << " um" << endl;
+  cout << indent_str << "unit_len_wire_del          = " << setw(12) << unit_len_wire_del          << " s/um^2" << endl;
+  cout << indent_str << "FO4                        = " << setw(12) << FO4                        << " s" << endl;
+  cout << indent_str << "kinv                       = " << setw(12) << kinv                       << " s" << endl;
+  cout << indent_str << "vpp                        = " << setw(12) << vpp                        << " V" << endl;
+  cout << indent_str << "w_sense_en                 = " << setw(12) << w_sense_en                 << " um" << endl;
+  cout << indent_str << "w_sense_n                  = " << setw(12) << w_sense_n                  << " um" << endl;
+  cout << indent_str << "w_sense_p                  = " << setw(12) << w_sense_p                  << " um" << endl;
+  cout << indent_str << "w_iso                      = " << setw(12) << w_iso                      << " um" << endl;
+  cout << indent_str << "w_poly_contact             = " << setw(12) << w_poly_contact             << " um" << endl;
+  cout << indent_str << "spacing_poly_to_poly       = " << setw(12) << spacing_poly_to_poly       << " um" << endl;
+  cout << indent_str << "spacing_poly_to_contact    = " << setw(12) << spacing_poly_to_contact    << " um" << endl;
+  cout << endl;
+  cout << indent_str << "w_comp_inv_p1              = " << setw(12) << w_comp_inv_p1 << " um" << endl;
+  cout << indent_str << "w_comp_inv_p2              = " << setw(12) << w_comp_inv_p2 << " um" << endl;
+  cout << indent_str << "w_comp_inv_p3              = " << setw(12) << w_comp_inv_p3 << " um" << endl;
+  cout << indent_str << "w_comp_inv_n1              = " << setw(12) << w_comp_inv_n1 << " um" << endl;
+  cout << indent_str << "w_comp_inv_n2              = " << setw(12) << w_comp_inv_n2 << " um" << endl;
+  cout << indent_str << "w_comp_inv_n3              = " << setw(12) << w_comp_inv_n3 << " um" << endl;
+  cout << indent_str << "w_eval_inv_p               = " << setw(12) << w_eval_inv_p  << " um" << endl;
+  cout << indent_str << "w_eval_inv_n               = " << setw(12) << w_eval_inv_n  << " um" << endl;
+  cout << indent_str << "w_comp_n                   = " << setw(12) << w_comp_n      << " um" << endl;
+  cout << indent_str << "w_comp_p                   = " << setw(12) << w_comp_p      << " um" << endl;
+  cout << endl;
+  cout << indent_str << "dram_cell_I_on             = " << setw(12) << dram_cell_I_on << " A/um" << endl;
+  cout << indent_str << "dram_cell_Vdd              = " << setw(12) << dram_cell_Vdd  << " V" << endl;
+  cout << indent_str << "dram_cell_I_off_worst_case_len_temp = " << setw(12) << dram_cell_I_off_worst_case_len_temp << " A/um" << endl;
+  cout << indent_str << "dram_cell_C                = " << setw(12) << dram_cell_C               << " F" << endl;
+  cout << indent_str << "gm_sense_amp_latch         = " << setw(12) << gm_sense_amp_latch        << " F/s" << endl;
+  cout << endl;
+  cout << indent_str << "w_nmos_b_mux               = " << setw(12) << w_nmos_b_mux              << " um" << endl;
+  cout << indent_str << "w_nmos_sa_mux              = " << setw(12) << w_nmos_sa_mux             << " um" << endl;
+  cout << indent_str << "w_pmos_bl_precharge        = " << setw(12) << w_pmos_bl_precharge       << " um" << endl;
+  cout << indent_str << "w_pmos_bl_eq               = " << setw(12) << w_pmos_bl_eq              << " um" << endl;
+  cout << indent_str << "MIN_GAP_BET_P_AND_N_DIFFS  = " << setw(12) << MIN_GAP_BET_P_AND_N_DIFFS << " um" << endl;
+  cout << indent_str << "HPOWERRAIL                 = " << setw(12) << HPOWERRAIL                << " um" << endl;
+  cout << indent_str << "cell_h_def                 = " << setw(12) << cell_h_def                << " um" << endl;
+
+  cout << endl;
+  cout << indent_str << "SRAM cell transistor: " << endl;
+  sram_cell.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "DRAM access transistor: " << endl;
+  dram_acc.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "DRAM wordline transistor: " << endl;
+  dram_wl.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "peripheral global transistor: " << endl;
+  peri_global.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "wire local" << endl;
+  wire_local.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "wire inside mat" << endl;
+  wire_inside_mat.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "wire outside mat" << endl;
+  wire_outside_mat.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "SRAM" << endl;
+  sram.display(indent + 2);
+
+  cout << endl;
+  cout << indent_str << "DRAM" << endl;
+  dram.display(indent + 2);
+}
+
+
+DynamicParameter::DynamicParameter():
+  use_inp_params(0), cell(), is_valid(true)
+{
+}
+
+
+
+DynamicParameter::DynamicParameter(
+    bool is_tag_,
+    int pure_ram_,
+    int pure_cam_,
+    double Nspd_,
+    unsigned int Ndwl_,
+    unsigned int Ndbl_,
+    unsigned int Ndcm_,
+    unsigned int Ndsam_lev_1_,
+    unsigned int Ndsam_lev_2_,
+    bool is_main_mem_):
+  is_tag(is_tag_), pure_ram(pure_ram_), pure_cam(pure_cam_), tagbits(0), Nspd(Nspd_), Ndwl(Ndwl_), Ndbl(Ndbl_),Ndcm(Ndcm_),
+  Ndsam_lev_1(Ndsam_lev_1_), Ndsam_lev_2(Ndsam_lev_2_),
+  number_way_select_signals_mat(0), V_b_sense(0), use_inp_params(0),
+  is_main_mem(is_main_mem_), cell(), is_valid(false)
+{
+
+	num_di_b_bank_per_port=0;
+	num_do_b_bank_per_port=0;
+	num_di_b_mat=0;
+	num_do_b_mat=0;
+	num_di_b_subbank=0;
+	num_do_b_subbank=0;
+	num_si_b_mat=0;
+	num_so_b_mat=0;
+	num_si_b_subbank=0;
+	num_so_b_subbank=0;
+	num_si_b_bank_per_port=0;
+	num_so_b_bank_per_port=0;
+
+
+  ram_cell_tech_type = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type : g_ip->data_arr_ram_cell_tech_type;
+  is_dram            = ((ram_cell_tech_type == lp_dram) || (ram_cell_tech_type == comm_dram));
+
+  unsigned int capacity_per_die = g_ip->cache_sz / NUMBER_STACKED_DIE_LAYERS;  // capacity per stacked die layer
+  const TechnologyParameter::InterconnectType & wire_local = g_tp.wire_local;
+  fully_assoc = (g_ip->fully_assoc) ? true : false;
+
+
+
+
+
+  if (fully_assoc || pure_cam)
+  { // fully-assocative cache -- ref: CACTi 2.0 report
+	  if (Ndwl != 1 ||            //Ndwl is fixed to 1 for FA
+			  Ndcm != 1 ||            //Ndcm is fixed to 1 for FA
+			  Nspd < 1 || Nspd > 1 || //Nspd is fixed to 1 for FA
+			  Ndsam_lev_1 != 1 ||     //Ndsam_lev_1 is fixed to one
+			  Ndsam_lev_2 != 1 ||     //Ndsam_lev_2 is fixed to one
+			  Ndbl < 2)
+	  {
+          return;
+	  }
+  }
+
+  if ((is_dram) && (!is_tag) && (Ndcm > 1))
+  {
+	  return;  // For a DRAM array, each bitline has its own sense-amp
+  }
+
+  // If it's not an FA tag/data array, Ndwl should be at least two and Ndbl should be
+  // at least two because an array is assumed to have at least one mat. And a mat
+  // is formed out of two horizontal subarrays and two vertical subarrays
+  if (fully_assoc == false && (Ndwl < 1 || Ndbl < 1))
+  {
+	  return;
+  }
+
+  //***********compute row, col of an subarray
+  if (!(fully_assoc || pure_cam))//Not fully_asso nor cam
+  {
+	  // if data array, let tagbits = 0
+	  if (is_tag)
+	  {
+		  if (g_ip->specific_tag)
+		  {
+			  tagbits = g_ip->tag_w;
+		  }
+		  else
+		  {
+			  tagbits = ADDRESS_BITS + EXTRA_TAG_BITS - _log2(capacity_per_die) +
+			  _log2(g_ip->tag_assoc*2 - 1) - _log2(g_ip->nbanks);
+
+		  }
+		  tagbits = (((tagbits + 3) >> 2) << 2);
+
+		  num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks *
+				  g_ip->block_sz * g_ip->tag_assoc * Ndbl * Nspd));// + EPSILON);
+		  num_c_subarray = (int)ceil((tagbits * g_ip->tag_assoc * Nspd / Ndwl));// + EPSILON);
+		  //burst_length = 1;
+	  }
+	  else
+	  {
+		  num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks *
+				  g_ip->block_sz * g_ip->data_assoc * Ndbl * Nspd));// + EPSILON);
+		  num_c_subarray = (int)ceil((8 * g_ip->block_sz * g_ip->data_assoc * Nspd / Ndwl));// + EPSILON); + EPSILON);
+		  // burst_length = g_ip->block_sz * 8 / g_ip->out_w;
+	  }
+
+	  if (num_r_subarray < MINSUBARRAYROWS) return;
+	  if (num_r_subarray == 0) return;
+	  if (num_r_subarray > MAXSUBARRAYROWS) return;
+	  if (num_c_subarray < MINSUBARRAYCOLS) return;
+	  if (num_c_subarray > MAXSUBARRAYCOLS) return;
+
+  }
+
+  else
+  {//either fully-asso or cam
+	  if (pure_cam)
+	  {
+		  if (g_ip->specific_tag)
+		  {
+			  tagbits = int(ceil(g_ip->tag_w/8.0)*8);
+		  }
+		  else
+		  {
+			  tagbits = int(ceil((ADDRESS_BITS + EXTRA_TAG_BITS)/8.0)*8);
+//			  cout<<"Pure CAM needs tag width to be specified"<<endl;
+//			  exit(0);
+		  }
+		  //tagbits = (((tagbits + 3) >> 2) << 2);
+
+		  tag_num_r_subarray = (int)ceil(capacity_per_die / (g_ip->nbanks*tagbits/8.0 * Ndbl));//TODO: error check input of tagbits and blocksize //TODO: for pure CAM, g_ip->block should be number of entries.
+		  //tag_num_c_subarray = (int)(tagbits  + EPSILON);
+		  tag_num_c_subarray = tagbits;
+		  if (tag_num_r_subarray == 0) return;
+		  if (tag_num_r_subarray > MAXSUBARRAYROWS) return;
+		  if (tag_num_c_subarray < MINSUBARRAYCOLS) return;
+		  if (tag_num_c_subarray > MAXSUBARRAYCOLS) return;
+		  num_r_subarray = tag_num_r_subarray;
+	  }
+	  else //fully associative
+	  {
+		  if (g_ip->specific_tag)
+		  {
+			  tagbits = g_ip->tag_w;
+		  }
+		  else
+		  {
+			  tagbits = ADDRESS_BITS + EXTRA_TAG_BITS - _log2(g_ip->block_sz);//TODO: should be the page_offset=log2(page size), but this info is not avail with CACTI, for McPAT this is no problem.
+		  }
+		  tagbits = (((tagbits + 3) >> 2) << 2);
+
+		  tag_num_r_subarray = (int)(capacity_per_die / (g_ip->nbanks*g_ip->block_sz * Ndbl));
+		  tag_num_c_subarray = (int)ceil((tagbits * Nspd / Ndwl));// + EPSILON);
+		  if (tag_num_r_subarray == 0) return;
+		  if (tag_num_r_subarray > MAXSUBARRAYROWS) return;
+		  if (tag_num_c_subarray < MINSUBARRAYCOLS) return;
+		  if (tag_num_c_subarray > MAXSUBARRAYCOLS) return;
+
+		  data_num_r_subarray = tag_num_r_subarray;
+		  data_num_c_subarray = 8 * g_ip->block_sz;
+		  if (data_num_r_subarray == 0) return;
+		  if (data_num_r_subarray > MAXSUBARRAYROWS) return;
+		  if (data_num_c_subarray < MINSUBARRAYCOLS) return;
+		  if (data_num_c_subarray > MAXSUBARRAYCOLS) return;
+		  num_r_subarray = tag_num_r_subarray;
+	  }
+  }
+
+  num_subarrays = Ndwl * Ndbl;
+  //****************end of computation of row, col of an subarray
+
+  // calculate wire parameters
+  if (fully_assoc || pure_cam)
+  {
+	  cam_cell.h = g_tp.cam.b_h + 2 * wire_local.pitch * (g_ip->num_rw_ports-1 + g_ip->num_rd_ports + g_ip->num_wr_ports)
+	  + 2 * wire_local.pitch*(g_ip->num_search_ports-1) + wire_local.pitch * g_ip->num_se_rd_ports;
+	  cam_cell.w = g_tp.cam.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports-1 + g_ip->num_rd_ports + g_ip->num_wr_ports)
+	  + 2 * wire_local.pitch*(g_ip->num_search_ports-1) + wire_local.pitch * g_ip->num_se_rd_ports;
+
+	  cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_wr_ports +g_ip->num_rw_ports-1 + g_ip->num_rd_ports)
+	  + 2 * wire_local.pitch*(g_ip->num_search_ports-1);
+	  cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports -1 + (g_ip->num_rd_ports - g_ip->num_se_rd_ports)
+			  + g_ip->num_wr_ports) + g_tp.wire_local.pitch * g_ip->num_se_rd_ports + 2 * wire_local.pitch*(g_ip->num_search_ports-1);
+  }
+  else
+  {
+	  if(is_tag)
+	  {
+		  cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + g_ip->num_rd_ports +
+				  g_ip->num_wr_ports);
+		  cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 + g_ip->num_wr_ports +
+				  (g_ip->num_rd_ports - g_ip->num_se_rd_ports)) +
+				  wire_local.pitch * g_ip->num_se_rd_ports;
+	  }
+	  else
+	  {
+		  if (is_dram)
+		  {
+			  cell.h = g_tp.dram.b_h;
+			  cell.w = g_tp.dram.b_w;
+		  }
+		  else
+		  {
+			  cell.h = g_tp.sram.b_h + 2 * wire_local.pitch * (g_ip->num_wr_ports +
+					  g_ip->num_rw_ports - 1 + g_ip->num_rd_ports);
+			  cell.w = g_tp.sram.b_w + 2 * wire_local.pitch * (g_ip->num_rw_ports - 1 +
+					  (g_ip->num_rd_ports - g_ip->num_se_rd_ports) +
+					  g_ip->num_wr_ports) + g_tp.wire_local.pitch * g_ip->num_se_rd_ports;
+		  }
+	  }
+  }
+
+  double c_b_metal = cell.h * wire_local.C_per_um;
+  double C_bl;
+
+  if (!(fully_assoc || pure_cam))
+  {
+	  if (is_dram)
+	  {
+		  deg_bl_muxing = 1;
+		  if (ram_cell_tech_type == comm_dram)
+		  {
+			  C_bl  = num_r_subarray * c_b_metal;
+			  V_b_sense = (g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C / (g_tp.dram_cell_C + C_bl);
+			  if (V_b_sense < VBITSENSEMIN)
+			  {
+				  return;
+			  }
+			  V_b_sense = VBITSENSEMIN;  // in any case, we fix sense amp input signal to a constant value
+			  dram_refresh_period = 64e-3;
+		  }
+		  else
+		  {
+			  double Cbitrow_drain_cap = drain_C_(g_tp.dram.cell_a_w, NCH, 1, 0, cell.w, true, true) / 2.0;
+			  C_bl  = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
+			  V_b_sense = (g_tp.dram_cell_Vdd/2) * g_tp.dram_cell_C /(g_tp.dram_cell_C + C_bl);
+
+			  if (V_b_sense < VBITSENSEMIN)
+			  {
+				  return; //Sense amp input signal is smaller that minimum allowable sense amp input signal
+			  }
+			  V_b_sense = VBITSENSEMIN; // in any case, we fix sense amp input signal to a constant value
+			  //v_storage_worst = g_tp.dram_cell_Vdd / 2 - VBITSENSEMIN * (g_tp.dram_cell_C + C_bl) / g_tp.dram_cell_C;
+			  //dram_refresh_period = 1.1 * g_tp.dram_cell_C * v_storage_worst / g_tp.dram_cell_I_off_worst_case_len_temp;
+			  dram_refresh_period = 0.9 * g_tp.dram_cell_C * VDD_STORAGE_LOSS_FRACTION_WORST * g_tp.dram_cell_Vdd / g_tp.dram_cell_I_off_worst_case_len_temp;
+		  }
+	  }
+	  else
+	  { //SRAM
+		  V_b_sense = (0.05 * g_tp.sram_cell.Vdd > VBITSENSEMIN) ? 0.05 * g_tp.sram_cell.Vdd : VBITSENSEMIN;
+		  deg_bl_muxing = Ndcm;
+		  // "/ 2.0" below is due to the fact that two adjacent access transistors share drain
+		  // contacts in a physical layout
+		  double Cbitrow_drain_cap = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0;
+		  C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
+		  dram_refresh_period = 0;
+	  }
+  }
+  else
+  {
+	  c_b_metal = cam_cell.h * wire_local.C_per_um;//IBM and SUN design, SRAM array uses dummy cells to fill the blank space due to mismatch on CAM-RAM
+	  V_b_sense = (0.05 * g_tp.sram_cell.Vdd > VBITSENSEMIN) ? 0.05 * g_tp.sram_cell.Vdd : VBITSENSEMIN;
+	  deg_bl_muxing = 1;//FA fix as 1
+	  // "/ 2.0" below is due to the fact that two adjacent access transistors share drain
+	  // contacts in a physical layout
+	  double Cbitrow_drain_cap = drain_C_(g_tp.cam.cell_a_w, NCH, 1, 0, cam_cell.w, false, true) / 2.0;//TODO: comment out these two lines
+	  C_bl = num_r_subarray * (Cbitrow_drain_cap + c_b_metal);
+	  dram_refresh_period = 0;
+  }
+
+
+  // do/di: data in/out, for fully associative they are the data width for normal read and write
+  // so/si: search data in/out, for fully associative they are the data width for the search ops
+  // for CAM, si=di, but so = matching address. do = data out = di (for normal read/write)
+  // so/si needs broadcase while do/di do not
+
+  if (fully_assoc || pure_cam)
+  {
+	    switch (Ndbl) {
+	      case (0):
+	        cout <<  "   Invalid Ndbl \n"<<endl;
+	        exit(0);
+	        break;
+	      case (1):
+	    	  num_mats_h_dir = 1;//one subarray per mat
+	    	  num_mats_v_dir = 1;
+	        break;
+	      case (2):
+	    	  num_mats_h_dir = 1;//two subarrays per mat
+	    	  num_mats_v_dir = 1;
+	    	  break;
+	      default:
+	    	  num_mats_h_dir = int(floor(sqrt(Ndbl/4.0)));//4 subbarrys per mat
+	    	  num_mats_v_dir = int(Ndbl/4.0 / num_mats_h_dir);
+	    }
+	    num_mats = num_mats_h_dir * num_mats_v_dir;
+
+	    if (fully_assoc)
+	    {
+	    	num_so_b_mat   = data_num_c_subarray;
+	    	num_do_b_mat   = data_num_c_subarray + tagbits;
+	    }
+	    else
+	    {
+	    	num_so_b_mat = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));//the address contains the matched data
+	    	num_do_b_mat = tagbits;
+	    }
+  }
+  else
+  {
+	  num_mats_h_dir = MAX(Ndwl / 2, 1);
+	  num_mats_v_dir = MAX(Ndbl / 2, 1);
+	  num_mats       = num_mats_h_dir * num_mats_v_dir;
+	  num_do_b_mat   = MAX((num_subarrays/num_mats) * num_c_subarray / (deg_bl_muxing * Ndsam_lev_1 * Ndsam_lev_2), 1);
+  }
+
+  if (!(fully_assoc|| pure_cam) && (num_do_b_mat < (num_subarrays/num_mats)))
+  {
+	  return;
+  }
+
+
+  int deg_sa_mux_l1_non_assoc;
+  //TODO:the i/o for subbank is not necessary and should be removed.
+  if (!(fully_assoc || pure_cam))
+  {
+	  if (!is_tag)
+	  {
+		  if (is_main_mem == true)
+		  {
+			  num_do_b_subbank = g_ip->int_prefetch_w * g_ip->out_w;
+			  deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
+		  }
+		  else
+		  {
+			  if (g_ip->fast_access == true)
+			  {
+				  num_do_b_subbank = g_ip->out_w * g_ip->data_assoc;
+				  deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
+			  }
+			  else
+			  {
+
+				  num_do_b_subbank = g_ip->out_w;
+				  deg_sa_mux_l1_non_assoc = Ndsam_lev_1 / g_ip->data_assoc;
+				  if (deg_sa_mux_l1_non_assoc < 1)
+				  {
+					  return;
+				  }
+
+			  }
+		  }
+	  }
+	  else
+	  {
+		  num_do_b_subbank = tagbits * g_ip->tag_assoc;
+		  if (num_do_b_mat < tagbits)
+		  {
+			  return;
+		  }
+		  deg_sa_mux_l1_non_assoc = Ndsam_lev_1;
+		  //num_do_b_mat = g_ip->tag_assoc / num_mats_h_dir;
+	  }
+  }
+  else
+  {
+	  if (fully_assoc)
+	  {
+		  num_so_b_subbank = 8 * g_ip->block_sz;//TODO:internal perfetch should be considered also for fa
+		  num_do_b_subbank = num_so_b_subbank + tag_num_c_subarray;
+	  }
+	  else
+	  {
+		  num_so_b_subbank = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));//the address contains the matched data
+		  num_do_b_subbank = tag_num_c_subarray;
+	  }
+
+	  deg_sa_mux_l1_non_assoc = 1;
+  }
+
+  deg_senseamp_muxing_non_associativity = deg_sa_mux_l1_non_assoc;
+
+  if (fully_assoc || pure_cam)
+  {
+	  num_act_mats_hor_dir = 1;
+	  num_act_mats_hor_dir_sl = num_mats_h_dir;//TODO: this is unnecessary, since search op, num_mats is used
+  }
+  else
+  {
+	  num_act_mats_hor_dir = num_do_b_subbank / num_do_b_mat;
+	  if (num_act_mats_hor_dir == 0)
+	  {
+		  return;
+	  }
+  }
+
+  //compute num_do_mat for tag
+  if (is_tag)
+  {
+	  if (!(fully_assoc || pure_cam))
+	  {
+		  num_do_b_mat     = g_ip->tag_assoc / num_act_mats_hor_dir;
+		  num_do_b_subbank = num_act_mats_hor_dir * num_do_b_mat;
+	  }
+  }
+
+  if ((g_ip->is_cache == false && is_main_mem == true) || (PAGE_MODE == 1 && is_dram))
+  {
+	  if (num_act_mats_hor_dir * num_do_b_mat * Ndsam_lev_1 * Ndsam_lev_2 != (int)g_ip->page_sz_bits)
+	  {
+		  return;
+	  }
+  }
+
+//  if (is_tag == false && g_ip->is_cache == true && !fully_assoc && !pure_cam && //TODO: TODO burst transfer should also apply to RAM arrays
+  if (is_tag == false && g_ip->is_main_mem == true &&
+		  num_act_mats_hor_dir*num_do_b_mat*Ndsam_lev_1*Ndsam_lev_2 < ((int) g_ip->out_w * (int) g_ip->burst_len * (int) g_ip->data_assoc))
+  {
+	  return;
+  }
+
+  if (num_act_mats_hor_dir > num_mats_h_dir)
+  {
+	  return;
+  }
+
+
+  //compute di for mat subbank and bank
+  if (!(fully_assoc ||pure_cam))
+  {
+	  if(!is_tag)
+	  {
+		  if(g_ip->fast_access == true)
+		  {
+			  num_di_b_mat = num_do_b_mat / g_ip->data_assoc;
+		  }
+		  else
+		  {
+			  num_di_b_mat = num_do_b_mat;
+		  }
+	  }
+	  else
+	  {
+		  num_di_b_mat = tagbits;
+	  }
+  }
+  else
+  {
+	  if (fully_assoc)
+	  {
+		  num_di_b_mat = num_do_b_mat;
+		  //*num_subarrays/num_mats; bits per mat of CAM/FA is as same as cache,
+		  //but inside the mat wire tracks need to be reserved for search data bus
+		  num_si_b_mat = tagbits;
+	  }
+	  else
+	  {
+		  num_di_b_mat = tagbits;
+		  num_si_b_mat = tagbits;//*num_subarrays/num_mats;
+	  }
+
+  }
+
+  num_di_b_subbank       = num_di_b_mat * num_act_mats_hor_dir;//normal cache or normal r/w for FA
+  num_si_b_subbank       = num_si_b_mat; //* num_act_mats_hor_dir_sl; inside the data is broadcast
+
+  int num_addr_b_row_dec     = _log2(num_r_subarray);
+  if  ((fully_assoc ||pure_cam))
+	  num_addr_b_row_dec     +=_log2(num_subarrays/num_mats);
+  int number_subbanks        = num_mats / num_act_mats_hor_dir;
+  number_subbanks_decode = _log2(number_subbanks);//TODO: add log2(num_subarray_per_bank) to FA/CAM
+
+  num_rw_ports = g_ip->num_rw_ports;
+  num_rd_ports = g_ip->num_rd_ports;
+  num_wr_ports = g_ip->num_wr_ports;
+  num_se_rd_ports = g_ip->num_se_rd_ports;
+  num_search_ports = g_ip->num_search_ports;
+
+  if (is_dram && is_main_mem)
+  {
+	  number_addr_bits_mat = MAX((unsigned int) num_addr_b_row_dec,
+			  _log2(deg_bl_muxing) + _log2(deg_sa_mux_l1_non_assoc) + _log2(Ndsam_lev_2));
+  }
+  else
+  {
+	  number_addr_bits_mat = num_addr_b_row_dec + _log2(deg_bl_muxing) +
+	  _log2(deg_sa_mux_l1_non_assoc) + _log2(Ndsam_lev_2);
+  }
+
+  if (!(fully_assoc ||pure_cam))
+  {
+	  if (is_tag)
+	  {
+		  num_di_b_bank_per_port = tagbits;
+		  num_do_b_bank_per_port = g_ip->data_assoc;
+	  }
+	  else
+	  {
+		  num_di_b_bank_per_port = g_ip->out_w + g_ip->data_assoc;
+		  num_do_b_bank_per_port = g_ip->out_w;
+	  }
+  }
+  else
+  {
+	  if (fully_assoc)
+	  {
+		  num_di_b_bank_per_port = g_ip->out_w + tagbits;//TODO: out_w or block_sz?
+		  num_si_b_bank_per_port = tagbits;
+		  num_do_b_bank_per_port = g_ip->out_w + tagbits;
+		  num_so_b_bank_per_port = g_ip->out_w;
+	  }
+	  else
+	  {
+		  num_di_b_bank_per_port = tagbits;
+		  num_si_b_bank_per_port = tagbits;
+		  num_do_b_bank_per_port = tagbits;
+		  num_so_b_bank_per_port = int(ceil(log2(num_r_subarray)) + ceil(log2(num_subarrays)));
+	  }
+  }
+
+  if ((!is_tag) && (g_ip->data_assoc > 1) && (!g_ip->fast_access))
+  {
+	  number_way_select_signals_mat = g_ip->data_assoc;
+  }
+
+  // add ECC adjustment to all data signals that traverse on H-trees.
+  if (g_ip->add_ecc_b_ == true)
+  {
+	  num_do_b_mat += (int) (ceil(num_do_b_mat / num_bits_per_ecc_b_));
+	  num_di_b_mat += (int) (ceil(num_di_b_mat / num_bits_per_ecc_b_));
+	  num_di_b_subbank += (int) (ceil(num_di_b_subbank / num_bits_per_ecc_b_));
+	  num_do_b_subbank += (int) (ceil(num_do_b_subbank / num_bits_per_ecc_b_));
+	  num_di_b_bank_per_port += (int) (ceil(num_di_b_bank_per_port / num_bits_per_ecc_b_));
+	  num_do_b_bank_per_port += (int) (ceil(num_do_b_bank_per_port / num_bits_per_ecc_b_));
+
+	  num_so_b_mat += (int) (ceil(num_so_b_mat / num_bits_per_ecc_b_));
+	  num_si_b_mat += (int) (ceil(num_si_b_mat / num_bits_per_ecc_b_));
+	  num_si_b_subbank += (int) (ceil(num_si_b_subbank / num_bits_per_ecc_b_));
+	  num_so_b_subbank += (int) (ceil(num_so_b_subbank / num_bits_per_ecc_b_));
+	  num_si_b_bank_per_port += (int) (ceil(num_si_b_bank_per_port / num_bits_per_ecc_b_));
+	  num_so_b_bank_per_port += (int) (ceil(num_so_b_bank_per_port / num_bits_per_ecc_b_));
+  }
+
+  is_valid = true;
+}
+
diff --git a/src/gpuwattch/cacti/parameter.h b/src/gpuwattch/cacti/parameter.h
new file mode 100644
index 000000000..2c977eedb
--- /dev/null
+++ b/src/gpuwattch/cacti/parameter.h
@@ -0,0 +1,367 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __PARAMETER_H__
+#define __PARAMETER_H__
+
+#include "area.h"
+#include "const.h"
+#include "cacti_interface.h"
+#include "io.h"
+
+// parameters which are functions of certain device technology
+class TechnologyParameter
+{
+ public:
+  class DeviceType
+  {
+   public:
+    double C_g_ideal;
+    double C_fringe;
+    double C_overlap;
+    double C_junc;  // C_junc_area
+    double C_junc_sidewall;
+    double l_phy;
+    double l_elec;
+    double R_nch_on;
+    double R_pch_on;
+    double Vdd;
+    double Vth;
+    double I_on_n;
+    double I_on_p;
+    double I_off_n;
+    double I_off_p;
+    double I_g_on_n;
+    double I_g_on_p;
+    double C_ox;
+    double t_ox;
+    double n_to_p_eff_curr_drv_ratio;
+    double long_channel_leakage_reduction;
+
+    DeviceType(): C_g_ideal(0), C_fringe(0), C_overlap(0), C_junc(0),
+                  C_junc_sidewall(0), l_phy(0), l_elec(0), R_nch_on(0), R_pch_on(0),
+                  Vdd(0), Vth(0),
+                  I_on_n(0), I_on_p(0), I_off_n(0), I_off_p(0),I_g_on_n(0),I_g_on_p(0),
+                  C_ox(0), t_ox(0), n_to_p_eff_curr_drv_ratio(0), long_channel_leakage_reduction(0) { };
+    void reset()
+    {
+      C_g_ideal = 0;
+      C_fringe  = 0;
+      C_overlap = 0;
+      C_junc    = 0;
+      l_phy     = 0;
+      l_elec    = 0;
+      R_nch_on  = 0;
+      R_pch_on  = 0;
+      Vdd       = 0;
+      Vth       = 0;
+      I_on_n    = 0;
+      I_on_p    = 0;
+      I_off_n   = 0;
+      I_off_p   = 0;
+      I_g_on_n   = 0;
+      I_g_on_p   = 0;
+      C_ox      = 0;
+      t_ox      = 0;
+      n_to_p_eff_curr_drv_ratio = 0;
+      long_channel_leakage_reduction = 0;
+    }
+
+    void display(uint32_t indent = 0);
+  };
+  class InterconnectType
+  {
+   public:
+    double pitch;
+    double R_per_um;
+    double C_per_um;
+    double horiz_dielectric_constant;
+    double vert_dielectric_constant;
+    double aspect_ratio;
+    double miller_value;
+    double ild_thickness;
+
+    InterconnectType(): pitch(0), R_per_um(0), C_per_um(0) { };
+
+    void reset()
+    {
+      pitch = 0;
+      R_per_um = 0;
+      C_per_um = 0;
+      horiz_dielectric_constant = 0;
+      vert_dielectric_constant = 0;
+      aspect_ratio = 0;
+      miller_value = 0;
+      ild_thickness = 0;
+    }
+
+    void display(uint32_t indent = 0);
+  };
+  class MemoryType
+  {
+   public:
+    double b_w;
+    double b_h;
+    double cell_a_w;
+    double cell_pmos_w;
+    double cell_nmos_w;
+    double Vbitpre;
+
+    void reset()
+    {
+      b_w = 0;
+      b_h = 0;
+      cell_a_w = 0;
+      cell_pmos_w = 0;
+      cell_nmos_w = 0;
+      Vbitpre = 0;
+    }
+
+    void display(uint32_t indent = 0);
+  };
+
+  class ScalingFactor
+  {
+   public:
+    double logic_scaling_co_eff;
+    double core_tx_density;
+    double long_channel_leakage_reduction;
+
+    ScalingFactor(): logic_scaling_co_eff(0), core_tx_density(0),
+    long_channel_leakage_reduction(0) { };
+
+    void reset()
+    {
+      logic_scaling_co_eff= 0;
+      core_tx_density = 0;
+      long_channel_leakage_reduction= 0;
+    }
+
+    void display(uint32_t indent = 0);
+  };
+
+  double ram_wl_stitching_overhead_;
+  double min_w_nmos_;
+  double max_w_nmos_;
+  double max_w_nmos_dec;
+  double unit_len_wire_del;
+  double FO4;
+  double kinv;
+  double vpp;
+  double w_sense_en;
+  double w_sense_n;
+  double w_sense_p;
+  double sense_delay;
+  double sense_dy_power;
+  double w_iso;
+  double w_poly_contact;
+  double spacing_poly_to_poly;
+  double spacing_poly_to_contact;
+
+  double w_comp_inv_p1;
+  double w_comp_inv_p2;
+  double w_comp_inv_p3;
+  double w_comp_inv_n1;
+  double w_comp_inv_n2;
+  double w_comp_inv_n3;
+  double w_eval_inv_p;
+  double w_eval_inv_n;
+  double w_comp_n;
+  double w_comp_p;
+
+  double dram_cell_I_on;
+  double dram_cell_Vdd;
+  double dram_cell_I_off_worst_case_len_temp;
+  double dram_cell_C;
+  double gm_sense_amp_latch;
+
+  double w_nmos_b_mux;
+  double w_nmos_sa_mux;
+  double w_pmos_bl_precharge;
+  double w_pmos_bl_eq;
+  double MIN_GAP_BET_P_AND_N_DIFFS;
+  double MIN_GAP_BET_SAME_TYPE_DIFFS;
+  double HPOWERRAIL;
+  double cell_h_def;
+
+  double chip_layout_overhead;
+  double macro_layout_overhead;
+  double sckt_co_eff;
+
+  double fringe_cap;
+
+  uint64_t h_dec;
+
+  DeviceType sram_cell;   // SRAM cell transistor
+  DeviceType dram_acc;    // DRAM access transistor
+  DeviceType dram_wl;     // DRAM wordline transistor
+  DeviceType peri_global; // peripheral global
+  DeviceType cam_cell;   // SRAM cell transistor
+
+  InterconnectType wire_local;
+  InterconnectType wire_inside_mat;
+  InterconnectType wire_outside_mat;
+
+  ScalingFactor scaling_factor;
+
+  MemoryType sram;
+  MemoryType dram;
+  MemoryType cam;
+
+  void display(uint32_t indent = 0);
+
+  void reset()
+  {
+    dram_cell_Vdd  = 0;
+    dram_cell_I_on = 0;
+    dram_cell_C    = 0;
+    vpp            = 0;
+
+    sense_delay               = 0;
+    sense_dy_power            = 0;
+    fringe_cap                = 0;
+//    horiz_dielectric_constant = 0;
+//    vert_dielectric_constant  = 0;
+//    aspect_ratio              = 0;
+//    miller_value              = 0;
+//    ild_thickness             = 0;
+
+    dram_cell_I_off_worst_case_len_temp = 0;
+
+    sram_cell.reset();
+    dram_acc.reset();
+    dram_wl.reset();
+    peri_global.reset();
+    cam_cell.reset();
+
+    scaling_factor.reset();
+
+    wire_local.reset();
+    wire_inside_mat.reset();
+    wire_outside_mat.reset();
+
+    sram.reset();
+    dram.reset();
+    cam.reset();
+
+    chip_layout_overhead  = 0;
+    macro_layout_overhead = 0;
+    sckt_co_eff           = 0;
+  }
+};
+
+
+
+class DynamicParameter
+{
+  public:
+    bool is_tag;
+    bool pure_ram;
+    bool pure_cam;
+    bool fully_assoc;
+    int tagbits;
+    int num_subarrays;  // only for leakage computation  -- the number of subarrays per bank
+    int num_mats;       // only for leakage computation  -- the number of mats per bank
+    double Nspd;
+    int Ndwl;
+    int Ndbl;
+    int Ndcm;
+    int deg_bl_muxing;
+    int deg_senseamp_muxing_non_associativity;
+    int Ndsam_lev_1;
+    int Ndsam_lev_2;
+    int number_addr_bits_mat;             // per port
+    int number_subbanks_decode;           // per_port
+    int num_di_b_bank_per_port;
+    int num_do_b_bank_per_port;
+    int num_di_b_mat;
+    int num_do_b_mat;
+    int num_di_b_subbank;
+    int num_do_b_subbank;
+
+    int num_si_b_mat;
+    int num_so_b_mat;
+    int num_si_b_subbank;
+    int num_so_b_subbank;
+	int num_si_b_bank_per_port;
+	int num_so_b_bank_per_port;
+
+    int number_way_select_signals_mat;
+    int num_act_mats_hor_dir;
+
+    int num_act_mats_hor_dir_sl;
+    bool is_dram;
+    double V_b_sense;
+    unsigned int num_r_subarray;
+    unsigned int num_c_subarray;
+    int tag_num_r_subarray;//sheng: fully associative cache tag and data must be computed together, data and tag must be separate
+    int tag_num_c_subarray;
+    int data_num_r_subarray;
+    int data_num_c_subarray;
+    int num_mats_h_dir;
+    int num_mats_v_dir;
+    uint32_t ram_cell_tech_type;
+    double dram_refresh_period;
+
+    DynamicParameter();
+    DynamicParameter(
+        bool         is_tag_,
+        int          pure_ram_,
+        int          pure_cam_,
+        double       Nspd_,
+        unsigned int Ndwl_,
+        unsigned int Ndbl_,
+        unsigned int Ndcm_,
+        unsigned int Ndsam_lev_1_,
+        unsigned int Ndsam_lev_2_,
+        bool         is_main_mem_);
+
+    int use_inp_params;
+    unsigned int num_rw_ports;
+    unsigned int num_rd_ports;
+    unsigned int num_wr_ports;
+    unsigned int num_se_rd_ports;  // number of single ended read ports
+    unsigned int num_search_ports;
+    unsigned int out_w;// == nr_bits_out
+    bool   is_main_mem;
+    Area   cell, cam_cell;//cell is the sram_cell in both nomal cache/ram and FA.
+    bool   is_valid;
+};
+
+
+
+extern InputParameter * g_ip;
+extern TechnologyParameter g_tp;
+
+#endif
+
diff --git a/src/gpuwattch/cacti/router.cc b/src/gpuwattch/cacti/router.cc
new file mode 100644
index 000000000..9826c69f6
--- /dev/null
+++ b/src/gpuwattch/cacti/router.cc
@@ -0,0 +1,311 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include "router.h"
+
+MCPAT_Router::MCPAT_Router(
+    double flit_size_,
+    double vc_buf, /* vc size = vc_buffer_size * flit_size */
+    double vc_c,
+    TechnologyParameter::DeviceType *dt,
+    double I_,
+    double O_,
+    double M_
+    ):flit_size(flit_size_),
+      deviceType(dt),
+      I(I_),
+      O(O_),
+      M(M_)
+{
+  vc_buffer_size = vc_buf;
+  vc_count = vc_c;
+  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
+  double technology = g_ip->F_sz_um;
+
+  Vdd = dt->Vdd;
+
+  /*Crossbar parameters. Transmisson gate is employed for connector*/
+  NTtr = 10*technology*1e-6/2; /*Transmission gate's nmos tr. length*/
+  PTtr = 20*technology*1e-6/2; /* pmos tr. length*/
+  wt = 15*technology*1e-6/2; /*track width*/
+  ht = 15*technology*1e-6/2; /*track height*/
+//  I = 5; /*Number of crossbar input ports*/
+//  O = 5; /*Number of crossbar output ports*/
+  NTi = 12.5*technology*1e-6/2;
+  PTi = 25*technology*1e-6/2;
+
+  NTid = 60*technology*1e-6/2; //m
+  PTid = 120*technology*1e-6/2; // m
+  NTod = 60*technology*1e-6/2; // m
+  PTod = 120*technology*1e-6/2; // m
+
+  calc_router_parameters();
+}
+
+MCPAT_Router::~MCPAT_Router(){}
+
+
+double //wire cap with triple spacing
+MCPAT_Router::Cw3(double length) {
+  Wire wc(g_ip->wt, length, 1, 3, 3);
+  return (wc.wire_cap(length));
+}
+
+/*Function to calculate the gate capacitance*/
+double
+MCPAT_Router::gate_cap(double w) {
+  return (double) gate_C (w*1e6 /*u*/, 0);
+}
+
+/*Function to calculate the diffusion capacitance*/
+double
+MCPAT_Router::diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/,
+    double s /*number of stacking transistors*/) {
+  return (double) drain_C_(w*1e6 /*u*/, type, (int) s, 1, g_tp.cell_h_def);
+}
+
+
+/*crossbar related functions */
+
+// Model for simple transmission gate
+double
+MCPAT_Router::transmission_buf_inpcap() {
+  return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1);
+}
+
+double
+MCPAT_Router::transmission_buf_outcap() {
+  return diff_cap(NTtr, 0, 1)+diff_cap(PTtr, 1, 1);
+}
+
+double
+MCPAT_Router::transmission_buf_ctrcap() {
+  return gate_cap(NTtr)+gate_cap(PTtr);
+}
+
+double
+MCPAT_Router::crossbar_inpline() {
+  return (Cw3(O*flit_size*wt) + O*transmission_buf_inpcap() + gate_cap(NTid) +
+      gate_cap(PTid) + diff_cap(NTid, 0, 1) + diff_cap(PTid, 1, 1));
+}
+
+double
+MCPAT_Router::crossbar_outline() {
+  return (Cw3(I*flit_size*ht) + I*transmission_buf_outcap() + gate_cap(NTod) +
+      gate_cap(PTod) + diff_cap(NTod, 0, 1) + diff_cap(PTod, 1, 1));
+}
+
+double
+MCPAT_Router::crossbar_ctrline() {
+  return (Cw3(0.5*O*flit_size*wt) + flit_size*transmission_buf_ctrcap() +
+      diff_cap(NTi, 0, 1) + diff_cap(PTi, 1, 1) +
+      gate_cap(NTi) + gate_cap(PTi));
+}
+
+double
+MCPAT_Router::tr_crossbar_power() {
+  return (crossbar_inpline()*Vdd*Vdd*flit_size/2 +
+      crossbar_outline()*Vdd*Vdd*flit_size/2)*2;
+}
+
+void MCPAT_Router::buffer_stats()
+{
+  DynamicParameter dyn_p;
+  dyn_p.is_tag      = false;
+  dyn_p.pure_cam    = false;
+  dyn_p.fully_assoc = false;
+  dyn_p.pure_ram    = true;
+  dyn_p.is_dram     = false;
+  dyn_p.is_main_mem = false;
+  dyn_p.num_subarrays = 1;
+  dyn_p.num_mats = 1;
+  dyn_p.Ndbl = 1;
+  dyn_p.Ndwl = 1;
+  dyn_p.Nspd = 1;
+  dyn_p.deg_bl_muxing = 1;
+  dyn_p.deg_senseamp_muxing_non_associativity = 1;
+  dyn_p.Ndsam_lev_1 = 1;
+  dyn_p.Ndsam_lev_2 = 1;
+  dyn_p.Ndcm = 1;
+  dyn_p.number_addr_bits_mat = 8;
+  dyn_p.number_way_select_signals_mat = 1;
+  dyn_p.number_subbanks_decode = 0;
+  dyn_p.num_act_mats_hor_dir = 1;
+  dyn_p.V_b_sense = Vdd; // FIXME check power calc.
+  dyn_p.ram_cell_tech_type = 0;
+  dyn_p.num_r_subarray = (int) vc_buffer_size;
+  dyn_p.num_c_subarray = (int) flit_size * (int) vc_count;
+  dyn_p.num_mats_h_dir = 1;
+  dyn_p.num_mats_v_dir = 1;
+  dyn_p.num_do_b_subbank = (int)flit_size;
+  dyn_p.num_di_b_subbank = (int)flit_size;
+  dyn_p.num_do_b_mat = (int) flit_size;
+  dyn_p.num_di_b_mat = (int) flit_size;
+  dyn_p.num_do_b_mat = (int) flit_size;
+  dyn_p.num_di_b_mat = (int) flit_size;
+  dyn_p.num_do_b_bank_per_port = (int) flit_size;
+  dyn_p.num_di_b_bank_per_port = (int) flit_size;
+  dyn_p.out_w = (int) flit_size;
+
+  dyn_p.use_inp_params = 1;
+  dyn_p.num_wr_ports = (unsigned int) vc_count;
+  dyn_p.num_rd_ports = 1;//(unsigned int) vc_count;//based on Bill Dally's book
+  dyn_p.num_rw_ports = 0;
+  dyn_p.num_se_rd_ports =0;
+  dyn_p.num_search_ports =0;
+
+
+
+  dyn_p.cell.h = g_tp.sram.b_h + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_wr_ports +
+      dyn_p.num_rw_ports - 1 + dyn_p.num_rd_ports);
+  dyn_p.cell.w = g_tp.sram.b_w + 2 * g_tp.wire_outside_mat.pitch * (dyn_p.num_rw_ports - 1 +
+      (dyn_p.num_rd_ports - dyn_p.num_se_rd_ports) +
+      dyn_p.num_wr_ports) + g_tp.wire_outside_mat.pitch * dyn_p.num_se_rd_ports;
+
+  Mat buff(dyn_p);
+  buff.compute_delays(0);
+  buff.compute_power_energy();
+  buffer.power.readOp  = buff.power.readOp;
+  buffer.power.writeOp = buffer.power.readOp; //FIXME
+  buffer.area = buff.area;
+}
+
+
+
+  void
+MCPAT_Router::cb_stats ()
+{
+  if (1) {
+    Crossbar c_b(I, O, flit_size);
+    c_b.compute_power();
+    crossbar.delay = c_b.delay;
+    crossbar.power.readOp.dynamic = c_b.power.readOp.dynamic;
+    crossbar.power.readOp.leakage = c_b.power.readOp.leakage;
+    crossbar.power.readOp.gate_leakage = c_b.power.readOp.gate_leakage;
+    crossbar.area = c_b.area;
+//  c_b.print_crossbar();
+  }
+  else {
+    crossbar.power.readOp.dynamic = tr_crossbar_power();
+    crossbar.power.readOp.leakage = flit_size * I * O *
+        cmos_Isub_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg);
+    crossbar.power.readOp.gate_leakage = flit_size * I * O *
+        cmos_Ig_leakage(NTtr*g_tp.min_w_nmos_, PTtr*min_w_pmos, 1, tg);
+  }
+}
+
+void
+MCPAT_Router::get_router_power()
+{
+  /* calculate buffer stats */
+  buffer_stats();
+
+  /* calculate cross-bar stats */
+  cb_stats();
+
+  /* calculate arbiter stats */
+  MCPAT_Arbiter vcarb(vc_count, flit_size, buffer.area.w);
+  MCPAT_Arbiter cbarb(I, flit_size, crossbar.area.w);
+  vcarb.compute_power();
+  cbarb.compute_power();
+  arbiter.power.readOp.dynamic = vcarb.power.readOp.dynamic * I +
+    cbarb.power.readOp.dynamic * O;
+  arbiter.power.readOp.leakage = vcarb.power.readOp.leakage * I +
+    cbarb.power.readOp.leakage * O;
+  arbiter.power.readOp.gate_leakage = vcarb.power.readOp.gate_leakage * I +
+    cbarb.power.readOp.gate_leakage * O;
+
+//  arb_stats();
+  power.readOp.dynamic = ((buffer.power.readOp.dynamic+buffer.power.writeOp.dynamic) +
+		  crossbar.power.readOp.dynamic +
+		  arbiter.power.readOp.dynamic)*MIN(I, O)*M;
+  double pppm_t[4]    = {1,I,I,1};
+  power = power + (buffer.power*pppm_t + crossbar.power + arbiter.power)*pppm_lkg;
+
+}
+
+  void
+MCPAT_Router::get_router_delay ()
+{
+  FREQUENCY=5; // move this to config file --TODO
+  cycle_time = (1/(double)FREQUENCY)*1e3; //ps
+  delay = 4;
+  max_cyc = 17 * g_tp.FO4; //s
+  max_cyc *= 1e12; //ps
+  if (cycle_time < max_cyc) {
+    FREQUENCY = (1/max_cyc)*1e3; //GHz
+  }
+}
+
+  void
+MCPAT_Router::get_router_area()
+{
+  area.h = I*buffer.area.h;
+  area.w = buffer.area.w+crossbar.area.w;
+}
+
+  void
+MCPAT_Router::calc_router_parameters()
+{
+  /* calculate router frequency and pipeline cycles */
+  get_router_delay();
+
+  /* router power stats */
+  get_router_power();
+
+  /* area stats */
+  get_router_area();
+}
+
+  void
+MCPAT_Router::print_router()
+{
+  cout << "\n\nMCPAT_Router stats:\n";
+  cout << "\tMCPAT_Router Area - "<< area.get_area()*1e-6<<"(mm^2)\n";
+  cout << "\tMaximum possible network frequency - " << (1/max_cyc)*1e3 << "GHz\n";
+  cout << "\tNetwork frequency - " << FREQUENCY <<" GHz\n";
+  cout << "\tNo. of Virtual channels - " << vc_count << "\n";
+  cout << "\tNo. of pipeline stages - " << delay << endl;
+  cout << "\tLink bandwidth - " << flit_size << " (bits)\n";
+  cout << "\tNo. of buffer entries per virtual channel -  "<< vc_buffer_size << "\n";
+  cout << "\tSimple buffer Area - "<< buffer.area.get_area()*1e-6<<"(mm^2)\n";
+  cout << "\tSimple buffer access (Read) - " << buffer.power.readOp.dynamic * 1e9 <<" (nJ)\n";
+  cout << "\tSimple buffer leakage - " << buffer.power.readOp.leakage * 1e3 <<" (mW)\n";
+  cout << "\tCrossbar Area - "<< crossbar.area.get_area()*1e-6<<"(mm^2)\n";
+  cout << "\tCross bar access energy - " << crossbar.power.readOp.dynamic * 1e9<<" (nJ)\n";
+  cout << "\tCross bar leakage power - " << crossbar.power.readOp.leakage * 1e3<<" (mW)\n";
+  cout << "\tMCPAT_Arbiter access energy (VC arb + Crossbar arb) - "<<arbiter.power.readOp.dynamic * 1e9 <<" (nJ)\n";
+  cout << "\tMCPAT_Arbiter leakage (VC arb + Crossbar arb) - "<<arbiter.power.readOp.leakage * 1e3 <<" (mW)\n";
+
+}
+
diff --git a/src/gpuwattch/cacti/router.h b/src/gpuwattch/cacti/router.h
new file mode 100644
index 000000000..21023c496
--- /dev/null
+++ b/src/gpuwattch/cacti/router.h
@@ -0,0 +1,115 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __ROUTER_H__
+#define __ROUTER_H__
+
+#include <assert.h>
+#include <iostream>
+#include "basic_circuit.h"
+#include "cacti_interface.h"
+#include "component.h"
+#include "mat.h"
+#include "parameter.h"
+#include "wire.h"
+#include "crossbar.h"
+#include "arbiter.h"
+
+
+
+class MCPAT_Router : public Component
+{
+  public:
+    MCPAT_Router(
+        double flit_size_,
+        double vc_buf, /* vc size = vc_buffer_size * flit_size */
+        double vc_count,
+        TechnologyParameter::DeviceType *dt = &(g_tp.peri_global),
+        double I_ = 5,
+        double O_ = 5,
+        double M_ = 0.6);
+    ~MCPAT_Router();
+
+
+    void print_router();
+
+    Component arbiter, crossbar, buffer;
+
+    double cycle_time, max_cyc;
+    double flit_size;
+    double vc_count;
+    double vc_buffer_size; /* vc size = vc_buffer_size * flit_size */
+
+  private:
+	TechnologyParameter::DeviceType *deviceType;
+	double FREQUENCY; // move this to config file --TODO
+    double Cw3(double len);
+    double gate_cap(double w);
+    double diff_cap(double w, int type /*0 for n-mos and 1 for p-mos*/, double stack);
+    enum Wire_type wtype;
+    enum Wire_placement wire_placement;
+    //corssbar
+    double NTtr, PTtr, wt, ht, I, O, NTi, PTi, NTid, PTid, NTod, PTod, TriS1, TriS2;
+    double M; //network load
+    double transmission_buf_inpcap();
+    double transmission_buf_outcap();
+    double transmission_buf_ctrcap();
+    double crossbar_inpline();
+    double crossbar_outline();
+    double crossbar_ctrline();
+    double tr_crossbar_power();
+    void  cb_stats ();
+    double arb_power();
+    void  arb_stats ();
+    double buffer_params();
+    void buffer_stats();
+
+
+    //arbiter
+
+    //buffer
+
+    //router params
+    double Vdd;
+
+    void calc_router_parameters();
+    void get_router_area();
+    void get_router_power();
+    void get_router_delay();
+
+    double min_w_pmos;
+
+
+};
+
+#endif
diff --git a/src/gpuwattch/cacti/subarray.cc b/src/gpuwattch/cacti/subarray.cc
new file mode 100755
index 000000000..ef5737d3b
--- /dev/null
+++ b/src/gpuwattch/cacti/subarray.cc
@@ -0,0 +1,197 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+
+#include <iostream>
+#include <math.h>
+#include <assert.h>
+
+#include "subarray.h"
+
+
+Subarray::Subarray(const DynamicParameter & dp_, bool is_fa_):
+  dp(dp_), num_rows(dp.num_r_subarray), num_cols(dp.num_c_subarray),
+  num_cols_fa_cam(dp.tag_num_c_subarray), num_cols_fa_ram(dp.data_num_c_subarray),
+  cell(dp.cell), cam_cell(dp.cam_cell), is_fa(is_fa_)
+{
+	//num_cols=7;
+	//cout<<"num_cols ="<< num_cols <<endl;
+  if (!(is_fa || dp.pure_cam))
+  {
+	  num_cols +=(g_ip->add_ecc_b_ ? (int)ceil(num_cols / num_bits_per_ecc_b_) : 0);   // ECC overhead
+	  uint32_t ram_num_cells_wl_stitching =
+		  (dp.ram_cell_tech_type == lp_dram)   ? dram_num_cells_wl_stitching_ :
+	  (dp.ram_cell_tech_type == comm_dram) ? comm_dram_num_cells_wl_stitching_ : sram_num_cells_wl_stitching_;
+
+	  area.h = cell.h * num_rows;
+
+	  area.w = cell.w * num_cols +
+	  ceil(num_cols / ram_num_cells_wl_stitching) * g_tp.ram_wl_stitching_overhead_;  // stitching overhead
+  }
+  else  //cam fa
+  {
+
+	  //should not add dummy row here since the dummy row do not need decoder
+	  if (is_fa)// fully associative cache
+	  {
+		  num_cols_fa_cam  += g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_cam / num_bits_per_ecc_b_) : 0;
+		  num_cols_fa_ram  += (g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_ram / num_bits_per_ecc_b_) : 0);
+		  num_cols = num_cols_fa_cam + num_cols_fa_ram;
+	  }
+	  else
+	  {
+		  num_cols_fa_cam  += g_ip->add_ecc_b_ ? (int)ceil(num_cols_fa_cam / num_bits_per_ecc_b_) : 0;
+		  num_cols_fa_ram  = 0;
+		  num_cols = num_cols_fa_cam;
+	  }
+
+	  area.h = cam_cell.h * (num_rows + 1);//height of subarray is decided by CAM array. blank space in sram array are filled with dummy cells
+	  area.w = cam_cell.w * num_cols_fa_cam + cell.w * num_cols_fa_ram
+	  + ceil((num_cols_fa_cam + num_cols_fa_ram) / sram_num_cells_wl_stitching_)*g_tp.ram_wl_stitching_overhead_
+	  + 16*g_tp.wire_local.pitch //the overhead for the NAND gate to connect the two halves
+	  + 128*g_tp.wire_local.pitch;//the overhead for the drivers from matchline to wordline of RAM
+  }
+
+  assert(area.h>0);
+  assert(area.w>0);
+  compute_C();
+}
+
+
+
+Subarray::~Subarray()
+{
+}
+
+
+
+double Subarray::get_total_cell_area()
+{
+//  return (is_fa==false? cell.get_area() * num_rows * num_cols
+//		  //: cam_cell.h*(num_rows+1)*(num_cols_fa_cam + sram_cell.get_area()*num_cols_fa_ram));
+//		  : cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram));
+//		  //: cam_cell.get_area()*(num_rows+1)*num_cols_fa_cam + sram_cell.get_area()*(num_rows+1)*num_cols_fa_ram);//for FA, this area does not include the dummy cells in SRAM arrays.
+
+    if (!(is_fa || dp.pure_cam))
+	  return (cell.get_area() * num_rows * num_cols);
+    else if (is_fa)
+    { //for FA, this area includes the dummy cells in SRAM arrays.
+      //return (cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram));
+      //cout<<"diff" <<cam_cell.get_area()*(num_rows+1)*(num_cols_fa_cam + num_cols_fa_ram)- cam_cell.h*(num_rows+1)*(cam_cell.w*num_cols_fa_cam + cell.w*num_cols_fa_ram)<<endl;
+      return (cam_cell.h*(num_rows+1)*(cam_cell.w*num_cols_fa_cam + cell.w*num_cols_fa_ram));
+    }
+    else
+      return (cam_cell.get_area()*(num_rows+1)*num_cols_fa_cam );
+
+
+}
+
+
+
+void Subarray::compute_C()
+{
+  double c_w_metal = cell.w * g_tp.wire_local.C_per_um;
+  double r_w_metal = cell.w * g_tp.wire_local.R_per_um;
+  double C_b_metal = cell.h * g_tp.wire_local.C_per_um;
+  double C_b_row_drain_C;
+
+  if (dp.is_dram)
+  {
+    C_wl = (gate_C_pass(g_tp.dram.cell_a_w, g_tp.dram.b_w, true, true) + c_w_metal) * num_cols;
+
+    if (dp.ram_cell_tech_type == comm_dram)
+    {
+      C_bl = num_rows * C_b_metal;
+    }
+    else
+    {
+      C_b_row_drain_C = drain_C_(g_tp.dram.cell_a_w, NCH, 1, 0, cell.w, true, true) / 2.0;  // due to shared contact
+      C_bl = num_rows * (C_b_row_drain_C + C_b_metal);
+    }
+  }
+  else
+  {
+	  if (!(is_fa ||dp.pure_cam))
+	  {
+		  C_wl = (gate_C_pass(g_tp.sram.cell_a_w, (g_tp.sram.b_w-2*g_tp.sram.cell_a_w)/2.0, false, true)*2 +
+				  c_w_metal) * num_cols;
+		  C_b_row_drain_C = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0;  // due to shared contact
+		  C_bl = num_rows * (C_b_row_drain_C + C_b_metal);
+	  }
+	  else
+	  {
+		 //Following is wordline not matchline
+		 //CAM portion
+		 c_w_metal = cam_cell.w * g_tp.wire_local.C_per_um;
+		 r_w_metal = cam_cell.w * g_tp.wire_local.R_per_um;
+         C_wl_cam = (gate_C_pass(g_tp.cam.cell_a_w, (g_tp.cam.b_w-2*g_tp.cam.cell_a_w)/2.0, false, true)*2 +
+				  c_w_metal) * num_cols_fa_cam;
+         R_wl_cam = (r_w_metal) * num_cols_fa_cam;
+
+         if (!dp.pure_cam)
+         {
+        	 //RAM portion
+        	 c_w_metal = cell.w * g_tp.wire_local.C_per_um;
+        	 r_w_metal = cell.w * g_tp.wire_local.R_per_um;
+        	 C_wl_ram = (gate_C_pass(g_tp.sram.cell_a_w, (g_tp.sram.b_w-2*g_tp.sram.cell_a_w)/2.0, false, true)*2 +
+        			 c_w_metal) * num_cols_fa_ram;
+        	 R_wl_ram = (r_w_metal) * num_cols_fa_ram;
+         }
+         else
+         {
+        	 C_wl_ram = R_wl_ram =0;
+         }
+         C_wl = C_wl_cam + C_wl_ram;
+         C_wl += (16+128)*g_tp.wire_local.pitch*g_tp.wire_local.C_per_um;
+
+         R_wl = R_wl_cam + R_wl_ram;
+         R_wl += (16+128)*g_tp.wire_local.pitch*g_tp.wire_local.R_per_um;
+
+         //there are two ways to write to a FA,
+         //1) Write to CAM array then force a match on match line to active the corresponding wordline in RAM;
+         //2) using separate wordline for read/write and search in RAM.
+         //We are using the second approach.
+
+         //Bitline CAM portion This is bitline not searchline. We assume no sharing between bitline and searchline according to SUN's implementations.
+         C_b_metal = cam_cell.h * g_tp.wire_local.C_per_um;
+         C_b_row_drain_C = drain_C_(g_tp.cam.cell_a_w, NCH, 1, 0, cam_cell.w, false, true) / 2.0;  // due to shared contact
+         C_bl_cam = (num_rows+1) * (C_b_row_drain_C + C_b_metal);
+         //height of subarray is decided by CAM array. blank space in sram array are filled with dummy cells
+         C_b_row_drain_C = drain_C_(g_tp.sram.cell_a_w, NCH, 1, 0, cell.w, false, true) / 2.0;  // due to shared contact
+         C_bl = (num_rows +1) * (C_b_row_drain_C + C_b_metal);
+
+	  }
+  }
+}
+
+
diff --git a/src/gpuwattch/cacti/subarray.h b/src/gpuwattch/cacti/subarray.h
new file mode 100755
index 000000000..5fb062420
--- /dev/null
+++ b/src/gpuwattch/cacti/subarray.h
@@ -0,0 +1,70 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __SUBARRAY_H__
+#define __SUBARRAY_H__
+
+#include "area.h"
+#include "component.h"
+#include "parameter.h"
+
+using namespace std;
+
+
+class Subarray : public Component
+{
+  public:
+    Subarray(const DynamicParameter & dp, bool is_fa_);
+    ~Subarray();
+
+    const DynamicParameter & dp;
+    double  get_total_cell_area();
+    unsigned int num_rows;
+    unsigned int num_cols;
+    int32_t num_cols_fa_cam;
+    int32_t num_cols_fa_ram;
+    Area    cell, cam_cell;
+
+    bool    is_fa;
+    double  C_wl, C_wl_cam, C_wl_ram;
+    double  R_wl, R_wl_cam, R_wl_ram;
+    double  C_bl, C_bl_cam;
+  private:
+
+    void compute_C();  // compute bitline and wordline capacitance
+};
+
+
+
+#endif
+
diff --git a/src/gpuwattch/cacti/technology.cc b/src/gpuwattch/cacti/technology.cc
new file mode 100644
index 000000000..7067470f7
--- /dev/null
+++ b/src/gpuwattch/cacti/technology.cc
@@ -0,0 +1,2917 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+#include "basic_circuit.h"
+
+#include "parameter.h"
+
+double wire_resistance(double resistivity, double wire_width, double wire_thickness,
+    double barrier_thickness, double dishing_thickness, double alpha_scatter)
+{
+  double resistance;
+  resistance = alpha_scatter * resistivity /((wire_thickness - barrier_thickness - dishing_thickness)*(wire_width - 2 * barrier_thickness));
+  return(resistance);
+}
+
+double wire_capacitance(double wire_width, double wire_thickness, double wire_spacing,
+    double ild_thickness, double miller_value, double horiz_dielectric_constant,
+    double vert_dielectric_constant, double fringe_cap)
+{
+  double vertical_cap, sidewall_cap, total_cap;
+  vertical_cap = 2 * PERMITTIVITY_FREE_SPACE * vert_dielectric_constant * wire_width / ild_thickness;
+  sidewall_cap = 2 * PERMITTIVITY_FREE_SPACE * miller_value * horiz_dielectric_constant * wire_thickness / wire_spacing;
+  total_cap = vertical_cap + sidewall_cap + fringe_cap;
+  return(total_cap);
+}
+
+
+void init_tech_params(double technology, bool is_tag)
+{
+  int   iter = 0;
+  int 	tech = 0;
+  int 	tech_lo = 0;
+  int 	tech_hi = 0;
+  double curr_alpha = 0;
+  double curr_vpp = 0;
+  double wire_width = 0;
+  double wire_thickness =0;
+  double wire_spacing = 0;
+  double fringe_cap = 0;
+  double pmos_to_nmos_sizing_r = 0;
+//  double aspect_ratio,ild_thickness, miller_value = 1.5, horiz_dielectric_constant, vert_dielectric_constant;
+  double barrier_thickness = 0;
+  double dishing_thickness = 0;
+  double alpha_scatter = 0;
+  double curr_vdd_dram_cell = 0;
+  double curr_v_th_dram_access_transistor = 0;
+  double curr_I_on_dram_cell = 0;
+  double curr_c_dram_cell = 0;
+
+  uint32_t ram_cell_tech_type    = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type : g_ip->data_arr_ram_cell_tech_type;
+  uint32_t peri_global_tech_type = (is_tag) ? g_ip->tag_arr_peri_global_tech_type : g_ip->data_arr_peri_global_tech_type;
+
+  technology  = technology * 1000.0;  // in the unit of nm
+
+  // initialize parameters
+  g_tp.reset();
+  double gmp_to_gmn_multiplier_periph_global = 0;
+
+  double curr_Wmemcella_dram = 0;
+  double curr_Wmemcellpmos_dram = 0;
+  double curr_Wmemcellnmos_dram = 0;
+  double curr_area_cell_dram = 0;
+  double curr_asp_ratio_cell_dram = 0;
+  double curr_Wmemcella_sram = 0;
+  double curr_Wmemcellpmos_sram = 0;
+  double curr_Wmemcellnmos_sram = 0;
+  double curr_area_cell_sram = 0;
+  double curr_asp_ratio_cell_sram = 0;
+  double curr_I_off_dram_cell_worst_case_length_temp = 0;
+  double curr_Wmemcella_cam = 0;
+  double curr_Wmemcellpmos_cam = 0;
+  double curr_Wmemcellnmos_cam = 0;
+  double curr_area_cell_cam = 0;//Sheng: CAM data
+  double curr_asp_ratio_cell_cam = 0;
+  double SENSE_AMP_D, SENSE_AMP_P; // J
+  double area_cell_dram = 0;
+  double asp_ratio_cell_dram = 0;
+  double area_cell_sram = 0;
+  double asp_ratio_cell_sram = 0;
+  double area_cell_cam = 0;
+  double asp_ratio_cell_cam = 0;
+  double mobility_eff_periph_global = 0;
+  double Vdsat_periph_global = 0;
+  double nmos_effective_resistance_multiplier;
+  double width_dram_access_transistor;
+
+  double curr_logic_scaling_co_eff = 0;//This is based on the reported numbers of Intel Merom 65nm, Penryn45nm and IBM cell 90/65/45 date
+  double curr_core_tx_density = 0;//this is density per um^2; 90, ...22nm based on Intel Penryn
+  double curr_chip_layout_overhead = 0;
+  double curr_macro_layout_overhead = 0;
+  double curr_sckt_co_eff = 0;
+
+  if (technology < 181 && technology > 179)
+      {
+        tech_lo = 180;
+        tech_hi = 180;
+      }
+  else if (technology < 91 && technology > 89)
+  {
+    tech_lo = 90;
+    tech_hi = 90;
+  }
+  else if (technology < 66 && technology > 64)
+  {
+    tech_lo = 65;
+    tech_hi = 65;
+  }
+  else if (technology < 46 && technology > 44)
+  {
+    tech_lo = 45;
+    tech_hi = 45;
+  }
+  else if (technology < 33 && technology > 31)
+  {
+    tech_lo = 32;
+    tech_hi = 32;
+  }
+  else if (technology < 23 && technology > 21)
+  {
+    tech_lo = 22;
+    tech_hi = 22;
+    if (ram_cell_tech_type == 3 )
+    {
+       cout<<"current version does not support eDRAM technologies at 22nm"<<endl;
+       exit(0);
+    }
+  }
+//  else if (technology < 17 && technology > 15)
+//  {
+//    tech_lo = 16;
+//    tech_hi = 16;
+//  }
+  else if (technology < 180 && technology > 90)
+    {
+      tech_lo = 180;
+      tech_hi = 90;
+    }
+  else if (technology < 90 && technology > 65)
+  {
+    tech_lo = 90;
+    tech_hi = 65;
+  }
+  else if (technology < 65 && technology > 45)
+  {
+    tech_lo = 65;
+    tech_hi = 45;
+  }
+  else if (technology < 45 && technology > 32)
+  {
+    tech_lo = 45;
+    tech_hi = 32;
+  }
+  else if (technology < 32 && technology > 22)
+    {
+      tech_lo = 32;
+      tech_hi = 22;
+    }
+//  else if (technology < 22 && technology > 16)
+//    {
+//      tech_lo = 22;
+//      tech_hi = 16;
+//    }
+      else
+    {
+  	  cout<<"Invalid technology nodes"<<endl;
+  	  exit(0);
+    }
+
+  double vdd[NUMBER_TECH_FLAVORS];
+  double Lphy[NUMBER_TECH_FLAVORS];
+  double Lelec[NUMBER_TECH_FLAVORS];
+  double t_ox[NUMBER_TECH_FLAVORS];
+  double v_th[NUMBER_TECH_FLAVORS];
+  double c_ox[NUMBER_TECH_FLAVORS];
+  double mobility_eff[NUMBER_TECH_FLAVORS];
+  double Vdsat[NUMBER_TECH_FLAVORS];
+  double c_g_ideal[NUMBER_TECH_FLAVORS];
+  double c_fringe[NUMBER_TECH_FLAVORS];
+  double c_junc[NUMBER_TECH_FLAVORS];
+  double I_on_n[NUMBER_TECH_FLAVORS];
+  double Rnchannelon[NUMBER_TECH_FLAVORS];
+  double Rpchannelon[NUMBER_TECH_FLAVORS];
+  double n_to_p_eff_curr_drv_ratio[NUMBER_TECH_FLAVORS];
+  double I_off_n[NUMBER_TECH_FLAVORS][101];
+  double I_g_on_n[NUMBER_TECH_FLAVORS][101];
+  //double I_off_p[NUMBER_TECH_FLAVORS][101];
+  double gmp_to_gmn_multiplier[NUMBER_TECH_FLAVORS];
+  //double curr_sckt_co_eff[NUMBER_TECH_FLAVORS];
+  double long_channel_leakage_reduction[NUMBER_TECH_FLAVORS];
+
+  for (iter = 0; iter <= 1; ++iter)
+  {
+    // linear interpolation
+    if (iter == 0)
+    {
+      tech = tech_lo;
+      if (tech_lo == tech_hi)
+      {
+        curr_alpha = 1;
+      }
+      else
+      {
+        curr_alpha = (technology - tech_hi)/(tech_lo - tech_hi);
+      }
+    }
+    else
+    {
+      tech = tech_hi;
+      if (tech_lo == tech_hi)
+      {
+        break;
+      }
+      else
+      {
+        curr_alpha = (tech_lo - technology)/(tech_lo - tech_hi);
+      }
+    }
+
+    if (tech == 180)
+    {
+      //180nm technology-node. Corresponds to year 1999 in ITRS
+      //Only HP transistor was of interest that 180nm since leakage power was not a big issue. Performance was the king
+      //MASTAR does not contain data for 0.18um process. The following parameters are projected based on ITRS 2000 update and IBM 0.18 Cu Spice input
+      bool Aggre_proj = false;
+      SENSE_AMP_D = .28e-9; // s
+      SENSE_AMP_P = 14.7e-15; // J
+      vdd[0]   = 1.5;
+      Lphy[0]  = 0.12;//Lphy is the physical gate-length. micron
+      Lelec[0] = 0.10;//Lelec is the electrical gate-length. micron
+      t_ox[0]  = 1.2e-3*(Aggre_proj? 1.9/1.2:2);//micron
+      v_th[0]  = Aggre_proj? 0.36 : 0.4407;//V
+      c_ox[0]  = 1.79e-14*(Aggre_proj? 1.9/1.2:2);//F/micron2
+      mobility_eff[0] = 302.16 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+      Vdsat[0] = 0.128*2; //V
+      c_g_ideal[0] = (Aggre_proj? 1.9/1.2:2)*6.64e-16;//F/micron
+      c_fringe[0]  = (Aggre_proj? 1.9/1.2:2)*0.08e-15;//F/micron
+      c_junc[0] = (Aggre_proj? 1.9/1.2:2)*1e-15;//F/micron2
+      I_on_n[0] = 750e-6;//A/micron
+      //Note that nmos_effective_resistance_multiplier, n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier values are calculated offline
+      nmos_effective_resistance_multiplier = 1.54;
+      n_to_p_eff_curr_drv_ratio[0] = 2.45;
+      gmp_to_gmn_multiplier[0] = 1.22;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+      long_channel_leakage_reduction[0] = 1;
+      I_off_n[0][0]  = 7e-10;//A/micron
+      I_off_n[0][10] = 8.26e-10;
+      I_off_n[0][20] = 9.74e-10;
+      I_off_n[0][30] = 1.15e-9;
+      I_off_n[0][40] = 1.35e-9;
+      I_off_n[0][50] = 1.60e-9;
+      I_off_n[0][60] = 1.88e-9;
+      I_off_n[0][70] = 2.29e-9;
+      I_off_n[0][80] = 2.70e-9;
+      I_off_n[0][90] = 3.19e-9;
+      I_off_n[0][100] = 3.76e-9;
+
+      I_g_on_n[0][0]  = 1.65e-10;//A/micron
+      I_g_on_n[0][10] = 1.65e-10;
+      I_g_on_n[0][20] = 1.65e-10;
+      I_g_on_n[0][30] = 1.65e-10;
+      I_g_on_n[0][40] = 1.65e-10;
+      I_g_on_n[0][50] = 1.65e-10;
+      I_g_on_n[0][60] = 1.65e-10;
+      I_g_on_n[0][70] = 1.65e-10;
+      I_g_on_n[0][80] = 1.65e-10;
+      I_g_on_n[0][90] = 1.65e-10;
+      I_g_on_n[0][100] = 1.65e-10;
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;//360
+      curr_asp_ratio_cell_cam = 2.92;//2.5
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff  = 1.5;//linear scaling from 90nm
+      curr_core_tx_density       = 1.25*0.7*0.7*0.4;
+      curr_sckt_co_eff           = 1.11;
+      curr_chip_layout_overhead  = 1.0;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.0;//EDA placement and routing tool rule of thumb
+
+    }
+
+    if (tech == 90)
+    {
+      SENSE_AMP_D = .28e-9; // s
+      SENSE_AMP_P = 14.7e-15; // J
+      //90nm technology-node. Corresponds to year 2004 in ITRS
+      //ITRS HP device type
+      vdd[0]   = 1.2;
+      Lphy[0]  = 0.037;//Lphy is the physical gate-length. micron
+      Lelec[0] = 0.0266;//Lelec is the electrical gate-length. micron
+      t_ox[0]  = 1.2e-3;//micron
+      v_th[0]  = 0.23707;//V
+      c_ox[0]  = 1.79e-14;//F/micron2
+      mobility_eff[0] = 342.16 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+      Vdsat[0] = 0.128; //V
+      c_g_ideal[0] = 6.64e-16;//F/micron
+      c_fringe[0]  = 0.08e-15;//F/micron
+      c_junc[0] = 1e-15;//F/micron2
+      I_on_n[0] = 1076.9e-6;//A/micron
+      //I_on_p[0] = 712.6e-6;//A/micron
+      //Note that nmos_effective_resistance_multiplier, n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier values are calculated offline
+      nmos_effective_resistance_multiplier = 1.54;
+      n_to_p_eff_curr_drv_ratio[0] = 2.45;
+      gmp_to_gmn_multiplier[0] = 1.22;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+      long_channel_leakage_reduction[0] = 1;
+      I_off_n[0][0]  = 3.24e-8;//A/micron
+      I_off_n[0][10] = 4.01e-8;
+      I_off_n[0][20] = 4.90e-8;
+      I_off_n[0][30] = 5.92e-8;
+      I_off_n[0][40] = 7.08e-8;
+      I_off_n[0][50] = 8.38e-8;
+      I_off_n[0][60] = 9.82e-8;
+      I_off_n[0][70] = 1.14e-7;
+      I_off_n[0][80] = 1.29e-7;
+      I_off_n[0][90] = 1.43e-7;
+      I_off_n[0][100] = 1.54e-7;
+
+      I_g_on_n[0][0]  = 1.65e-8;//A/micron
+      I_g_on_n[0][10] = 1.65e-8;
+      I_g_on_n[0][20] = 1.65e-8;
+      I_g_on_n[0][30] = 1.65e-8;
+      I_g_on_n[0][40] = 1.65e-8;
+      I_g_on_n[0][50] = 1.65e-8;
+      I_g_on_n[0][60] = 1.65e-8;
+      I_g_on_n[0][70] = 1.65e-8;
+      I_g_on_n[0][80] = 1.65e-8;
+      I_g_on_n[0][90] = 1.65e-8;
+      I_g_on_n[0][100] = 1.65e-8;
+
+      //ITRS LSTP device type
+      vdd[1]   = 1.3;
+      Lphy[1]  = 0.075;
+      Lelec[1] = 0.0486;
+      t_ox[1]  = 2.2e-3;
+      v_th[1]  = 0.48203;
+      c_ox[1]  = 1.22e-14;
+      mobility_eff[1] = 356.76 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 0.373;
+      c_g_ideal[1] = 9.15e-16;
+      c_fringe[1]  = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 503.6e-6;
+      nmos_effective_resistance_multiplier = 1.92;
+      n_to_p_eff_curr_drv_ratio[1] = 2.44;
+      gmp_to_gmn_multiplier[1] =0.88;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1;
+      I_off_n[1][0]  = 2.81e-12;
+      I_off_n[1][10] = 4.76e-12;
+      I_off_n[1][20] = 7.82e-12;
+      I_off_n[1][30] = 1.25e-11;
+      I_off_n[1][40] = 1.94e-11;
+      I_off_n[1][50] = 2.94e-11;
+      I_off_n[1][60] = 4.36e-11;
+      I_off_n[1][70] = 6.32e-11;
+      I_off_n[1][80] = 8.95e-11;
+      I_off_n[1][90] = 1.25e-10;
+      I_off_n[1][100] = 1.7e-10;
+
+      I_g_on_n[1][0]  = 3.87e-11;//A/micron
+      I_g_on_n[1][10] = 3.87e-11;
+      I_g_on_n[1][20] = 3.87e-11;
+      I_g_on_n[1][30] = 3.87e-11;
+      I_g_on_n[1][40] = 3.87e-11;
+      I_g_on_n[1][50] = 3.87e-11;
+      I_g_on_n[1][60] = 3.87e-11;
+      I_g_on_n[1][70] = 3.87e-11;
+      I_g_on_n[1][80] = 3.87e-11;
+      I_g_on_n[1][90] = 3.87e-11;
+      I_g_on_n[1][100] = 3.87e-11;
+
+      //ITRS LOP device type
+      vdd[2] = 0.9;
+      Lphy[2] = 0.053;
+      Lelec[2] = 0.0354;
+      t_ox[2] = 1.5e-3;
+      v_th[2] = 0.30764;
+      c_ox[2] = 1.59e-14;
+      mobility_eff[2] = 460.39 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 0.113;
+      c_g_ideal[2] = 8.45e-16;
+      c_fringe[2] = 0.08e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 386.6e-6;
+      nmos_effective_resistance_multiplier = 1.77;
+      n_to_p_eff_curr_drv_ratio[2] = 2.54;
+      gmp_to_gmn_multiplier[2] = 0.98;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1;
+      I_off_n[2][0] = 2.14e-9;
+      I_off_n[2][10] = 2.9e-9;
+      I_off_n[2][20] = 3.87e-9;
+      I_off_n[2][30] = 5.07e-9;
+      I_off_n[2][40] = 6.54e-9;
+      I_off_n[2][50] = 8.27e-8;
+      I_off_n[2][60] = 1.02e-7;
+      I_off_n[2][70] = 1.20e-7;
+      I_off_n[2][80] = 1.36e-8;
+      I_off_n[2][90] = 1.52e-8;
+      I_off_n[2][100] = 1.73e-8;
+
+      I_g_on_n[2][0]  = 4.31e-8;//A/micron
+      I_g_on_n[2][10] = 4.31e-8;
+      I_g_on_n[2][20] = 4.31e-8;
+      I_g_on_n[2][30] = 4.31e-8;
+      I_g_on_n[2][40] = 4.31e-8;
+      I_g_on_n[2][50] = 4.31e-8;
+      I_g_on_n[2][60] = 4.31e-8;
+      I_g_on_n[2][70] = 4.31e-8;
+      I_g_on_n[2][80] = 4.31e-8;
+      I_g_on_n[2][90] = 4.31e-8;
+      I_g_on_n[2][100] = 4.31e-8;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.2;
+        Lphy[3] = 0.12;
+        Lelec[3] = 0.0756;
+        curr_v_th_dram_access_transistor = 0.4545;
+        width_dram_access_transistor = 0.14;
+        curr_I_on_dram_cell = 45e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 21.1e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 0.168;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.6;
+        t_ox[3] = 2.2e-3;
+        v_th[3] = 0.4545;
+        c_ox[3] = 1.22e-14;
+        mobility_eff[3] =  323.95 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.3;
+        c_g_ideal[3] = 1.47e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 321.6e-6;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.42e-11;
+        I_off_n[3][10] = 2.25e-11;
+        I_off_n[3][20] = 3.46e-11;
+        I_off_n[3][30] = 5.18e-11;
+        I_off_n[3][40] = 7.58e-11;
+        I_off_n[3][50] = 1.08e-10;
+        I_off_n[3][60] = 1.51e-10;
+        I_off_n[3][70] = 2.02e-10;
+        I_off_n[3][80] = 2.57e-10;
+        I_off_n[3][90] = 3.14e-10;
+        I_off_n[3][100] = 3.85e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.6;
+        Lphy[3] = 0.09;
+        Lelec[3] = 0.0576;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.09;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6*0.09*0.09;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 3.7;
+        t_ox[3] = 5.5e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 5.65e-15;
+        mobility_eff[3] =  302.2 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.32;
+        c_g_ideal[3] = 5.08e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1094.3e-6;
+        nmos_effective_resistance_multiplier = 1.62;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 5.80e-15;
+        I_off_n[3][10] = 1.21e-14;
+        I_off_n[3][20] = 2.42e-14;
+        I_off_n[3][30] = 4.65e-14;
+        I_off_n[3][40] = 8.60e-14;
+        I_off_n[3][50] = 1.54e-13;
+        I_off_n[3][60] = 2.66e-13;
+        I_off_n[3][70] = 4.45e-13;
+        I_off_n[3][80] = 7.17e-13;
+        I_off_n[3][90] = 1.11e-12;
+        I_off_n[3][100] = 1.67e-12;
+      }
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;//360
+      curr_asp_ratio_cell_cam = 2.92;//2.5
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff  = 1;
+      curr_core_tx_density       = 1.25*0.7*0.7;
+      curr_sckt_co_eff           = 1.1539;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+
+
+    }
+
+    if (tech == 65)
+    { //65nm technology-node. Corresponds to year 2007 in ITRS
+      //ITRS HP device type
+      SENSE_AMP_D = .2e-9; // s
+      SENSE_AMP_P = 5.7e-15; // J
+      vdd[0] = 1.1;
+      Lphy[0] = 0.025;
+      Lelec[0] = 0.019;
+      t_ox[0] = 1.1e-3;
+      v_th[0] = .19491;
+      c_ox[0] = 1.88e-14;
+      mobility_eff[0] = 436.24 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 7.71e-2;
+      c_g_ideal[0] = 4.69e-16;
+      c_fringe[0] = 0.077e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] = 1197.2e-6;
+      nmos_effective_resistance_multiplier = 1.50;
+      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];
+      long_channel_leakage_reduction[0] = 1/3.74;
+      //Using MASTAR, @380K, increase Lgate until Ion reduces to 90% or Lgate increase by 10%, whichever comes first
+      //Ioff(Lgate normal)/Ioff(Lgate long)= 3.74.
+      I_off_n[0][0] = 1.96e-7;
+      I_off_n[0][10] = 2.29e-7;
+      I_off_n[0][20] = 2.66e-7;
+      I_off_n[0][30] = 3.05e-7;
+      I_off_n[0][40] = 3.49e-7;
+      I_off_n[0][50] = 3.95e-7;
+      I_off_n[0][60] = 4.45e-7;
+      I_off_n[0][70] = 4.97e-7;
+      I_off_n[0][80] = 5.48e-7;
+      I_off_n[0][90] = 5.94e-7;
+      I_off_n[0][100] = 6.3e-7;
+      I_g_on_n[0][0]  = 4.09e-8;//A/micron
+      I_g_on_n[0][10] = 4.09e-8;
+      I_g_on_n[0][20] = 4.09e-8;
+      I_g_on_n[0][30] = 4.09e-8;
+      I_g_on_n[0][40] = 4.09e-8;
+      I_g_on_n[0][50] = 4.09e-8;
+      I_g_on_n[0][60] = 4.09e-8;
+      I_g_on_n[0][70] = 4.09e-8;
+      I_g_on_n[0][80] = 4.09e-8;
+      I_g_on_n[0][90] = 4.09e-8;
+      I_g_on_n[0][100] = 4.09e-8;
+
+      //ITRS LSTP device type
+      vdd[1] = 1.2;
+      Lphy[1] = 0.045;
+      Lelec[1] = 0.0298;
+      t_ox[1] = 1.9e-3;
+      v_th[1] = 0.52354;
+      c_ox[1] = 1.36e-14;
+      mobility_eff[1] = 341.21 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 0.128;
+      c_g_ideal[1] = 6.14e-16;
+      c_fringe[1] = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 519.2e-6;
+      nmos_effective_resistance_multiplier = 1.96;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1/2.82;
+      I_off_n[1][0] = 9.12e-12;
+      I_off_n[1][10] = 1.49e-11;
+      I_off_n[1][20] = 2.36e-11;
+      I_off_n[1][30] = 3.64e-11;
+      I_off_n[1][40] = 5.48e-11;
+      I_off_n[1][50] = 8.05e-11;
+      I_off_n[1][60] = 1.15e-10;
+      I_off_n[1][70] = 1.59e-10;
+      I_off_n[1][80] = 2.1e-10;
+      I_off_n[1][90] = 2.62e-10;
+      I_off_n[1][100] = 3.21e-10;
+
+      I_g_on_n[1][0]  = 1.09e-10;//A/micron
+      I_g_on_n[1][10] = 1.09e-10;
+      I_g_on_n[1][20] = 1.09e-10;
+      I_g_on_n[1][30] = 1.09e-10;
+      I_g_on_n[1][40] = 1.09e-10;
+      I_g_on_n[1][50] = 1.09e-10;
+      I_g_on_n[1][60] = 1.09e-10;
+      I_g_on_n[1][70] = 1.09e-10;
+      I_g_on_n[1][80] = 1.09e-10;
+      I_g_on_n[1][90] = 1.09e-10;
+      I_g_on_n[1][100] = 1.09e-10;
+
+      //ITRS LOP device type
+      vdd[2] = 0.8;
+      Lphy[2] = 0.032;
+      Lelec[2] = 0.0216;
+      t_ox[2] = 1.2e-3;
+      v_th[2] = 0.28512;
+      c_ox[2] = 1.87e-14;
+      mobility_eff[2] = 495.19 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 0.292;
+      c_g_ideal[2] = 6e-16;
+      c_fringe[2] = 0.08e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 573.1e-6;
+      nmos_effective_resistance_multiplier = 1.82;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1/2.05;
+      I_off_n[2][0] = 4.9e-9;
+      I_off_n[2][10] = 6.49e-9;
+      I_off_n[2][20] = 8.45e-9;
+      I_off_n[2][30] = 1.08e-8;
+      I_off_n[2][40] = 1.37e-8;
+      I_off_n[2][50] = 1.71e-8;
+      I_off_n[2][60] = 2.09e-8;
+      I_off_n[2][70] = 2.48e-8;
+      I_off_n[2][80] = 2.84e-8;
+      I_off_n[2][90] = 3.13e-8;
+      I_off_n[2][100] = 3.42e-8;
+
+      I_g_on_n[2][0]  = 9.61e-9;//A/micron
+      I_g_on_n[2][10] = 9.61e-9;
+      I_g_on_n[2][20] = 9.61e-9;
+      I_g_on_n[2][30] = 9.61e-9;
+      I_g_on_n[2][40] = 9.61e-9;
+      I_g_on_n[2][50] = 9.61e-9;
+      I_g_on_n[2][60] = 9.61e-9;
+      I_g_on_n[2][70] = 9.61e-9;
+      I_g_on_n[2][80] = 9.61e-9;
+      I_g_on_n[2][90] = 9.61e-9;
+      I_g_on_n[2][100] = 9.61e-9;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.2;
+        Lphy[3] = 0.12;
+        Lelec[3] = 0.0756;
+        curr_v_th_dram_access_transistor = 0.43806;
+        width_dram_access_transistor = 0.09;
+        curr_I_on_dram_cell = 36e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 19.6e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 0.11;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.6;
+        t_ox[3] = 2.2e-3;
+        v_th[3] = 0.43806;
+        c_ox[3] = 1.22e-14;
+        mobility_eff[3] =  328.32 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.43806;
+        c_g_ideal[3] = 1.46e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15 ;
+        I_on_n[3] = 399.8e-6;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 2.23e-11;
+        I_off_n[3][10] = 3.46e-11;
+        I_off_n[3][20] = 5.24e-11;
+        I_off_n[3][30] = 7.75e-11;
+        I_off_n[3][40] = 1.12e-10;
+        I_off_n[3][50] = 1.58e-10;
+        I_off_n[3][60] = 2.18e-10;
+        I_off_n[3][70] = 2.88e-10;
+        I_off_n[3][80] = 3.63e-10;
+        I_off_n[3][90] = 4.41e-10;
+        I_off_n[3][100] = 5.36e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.3;
+        Lphy[3] = 0.065;
+        Lelec[3] = 0.0426;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.065;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6*0.065*0.065;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 3.3;
+        t_ox[3] = 5e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 6.16e-15;
+        mobility_eff[3] =  303.44 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.385;
+        c_g_ideal[3] = 4e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15 ;
+        I_on_n[3] = 1031e-6;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 2.39;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 1.80e-14;
+        I_off_n[3][10] = 3.64e-14;
+        I_off_n[3][20] = 7.03e-14;
+        I_off_n[3][30] = 1.31e-13;
+        I_off_n[3][40] = 2.35e-13;
+        I_off_n[3][50] = 4.09e-13;
+        I_off_n[3][60] = 6.89e-13;
+        I_off_n[3][70] = 1.13e-12;
+        I_off_n[3][80] = 1.78e-12;
+        I_off_n[3][90] = 2.71e-12;
+        I_off_n[3][100] = 3.99e-12;
+      }
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7; //Rather than scale proportionally to square of feature size, only scale linearly according to IBM cell processor
+      curr_core_tx_density      = 1.25*0.7;
+      curr_sckt_co_eff           = 1.1359;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 45)
+    { //45nm technology-node. Corresponds to year 2010 in ITRS
+      //ITRS HP device type
+      SENSE_AMP_D = .04e-9; // s
+      SENSE_AMP_P = 2.7e-15; // J
+      vdd[0] = 1.0;
+      Lphy[0] = 0.018;
+      Lelec[0] = 0.01345;
+      t_ox[0] = 0.65e-3;
+      v_th[0] = .18035;
+      c_ox[0] = 3.77e-14;
+      mobility_eff[0] = 266.68 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 9.38E-2;
+      c_g_ideal[0] = 6.78e-16;
+      c_fringe[0] = 0.05e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] = 2046.6e-6;
+      //There are certain problems with the ITRS PMOS numbers in MASTAR for 45nm. So we are using 65nm values of
+      //n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier for 45nm
+      nmos_effective_resistance_multiplier = 1.51;
+      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];
+      long_channel_leakage_reduction[0] = 1/3.546;//Using MASTAR, @380K, increase Lgate until Ion reduces to 90%, Ioff(Lgate normal)/Ioff(Lgate long)= 3.74
+      I_off_n[0][0] = 2.8e-7;
+      I_off_n[0][10] = 3.28e-7;
+      I_off_n[0][20] = 3.81e-7;
+      I_off_n[0][30] = 4.39e-7;
+      I_off_n[0][40] = 5.02e-7;
+      I_off_n[0][50] = 5.69e-7;
+      I_off_n[0][60] = 6.42e-7;
+      I_off_n[0][70] = 7.2e-7;
+      I_off_n[0][80] = 8.03e-7;
+      I_off_n[0][90] = 8.91e-7;
+      I_off_n[0][100] = 9.84e-7;
+
+      I_g_on_n[0][0]  = 3.59e-8;//A/micron
+      I_g_on_n[0][10] = 3.59e-8;
+      I_g_on_n[0][20] = 3.59e-8;
+      I_g_on_n[0][30] = 3.59e-8;
+      I_g_on_n[0][40] = 3.59e-8;
+      I_g_on_n[0][50] = 3.59e-8;
+      I_g_on_n[0][60] = 3.59e-8;
+      I_g_on_n[0][70] = 3.59e-8;
+      I_g_on_n[0][80] = 3.59e-8;
+      I_g_on_n[0][90] = 3.59e-8;
+      I_g_on_n[0][100] = 3.59e-8;
+
+      //ITRS LSTP device type
+      vdd[1] = 1.1;
+      Lphy[1] =  0.028;
+      Lelec[1] = 0.0212;
+      t_ox[1] = 1.4e-3;
+      v_th[1] = 0.50245;
+      c_ox[1] = 2.01e-14;
+      mobility_eff[1] =  363.96 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 9.12e-2;
+      c_g_ideal[1] = 5.18e-16;
+      c_fringe[1] = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 666.2e-6;
+      nmos_effective_resistance_multiplier = 1.99;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1/2.08;
+      I_off_n[1][0] = 1.01e-11;
+      I_off_n[1][10] = 1.65e-11;
+      I_off_n[1][20] = 2.62e-11;
+      I_off_n[1][30] = 4.06e-11;
+      I_off_n[1][40] = 6.12e-11;
+      I_off_n[1][50] = 9.02e-11;
+      I_off_n[1][60] = 1.3e-10;
+      I_off_n[1][70] = 1.83e-10;
+      I_off_n[1][80] = 2.51e-10;
+      I_off_n[1][90] = 3.29e-10;
+      I_off_n[1][100] = 4.1e-10;
+
+      I_g_on_n[1][0]  = 9.47e-12;//A/micron
+      I_g_on_n[1][10] = 9.47e-12;
+      I_g_on_n[1][20] = 9.47e-12;
+      I_g_on_n[1][30] = 9.47e-12;
+      I_g_on_n[1][40] = 9.47e-12;
+      I_g_on_n[1][50] = 9.47e-12;
+      I_g_on_n[1][60] = 9.47e-12;
+      I_g_on_n[1][70] = 9.47e-12;
+      I_g_on_n[1][80] = 9.47e-12;
+      I_g_on_n[1][90] = 9.47e-12;
+      I_g_on_n[1][100] = 9.47e-12;
+
+      //ITRS LOP device type
+      vdd[2] = 0.7;
+      Lphy[2] = 0.022;
+      Lelec[2] = 0.016;
+      t_ox[2] = 0.9e-3;
+      v_th[2] = 0.22599;
+      c_ox[2] = 2.82e-14;//F/micron2
+      mobility_eff[2] = 508.9 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 5.71e-2;
+      c_g_ideal[2] = 6.2e-16;
+      c_fringe[2] = 0.073e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 748.9e-6;
+      nmos_effective_resistance_multiplier = 1.76;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1/1.92;
+      I_off_n[2][0] = 4.03e-9;
+      I_off_n[2][10] = 5.02e-9;
+      I_off_n[2][20] = 6.18e-9;
+      I_off_n[2][30] = 7.51e-9;
+      I_off_n[2][40] = 9.04e-9;
+      I_off_n[2][50] = 1.08e-8;
+      I_off_n[2][60] = 1.27e-8;
+      I_off_n[2][70] = 1.47e-8;
+      I_off_n[2][80] = 1.66e-8;
+      I_off_n[2][90] = 1.84e-8;
+      I_off_n[2][100] = 2.03e-8;
+
+      I_g_on_n[2][0]  = 3.24e-8;//A/micron
+      I_g_on_n[2][10] = 4.01e-8;
+      I_g_on_n[2][20] = 4.90e-8;
+      I_g_on_n[2][30] = 5.92e-8;
+      I_g_on_n[2][40] = 7.08e-8;
+      I_g_on_n[2][50] = 8.38e-8;
+      I_g_on_n[2][60] = 9.82e-8;
+      I_g_on_n[2][70] = 1.14e-7;
+      I_g_on_n[2][80] = 1.29e-7;
+      I_g_on_n[2][90] = 1.43e-7;
+      I_g_on_n[2][100] = 1.54e-7;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.1;
+        Lphy[3] = 0.078;
+        Lelec[3] = 0.0504;// Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors.
+        curr_v_th_dram_access_transistor = 0.44559;
+        width_dram_access_transistor = 0.079;
+        curr_I_on_dram_cell = 36e-6;//A
+        curr_I_off_dram_cell_worst_case_length_temp = 19.5e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram  = 0;
+        curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.5;
+        t_ox[3] = 2.1e-3;
+        v_th[3] = 0.44559;
+        c_ox[3] = 1.41e-14;
+        mobility_eff[3] =   426.30 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.181;
+        c_g_ideal[3] = 1.10e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 456e-6;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 2.54e-11;
+        I_off_n[3][10] = 3.94e-11;
+        I_off_n[3][20] = 5.95e-11;
+        I_off_n[3][30] = 8.79e-11;
+        I_off_n[3][40] = 1.27e-10;
+        I_off_n[3][50] = 1.79e-10;
+        I_off_n[3][60] = 2.47e-10;
+        I_off_n[3][70] = 3.31e-10;
+        I_off_n[3][80] = 4.26e-10;
+        I_off_n[3][90] = 5.27e-10;
+        I_off_n[3][100] = 6.46e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.1;
+        Lphy[3] = 0.045;
+        Lelec[3] = 0.0298;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.045;
+        curr_I_on_dram_cell = 20e-6;//A
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram  = 0;
+        curr_area_cell_dram = 6*0.045*0.045;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 2.7;
+        t_ox[3] = 4e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 7.98e-15;
+        mobility_eff[3] = 368.58 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.147;
+        c_g_ideal[3] = 3.59e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 999.4e-6;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.31e-14;
+        I_off_n[3][10] = 2.68e-14;
+        I_off_n[3][20] = 5.25e-14;
+        I_off_n[3][30] = 9.88e-14;
+        I_off_n[3][40] = 1.79e-13;
+        I_off_n[3][50] = 3.15e-13;
+        I_off_n[3][60] = 5.36e-13;
+        I_off_n[3][70] = 8.86e-13;
+        I_off_n[3][80] = 1.42e-12;
+        I_off_n[3][90] = 2.20e-12;
+        I_off_n[3][100] = 3.29e-12;
+      }
+
+
+      //SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7*0.7;
+      curr_core_tx_density      = 1.25;
+      curr_sckt_co_eff           = 1.1387;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 32)
+    {
+      SENSE_AMP_D = .03e-9; // s
+      SENSE_AMP_P = 2.16e-15; // J
+      //For 2013, MPU/ASIC stagger-contacted M1 half-pitch is 32 nm (so this is 32 nm
+      //technology i.e. FEATURESIZE = 0.032). Using the SOI process numbers for
+      //HP and LSTP.
+      vdd[0] = 0.9;
+      Lphy[0] = 0.013;
+      Lelec[0] = 0.01013;
+      t_ox[0] = 0.5e-3;
+      v_th[0] = 0.21835;
+      c_ox[0] = 4.11e-14;
+      mobility_eff[0] = 361.84 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 5.09E-2;
+      c_g_ideal[0] = 5.34e-16;
+      c_fringe[0] = 0.04e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] =  2211.7e-6;
+      nmos_effective_resistance_multiplier = 1.49;
+      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+      long_channel_leakage_reduction[0] = 1/3.706;
+      //Using MASTAR, @300K (380K does not work in MASTAR), increase Lgate until Ion reduces to 95% or Lgate increase by 5% (DG device can only increase by 5%),
+      //whichever comes first
+      I_off_n[0][0] = 1.52e-7;
+      I_off_n[0][10] = 1.55e-7;
+      I_off_n[0][20] = 1.59e-7;
+      I_off_n[0][30] = 1.68e-7;
+      I_off_n[0][40] = 1.90e-7;
+      I_off_n[0][50] = 2.69e-7;
+      I_off_n[0][60] = 5.32e-7;
+      I_off_n[0][70] = 1.02e-6;
+      I_off_n[0][80] = 1.62e-6;
+      I_off_n[0][90] = 2.73e-6;
+      I_off_n[0][100] = 6.1e-6;
+
+      I_g_on_n[0][0]  = 6.55e-8;//A/micron
+      I_g_on_n[0][10] = 6.55e-8;
+      I_g_on_n[0][20] = 6.55e-8;
+      I_g_on_n[0][30] = 6.55e-8;
+      I_g_on_n[0][40] = 6.55e-8;
+      I_g_on_n[0][50] = 6.55e-8;
+      I_g_on_n[0][60] = 6.55e-8;
+      I_g_on_n[0][70] = 6.55e-8;
+      I_g_on_n[0][80] = 6.55e-8;
+      I_g_on_n[0][90] = 6.55e-8;
+      I_g_on_n[0][100] = 6.55e-8;
+
+//      32 DG
+//      I_g_on_n[0][0]  = 2.71e-9;//A/micron
+//      I_g_on_n[0][10] = 2.71e-9;
+//      I_g_on_n[0][20] = 2.71e-9;
+//      I_g_on_n[0][30] = 2.71e-9;
+//      I_g_on_n[0][40] = 2.71e-9;
+//      I_g_on_n[0][50] = 2.71e-9;
+//      I_g_on_n[0][60] = 2.71e-9;
+//      I_g_on_n[0][70] = 2.71e-9;
+//      I_g_on_n[0][80] = 2.71e-9;
+//      I_g_on_n[0][90] = 2.71e-9;
+//      I_g_on_n[0][100] = 2.71e-9;
+
+      //LSTP device type
+      vdd[1] = 1;
+      Lphy[1] = 0.020;
+      Lelec[1] = 0.0173;
+      t_ox[1] = 1.2e-3;
+      v_th[1] = 0.513;
+      c_ox[1] = 2.29e-14;
+      mobility_eff[1] =  347.46 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 8.64e-2;
+      c_g_ideal[1] = 4.58e-16;
+      c_fringe[1] = 0.053e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 683.6e-6;
+      nmos_effective_resistance_multiplier = 1.99;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1/1.93;
+      I_off_n[1][0] = 2.06e-11;
+      I_off_n[1][10] = 3.30e-11;
+      I_off_n[1][20] = 5.15e-11;
+      I_off_n[1][30] = 7.83e-11;
+      I_off_n[1][40] = 1.16e-10;
+      I_off_n[1][50] = 1.69e-10;
+      I_off_n[1][60] = 2.40e-10;
+      I_off_n[1][70] = 3.34e-10;
+      I_off_n[1][80] = 4.54e-10;
+      I_off_n[1][90] = 5.96e-10;
+      I_off_n[1][100] = 7.44e-10;
+
+      I_g_on_n[1][0]  = 3.73e-11;//A/micron
+      I_g_on_n[1][10] = 3.73e-11;
+      I_g_on_n[1][20] = 3.73e-11;
+      I_g_on_n[1][30] = 3.73e-11;
+      I_g_on_n[1][40] = 3.73e-11;
+      I_g_on_n[1][50] = 3.73e-11;
+      I_g_on_n[1][60] = 3.73e-11;
+      I_g_on_n[1][70] = 3.73e-11;
+      I_g_on_n[1][80] = 3.73e-11;
+      I_g_on_n[1][90] = 3.73e-11;
+      I_g_on_n[1][100] = 3.73e-11;
+
+
+      //LOP device type
+      vdd[2] = 0.6;
+      Lphy[2] = 0.016;
+      Lelec[2] = 0.01232;
+      t_ox[2] = 0.9e-3;
+      v_th[2] = 0.24227;
+      c_ox[2] = 2.84e-14;
+      mobility_eff[2] =  513.52 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 4.64e-2;
+      c_g_ideal[2] = 4.54e-16;
+      c_fringe[2] = 0.057e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 827.8e-6;
+      nmos_effective_resistance_multiplier = 1.73;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1/1.89;
+      I_off_n[2][0] = 5.94e-8;
+      I_off_n[2][10] = 7.23e-8;
+      I_off_n[2][20] = 8.7e-8;
+      I_off_n[2][30] = 1.04e-7;
+      I_off_n[2][40] = 1.22e-7;
+      I_off_n[2][50] = 1.43e-7;
+      I_off_n[2][60] = 1.65e-7;
+      I_off_n[2][70] = 1.90e-7;
+      I_off_n[2][80] = 2.15e-7;
+      I_off_n[2][90] = 2.39e-7;
+      I_off_n[2][100] = 2.63e-7;
+
+      I_g_on_n[2][0]  = 2.93e-9;//A/micron
+      I_g_on_n[2][10] = 2.93e-9;
+      I_g_on_n[2][20] = 2.93e-9;
+      I_g_on_n[2][30] = 2.93e-9;
+      I_g_on_n[2][40] = 2.93e-9;
+      I_g_on_n[2][50] = 2.93e-9;
+      I_g_on_n[2][60] = 2.93e-9;
+      I_g_on_n[2][70] = 2.93e-9;
+      I_g_on_n[2][80] = 2.93e-9;
+      I_g_on_n[2][90] = 2.93e-9;
+      I_g_on_n[2][100] = 2.93e-9;
+
+      if (ram_cell_tech_type == lp_dram)
+      {
+        //LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.0;
+        Lphy[3] = 0.056;
+        Lelec[3] = 0.0419;//Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors.
+        curr_v_th_dram_access_transistor = 0.44129;
+        width_dram_access_transistor = 0.056;
+        curr_I_on_dram_cell = 36e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 18.9e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        //LP-DRAM wordline transistor parameters
+        curr_vpp = 1.5;
+        t_ox[3] = 2e-3;
+        v_th[3] = 0.44467;
+        c_ox[3] = 1.48e-14;
+        mobility_eff[3] =  408.12 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.174;
+        c_g_ideal[3] = 7.45e-16;
+        c_fringe[3] = 0.053e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1055.4e-6;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 3.57e-11;
+        I_off_n[3][10] = 5.51e-11;
+        I_off_n[3][20] = 8.27e-11;
+        I_off_n[3][30] = 1.21e-10;
+        I_off_n[3][40] = 1.74e-10;
+        I_off_n[3][50] = 2.45e-10;
+        I_off_n[3][60] = 3.38e-10;
+        I_off_n[3][70] = 4.53e-10;
+        I_off_n[3][80] = 5.87e-10;
+        I_off_n[3][90] = 7.29e-10;
+        I_off_n[3][100] = 8.87e-10;
+      }
+      else if (ram_cell_tech_type == comm_dram)
+      {
+        //COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.0;
+        Lphy[3] = 0.032;
+        Lelec[3] = 0.0205;//Assume Lelec is 30% lesser than Lphy for DRAM access and wordline transistors.
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.032;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6*0.032*0.032;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        //COMM-DRAM wordline transistor parameters
+        curr_vpp = 2.6;
+        t_ox[3] = 4e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 7.99e-15;
+        mobility_eff[3] =  380.76 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.129;
+        c_g_ideal[3] = 2.56e-16;
+        c_fringe[3] = 0.053e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1024.5e-6;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0]  = 3.63e-14;
+        I_off_n[3][10] = 7.18e-14;
+        I_off_n[3][20] = 1.36e-13;
+        I_off_n[3][30] = 2.49e-13;
+        I_off_n[3][40] = 4.41e-13;
+        I_off_n[3][50] = 7.55e-13;
+        I_off_n[3][60] = 1.26e-12;
+        I_off_n[3][70] = 2.03e-12;
+        I_off_n[3][80] = 3.19e-12;
+        I_off_n[3][90] = 4.87e-12;
+        I_off_n[3][100] = 7.16e-12;
+      }
+
+      //SRAM cell properties
+      curr_Wmemcella_sram    = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram    = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      //CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      //Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7*0.7*0.7;
+      curr_core_tx_density      = 1.25/0.7;
+      curr_sckt_co_eff           = 1.1111;
+      curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    }
+
+    if(tech == 22){
+        SENSE_AMP_D = .03e-9; // s
+	SENSE_AMP_P = 2.16e-15; // J
+    	//For 2016, MPU/ASIC stagger-contacted M1 half-pitch is 22 nm (so this is 22 nm
+    	//technology i.e. FEATURESIZE = 0.022). Using the DG process numbers for HP.
+    	//22 nm HP
+    	vdd[0] = 0.8;
+    	Lphy[0] = 0.009;//Lphy is the physical gate-length.
+    	Lelec[0] = 0.00468;//Lelec is the electrical gate-length.
+    	t_ox[0] = 0.55e-3;//micron
+    	v_th[0] = 0.1395;//V
+    	c_ox[0] = 3.63e-14;//F/micron2
+    	mobility_eff[0] = 426.07 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+    	Vdsat[0] = 2.33e-2; //V/micron
+    	c_g_ideal[0] = 3.27e-16;//F/micron
+    	c_fringe[0] = 0.06e-15;//F/micron
+    	c_junc[0] = 0;//F/micron2
+    	I_on_n[0] =  2626.4e-6;//A/micron
+    	//I_on_p[0] = I_on_n[0] / 2;//A/micron //This value for I_on_p is not really used.
+        nmos_effective_resistance_multiplier = 1.45;
+        n_to_p_eff_curr_drv_ratio[0] = 2; //Wpmos/Wnmos = 2 in 2007 MASTAR. Look in
+    	//"Dynamic" tab of Device workspace.
+        gmp_to_gmn_multiplier[0] = 1.38; //Just using the 32nm SOI value.
+        Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+        Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+        long_channel_leakage_reduction[0] = 1/3.274;
+        I_off_n[0][0] = 1.52e-7/1.5*1.2;//From 22nm, leakage current are directly from ITRS report rather than MASTAR, since MASTAR has serious bugs there.
+        I_off_n[0][10] = 1.55e-7/1.5*1.2;
+        I_off_n[0][20] = 1.59e-7/1.5*1.2;
+        I_off_n[0][30] = 1.68e-7/1.5*1.2;
+        I_off_n[0][40] = 1.90e-7/1.5*1.2;
+        I_off_n[0][50] = 2.69e-7/1.5*1.2;
+        I_off_n[0][60] = 5.32e-7/1.5*1.2;
+        I_off_n[0][70] = 1.02e-6/1.5*1.2;
+        I_off_n[0][80] = 1.62e-6/1.5*1.2;
+        I_off_n[0][90] = 2.73e-6/1.5*1.2;
+        I_off_n[0][100] = 6.1e-6/1.5*1.2;
+        //for 22nm DG HP
+        I_g_on_n[0][0]  = 1.81e-9;//A/micron
+        I_g_on_n[0][10] = 1.81e-9;
+        I_g_on_n[0][20] = 1.81e-9;
+        I_g_on_n[0][30] = 1.81e-9;
+        I_g_on_n[0][40] = 1.81e-9;
+        I_g_on_n[0][50] = 1.81e-9;
+        I_g_on_n[0][60] = 1.81e-9;
+        I_g_on_n[0][70] = 1.81e-9;
+        I_g_on_n[0][80] = 1.81e-9;
+        I_g_on_n[0][90] = 1.81e-9;
+        I_g_on_n[0][100] = 1.81e-9;
+
+    	//22 nm LSTP DG
+    	vdd[1] = 0.8;
+    	Lphy[1] = 0.014;
+    	Lelec[1] = 0.008;//Lelec is the electrical gate-length.
+    	t_ox[1] = 1.1e-3;//micron
+    	v_th[1] = 0.40126;//V
+    	c_ox[1] = 2.30e-14;//F/micron2
+    	mobility_eff[1] =  738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+    	Vdsat[1] = 6.64e-2; //V/micron
+    	c_g_ideal[1] = 3.22e-16;//F/micron
+    	c_fringe[1] = 0.08e-15;
+    	c_junc[1] = 0;//F/micron2
+    	I_on_n[1] = 727.6e-6;//A/micron
+    	nmos_effective_resistance_multiplier = 1.99;
+    	n_to_p_eff_curr_drv_ratio[1] = 2;
+    	gmp_to_gmn_multiplier[1] = 0.99;
+    	Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];//ohm-micron
+    	Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];//ohm-micron
+    	long_channel_leakage_reduction[1] = 1/1.89;
+    	I_off_n[1][0] = 2.43e-11;
+    	I_off_n[1][10] = 4.85e-11;
+    	I_off_n[1][20] = 9.68e-11;
+    	I_off_n[1][30] = 1.94e-10;
+    	I_off_n[1][40] = 3.87e-10;
+    	I_off_n[1][50] = 7.73e-10;
+    	I_off_n[1][60] = 3.55e-10;
+    	I_off_n[1][70] = 3.09e-9;
+    	I_off_n[1][80] = 6.19e-9;
+    	I_off_n[1][90] = 1.24e-8;
+    	I_off_n[1][100]= 2.48e-8;
+
+    	I_g_on_n[1][0]  = 4.51e-10;//A/micron
+    	I_g_on_n[1][10] = 4.51e-10;
+    	I_g_on_n[1][20] = 4.51e-10;
+    	I_g_on_n[1][30] = 4.51e-10;
+    	I_g_on_n[1][40] = 4.51e-10;
+    	I_g_on_n[1][50] = 4.51e-10;
+    	I_g_on_n[1][60] = 4.51e-10;
+    	I_g_on_n[1][70] = 4.51e-10;
+    	I_g_on_n[1][80] = 4.51e-10;
+    	I_g_on_n[1][90] = 4.51e-10;
+    	I_g_on_n[1][100] = 4.51e-10;
+
+    	//22 nm LOP
+    	vdd[2] = 0.6;
+    	Lphy[2] = 0.011;
+    	Lelec[2] = 0.00604;//Lelec is the electrical gate-length.
+    	t_ox[2] = 0.8e-3;//micron
+    	v_th[2] = 0.2315;//V
+    	c_ox[2] = 2.87e-14;//F/micron2
+    	mobility_eff[2] =  698.37 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+    	Vdsat[2] = 1.81e-2; //V/micron
+    	c_g_ideal[2] = 3.16e-16;//F/micron
+    	c_fringe[2] = 0.08e-15;
+    	c_junc[2] = 0;//F/micron2 This is Cj0 not Cjunc in MASTAR results->Dynamic Tab
+    	I_on_n[2] = 916.1e-6;//A/micron
+    	nmos_effective_resistance_multiplier = 1.73;
+    	n_to_p_eff_curr_drv_ratio[2] = 2;
+    	gmp_to_gmn_multiplier[2] = 1.11;
+    	Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];//ohm-micron
+    	Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];//ohm-micron
+    	long_channel_leakage_reduction[2] = 1/2.38;
+
+    	I_off_n[2][0] = 1.31e-8;
+    	I_off_n[2][10] = 2.60e-8;
+    	I_off_n[2][20] = 5.14e-8;
+    	I_off_n[2][30] = 1.02e-7;
+    	I_off_n[2][40] = 2.02e-7;
+    	I_off_n[2][50] = 3.99e-7;
+    	I_off_n[2][60] = 7.91e-7;
+    	I_off_n[2][70] = 1.09e-6;
+    	I_off_n[2][80] = 2.09e-6;
+    	I_off_n[2][90] = 4.04e-6;
+    	I_off_n[2][100]= 4.48e-6;
+
+    	I_g_on_n[2][0]  = 2.74e-9;//A/micron
+    	I_g_on_n[2][10] = 2.74e-9;
+    	I_g_on_n[2][20] = 2.74e-9;
+    	I_g_on_n[2][30] = 2.74e-9;
+    	I_g_on_n[2][40] = 2.74e-9;
+    	I_g_on_n[2][50] = 2.74e-9;
+    	I_g_on_n[2][60] = 2.74e-9;
+    	I_g_on_n[2][70] = 2.74e-9;
+    	I_g_on_n[2][80] = 2.74e-9;
+    	I_g_on_n[2][90] = 2.74e-9;
+    	I_g_on_n[2][100] = 2.74e-9;
+
+
+
+        if (ram_cell_tech_type == 3)
+              {}
+        else if (ram_cell_tech_type == 4)
+        {
+    	//22 nm commodity DRAM cell access transistor technology parameters.
+    		//parameters
+        	curr_vdd_dram_cell = 0.9;//0.45;//This value has reduced greatly in 2007 ITRS for all technology nodes. In
+    		//2005 ITRS, the value was about twice the value in 2007 ITRS
+    		Lphy[3] = 0.022;//micron
+    		Lelec[3] = 0.0181;//micron.
+    		curr_v_th_dram_access_transistor = 1;//V
+    		width_dram_access_transistor = 0.022;//micron
+    		curr_I_on_dram_cell = 20e-6; //This is a typical value that I have always
+    		//kept constant. In reality this could perhaps be lower
+    		curr_I_off_dram_cell_worst_case_length_temp = 1e-15;//A
+    		curr_Wmemcella_dram = width_dram_access_transistor;
+    		curr_Wmemcellpmos_dram = 0;
+    		curr_Wmemcellnmos_dram = 0;
+    		curr_area_cell_dram = 6*0.022*0.022;//micron2.
+    		curr_asp_ratio_cell_dram = 0.667;
+    		curr_c_dram_cell = 30e-15;//This is a typical value that I have alwaus
+    		//kept constant.
+
+    	//22 nm commodity DRAM wordline transistor parameters obtained using MASTAR.
+    		curr_vpp = 2.3;//vpp. V
+    		t_ox[3] = 3.5e-3;//micron
+    		v_th[3] = 1.0;//V
+    		c_ox[3] = 9.06e-15;//F/micron2
+    		mobility_eff[3] =  367.29 * (1e-2 * 1e6 * 1e-2 * 1e6);//micron2 / Vs
+    		Vdsat[3] = 0.0972; //V/micron
+    		c_g_ideal[3] = 1.99e-16;//F/micron
+    		c_fringe[3] = 0.053e-15;//F/micron
+    		c_junc[3] = 1e-15;//F/micron2
+    		I_on_n[3] = 910.5e-6;//A/micron
+    		nmos_effective_resistance_multiplier = 1.69;//Using the value from 32nm.
+    		//
+    		n_to_p_eff_curr_drv_ratio[3] = 1.95;//Using the value from 32nm
+    		gmp_to_gmn_multiplier[3] = 0.90;
+    		Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp  / I_on_n[3];//ohm-micron
+    		Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];//ohm-micron
+    		long_channel_leakage_reduction[3] = 1;
+    		I_off_n[3][0] = 1.1e-13; //A/micron
+    		I_off_n[3][10] = 2.11e-13;
+    		I_off_n[3][20] = 3.88e-13;
+    		I_off_n[3][30] = 6.9e-13;
+    		I_off_n[3][40] = 1.19e-12;
+    		I_off_n[3][50] = 1.98e-12;
+    		I_off_n[3][60] = 3.22e-12;
+    		I_off_n[3][70] = 5.09e-12;
+    		I_off_n[3][80] = 7.85e-12;
+    		I_off_n[3][90] = 1.18e-11;
+    		I_off_n[3][100] = 1.72e-11;
+
+    	}
+        else
+        {
+      	  //some error handler
+        }
+
+        //SRAM cell properties
+        curr_Wmemcella_sram    = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_sram    = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_sram = 1.46;
+        //CAM cell properties //TODO: data need to be revisited
+        curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_cam = 2.92;
+        //Empirical undifferetiated core/FU coefficient
+        curr_logic_scaling_co_eff = 0.7*0.7*0.7*0.7;
+        curr_core_tx_density      = 1.25/0.7/0.7;
+        curr_sckt_co_eff           = 1.1296;
+        curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+        curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    	}
+
+    if(tech == 16){
+    	//For 2019, MPU/ASIC stagger-contacted M1 half-pitch is 16 nm (so this is 16 nm
+    	//technology i.e. FEATURESIZE = 0.016). Using the DG process numbers for HP.
+    	//16 nm HP
+    	vdd[0] = 0.7;
+    	Lphy[0] = 0.006;//Lphy is the physical gate-length.
+    	Lelec[0] = 0.00315;//Lelec is the electrical gate-length.
+    	t_ox[0] = 0.5e-3;//micron
+    	v_th[0] = 0.1489;//V
+    	c_ox[0] = 3.83e-14;//F/micron2 Cox_elec in MASTAR
+    	mobility_eff[0] = 476.15 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+    	Vdsat[0] = 1.42e-2; //V/micron calculated in spreadsheet
+    	c_g_ideal[0] = 2.30e-16;//F/micron
+    	c_fringe[0] = 0.06e-15;//F/micron MASTAR inputdynamic/3
+    	c_junc[0] = 0;//F/micron2 MASTAR result dynamic
+    	I_on_n[0] =  2768.4e-6;//A/micron
+        nmos_effective_resistance_multiplier = 1.48;//nmos_effective_resistance_multiplier  is the ratio of Ieff to Idsat where Ieff is the effective NMOS current and Idsat is the saturation current.
+        n_to_p_eff_curr_drv_ratio[0] = 2; //Wpmos/Wnmos = 2 in 2007 MASTAR. Look in
+    	//"Dynamic" tab of Device workspace.
+        gmp_to_gmn_multiplier[0] = 1.38; //Just using the 32nm SOI value.
+        Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];//ohm-micron
+        Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];//ohm-micron
+        long_channel_leakage_reduction[0] = 1/2.655;
+        I_off_n[0][0] = 1.52e-7/1.5*1.2*1.07;
+        I_off_n[0][10] = 1.55e-7/1.5*1.2*1.07;
+        I_off_n[0][20] = 1.59e-7/1.5*1.2*1.07;
+        I_off_n[0][30] = 1.68e-7/1.5*1.2*1.07;
+        I_off_n[0][40] = 1.90e-7/1.5*1.2*1.07;
+        I_off_n[0][50] = 2.69e-7/1.5*1.2*1.07;
+        I_off_n[0][60] = 5.32e-7/1.5*1.2*1.07;
+        I_off_n[0][70] = 1.02e-6/1.5*1.2*1.07;
+        I_off_n[0][80] = 1.62e-6/1.5*1.2*1.07;
+        I_off_n[0][90] = 2.73e-6/1.5*1.2*1.07;
+        I_off_n[0][100] = 6.1e-6/1.5*1.2*1.07;
+        //for 16nm DG HP
+        I_g_on_n[0][0]  = 1.07e-9;//A/micron
+        I_g_on_n[0][10] = 1.07e-9;
+        I_g_on_n[0][20] = 1.07e-9;
+        I_g_on_n[0][30] = 1.07e-9;
+        I_g_on_n[0][40] = 1.07e-9;
+        I_g_on_n[0][50] = 1.07e-9;
+        I_g_on_n[0][60] = 1.07e-9;
+        I_g_on_n[0][70] = 1.07e-9;
+        I_g_on_n[0][80] = 1.07e-9;
+        I_g_on_n[0][90] = 1.07e-9;
+        I_g_on_n[0][100] = 1.07e-9;
+
+//    	//16 nm LSTP DG
+//    	vdd[1] = 0.8;
+//    	Lphy[1] = 0.014;
+//    	Lelec[1] = 0.008;//Lelec is the electrical gate-length.
+//    	t_ox[1] = 1.1e-3;//micron
+//    	v_th[1] = 0.40126;//V
+//    	c_ox[1] = 2.30e-14;//F/micron2
+//    	mobility_eff[1] =  738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2 / Vs
+//    	Vdsat[1] = 6.64e-2; //V/micron
+//    	c_g_ideal[1] = 3.22e-16;//F/micron
+//    	c_fringe[1] = 0.008e-15;
+//    	c_junc[1] = 0;//F/micron2
+//    	I_on_n[1] = 727.6e-6;//A/micron
+//    	I_on_p[1] = I_on_n[1] / 2;
+//    	nmos_effective_resistance_multiplier = 1.99;
+//    	n_to_p_eff_curr_drv_ratio[1] = 2;
+//    	gmp_to_gmn_multiplier[1] = 0.99;
+//    	Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];//ohm-micron
+//    	Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];//ohm-micron
+//    	I_off_n[1][0] = 2.43e-11;
+//    	I_off_n[1][10] = 4.85e-11;
+//    	I_off_n[1][20] = 9.68e-11;
+//    	I_off_n[1][30] = 1.94e-10;
+//    	I_off_n[1][40] = 3.87e-10;
+//    	I_off_n[1][50] = 7.73e-10;
+//    	I_off_n[1][60] = 3.55e-10;
+//    	I_off_n[1][70] = 3.09e-9;
+//    	I_off_n[1][80] = 6.19e-9;
+//    	I_off_n[1][90] = 1.24e-8;
+//    	I_off_n[1][100]= 2.48e-8;
+//
+//    	//    for 22nm LSTP HP
+//    	I_g_on_n[1][0]  = 4.51e-10;//A/micron
+//    	I_g_on_n[1][10] = 4.51e-10;
+//    	I_g_on_n[1][20] = 4.51e-10;
+//    	I_g_on_n[1][30] = 4.51e-10;
+//    	I_g_on_n[1][40] = 4.51e-10;
+//    	I_g_on_n[1][50] = 4.51e-10;
+//    	I_g_on_n[1][60] = 4.51e-10;
+//    	I_g_on_n[1][70] = 4.51e-10;
+//    	I_g_on_n[1][80] = 4.51e-10;
+//    	I_g_on_n[1][90] = 4.51e-10;
+//    	I_g_on_n[1][100] = 4.51e-10;
+
+
+        if (ram_cell_tech_type == 3)
+              {}
+        else if (ram_cell_tech_type == 4)
+        {
+    	//22 nm commodity DRAM cell access transistor technology parameters.
+    		//parameters
+        	curr_vdd_dram_cell = 0.9;//0.45;//This value has reduced greatly in 2007 ITRS for all technology nodes. In
+    		//2005 ITRS, the value was about twice the value in 2007 ITRS
+    		Lphy[3] = 0.022;//micron
+    		Lelec[3] = 0.0181;//micron.
+    		curr_v_th_dram_access_transistor = 1;//V
+    		width_dram_access_transistor = 0.022;//micron
+    		curr_I_on_dram_cell = 20e-6; //This is a typical value that I have always
+    		//kept constant. In reality this could perhaps be lower
+    		curr_I_off_dram_cell_worst_case_length_temp = 1e-15;//A
+    		curr_Wmemcella_dram = width_dram_access_transistor;
+    		curr_Wmemcellpmos_dram = 0;
+    		curr_Wmemcellnmos_dram = 0;
+    		curr_area_cell_dram = 6*0.022*0.022;//micron2.
+    		curr_asp_ratio_cell_dram = 0.667;
+    		curr_c_dram_cell = 30e-15;//This is a typical value that I have alwaus
+    		//kept constant.
+
+    	//22 nm commodity DRAM wordline transistor parameters obtained using MASTAR.
+    		curr_vpp = 2.3;//vpp. V
+    		t_ox[3] = 3.5e-3;//micron
+    		v_th[3] = 1.0;//V
+    		c_ox[3] = 9.06e-15;//F/micron2
+    		mobility_eff[3] =  367.29 * (1e-2 * 1e6 * 1e-2 * 1e6);//micron2 / Vs
+    		Vdsat[3] = 0.0972; //V/micron
+    		c_g_ideal[3] = 1.99e-16;//F/micron
+    		c_fringe[3] = 0.053e-15;//F/micron
+    		c_junc[3] = 1e-15;//F/micron2
+    		I_on_n[3] = 910.5e-6;//A/micron
+    		nmos_effective_resistance_multiplier = 1.69;//Using the value from 32nm.
+    		//
+    		n_to_p_eff_curr_drv_ratio[3] = 1.95;//Using the value from 32nm
+    		gmp_to_gmn_multiplier[3] = 0.90;
+    		Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp  / I_on_n[3];//ohm-micron
+    		Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];//ohm-micron
+    		long_channel_leakage_reduction[3] = 1;
+    		I_off_n[3][0] = 1.1e-13; //A/micron
+    		I_off_n[3][10] = 2.11e-13;
+    		I_off_n[3][20] = 3.88e-13;
+    		I_off_n[3][30] = 6.9e-13;
+    		I_off_n[3][40] = 1.19e-12;
+    		I_off_n[3][50] = 1.98e-12;
+    		I_off_n[3][60] = 3.22e-12;
+    		I_off_n[3][70] = 5.09e-12;
+    		I_off_n[3][80] = 7.85e-12;
+    		I_off_n[3][90] = 1.18e-11;
+    		I_off_n[3][100] = 1.72e-11;
+
+    	}
+        else
+        {
+      	  //some error handler
+        }
+
+        //SRAM cell properties
+        curr_Wmemcella_sram    = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_sram    = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_sram = 1.46;
+        //CAM cell properties //TODO: data need to be revisited
+        curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+        curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+        curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+        curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+        curr_asp_ratio_cell_cam = 2.92;
+        //Empirical undifferetiated core/FU coefficient
+        curr_logic_scaling_co_eff = 0.7*0.7*0.7*0.7*0.7;
+        curr_core_tx_density      = 1.25/0.7/0.7/0.7;
+        curr_sckt_co_eff           = 1.1296;
+        curr_chip_layout_overhead  = 1.2;//die measurement results based on Niagara 1 and 2
+        curr_macro_layout_overhead = 1.1;//EDA placement and routing tool rule of thumb
+    	}
+
+
+    g_tp.peri_global.Vdd       += curr_alpha * vdd[peri_global_tech_type];
+    g_tp.peri_global.t_ox      += curr_alpha * t_ox[peri_global_tech_type];
+    g_tp.peri_global.Vth       += curr_alpha * v_th[peri_global_tech_type];
+    g_tp.peri_global.C_ox      += curr_alpha * c_ox[peri_global_tech_type];
+    g_tp.peri_global.C_g_ideal += curr_alpha * c_g_ideal[peri_global_tech_type];
+    g_tp.peri_global.C_fringe  += curr_alpha * c_fringe[peri_global_tech_type];
+    g_tp.peri_global.C_junc    += curr_alpha * c_junc[peri_global_tech_type];
+    g_tp.peri_global.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.peri_global.l_phy     += curr_alpha * Lphy[peri_global_tech_type];
+    g_tp.peri_global.l_elec    += curr_alpha * Lelec[peri_global_tech_type];
+    g_tp.peri_global.I_on_n    += curr_alpha * I_on_n[peri_global_tech_type];
+    g_tp.peri_global.R_nch_on  += curr_alpha * Rnchannelon[peri_global_tech_type];
+    g_tp.peri_global.R_pch_on  += curr_alpha * Rpchannelon[peri_global_tech_type];
+    g_tp.peri_global.n_to_p_eff_curr_drv_ratio
+      += curr_alpha * n_to_p_eff_curr_drv_ratio[peri_global_tech_type];
+    g_tp.peri_global.long_channel_leakage_reduction
+      += curr_alpha * long_channel_leakage_reduction[peri_global_tech_type];
+    g_tp.peri_global.I_off_n   += curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_off_p   += curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_g_on_n   += curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_g_on_p   += curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300];
+    gmp_to_gmn_multiplier_periph_global += curr_alpha * gmp_to_gmn_multiplier[peri_global_tech_type];
+
+    g_tp.sram_cell.Vdd       += curr_alpha * vdd[ram_cell_tech_type];
+    g_tp.sram_cell.l_phy     += curr_alpha * Lphy[ram_cell_tech_type];
+    g_tp.sram_cell.l_elec    += curr_alpha * Lelec[ram_cell_tech_type];
+    g_tp.sram_cell.t_ox      += curr_alpha * t_ox[ram_cell_tech_type];
+    g_tp.sram_cell.Vth       += curr_alpha * v_th[ram_cell_tech_type];
+    g_tp.sram_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type];
+    g_tp.sram_cell.C_fringe  += curr_alpha * c_fringe[ram_cell_tech_type];
+    g_tp.sram_cell.C_junc    += curr_alpha * c_junc[ram_cell_tech_type];
+    g_tp.sram_cell.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.sram_cell.I_on_n    += curr_alpha * I_on_n[ram_cell_tech_type];
+    g_tp.sram_cell.R_nch_on  += curr_alpha * Rnchannelon[ram_cell_tech_type];
+    g_tp.sram_cell.R_pch_on  += curr_alpha * Rpchannelon[ram_cell_tech_type];
+    g_tp.sram_cell.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type];
+    g_tp.sram_cell.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type];
+    g_tp.sram_cell.I_off_n   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_off_p   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_g_on_n   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_g_on_p   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+
+    g_tp.dram_cell_Vdd      += curr_alpha * curr_vdd_dram_cell;
+    g_tp.dram_acc.Vth       += curr_alpha * curr_v_th_dram_access_transistor;
+    g_tp.dram_acc.l_phy     += curr_alpha * Lphy[dram_cell_tech_flavor];
+    g_tp.dram_acc.l_elec    += curr_alpha * Lelec[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_g_ideal += curr_alpha * c_g_ideal[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_fringe  += curr_alpha * c_fringe[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_junc    += curr_alpha * c_junc[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.dram_cell_I_on     += curr_alpha * curr_I_on_dram_cell;
+    g_tp.dram_cell_I_off_worst_case_len_temp += curr_alpha * curr_I_off_dram_cell_worst_case_length_temp;
+    g_tp.dram_acc.I_on_n    += curr_alpha * I_on_n[dram_cell_tech_flavor];
+    g_tp.dram_cell_C        += curr_alpha * curr_c_dram_cell;
+    g_tp.vpp                += curr_alpha * curr_vpp;
+    g_tp.dram_wl.l_phy      += curr_alpha * Lphy[dram_cell_tech_flavor];
+    g_tp.dram_wl.l_elec     += curr_alpha * Lelec[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_g_ideal  += curr_alpha * c_g_ideal[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_fringe   += curr_alpha * c_fringe[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_junc     += curr_alpha * c_junc[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.dram_wl.I_on_n     += curr_alpha * I_on_n[dram_cell_tech_flavor];
+    g_tp.dram_wl.R_nch_on   += curr_alpha * Rnchannelon[dram_cell_tech_flavor];
+    g_tp.dram_wl.R_pch_on   += curr_alpha * Rpchannelon[dram_cell_tech_flavor];
+    g_tp.dram_wl.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[dram_cell_tech_flavor];
+    g_tp.dram_wl.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[dram_cell_tech_flavor];
+    g_tp.dram_wl.I_off_n    += curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300];
+    g_tp.dram_wl.I_off_p    += curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300];
+
+    g_tp.cam_cell.Vdd       += curr_alpha * vdd[ram_cell_tech_type];
+    g_tp.cam_cell.l_phy     += curr_alpha * Lphy[ram_cell_tech_type];
+    g_tp.cam_cell.l_elec    += curr_alpha * Lelec[ram_cell_tech_type];
+    g_tp.cam_cell.t_ox      += curr_alpha * t_ox[ram_cell_tech_type];
+    g_tp.cam_cell.Vth       += curr_alpha * v_th[ram_cell_tech_type];
+    g_tp.cam_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type];
+    g_tp.cam_cell.C_fringe  += curr_alpha * c_fringe[ram_cell_tech_type];
+    g_tp.cam_cell.C_junc    += curr_alpha * c_junc[ram_cell_tech_type];
+    g_tp.cam_cell.C_junc_sidewall = 0.25e-15;  // F/micron
+    g_tp.cam_cell.I_on_n    += curr_alpha * I_on_n[ram_cell_tech_type];
+    g_tp.cam_cell.R_nch_on  += curr_alpha * Rnchannelon[ram_cell_tech_type];
+    g_tp.cam_cell.R_pch_on  += curr_alpha * Rpchannelon[ram_cell_tech_type];
+    g_tp.cam_cell.n_to_p_eff_curr_drv_ratio += curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type];
+    g_tp.cam_cell.long_channel_leakage_reduction += curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type];
+    g_tp.cam_cell.I_off_n   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_off_p   += curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_g_on_n   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_g_on_p   += curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+
+    g_tp.dram.cell_a_w    += curr_alpha * curr_Wmemcella_dram;
+    g_tp.dram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_dram;
+    g_tp.dram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_dram;
+    area_cell_dram        += curr_alpha * curr_area_cell_dram;
+    asp_ratio_cell_dram   += curr_alpha * curr_asp_ratio_cell_dram;
+
+    g_tp.sram.cell_a_w    += curr_alpha * curr_Wmemcella_sram;
+    g_tp.sram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_sram;
+    g_tp.sram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_sram;
+    area_cell_sram += curr_alpha * curr_area_cell_sram;
+    asp_ratio_cell_sram += curr_alpha * curr_asp_ratio_cell_sram;
+
+    g_tp.cam.cell_a_w    += curr_alpha * curr_Wmemcella_cam;//sheng
+    g_tp.cam.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_cam;
+    g_tp.cam.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_cam;
+    area_cell_cam += curr_alpha * curr_area_cell_cam;
+    asp_ratio_cell_cam += curr_alpha * curr_asp_ratio_cell_cam;
+
+    //Sense amplifier latch Gm calculation
+    mobility_eff_periph_global += curr_alpha * mobility_eff[peri_global_tech_type];
+    Vdsat_periph_global += curr_alpha * Vdsat[peri_global_tech_type];
+
+    //Empirical undifferetiated core/FU coefficient
+    g_tp.scaling_factor.logic_scaling_co_eff += curr_alpha * curr_logic_scaling_co_eff;
+    g_tp.scaling_factor.core_tx_density += curr_alpha * curr_core_tx_density;
+    g_tp.chip_layout_overhead  += curr_alpha * curr_chip_layout_overhead;
+    g_tp.macro_layout_overhead += curr_alpha * curr_macro_layout_overhead;
+    g_tp.sckt_co_eff           += curr_alpha * curr_sckt_co_eff;
+  }
+
+
+  //Currently we are not modeling the resistance/capacitance of poly anywhere.
+  //Continuous function (or date have been processed) does not need linear interpolation
+  g_tp.w_comp_inv_p1 = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n1 =  7.5 * g_ip->F_sz_um;//this was  6 micron for the 0.8 micron process
+  g_tp.w_comp_inv_p2 =   25 * g_ip->F_sz_um;//this was 20 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n2 =   15 * g_ip->F_sz_um;//this was 12 micron for the 0.8 micron process
+  g_tp.w_comp_inv_p3 =   50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n3 =   30 * g_ip->F_sz_um;//this was 24 micron for the 0.8 micron process
+  g_tp.w_eval_inv_p  =  100 * g_ip->F_sz_um;//this was 80 micron for the 0.8 micron process
+  g_tp.w_eval_inv_n  =   50 * g_ip->F_sz_um;//this was 40 micron for the 0.8 micron process
+  g_tp.w_comp_n     = 12.5 * g_ip->F_sz_um;//this was 10 micron for the 0.8 micron process
+  g_tp.w_comp_p     = 37.5 * g_ip->F_sz_um;//this was 30 micron for the 0.8 micron process
+
+  g_tp.MIN_GAP_BET_P_AND_N_DIFFS = 5 * g_ip->F_sz_um;
+  g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS = 1.5 * g_ip->F_sz_um;
+  g_tp.HPOWERRAIL = 2 * g_ip->F_sz_um;
+  g_tp.cell_h_def = 50 * g_ip->F_sz_um;
+  g_tp.w_poly_contact = g_ip->F_sz_um;
+  g_tp.spacing_poly_to_contact = g_ip->F_sz_um;
+  g_tp.spacing_poly_to_poly = 1.5 * g_ip->F_sz_um;
+  g_tp.ram_wl_stitching_overhead_ = 7.5 * g_ip->F_sz_um;
+
+  g_tp.min_w_nmos_ = 3 * g_ip->F_sz_um / 2;
+  g_tp.max_w_nmos_ = 100 * g_ip->F_sz_um;
+  g_tp.w_iso       = 12.5*g_ip->F_sz_um;//was 10 micron for the 0.8 micron process
+  g_tp.w_sense_n   = 3.75*g_ip->F_sz_um; // sense amplifier N-trans; was 3 micron for the 0.8 micron process
+  g_tp.w_sense_p   = 7.5*g_ip->F_sz_um; // sense amplifier P-trans; was 6 micron for the 0.8 micron process
+  g_tp.w_sense_en  = 5*g_ip->F_sz_um; // Sense enable transistor of the sense amplifier; was 4 micron for the 0.8 micron process
+  g_tp.w_nmos_b_mux  = 6 * g_tp.min_w_nmos_;
+  g_tp.w_nmos_sa_mux = 6 * g_tp.min_w_nmos_;
+
+  if (ram_cell_tech_type == comm_dram)
+  {
+    g_tp.max_w_nmos_dec = 8 * g_ip->F_sz_um;
+    g_tp.h_dec          = 8;  // in the unit of memory cell height
+  }
+  else
+  {
+    g_tp.max_w_nmos_dec = g_tp.max_w_nmos_;
+    g_tp.h_dec          = 4;  // in the unit of memory cell height
+  }
+
+  g_tp.peri_global.C_overlap = 0.2 * g_tp.peri_global.C_g_ideal;
+  g_tp.sram_cell.C_overlap   = 0.2 * g_tp.sram_cell.C_g_ideal;
+  g_tp.cam_cell.C_overlap    = 0.2 * g_tp.cam_cell.C_g_ideal;
+
+  g_tp.dram_acc.C_overlap = 0.2 * g_tp.dram_acc.C_g_ideal;
+  g_tp.dram_acc.R_nch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_n;
+  //g_tp.dram_acc.R_pch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_p;
+
+  g_tp.dram_wl.C_overlap = 0.2 * g_tp.dram_wl.C_g_ideal;
+
+  double gmn_sense_amp_latch = (mobility_eff_periph_global / 2) * g_tp.peri_global.C_ox * (g_tp.w_sense_n / g_tp.peri_global.l_elec) * Vdsat_periph_global;
+  double gmp_sense_amp_latch = gmp_to_gmn_multiplier_periph_global * gmn_sense_amp_latch;
+  g_tp.gm_sense_amp_latch = gmn_sense_amp_latch + gmp_sense_amp_latch;
+
+  g_tp.dram.b_w = sqrt(area_cell_dram / (asp_ratio_cell_dram));
+  g_tp.dram.b_h = asp_ratio_cell_dram * g_tp.dram.b_w;
+  g_tp.sram.b_w = sqrt(area_cell_sram / (asp_ratio_cell_sram));
+  g_tp.sram.b_h = asp_ratio_cell_sram * g_tp.sram.b_w;
+  g_tp.cam.b_w =  sqrt(area_cell_cam / (asp_ratio_cell_cam));//Sheng
+  g_tp.cam.b_h = asp_ratio_cell_cam * g_tp.cam.b_w;
+
+  g_tp.dram.Vbitpre = g_tp.dram_cell_Vdd;
+  g_tp.sram.Vbitpre = vdd[ram_cell_tech_type];
+  g_tp.cam.Vbitpre = vdd[ram_cell_tech_type];//Sheng
+  pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  g_tp.w_pmos_bl_precharge = 6 * pmos_to_nmos_sizing_r * g_tp.min_w_nmos_;
+  g_tp.w_pmos_bl_eq = pmos_to_nmos_sizing_r * g_tp.min_w_nmos_;
+
+
+  double wire_pitch       [NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         wire_r_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         wire_c_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         horiz_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         vert_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         aspect_ratio[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         miller_value[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+         ild_thickness[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES];
+
+  for (iter=0; iter<=1; ++iter)
+  {
+    // linear interpolation
+    if (iter == 0)
+    {
+      tech = tech_lo;
+      if (tech_lo == tech_hi)
+      {
+        curr_alpha = 1;
+      }
+      else
+      {
+        curr_alpha = (technology - tech_hi)/(tech_lo - tech_hi);
+      }
+    }
+    else
+    {
+      tech = tech_hi;
+      if (tech_lo == tech_hi)
+      {
+        break;
+      }
+      else
+      {
+        curr_alpha = (tech_lo - technology)/(tech_lo - tech_hi);
+      }
+    }
+
+    if (tech == 180)
+    {
+    	//Aggressive projections
+    	wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//micron
+    	aspect_ratio[0][0] = 2.0;
+    	wire_width = wire_pitch[0][0] / 2; //micron
+    	wire_thickness = aspect_ratio[0][0] * wire_width;//micron
+    	wire_spacing = wire_pitch[0][0] - wire_width;//micron
+    	barrier_thickness = 0.017;//micron
+    	dishing_thickness = 0;//micron
+    	alpha_scatter = 1;
+    	wire_r_per_micron[0][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+    			wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);//ohm/micron
+    	ild_thickness[0][0] = 0.75;//micron
+    	miller_value[0][0] = 1.5;
+    	horiz_dielectric_constant[0][0] = 2.709;
+    	vert_dielectric_constant[0][0] = 3.9;
+    	fringe_cap = 0.115e-15; //F/micron
+        wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0],
+          fringe_cap);//F/micron.
+
+    	wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+    	wire_width = wire_pitch[0][1] / 2;
+    	aspect_ratio[0][1] = 2.4;
+    	wire_thickness = aspect_ratio[0][1] * wire_width;
+    	wire_spacing = wire_pitch[0][1] - wire_width;
+    	wire_r_per_micron[0][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+    			wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+    	ild_thickness[0][1] = 0.75;//micron
+    	miller_value[0][1] = 1.5;
+    	horiz_dielectric_constant[0][1] = 2.709;
+    	vert_dielectric_constant[0][1] = 3.9;
+    	fringe_cap = 0.115e-15; //F/micron
+        wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1],
+          fringe_cap);
+
+    	wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+    	aspect_ratio[0][2] = 2.2;
+    	wire_width = wire_pitch[0][2] / 2;
+    	wire_thickness = aspect_ratio[0][2] * wire_width;
+    	wire_spacing = wire_pitch[0][2] - wire_width;
+    	wire_r_per_micron[0][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+    			wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+    	ild_thickness[0][2] = 1.5;
+    	miller_value[0][2] = 1.5;
+        horiz_dielectric_constant[0][2] = 2.709;
+        vert_dielectric_constant[0][2] = 3.9;
+        wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+    	//Conservative projections
+    	wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+    	aspect_ratio[1][0]= 2.0;
+    	wire_width = wire_pitch[1][0] / 2;
+    	wire_thickness = aspect_ratio[1][0] * wire_width;
+    	wire_spacing = wire_pitch[1][0] - wire_width;
+    	barrier_thickness = 0.017;
+    	dishing_thickness = 0;
+    	alpha_scatter = 1;
+    	wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+    			wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+    	ild_thickness[1][0] = 0.75;
+    	miller_value[1][0] = 1.5;
+    	horiz_dielectric_constant[1][0] = 3.038;
+    	vert_dielectric_constant[1][0] = 3.9;
+    	fringe_cap = 0.115e-15;
+        wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0],
+          fringe_cap);
+
+    	wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+    	wire_width = wire_pitch[1][1] / 2;
+    	aspect_ratio[1][1] = 2.0;
+    	wire_thickness = aspect_ratio[1][1] * wire_width;
+    	wire_spacing = wire_pitch[1][1] - wire_width;
+    	wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+    			wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+    	ild_thickness[1][1] = 0.75;
+    	miller_value[1][1] = 1.5;
+    	horiz_dielectric_constant[1][1] = 3.038;
+    	vert_dielectric_constant[1][1] = 3.9;
+        wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1],
+          fringe_cap);
+
+    	wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+    	aspect_ratio[1][2] = 2.2;
+    	wire_width = wire_pitch[1][2] / 2;
+    	wire_thickness = aspect_ratio[1][2] * wire_width;
+    	wire_spacing = wire_pitch[1][2] - wire_width;
+    	dishing_thickness = 0.1 *  wire_thickness;
+    	wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+    			wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+    	ild_thickness[1][2]  = 1.98;
+    	miller_value[1][2]  = 1.5;
+        horiz_dielectric_constant[1][2]  = 3.038;
+        vert_dielectric_constant[1][2]  = 3.9;
+        wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2] , miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+    	//Nominal projections for commodity DRAM wordline/bitline
+    	wire_pitch[1][3] = 2 * 0.18;
+    	wire_c_per_micron[1][3] = 60e-15 / (256 * 2 * 0.18);
+    	wire_r_per_micron[1][3] = 12 / 0.18;
+    }
+    else if (tech == 90)
+    {
+      //Aggressive projections
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//micron
+      aspect_ratio[0][0] = 2.4;
+      wire_width = wire_pitch[0][0] / 2; //micron
+      wire_thickness = aspect_ratio[0][0] * wire_width;//micron
+      wire_spacing = wire_pitch[0][0] - wire_width;//micron
+      barrier_thickness = 0.01;//micron
+      dishing_thickness = 0;//micron
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);//ohm/micron
+      ild_thickness[0][0] = 0.48;//micron
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 2.709;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15; //F/micron
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0],
+          fringe_cap);//F/micron.
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 2.4;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.48;//micron
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 2.709;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 2.7;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.96;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 2.709;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0]  = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.008;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0]  = 0.48;
+      miller_value[1][0]  = 1.5;
+      horiz_dielectric_constant[1][0]  = 3.038;
+      vert_dielectric_constant[1][0]  = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1]  = 0.48;
+      miller_value[1][1]  = 1.5;
+      horiz_dielectric_constant[1][1]  = 3.038;
+      vert_dielectric_constant[1][1]  = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2]  = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 *  wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2]  = 1.1;
+      miller_value[1][2]  = 1.5;
+      horiz_dielectric_constant[1][2]  = 3.038;
+      vert_dielectric_constant[1][2]  = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2] , miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.09;
+      wire_c_per_micron[1][3] = 60e-15 / (256 * 2 * 0.09);
+      wire_r_per_micron[1][3] = 12 / 0.09;
+    }
+    else if (tech == 65)
+    {
+      //Aggressive projections
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0]  = 2.7;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0]  * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0]  = 0.405;
+      miller_value[0][0]   = 1.5;
+      horiz_dielectric_constant[0][0]  = 2.303;
+      vert_dielectric_constant[0][0]   = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0] , miller_value[0][0] , horiz_dielectric_constant[0][0] , vert_dielectric_constant[0][0] ,
+          fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1]  = 2.7;
+      wire_thickness = aspect_ratio[0][1]  * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1]  = 0.405;
+      miller_value[0][1]   = 1.5;
+      horiz_dielectric_constant[0][1]  = 2.303;
+      vert_dielectric_constant[0][1]   = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 2.8;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.81;
+      miller_value[0][2]   = 1.5;
+      horiz_dielectric_constant[0][2]  = 2.303;
+      vert_dielectric_constant[0][2]   = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.006;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.405;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.734;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.405;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.734;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 *  wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.77;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.734;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.065;
+      wire_c_per_micron[1][3] = 52.5e-15 / (256 * 2 * 0.065);
+      wire_r_per_micron[1][3] = 12 / 0.065;
+    }
+    else if (tech == 45)
+    {
+      //Aggressive projections.
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0]  = 3.0;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0]  * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0]  = 0.315;
+      miller_value[0][0]  = 1.5;
+      horiz_dielectric_constant[0][0]  = 1.958;
+      vert_dielectric_constant[0][0]  = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0] , miller_value[0][0] , horiz_dielectric_constant[0][0] , vert_dielectric_constant[0][0] ,
+          fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1]  = 3.0;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1]  = 0.315;
+      miller_value[0][1]  = 1.5;
+      horiz_dielectric_constant[0][1]  = 1.958;
+      vert_dielectric_constant[0][1]  = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 3.0;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.63;
+      miller_value[0][2]  = 1.5;
+      horiz_dielectric_constant[0][2]  = 1.958;
+      vert_dielectric_constant[0][2]  = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.004;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.315;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.46;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.315;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.46;
+      vert_dielectric_constant[1][1] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 * wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.55;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.46;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.045;
+      wire_c_per_micron[1][3] = 37.5e-15 / (256 * 2 * 0.045);
+      wire_r_per_micron[1][3] = 12 / 0.045;
+    }
+    else if (tech == 32)
+    {
+      //Aggressive projections.
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0] = 3.0;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0] * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0] = 0.21;
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 1.664;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0],
+          fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 3.0;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.21;
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 1.664;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+          fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 3.0;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.42;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 1.664;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+          fringe_cap);
+
+      //Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.003;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.21;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.214;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+          fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      aspect_ratio[1][1] = 2.0;
+      wire_width = wire_pitch[1][1] / 2;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.21;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.214;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+          fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 *  wire_thickness;
+      wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+          wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.385;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.214;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+          ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+          fringe_cap);
+      //Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.032;//micron
+      wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.032);//F/micron
+      wire_r_per_micron[1][3] = 12 / 0.032;//ohm/micron
+    }
+    else if (tech == 22)
+        {
+          //Aggressive projections.
+          wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//local
+          aspect_ratio[0][0] = 3.0;
+          wire_width = wire_pitch[0][0] / 2;
+          wire_thickness = aspect_ratio[0][0] * wire_width;
+          wire_spacing = wire_pitch[0][0] - wire_width;
+          barrier_thickness = 0;
+          dishing_thickness = 0;
+          alpha_scatter = 1;
+          wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][0] = 0.15;
+          miller_value[0][0] = 1.5;
+          horiz_dielectric_constant[0][0] = 1.414;
+          vert_dielectric_constant[0][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0],
+            fringe_cap);
+
+          wire_pitch[0][1] = 4 * g_ip->F_sz_um;//semi-global
+          wire_width = wire_pitch[0][1] / 2;
+          aspect_ratio[0][1] = 3.0;
+          wire_thickness = aspect_ratio[0][1] * wire_width;
+          wire_spacing = wire_pitch[0][1] - wire_width;
+          wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][1] = 0.15;
+          miller_value[0][1] = 1.5;
+          horiz_dielectric_constant[0][1] = 1.414;
+          vert_dielectric_constant[0][1] = 3.9;
+          wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+            fringe_cap);
+
+          wire_pitch[0][2] = 8 * g_ip->F_sz_um;//global
+          aspect_ratio[0][2] = 3.0;
+          wire_width = wire_pitch[0][2] / 2;
+          wire_thickness = aspect_ratio[0][2] * wire_width;
+          wire_spacing = wire_pitch[0][2] - wire_width;
+          wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+        		  wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][2] = 0.3;
+          miller_value[0][2] = 1.5;
+          horiz_dielectric_constant[0][2] = 1.414;
+          vert_dielectric_constant[0][2] = 3.9;
+          wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+        		  ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+        		  fringe_cap);
+
+//          //*************************
+//          wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][4] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][4] - wire_width;
+//          wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//        		  wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//        		  ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//        		  fringe_cap);
+//
+//          wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][5] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][5] - wire_width;
+//          wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//        		  wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//        		  ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//        		  fringe_cap);
+//
+//          wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][6] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][6] - wire_width;
+//          wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//        		  wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//        		  ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//        		  fringe_cap);
+          //*************************
+
+          //Conservative projections
+          wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+          aspect_ratio[1][0] = 2.0;
+          wire_width = wire_pitch[1][0] / 2;
+          wire_thickness = aspect_ratio[1][0] * wire_width;
+          wire_spacing = wire_pitch[1][0] - wire_width;
+          barrier_thickness = 0.003;
+          dishing_thickness = 0;
+          alpha_scatter = 1.05;
+          wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][0] = 0.15;
+          miller_value[1][0] = 1.5;
+          horiz_dielectric_constant[1][0] = 2.104;
+          vert_dielectric_constant[1][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+            fringe_cap);
+
+          wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+          wire_width = wire_pitch[1][1] / 2;
+          aspect_ratio[1][1] = 2.0;
+          wire_thickness = aspect_ratio[1][1] * wire_width;
+          wire_spacing = wire_pitch[1][1] - wire_width;
+          wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][1] = 0.15;
+          miller_value[1][1] = 1.5;
+          horiz_dielectric_constant[1][1] = 2.104;
+          vert_dielectric_constant[1][1] = 3.9;
+          wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+            fringe_cap);
+
+            wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+            aspect_ratio[1][2] = 2.2;
+            wire_width = wire_pitch[1][2] / 2;
+            wire_thickness = aspect_ratio[1][2] * wire_width;
+            wire_spacing = wire_pitch[1][2] - wire_width;
+            dishing_thickness = 0.1 *  wire_thickness;
+            wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+            		wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+            ild_thickness[1][2] = 0.275;
+            miller_value[1][2] = 1.5;
+            horiz_dielectric_constant[1][2] = 2.104;
+            vert_dielectric_constant[1][2] = 3.9;
+            wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            		ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+            		fringe_cap);
+            //Nominal projections for commodity DRAM wordline/bitline
+            wire_pitch[1][3] = 2 * 0.022;//micron
+            wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.022);//F/micron
+            wire_r_per_micron[1][3] = 12 / 0.022;//ohm/micron
+
+            //******************
+//            wire_pitch[1][4] = 16 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][4] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][4] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY, wire_width,
+//            		wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//            		ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//            		fringe_cap);
+//
+//            wire_pitch[1][5] = 24 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][5] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][5] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY, wire_width,
+//            		wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//            		ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//            		fringe_cap);
+//
+//            wire_pitch[1][6] = 32 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][6] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][6] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY, wire_width,
+//            		wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//            		ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//            		fringe_cap);
+        }
+
+    else if (tech == 16)
+        {
+          //Aggressive projections.
+          wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;//local
+          aspect_ratio[0][0] = 3.0;
+          wire_width = wire_pitch[0][0] / 2;
+          wire_thickness = aspect_ratio[0][0] * wire_width;
+          wire_spacing = wire_pitch[0][0] - wire_width;
+          barrier_thickness = 0;
+          dishing_thickness = 0;
+          alpha_scatter = 1;
+          wire_r_per_micron[0][0] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][0] = 0.108;
+          miller_value[0][0] = 1.5;
+          horiz_dielectric_constant[0][0] = 1.202;
+          vert_dielectric_constant[0][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[0][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][0], miller_value[0][0], horiz_dielectric_constant[0][0], vert_dielectric_constant[0][0],
+            fringe_cap);
+
+          wire_pitch[0][1] = 4 * g_ip->F_sz_um;//semi-global
+          aspect_ratio[0][1] = 3.0;
+          wire_width = wire_pitch[0][1] / 2;
+          wire_thickness = aspect_ratio[0][1] * wire_width;
+          wire_spacing = wire_pitch[0][1] - wire_width;
+          wire_r_per_micron[0][1] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][1] = 0.108;
+          miller_value[0][1] = 1.5;
+          horiz_dielectric_constant[0][1] = 1.202;
+          vert_dielectric_constant[0][1] = 3.9;
+          wire_c_per_micron[0][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[0][1], miller_value[0][1], horiz_dielectric_constant[0][1], vert_dielectric_constant[0][1],
+            fringe_cap);
+
+          wire_pitch[0][2] = 8 * g_ip->F_sz_um;//global
+          aspect_ratio[0][2] = 3.0;
+          wire_width = wire_pitch[0][2] / 2;
+          wire_thickness = aspect_ratio[0][2] * wire_width;
+          wire_spacing = wire_pitch[0][2] - wire_width;
+          wire_r_per_micron[0][2] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+        		  wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[0][2] = 0.216;
+          miller_value[0][2] = 1.5;
+          horiz_dielectric_constant[0][2] = 1.202;
+          vert_dielectric_constant[0][2] = 3.9;
+          wire_c_per_micron[0][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+        		  ild_thickness[0][2], miller_value[0][2], horiz_dielectric_constant[0][2], vert_dielectric_constant[0][2],
+        		  fringe_cap);
+
+//          //*************************
+//          wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][4] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][4] - wire_width;
+//          wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//        		  wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//        		  ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//        		  fringe_cap);
+//
+//          wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][5] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][5] - wire_width;
+//          wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//        		  wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//        		  ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//        		  fringe_cap);
+//
+//          wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global
+//          aspect_ratio = 3.0;
+//          wire_width = wire_pitch[0][6] / 2;
+//          wire_thickness = aspect_ratio * wire_width;
+//          wire_spacing = wire_pitch[0][6] - wire_width;
+//          wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY, wire_width,
+//        		  wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//          ild_thickness = 0.3;
+//          wire_c_per_micron[0][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//        		  ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//        		  fringe_cap);
+          //*************************
+
+          //Conservative projections
+          wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+          aspect_ratio[1][0] = 2.0;
+          wire_width = wire_pitch[1][0] / 2;
+          wire_thickness = aspect_ratio[1][0] * wire_width;
+          wire_spacing = wire_pitch[1][0] - wire_width;
+          barrier_thickness = 0.002;
+          dishing_thickness = 0;
+          alpha_scatter = 1.05;
+          wire_r_per_micron[1][0] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][0] = 0.108;
+          miller_value[1][0] = 1.5;
+          horiz_dielectric_constant[1][0] = 1.998;
+          vert_dielectric_constant[1][0] = 3.9;
+          fringe_cap = 0.115e-15;
+          wire_c_per_micron[1][0] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][0], miller_value[1][0], horiz_dielectric_constant[1][0], vert_dielectric_constant[1][0],
+            fringe_cap);
+
+          wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+          wire_width = wire_pitch[1][1] / 2;
+          aspect_ratio[1][1] = 2.0;
+          wire_thickness = aspect_ratio[1][1] * wire_width;
+          wire_spacing = wire_pitch[1][1] - wire_width;
+          wire_r_per_micron[1][1] = wire_resistance(CU_RESISTIVITY, wire_width,
+            wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+          ild_thickness[1][1] = 0.108;
+          miller_value[1][1] = 1.5;
+          horiz_dielectric_constant[1][1] = 1.998;
+          vert_dielectric_constant[1][1] = 3.9;
+            wire_c_per_micron[1][1] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            ild_thickness[1][1], miller_value[1][1], horiz_dielectric_constant[1][1], vert_dielectric_constant[1][1],
+            fringe_cap);
+
+            wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+            aspect_ratio[1][2] = 2.2;
+            wire_width = wire_pitch[1][2] / 2;
+            wire_thickness = aspect_ratio[1][2] * wire_width;
+            wire_spacing = wire_pitch[1][2] - wire_width;
+            dishing_thickness = 0.1 *  wire_thickness;
+            wire_r_per_micron[1][2] = wire_resistance(CU_RESISTIVITY, wire_width,
+            		wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+            ild_thickness[1][2] = 0.198;
+            miller_value[1][2] = 1.5;
+            horiz_dielectric_constant[1][2] = 1.998;
+            vert_dielectric_constant[1][2] = 3.9;
+            wire_c_per_micron[1][2] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+            		ild_thickness[1][2], miller_value[1][2], horiz_dielectric_constant[1][2], vert_dielectric_constant[1][2],
+            		fringe_cap);
+            //Nominal projections for commodity DRAM wordline/bitline
+            wire_pitch[1][3] = 2 * 0.016;//micron
+            wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.016);//F/micron
+            wire_r_per_micron[1][3] = 12 / 0.016;//ohm/micron
+
+            //******************
+//            wire_pitch[1][4] = 16 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][4] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][4] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY, wire_width,
+//            		wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][4] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//            		ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//            		fringe_cap);
+//
+//            wire_pitch[1][5] = 24 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][5] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][5] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY, wire_width,
+//            		wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][5] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//            		ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//            		fringe_cap);
+//
+//            wire_pitch[1][6] = 32 * g_ip.F_sz_um;
+//            aspect_ratio = 2.2;
+//            wire_width = wire_pitch[1][6] / 2;
+//            wire_thickness = aspect_ratio * wire_width;
+//            wire_spacing = wire_pitch[1][6] - wire_width;
+//            dishing_thickness = 0.1 *  wire_thickness;
+//            wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY, wire_width,
+//            		wire_thickness, barrier_thickness, dishing_thickness, alpha_scatter);
+//            ild_thickness = 0.275;
+//            wire_c_per_micron[1][6] = wire_capacitance(wire_width, wire_thickness, wire_spacing,
+//            		ild_thickness, miller_value, horiz_dielectric_constant, vert_dielectric_constant,
+//            		fringe_cap);
+        }
+    g_tp.wire_local.pitch    += curr_alpha * wire_pitch[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.R_per_um += curr_alpha * wire_r_per_micron[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.C_per_um += curr_alpha * wire_c_per_micron[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.aspect_ratio  += curr_alpha * aspect_ratio[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.miller_value   += curr_alpha * miller_value[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+    g_tp.wire_local.vert_dielectric_constant  += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][(ram_cell_tech_type == comm_dram)?3:0];
+
+    g_tp.wire_inside_mat.pitch     += curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.R_per_um  += curr_alpha* wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.C_per_um  += curr_alpha* wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.aspect_ratio  += curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.miller_value   += curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.vert_dielectric_constant  += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+
+    g_tp.wire_outside_mat.pitch    += curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.R_per_um += curr_alpha*wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.C_per_um += curr_alpha*wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.aspect_ratio  += curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.ild_thickness += curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.miller_value   += curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.horiz_dielectric_constant += curr_alpha* horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.vert_dielectric_constant  += curr_alpha* vert_dielectric_constant [g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+
+    g_tp.unit_len_wire_del = g_tp.wire_inside_mat.R_per_um * g_tp.wire_inside_mat.C_per_um / 2;
+
+    g_tp.sense_delay               += curr_alpha *SENSE_AMP_D;
+    g_tp.sense_dy_power            += curr_alpha *SENSE_AMP_P;
+//    g_tp.horiz_dielectric_constant += horiz_dielectric_constant;
+//    g_tp.vert_dielectric_constant  += vert_dielectric_constant;
+//    g_tp.aspect_ratio              += aspect_ratio;
+//    g_tp.miller_value              += miller_value;
+//    g_tp.ild_thickness             += ild_thickness;
+
+  }
+  g_tp.fringe_cap = fringe_cap;
+
+  double rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1);
+  double p_to_n_sizing_r = pmos_to_nmos_sz_ratio();
+  double c_load = gate_C(g_tp.min_w_nmos_ * (1 + p_to_n_sizing_r), 0.0);
+  double tf = rd * c_load;
+  g_tp.kinv = horowitz(0, tf, 0.5, 0.5, RISE);
+  double KLOAD = 1;
+  c_load = KLOAD * (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+                    drain_C_(g_tp.min_w_nmos_ * p_to_n_sizing_r, PCH, 1, 1, g_tp.cell_h_def) +
+                    gate_C(g_tp.min_w_nmos_ * 4 * (1 + p_to_n_sizing_r), 0.0));
+  tf = rd * c_load;
+  g_tp.FO4 = horowitz(0, tf, 0.5, 0.5, RISE);
+}
+
diff --git a/src/gpuwattch/cacti/uca.cc b/src/gpuwattch/cacti/uca.cc
new file mode 100755
index 000000000..ed9be4993
--- /dev/null
+++ b/src/gpuwattch/cacti/uca.cc
@@ -0,0 +1,420 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#include <iostream>
+#include <math.h>
+
+#include "uca.h"
+
+
+UCA::UCA(const DynamicParameter & dyn_p)
+ :dp(dyn_p), bank(dp), nbanks(g_ip->nbanks), refresh_power(0)
+{
+  int num_banks_ver_dir = 1 << ((bank.area.h > bank.area.w) ? _log2(nbanks)/2 : (_log2(nbanks) - _log2(nbanks)/2));
+  int num_banks_hor_dir = nbanks/num_banks_ver_dir;
+
+  if (dp.use_inp_params)
+  {
+	  RWP  = dp.num_rw_ports;
+	  ERP  = dp.num_rd_ports;
+	  EWP  = dp.num_wr_ports;
+	  SCHP = dp.num_search_ports;
+  }
+  else
+  {
+	  RWP  = g_ip->num_rw_ports;
+	  ERP  = g_ip->num_rd_ports;
+	  EWP  = g_ip->num_wr_ports;
+	  SCHP = g_ip->num_search_ports;
+  }
+
+  num_addr_b_bank = (dp.number_addr_bits_mat + dp.number_subbanks_decode)*(RWP+ERP+EWP);
+  num_di_b_bank   = dp.num_di_b_bank_per_port * (RWP + EWP);
+  num_do_b_bank   = dp.num_do_b_bank_per_port * (RWP + ERP);
+  num_si_b_bank   = dp.num_si_b_bank_per_port * SCHP;
+  num_so_b_bank   = dp.num_so_b_bank_per_port * SCHP;
+
+  if (!dp.fully_assoc && !dp.pure_cam)
+  {
+
+	  if (g_ip->fast_access && dp.is_tag == false)
+	  {
+		  num_do_b_bank *= g_ip->data_assoc;
+	  }
+
+	  htree_in_add   = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+			  num_addr_b_bank, num_di_b_bank,0, num_do_b_bank,0,num_banks_ver_dir*2, num_banks_hor_dir*2, Add_htree, true);
+	  htree_in_data  = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+			  num_addr_b_bank, num_di_b_bank, 0, num_do_b_bank, 0, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
+	  htree_out_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+			  num_addr_b_bank, num_di_b_bank, 0, num_do_b_bank, 0, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
+  }
+
+  else
+  {
+
+	  htree_in_add   = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+			  num_addr_b_bank, num_di_b_bank, num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Add_htree, true);
+	  htree_in_data  = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+			  num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
+	  htree_out_data = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+			  num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
+	  htree_in_search  = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+			  num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_in_htree, true);
+	  htree_out_search = new Htree2(g_ip->wt, bank.area.w, bank.area.h,
+			  num_addr_b_bank, num_di_b_bank,num_si_b_bank, num_do_b_bank, num_so_b_bank, num_banks_ver_dir*2, num_banks_hor_dir*2, Data_out_htree, true);
+  }
+
+  area.w = htree_in_data->area.w;
+  area.h = htree_in_data->area.h;
+
+  area_all_dataramcells = bank.mat.subarray.get_total_cell_area() * dp.num_subarrays * g_ip->nbanks;
+//  cout<<"area cell"<<area_all_dataramcells<<endl;
+//  cout<<area.get_area()<<endl;
+  // delay calculation
+  double inrisetime = 0.0;
+  compute_delays(inrisetime);
+  compute_power_energy();
+}
+
+
+
+UCA::~UCA()
+{
+  delete htree_in_add;
+  delete htree_in_data;
+  delete htree_out_data;
+}
+
+
+
+double UCA::compute_delays(double inrisetime)
+{
+  double outrisetime = bank.compute_delays(inrisetime);
+
+  double delay_array_to_mat = htree_in_add->delay + bank.htree_in_add->delay;
+  double max_delay_before_row_decoder = delay_array_to_mat + bank.mat.r_predec->delay;
+  delay_array_to_sa_mux_lev_1_decoder = delay_array_to_mat +
+    bank.mat.sa_mux_lev_1_predec->delay +
+    bank.mat.sa_mux_lev_1_dec->delay;
+  delay_array_to_sa_mux_lev_2_decoder = delay_array_to_mat +
+    bank.mat.sa_mux_lev_2_predec->delay +
+    bank.mat.sa_mux_lev_2_dec->delay;
+  double delay_inside_mat = bank.mat.row_dec->delay + bank.mat.delay_bitline + bank.mat.delay_sa;
+
+  delay_before_subarray_output_driver =
+    MAX(MAX(max_delay_before_row_decoder + delay_inside_mat,  // row_path
+            delay_array_to_mat + bank.mat.b_mux_predec->delay + bank.mat.bit_mux_dec->delay + bank.mat.delay_sa),  // col_path
+        MAX(delay_array_to_sa_mux_lev_1_decoder,    // sa_mux_lev_1_path
+            delay_array_to_sa_mux_lev_2_decoder));  // sa_mux_lev_2_path
+  delay_from_subarray_out_drv_to_out = bank.mat.delay_subarray_out_drv_htree +
+                                       bank.htree_out_data->delay + htree_out_data->delay;
+  access_time                        = bank.mat.delay_comparator;
+
+  double ram_delay_inside_mat;
+  if (dp.fully_assoc)
+  {
+    //delay of FA contains both CAM tag and RAM data
+    { //delay of CAM
+      ram_delay_inside_mat = bank.mat.delay_bitline + bank.mat.delay_matchchline;
+      access_time = htree_in_add->delay + bank.htree_in_add->delay;
+      //delay of fully-associative data array
+      access_time += ram_delay_inside_mat + delay_from_subarray_out_drv_to_out;
+    }
+  }
+  else
+  {
+    access_time = delay_before_subarray_output_driver + delay_from_subarray_out_drv_to_out; //data_acc_path
+  }
+
+  if (dp.is_main_mem)
+  {
+    double t_rcd       = max_delay_before_row_decoder + delay_inside_mat;
+    double cas_latency = MAX(delay_array_to_sa_mux_lev_1_decoder, delay_array_to_sa_mux_lev_2_decoder) +
+                         delay_from_subarray_out_drv_to_out;
+    access_time = t_rcd + cas_latency;
+  }
+
+  double temp;
+
+  if (!dp.fully_assoc)
+  {
+    temp = delay_inside_mat + bank.mat.delay_wl_reset + bank.mat.delay_bl_restore;//TODO: Sheng: revisit
+   if (dp.is_dram)
+    {
+      temp += bank.mat.delay_writeback;  // temp stores random cycle time
+    }
+
+
+  temp = MAX(temp, bank.mat.r_predec->delay);
+  temp = MAX(temp, bank.mat.b_mux_predec->delay);
+  temp = MAX(temp, bank.mat.sa_mux_lev_1_predec->delay);
+  temp = MAX(temp, bank.mat.sa_mux_lev_2_predec->delay);
+  }
+  else
+   {
+	  ram_delay_inside_mat = bank.mat.delay_bitline + bank.mat.delay_matchchline;
+	  temp = ram_delay_inside_mat + bank.mat.delay_cam_sl_restore + bank.mat.delay_cam_ml_reset + bank.mat.delay_bl_restore
+	         + bank.mat.delay_hit_miss_reset + bank.mat.delay_wl_reset;
+
+	  temp = MAX(temp, bank.mat.b_mux_predec->delay);//TODO: Sheng revisit whether distinguish cam and ram bitline etc.
+	  temp = MAX(temp, bank.mat.sa_mux_lev_1_predec->delay);
+	  temp = MAX(temp, bank.mat.sa_mux_lev_2_predec->delay);
+   }
+
+  // The following is true only if the input parameter "repeaters_in_htree" is set to false --Nav
+  if (g_ip->rpters_in_htree == false)
+  {
+    temp = MAX(temp, bank.htree_in_add->max_unpipelined_link_delay);
+  }
+  cycle_time = temp;
+
+  double delay_req_network = max_delay_before_row_decoder;
+  double delay_rep_network = delay_from_subarray_out_drv_to_out;
+  multisubbank_interleave_cycle_time = MAX(delay_req_network, delay_rep_network);
+
+  if (dp.is_main_mem)
+  {
+    multisubbank_interleave_cycle_time = htree_in_add->delay;
+    precharge_delay = htree_in_add->delay +
+                      bank.htree_in_add->delay + bank.mat.delay_writeback +
+                      bank.mat.delay_wl_reset + bank.mat.delay_bl_restore;
+    cycle_time = access_time + precharge_delay;
+  }
+  else
+  {
+    precharge_delay = 0;
+  }
+  return outrisetime;
+}
+
+
+
+// note: currently, power numbers are for a bank of an array
+void UCA::compute_power_energy()
+{
+  bank.compute_power_energy();
+  power = bank.power;
+
+  power_routing_to_bank.readOp.dynamic  = htree_in_add->power.readOp.dynamic + htree_out_data->power.readOp.dynamic;
+  power_routing_to_bank.writeOp.dynamic = htree_in_add->power.readOp.dynamic + htree_in_data->power.readOp.dynamic;
+  if (dp.fully_assoc || dp.pure_cam)
+      power_routing_to_bank.searchOp.dynamic= htree_in_search->power.searchOp.dynamic + htree_out_search->power.searchOp.dynamic;
+
+  power_routing_to_bank.readOp.leakage += htree_in_add->power.readOp.leakage +
+                                          htree_in_data->power.readOp.leakage +
+                                          htree_out_data->power.readOp.leakage;
+
+  power_routing_to_bank.readOp.gate_leakage += htree_in_add->power.readOp.gate_leakage +
+                                          htree_in_data->power.readOp.gate_leakage +
+                                          htree_out_data->power.readOp.gate_leakage;
+  if (dp.fully_assoc || dp.pure_cam)
+  {
+	power_routing_to_bank.readOp.leakage += htree_in_search->power.readOp.leakage + htree_out_search->power.readOp.leakage;
+	power_routing_to_bank.readOp.gate_leakage += htree_in_search->power.readOp.gate_leakage + htree_out_search->power.readOp.gate_leakage;
+  }
+
+  power.searchOp.dynamic += power_routing_to_bank.searchOp.dynamic;
+  power.readOp.dynamic += power_routing_to_bank.readOp.dynamic;
+  power.readOp.leakage += power_routing_to_bank.readOp.leakage;
+  power.readOp.gate_leakage += power_routing_to_bank.readOp.gate_leakage;
+
+  // calculate total write energy per access
+  power.writeOp.dynamic = power.readOp.dynamic
+                        - bank.mat.power_bitline.readOp.dynamic * dp.num_act_mats_hor_dir
+                        + bank.mat.power_bitline.writeOp.dynamic * dp.num_act_mats_hor_dir
+                        - power_routing_to_bank.readOp.dynamic
+                        + power_routing_to_bank.writeOp.dynamic
+                        + bank.htree_in_data->power.readOp.dynamic
+                        - bank.htree_out_data->power.readOp.dynamic;
+
+  if (dp.is_dram == false)
+  {
+    power.writeOp.dynamic -= bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
+  }
+
+  dyn_read_energy_from_closed_page = power.readOp.dynamic;
+  dyn_read_energy_from_open_page   = power.readOp.dynamic -
+                                     (bank.mat.r_predec->power.readOp.dynamic +
+                                      bank.mat.power_row_decoders.readOp.dynamic +
+                                      bank.mat.power_bl_precharge_eq_drv.readOp.dynamic +
+                                      bank.mat.power_sa.readOp.dynamic +
+                                      bank.mat.power_bitline.readOp.dynamic) * dp.num_act_mats_hor_dir;
+
+  dyn_read_energy_remaining_words_in_burst =
+    (MAX((g_ip->burst_len / g_ip->int_prefetch_w), 1) - 1) *
+    ((bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic +
+      bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic +
+      bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
+      bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic +
+      bank.mat.power_subarray_out_drv.readOp.dynamic)     * dp.num_act_mats_hor_dir +
+     bank.htree_out_data->power.readOp.dynamic +
+     power_routing_to_bank.readOp.dynamic);
+  dyn_read_energy_from_closed_page += dyn_read_energy_remaining_words_in_burst;
+  dyn_read_energy_from_open_page   += dyn_read_energy_remaining_words_in_burst;
+
+  activate_energy = htree_in_add->power.readOp.dynamic +
+                    bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_act +
+                    (bank.mat.r_predec->power.readOp.dynamic +
+                     bank.mat.power_row_decoders.readOp.dynamic +
+                     bank.mat.power_sa.readOp.dynamic) * dp.num_act_mats_hor_dir;
+  read_energy    = (htree_in_add->power.readOp.dynamic +
+                    bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_rd_or_wr +
+                    (bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic  +
+                     bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic  +
+                     bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
+                     bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic +
+                     bank.mat.power_subarray_out_drv.readOp.dynamic) * dp.num_act_mats_hor_dir +
+                    bank.htree_out_data->power.readOp.dynamic +
+                    htree_in_data->power.readOp.dynamic) * g_ip->burst_len;
+  write_energy   = (htree_in_add->power.readOp.dynamic +
+                    bank.htree_in_add->power_bit.readOp.dynamic * bank.num_addr_b_routed_to_mat_for_rd_or_wr +
+                    htree_in_data->power.readOp.dynamic +
+                    bank.htree_in_data->power.readOp.dynamic +
+                    (bank.mat.sa_mux_lev_1_predec->power.readOp.dynamic  +
+                     bank.mat.sa_mux_lev_2_predec->power.readOp.dynamic  +
+                     bank.mat.power_sa_mux_lev_1_decoders.readOp.dynamic +
+                     bank.mat.power_sa_mux_lev_2_decoders.readOp.dynamic) * dp.num_act_mats_hor_dir) * g_ip->burst_len;
+  precharge_energy = (bank.mat.power_bitline.readOp.dynamic +
+                      bank.mat.power_bl_precharge_eq_drv.readOp.dynamic) * dp.num_act_mats_hor_dir;
+
+  leak_power_subbank_closed_page =
+    (bank.mat.r_predec->power.readOp.leakage +
+     bank.mat.b_mux_predec->power.readOp.leakage +
+     bank.mat.sa_mux_lev_1_predec->power.readOp.leakage +
+     bank.mat.sa_mux_lev_2_predec->power.readOp.leakage +
+     bank.mat.power_row_decoders.readOp.leakage +
+     bank.mat.power_bit_mux_decoders.readOp.leakage +
+     bank.mat.power_sa_mux_lev_1_decoders.readOp.leakage +
+     bank.mat.power_sa_mux_lev_2_decoders.readOp.leakage +
+     bank.mat.leak_power_sense_amps_closed_page_state) * dp.num_act_mats_hor_dir;
+
+  leak_power_subbank_closed_page +=
+    (bank.mat.r_predec->power.readOp.gate_leakage +
+     bank.mat.b_mux_predec->power.readOp.gate_leakage +
+     bank.mat.sa_mux_lev_1_predec->power.readOp.gate_leakage +
+     bank.mat.sa_mux_lev_2_predec->power.readOp.gate_leakage +
+     bank.mat.power_row_decoders.readOp.gate_leakage +
+     bank.mat.power_bit_mux_decoders.readOp.gate_leakage +
+     bank.mat.power_sa_mux_lev_1_decoders.readOp.gate_leakage +
+     bank.mat.power_sa_mux_lev_2_decoders.readOp.gate_leakage) * dp.num_act_mats_hor_dir; //+
+     //bank.mat.leak_power_sense_amps_closed_page_state) * dp.num_act_mats_hor_dir;
+
+  leak_power_subbank_open_page =
+    (bank.mat.r_predec->power.readOp.leakage +
+     bank.mat.b_mux_predec->power.readOp.leakage +
+     bank.mat.sa_mux_lev_1_predec->power.readOp.leakage +
+     bank.mat.sa_mux_lev_2_predec->power.readOp.leakage +
+     bank.mat.power_row_decoders.readOp.leakage +
+     bank.mat.power_bit_mux_decoders.readOp.leakage +
+     bank.mat.power_sa_mux_lev_1_decoders.readOp.leakage +
+     bank.mat.power_sa_mux_lev_2_decoders.readOp.leakage +
+     bank.mat.leak_power_sense_amps_open_page_state) * dp.num_act_mats_hor_dir;
+
+  leak_power_subbank_open_page +=
+    (bank.mat.r_predec->power.readOp.gate_leakage +
+     bank.mat.b_mux_predec->power.readOp.gate_leakage +
+     bank.mat.sa_mux_lev_1_predec->power.readOp.gate_leakage +
+     bank.mat.sa_mux_lev_2_predec->power.readOp.gate_leakage +
+     bank.mat.power_row_decoders.readOp.gate_leakage +
+     bank.mat.power_bit_mux_decoders.readOp.gate_leakage +
+     bank.mat.power_sa_mux_lev_1_decoders.readOp.gate_leakage +
+     bank.mat.power_sa_mux_lev_2_decoders.readOp.gate_leakage ) * dp.num_act_mats_hor_dir;
+     //bank.mat.leak_power_sense_amps_open_page_state) * dp.num_act_mats_hor_dir;
+
+  leak_power_request_and_reply_networks =
+    power_routing_to_bank.readOp.leakage +
+    bank.htree_in_add->power.readOp.leakage +
+    bank.htree_in_data->power.readOp.leakage +
+    bank.htree_out_data->power.readOp.leakage;
+
+  leak_power_request_and_reply_networks +=
+    power_routing_to_bank.readOp.gate_leakage +
+    bank.htree_in_add->power.readOp.gate_leakage +
+    bank.htree_in_data->power.readOp.gate_leakage +
+    bank.htree_out_data->power.readOp.gate_leakage;
+
+  if (dp.fully_assoc || dp.pure_cam)
+  {
+	leak_power_request_and_reply_networks += htree_in_search->power.readOp.leakage + htree_out_search->power.readOp.leakage;
+	leak_power_request_and_reply_networks += htree_in_search->power.readOp.gate_leakage + htree_out_search->power.readOp.gate_leakage;
+  }
+
+
+  if (dp.is_dram)
+  { // if DRAM, add contribution of power spent in row predecoder drivers, blocks and decoders to refresh power
+    refresh_power  = (bank.mat.r_predec->power.readOp.dynamic * dp.num_act_mats_hor_dir +
+                      bank.mat.row_dec->power.readOp.dynamic) * dp.num_r_subarray * dp.num_subarrays;
+    refresh_power += bank.mat.per_bitline_read_energy * dp.num_c_subarray * dp.num_r_subarray * dp.num_subarrays;
+    refresh_power += bank.mat.power_bl_precharge_eq_drv.readOp.dynamic * dp.num_act_mats_hor_dir;
+    refresh_power += bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
+    refresh_power /= dp.dram_refresh_period;
+  }
+
+
+  if (dp.is_tag == false)
+  {
+    power.readOp.dynamic  = dyn_read_energy_from_closed_page;
+    power.writeOp.dynamic = dyn_read_energy_from_closed_page
+      - dyn_read_energy_remaining_words_in_burst
+      - bank.mat.power_bitline.readOp.dynamic * dp.num_act_mats_hor_dir
+      + bank.mat.power_bitline.writeOp.dynamic * dp.num_act_mats_hor_dir
+      + (power_routing_to_bank.writeOp.dynamic -
+         power_routing_to_bank.readOp.dynamic -
+         bank.htree_out_data->power.readOp.dynamic +
+         bank.htree_in_data->power.readOp.dynamic) *
+        (MAX((g_ip->burst_len / g_ip->int_prefetch_w), 1) - 1); //FIXME
+
+    if (dp.is_dram == false)
+    {
+      power.writeOp.dynamic -= bank.mat.power_sa.readOp.dynamic * dp.num_act_mats_hor_dir;
+    }
+  }
+
+  // if DRAM, add refresh power to total leakage
+  if (dp.is_dram)
+  {
+    power.readOp.leakage += refresh_power;
+  }
+
+  // TODO: below should be  avoided.
+  /*if (dp.is_main_mem)
+  {
+    power.readOp.leakage += MAIN_MEM_PER_CHIP_STANDBY_CURRENT_mA * 1e-3 * g_tp.peri_global.Vdd / g_ip->nbanks;
+  }*/
+
+  assert(power.readOp.dynamic  > 0);
+  assert(power.writeOp.dynamic > 0);
+  assert(power.readOp.leakage  > 0);
+}
+
diff --git a/src/gpuwattch/cacti/uca.h b/src/gpuwattch/cacti/uca.h
new file mode 100755
index 000000000..93ae3c5a9
--- /dev/null
+++ b/src/gpuwattch/cacti/uca.h
@@ -0,0 +1,96 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __UCA_H__
+#define __UCA_H__
+
+#include "area.h"
+#include "bank.h"
+#include "component.h"
+#include "parameter.h"
+#include "htree2.h"
+
+
+class UCA : public Component
+{
+  public:
+    UCA(const DynamicParameter & dyn_p);
+    ~UCA();
+    double compute_delays(double inrisetime);  // returns outrisetime
+    void   compute_power_energy();
+
+    DynamicParameter dp;
+    Bank   bank;
+
+    Htree2   * htree_in_add;
+    Htree2   * htree_in_data;
+    Htree2   * htree_out_data;
+    Htree2   * htree_in_search;
+    Htree2   * htree_out_search;
+
+    powerDef power_routing_to_bank;
+
+    uint32_t nbanks;
+
+    int   num_addr_b_bank;
+    int   num_di_b_bank;
+    int   num_do_b_bank;
+    int   num_si_b_bank;
+    int   num_so_b_bank;
+    int   RWP, ERP, EWP,SCHP;
+    double area_all_dataramcells;
+
+    double dyn_read_energy_from_closed_page;
+    double dyn_read_energy_from_open_page;
+    double dyn_read_energy_remaining_words_in_burst;
+
+    double refresh_power;  // only for DRAM
+    double activate_energy;
+    double read_energy;
+    double write_energy;
+    double precharge_energy;
+    double leak_power_subbank_closed_page;
+    double leak_power_subbank_open_page;
+    double leak_power_request_and_reply_networks;
+
+    double delay_array_to_sa_mux_lev_1_decoder;
+    double delay_array_to_sa_mux_lev_2_decoder;
+    double delay_before_subarray_output_driver;
+    double delay_from_subarray_out_drv_to_out;
+    double access_time;
+    double precharge_delay;
+    double multisubbank_interleave_cycle_time;
+};
+
+#endif
+
diff --git a/src/gpuwattch/cacti/wire.cc b/src/gpuwattch/cacti/wire.cc
new file mode 100644
index 000000000..3da3e849a
--- /dev/null
+++ b/src/gpuwattch/cacti/wire.cc
@@ -0,0 +1,833 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "wire.h"
+#include "cmath"
+// use this constructor to calculate wire stats
+Wire::Wire(
+    enum Wire_type wire_model,
+    double wl,
+    int n,
+    double w_s,
+    double s_s,
+    enum Wire_placement wp,
+    double _resistivity,
+    TechnologyParameter::DeviceType *dt
+    ):wt(wire_model), wire_length(wl*1e-6), nsense(n), w_scale(w_s), s_scale(s_s),
+    resistivity(_resistivity), deviceType(dt)
+{
+
+  wire_placement = wp;
+  min_w_pmos     = deviceType->n_to_p_eff_curr_drv_ratio*g_tp.min_w_nmos_;
+  in_rise_time   = 0;
+  out_rise_time  = 0;
+  if (initialized != 1) {
+    cout << "Wire not initialized. Initializing it with default values\n";
+    Wire winit;
+  }
+  calculate_wire_stats();
+  // change everything back to seconds, microns, and Joules
+  repeater_spacing *= 1e6;
+  wire_length      *= 1e6;
+  wire_width       *= 1e6;
+  wire_spacing     *= 1e6;
+  assert(wire_length > 0);
+  assert(power.readOp.dynamic > 0);
+  assert(power.readOp.leakage > 0);
+  assert(power.readOp.gate_leakage > 0);
+}
+
+    // the following values are for peripheral global technology
+    // specified in the input config file
+    Component Wire::global;
+    Component Wire::global_5;
+    Component Wire::global_10;
+    Component Wire::global_20;
+    Component Wire::global_30;
+    Component Wire::low_swing;
+
+    int Wire::initialized;
+    double Wire::wire_width_init;
+    double Wire::wire_spacing_init;
+
+
+Wire::Wire(double w_s, double s_s, enum Wire_placement wp, double resis, TechnologyParameter::DeviceType *dt)
+{
+  w_scale        = w_s;
+  s_scale        = s_s;
+  deviceType     = dt;
+  wire_placement = wp;
+  resistivity    = resis;
+  min_w_pmos     = deviceType->n_to_p_eff_curr_drv_ratio * g_tp.min_w_nmos_;
+  in_rise_time   = 0;
+  out_rise_time  = 0;
+
+  switch (wire_placement)
+  {
+    case outside_mat: wire_width = g_tp.wire_outside_mat.pitch; break;
+    case inside_mat : wire_width = g_tp.wire_inside_mat.pitch;  break;
+    default:          wire_width = g_tp.wire_local.pitch; break;
+  }
+
+  wire_spacing = wire_width;
+
+  wire_width   *= (w_scale * 1e-6/2) /* (m) */;
+  wire_spacing *= (s_scale * 1e-6/2) /* (m) */;
+
+  initialized = 1;
+  init_wire();
+  wire_width_init = wire_width;
+  wire_spacing_init = wire_spacing;
+
+  assert(power.readOp.dynamic > 0);
+  assert(power.readOp.leakage > 0);
+  assert(power.readOp.gate_leakage > 0);
+}
+
+
+
+Wire::~Wire()
+{
+}
+
+
+
+void
+Wire::calculate_wire_stats()
+{
+
+  if (wire_placement == outside_mat) {
+    wire_width = g_tp.wire_outside_mat.pitch;
+  }
+  else if (wire_placement == inside_mat) {
+    wire_width = g_tp.wire_inside_mat.pitch;
+  }
+  else {
+    wire_width = g_tp.wire_local.pitch;
+  }
+
+  wire_spacing = wire_width;
+
+  wire_width   *= (w_scale * 1e-6/2) /* (m) */;
+  wire_spacing *= (s_scale * 1e-6/2) /* (m) */;
+
+
+  if (wt != Low_swing) {
+
+	  //    delay_optimal_wire();
+
+	  if (wt == Global) {
+		  delay = global.delay * wire_length;
+		  power.readOp.dynamic = global.power.readOp.dynamic * wire_length;
+		  power.readOp.leakage = global.power.readOp.leakage * wire_length;
+		  power.readOp.gate_leakage = global.power.readOp.gate_leakage * wire_length;
+		  repeater_spacing = global.area.w;
+		  repeater_size = global.area.h;
+		  area.set_area((wire_length/repeater_spacing) *
+				  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
+						  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
+	  }
+	  else if (wt == Global_5) {
+		  delay = global_5.delay * wire_length;
+		  power.readOp.dynamic = global_5.power.readOp.dynamic * wire_length;
+		  power.readOp.leakage = global_5.power.readOp.leakage * wire_length;
+		  power.readOp.gate_leakage = global_5.power.readOp.gate_leakage * wire_length;
+		  repeater_spacing = global_5.area.w;
+		  repeater_size = global_5.area.h;
+		  area.set_area((wire_length/repeater_spacing) *
+				  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
+						  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
+	  }
+	  else if (wt == Global_10) {
+		  delay = global_10.delay * wire_length;
+		  power.readOp.dynamic = global_10.power.readOp.dynamic * wire_length;
+		  power.readOp.leakage = global_10.power.readOp.leakage * wire_length;
+		  power.readOp.gate_leakage = global_10.power.readOp.gate_leakage * wire_length;
+		  repeater_spacing = global_10.area.w;
+		  repeater_size = global_10.area.h;
+		  area.set_area((wire_length/repeater_spacing) *
+				  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
+						  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
+	  }
+	  else if (wt == Global_20) {
+		  delay = global_20.delay * wire_length;
+		  power.readOp.dynamic = global_20.power.readOp.dynamic * wire_length;
+		  power.readOp.leakage = global_20.power.readOp.leakage * wire_length;
+		  power.readOp.gate_leakage = global_20.power.readOp.gate_leakage * wire_length;
+		  repeater_spacing = global_20.area.w;
+		  repeater_size = global_20.area.h;
+		  area.set_area((wire_length/repeater_spacing) *
+				  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
+						  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
+	  }
+	  else if (wt == Global_30) {
+		  delay = global_30.delay * wire_length;
+		  power.readOp.dynamic = global_30.power.readOp.dynamic * wire_length;
+		  power.readOp.leakage = global_30.power.readOp.leakage * wire_length;
+		  power.readOp.gate_leakage = global_30.power.readOp.gate_leakage * wire_length;
+		  repeater_spacing = global_30.area.w;
+		  repeater_size = global_30.area.h;
+		  area.set_area((wire_length/repeater_spacing) *
+				  compute_gate_area(INV, 1, min_w_pmos * repeater_size,
+						  g_tp.min_w_nmos_ * repeater_size, g_tp.cell_h_def));
+	  }
+    out_rise_time = delay*repeater_spacing/deviceType->Vth;
+  }
+  else if (wt == Low_swing) {
+    low_swing_model ();
+    repeater_spacing = wire_length;
+    repeater_size = 1;
+  }
+  else {
+    assert(0);
+  }
+}
+
+
+
+/*
+ * The fall time of an input signal to the first stage of a circuit is
+ * assumed to be same as the fall time of the output signal of two
+ * inverters connected in series (refer: CACTI 1 Technical report,
+ * section 6.1.3)
+ */
+  double
+Wire::signal_fall_time ()
+{
+
+  /* rise time of inverter 1's output */
+  double rt;
+  /* fall time of inverter 2's output */
+  double ft;
+  double timeconst;
+
+  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
+    tr_R_on(min_w_pmos, PCH, 1);
+  rt = horowitz (0, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, FALL) / (deviceType->Vdd - deviceType->Vth);
+  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
+    tr_R_on(g_tp.min_w_nmos_, NCH, 1);
+  ft = horowitz (rt, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE) / deviceType->Vth;
+  return ft;
+}
+
+
+
+double Wire::signal_rise_time ()
+{
+
+  /* rise time of inverter 1's output */
+  double ft;
+  /* fall time of inverter 2's output */
+  double rt;
+  double timeconst;
+
+  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
+    tr_R_on(g_tp.min_w_nmos_, NCH, 1);
+  rt = horowitz (0, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, RISE) / deviceType->Vth;
+  timeconst = (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+      gate_C(min_w_pmos + g_tp.min_w_nmos_, 0)) *
+    tr_R_on(min_w_pmos, PCH, 1);
+  ft = horowitz (rt, timeconst, deviceType->Vth/deviceType->Vdd, deviceType->Vth/deviceType->Vdd, FALL) / (deviceType->Vdd - deviceType->Vth);
+  return ft; //sec
+}
+
+
+
+/* Wire resistance and capacitance calculations
+ *   wire width
+ *
+ *    /__/
+ *   |  |
+ *   |  |  height = ASPECT_RATIO*wire width (ASPECT_RATIO = 2.2, ref: ITRS)
+ *   |__|/
+ *
+ *   spacing between wires in same level = wire width
+ *   spacing between wires in adjacent levels = wire width---this is incorrect,
+ *   according to R.Ho's paper and thesis. ILD != wire width
+ *
+ */
+
+double Wire::wire_cap (double len /* in m */, bool call_from_outside)
+{
+	//TODO: this should be consistent with the wire_res in technology file
+  double sidewall, adj, tot_cap;
+  double wire_height;
+  double epsilon0 = 8.8542e-12;
+  double aspect_ratio, horiz_dielectric_constant, vert_dielectric_constant, miller_value,ild_thickness;
+
+  switch (wire_placement)
+  {
+    case outside_mat:
+    	{
+    		aspect_ratio = g_tp.wire_outside_mat.aspect_ratio;
+    		horiz_dielectric_constant = g_tp.wire_outside_mat.horiz_dielectric_constant;
+    		vert_dielectric_constant = g_tp.wire_outside_mat.vert_dielectric_constant;
+    		miller_value = g_tp.wire_outside_mat.miller_value;
+    		ild_thickness = g_tp.wire_outside_mat.ild_thickness;
+    		break;
+    	}
+    case inside_mat :
+    	{
+    		aspect_ratio = g_tp.wire_inside_mat.aspect_ratio;
+    		horiz_dielectric_constant = g_tp.wire_inside_mat.horiz_dielectric_constant;
+    		vert_dielectric_constant = g_tp.wire_inside_mat.vert_dielectric_constant;
+    		miller_value = g_tp.wire_inside_mat.miller_value;
+    		ild_thickness = g_tp.wire_inside_mat.ild_thickness;
+    		break;
+    	}
+    default:
+    	{
+    		aspect_ratio = g_tp.wire_local.aspect_ratio;
+    		horiz_dielectric_constant = g_tp.wire_local.horiz_dielectric_constant;
+    		vert_dielectric_constant = g_tp.wire_local.vert_dielectric_constant;
+    		miller_value = g_tp.wire_local.miller_value;
+    		ild_thickness = g_tp.wire_local.ild_thickness;
+    		break;
+    	}
+  }
+
+  if (call_from_outside)
+  {
+	  wire_width       *= 1e-6;
+	  wire_spacing     *= 1e-6;
+  }
+  wire_height = wire_width/w_scale*aspect_ratio;
+  /*
+   * assuming height does not change. wire_width = width_original*w_scale
+   * So wire_height does not change as wire width increases
+   */
+
+// capacitance between wires in the same level
+//  sidewall = 2*miller_value * horiz_dielectric_constant * (wire_height/wire_spacing)
+//    * epsilon0;
+
+  sidewall = miller_value * horiz_dielectric_constant * (wire_height/wire_spacing)
+    * epsilon0;
+
+
+  // capacitance between wires in adjacent levels
+  //adj = miller_value * vert_dielectric_constant *w_scale * epsilon0;
+  //adj = 2*vert_dielectric_constant *wire_width/(ild_thickness*1e-6) * epsilon0;
+
+  adj = miller_value *vert_dielectric_constant *wire_width/(ild_thickness*1e-6) * epsilon0;
+  //Change ild_thickness from micron to M
+
+  //tot_cap =  (sidewall + adj + (deviceType->C_fringe * 1e6)); //F/m
+  tot_cap =  (sidewall + adj + (g_tp.fringe_cap * 1e6)); //F/m
+
+  if (call_from_outside)
+  {
+	  wire_width       *= 1e6;
+	  wire_spacing     *= 1e6;
+  }
+  return (tot_cap*len); // (F)
+}
+
+
+  double
+Wire::wire_res (double len /*(in m)*/)
+{
+
+	  double aspect_ratio,alpha_scatter =1.05, dishing_thickness=0, barrier_thickness=0;
+	  //TODO: this should be consistent with the wire_res in technology file
+	  //The whole computation should be consistent with the wire_res in technology.cc too!
+
+	  switch (wire_placement)
+	  {
+	  case outside_mat:
+	  {
+		  aspect_ratio = g_tp.wire_outside_mat.aspect_ratio;
+		  break;
+	  }
+	  case inside_mat :
+	  {
+		  aspect_ratio = g_tp.wire_inside_mat.aspect_ratio;
+		  break;
+	  }
+	  default:
+	  {
+		  aspect_ratio = g_tp.wire_local.aspect_ratio;
+		  break;
+	  }
+	  }
+	  return (alpha_scatter * resistivity * 1e-6 * len/((aspect_ratio*wire_width/w_scale-dishing_thickness - barrier_thickness)*
+			  (wire_width-2*barrier_thickness)));
+}
+
+/*
+ * Calculates the delay, power and area of the transmitter circuit.
+ *
+ * The transmitter delay is the sum of nand gate delay, inverter delay
+ * low swing nmos delay, and the wire delay
+ * (ref: Technical report 6)
+ */
+  void
+Wire::low_swing_model()
+{
+  double len = wire_length;
+  double beta = pmos_to_nmos_sz_ratio();
+
+
+  double inputrise = (in_rise_time == 0) ? signal_rise_time() : in_rise_time;
+
+  /* Final nmos low swing driver size calculation:
+   * Try to size the driver such that the delay
+   * is less than 8FO4.
+   * If the driver size is greater than
+   * the max allowable size, assume max size for the driver.
+   * In either case, recalculate the delay using
+   * the final driver size assuming slow input with
+   * finite rise time instead of ideal step input
+   *
+   * (ref: Technical report 6)
+   */
+  double cwire = wire_cap(len); /* load capacitance */
+  double rwire = wire_res(len);
+
+#define RES_ADJ (8.6) // Increase in resistance due to low driving vol.
+
+  double driver_res = (-8*g_tp.FO4/(log(0.5) * cwire))/RES_ADJ;
+  double nsize = R_to_w(driver_res, NCH);
+
+  nsize = MIN(nsize, g_tp.max_w_nmos_);
+  nsize = MAX(nsize, g_tp.min_w_nmos_);
+
+  if(rwire*cwire > 8*g_tp.FO4)
+  {
+    nsize = g_tp.max_w_nmos_;
+  }
+
+  // size the inverter appropriately to minimize the transmitter delay
+  // Note - In order to minimize leakage, we are not adding a set of inverters to
+  // bring down delay. Instead, we are sizing the single gate
+  // based on the logical effort.
+  double st_eff   = sqrt((2+beta/1+beta)*gate_C(nsize, 0)/(gate_C(2*g_tp.min_w_nmos_, 0)
+        + gate_C(2*min_w_pmos, 0)));
+  double req_cin  = ((2+beta/1+beta)*gate_C(nsize, 0))/st_eff;
+  double inv_size = req_cin/(gate_C(min_w_pmos, 0) + gate_C(g_tp.min_w_nmos_, 0));
+  inv_size = MAX(inv_size, 1);
+
+  /* nand gate delay */
+  double res_eq = (2 * tr_R_on(g_tp.min_w_nmos_, NCH, 1));
+  double cap_eq = 2 * drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(2*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+    gate_C(inv_size*g_tp.min_w_nmos_, 0) +
+    gate_C(inv_size*min_w_pmos, 0);
+
+  double timeconst = res_eq * cap_eq;
+
+  delay = horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
+      deviceType->Vth/deviceType->Vdd, RISE);
+  double temp_power = cap_eq*deviceType->Vdd*deviceType->Vdd;
+
+  inputrise = delay / (deviceType->Vdd - deviceType->Vth); /* for the next stage */
+
+  /* Inverter delay:
+   * The load capacitance of this inv depends on
+   * the gate capacitance of the final stage nmos
+   * transistor which in turn depends on nsize
+   */
+  res_eq = tr_R_on(inv_size*min_w_pmos, PCH, 1);
+  cap_eq = drain_C_(inv_size*min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(inv_size*g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+    gate_C(nsize, 0);
+  timeconst = res_eq * cap_eq;
+
+  delay += horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
+      deviceType->Vth/deviceType->Vdd, FALL);
+  temp_power += cap_eq*deviceType->Vdd*deviceType->Vdd;
+
+
+  transmitter.delay = delay;
+  transmitter.power.readOp.dynamic = temp_power*2; /* since it is a diff. model*/
+  transmitter.power.readOp.leakage = deviceType->Vdd *
+    (4 * cmos_Isub_leakage(g_tp.min_w_nmos_, min_w_pmos, 2, nand) +
+     4 * cmos_Isub_leakage(g_tp.min_w_nmos_, min_w_pmos, 1, inv));
+
+  transmitter.power.readOp.gate_leakage = deviceType->Vdd *
+    (4 * cmos_Ig_leakage(g_tp.min_w_nmos_, min_w_pmos, 2, nand) +
+     4 * cmos_Ig_leakage(g_tp.min_w_nmos_, min_w_pmos, 1, inv));
+
+  inputrise = delay / deviceType->Vth;
+
+  /* nmos delay + wire delay */
+  cap_eq = cwire + drain_C_(nsize, NCH, 1, 1, g_tp.cell_h_def)*2 +
+    nsense * sense_amp_input_cap(); //+receiver cap
+  /*
+   * NOTE: nmos is used as both pull up and pull down transistor
+   * in the transmitter. This is because for low voltage swing, drive
+   * resistance of nmos is less than pmos
+   * (for a detailed graph ref: On-Chip Wires: Scaling and Efficiency)
+   */
+  timeconst = (tr_R_on(nsize, NCH, 1)*RES_ADJ) * (cwire +
+      drain_C_(nsize, NCH, 1, 1, g_tp.cell_h_def)*2) +
+    rwire*cwire/2 +
+    (tr_R_on(nsize, NCH, 1)*RES_ADJ + rwire) *
+    nsense * sense_amp_input_cap();
+
+  /*
+   * since we are pre-equalizing and overdriving the low
+   * swing wires, the net time constant is less
+   * than the actual value
+   */
+  delay += horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd, .25, 0);
+#define VOL_SWING .1
+  temp_power += cap_eq*VOL_SWING*.400; /* .4v is the over drive voltage */
+  temp_power *= 2; /* differential wire */
+
+  l_wire.delay = delay - transmitter.delay;
+  l_wire.power.readOp.dynamic = temp_power - transmitter.power.readOp.dynamic;
+  l_wire.power.readOp.leakage = deviceType->Vdd*
+    (4* cmos_Isub_leakage(nsize, 0, 1, nmos));
+
+  l_wire.power.readOp.gate_leakage = deviceType->Vdd*
+    (4* cmos_Ig_leakage(nsize, 0, 1, nmos));
+
+  //double rt = horowitz(inputrise, timeconst, deviceType->Vth/deviceType->Vdd,
+  //    deviceType->Vth/deviceType->Vdd, RISE)/deviceType->Vth;
+
+  delay += g_tp.sense_delay;
+
+  sense_amp.delay = g_tp.sense_delay;
+  out_rise_time = g_tp.sense_delay/(deviceType->Vth);
+  sense_amp.power.readOp.dynamic = g_tp.sense_dy_power;
+  sense_amp.power.readOp.leakage = 0; //FIXME
+  sense_amp.power.readOp.gate_leakage = 0;
+
+  power.readOp.dynamic = temp_power + sense_amp.power.readOp.dynamic;
+  power.readOp.leakage = transmitter.power.readOp.leakage +
+                         l_wire.power.readOp.leakage +
+                         sense_amp.power.readOp.leakage;
+  power.readOp.gate_leakage = transmitter.power.readOp.gate_leakage +
+                         l_wire.power.readOp.gate_leakage +
+                         sense_amp.power.readOp.gate_leakage;
+}
+
+  double
+Wire::sense_amp_input_cap()
+{
+  return drain_C_(g_tp.w_iso, PCH, 1, 1, g_tp.cell_h_def) +
+    gate_C(g_tp.w_sense_en + g_tp.w_sense_n, 0) +
+    drain_C_(g_tp.w_sense_n, NCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(g_tp.w_sense_p, PCH, 1, 1, g_tp.cell_h_def);
+}
+
+
+void Wire::delay_optimal_wire ()
+{
+  double len       = wire_length;
+  //double min_wire_width = wire_width; //m
+  double beta = pmos_to_nmos_sz_ratio();
+  double switching = 0;  // switching energy
+  double short_ckt = 0;  // short-circuit energy
+  double tc        = 0;  // time constant
+  // input cap of min sized driver
+  double input_cap = gate_C(g_tp.min_w_nmos_ + min_w_pmos, 0);
+
+   // output parasitic capacitance of
+   // the min. sized driver
+  double out_cap = drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def);
+  // drive resistance
+  double out_res = (tr_R_on(g_tp.min_w_nmos_, NCH, 1) +
+      tr_R_on(min_w_pmos, PCH, 1))/2;
+  double wr = wire_res(len); //ohm
+
+  // wire cap /m
+  double wc = wire_cap(len);
+
+  // size the repeater such that the delay of the wire is minimum
+  double repeater_scaling = sqrt(out_res*wc/(wr*input_cap)); // len will cancel
+
+   // calc the optimum spacing between the repeaters (m)
+
+  repeater_spacing = sqrt(2 * out_res * (out_cap + input_cap)/
+      ((wr/len)*(wc/len)));
+  repeater_size = repeater_scaling;
+
+  switching = (repeater_scaling * (input_cap + out_cap) +
+      repeater_spacing * (wc/len)) * deviceType->Vdd * deviceType->Vdd;
+
+  tc = out_res * (input_cap + out_cap) +
+    out_res * wc/len * repeater_spacing/repeater_scaling +
+    wr/len * repeater_spacing * input_cap * repeater_scaling +
+    0.5 * (wr/len) * (wc/len)* repeater_spacing * repeater_spacing;
+
+  delay = 0.693 * tc * len/repeater_spacing;
+
+#define Ishort_ckt 65e-6 /* across all tech Ref:Banerjee et al. {IEEE TED} */
+  short_ckt = deviceType->Vdd * g_tp.min_w_nmos_ * Ishort_ckt * 1.0986 *
+    repeater_scaling * tc;
+
+  area.set_area((len/repeater_spacing) *
+                compute_gate_area(INV, 1, min_w_pmos * repeater_scaling,
+                                          g_tp.min_w_nmos_ * repeater_scaling, g_tp.cell_h_def));
+  power.readOp.dynamic = ((len/repeater_spacing)*(switching + short_ckt));
+  power.readOp.leakage = ((len/repeater_spacing)*
+      deviceType->Vdd*
+      cmos_Isub_leakage(g_tp.min_w_nmos_*repeater_scaling, beta*g_tp.min_w_nmos_*repeater_scaling, 1, inv));
+  power.readOp.gate_leakage = ((len/repeater_spacing)*
+      deviceType->Vdd*
+      cmos_Ig_leakage(g_tp.min_w_nmos_*repeater_scaling, beta*g_tp.min_w_nmos_*repeater_scaling, 1, inv));
+}
+
+
+
+// calculate power/delay values for wires with suboptimal repeater sizing/spacing
+void
+Wire::init_wire(){
+  wire_length = 1;
+  delay_optimal_wire();
+    double sp, si;
+  powerDef pow;
+  si = repeater_size;
+  sp = repeater_spacing;
+  sp *= 1e6; // in microns
+
+  double i, j, del;
+  repeated_wire.push_back(Component());
+  for (j=sp; j < 4*sp; j+=100) {
+    for (i = si; i > 1; i--) {
+      pow = wire_model(j*1e-6, i, &del);
+      if (j == sp && i == si) {
+        global.delay = del;
+        global.power = pow;
+        global.area.h = si;
+        global.area.w = sp*1e-6; // m
+      }
+//      cout << "Repeater size - "<< i <<
+//        " Repeater spacing - " << j <<
+//        " Delay - " << del <<
+//        " PowerD - " << pow.readOp.dynamic <<
+//        " PowerL - " << pow.readOp.leakage <<endl;
+      repeated_wire.back().delay = del;
+      repeated_wire.back().power.readOp = pow.readOp;
+      repeated_wire.back().area.w = j*1e-6; //m
+      repeated_wire.back().area.h = i;
+      repeated_wire.push_back(Component());
+
+    }
+  }
+  repeated_wire.pop_back();
+  update_fullswing();
+  Wire *l_wire = new Wire(Low_swing, 0.001/* 1 mm*/, 1);
+  low_swing.delay = l_wire->delay;
+  low_swing.power = l_wire->power;
+  delete l_wire;
+}
+
+
+
+void Wire::update_fullswing()
+{
+
+  list<Component>::iterator citer;
+  double del[4];
+  del[3] = this->global.delay + this->global.delay*.3;
+  del[2] = global.delay + global.delay*.2;
+  del[1] = global.delay + global.delay*.1;
+  del[0] = global.delay + global.delay*.05;
+  double threshold;
+  double ncost;
+  double cost;
+  int i = 4;
+  while (i>0) {
+    threshold = del[i-1];
+    cost = BIGNUM;
+    for (citer = repeated_wire.begin(); citer != repeated_wire.end(); citer++)
+    {
+      if (citer->delay > threshold) {
+        citer = repeated_wire.erase(citer);
+        citer --;
+      }
+      else {
+        ncost = citer->power.readOp.dynamic/global.power.readOp.dynamic +
+                citer->power.readOp.leakage/global.power.readOp.leakage;
+        if(ncost < cost)
+        {
+          cost = ncost;
+          if (i == 4) {
+            global_30.delay = citer->delay;
+            global_30.power = citer->power;
+            global_30.area  = citer->area;
+          }
+          else if (i==3) {
+            global_20.delay = citer->delay;
+            global_20.power = citer->power;
+            global_20.area  = citer->area;
+          }
+          else if(i==2) {
+            global_10.delay = citer->delay;
+            global_10.power = citer->power;
+            global_10.area  = citer->area;
+          }
+          else if(i==1) {
+            global_5.delay = citer->delay;
+            global_5.power = citer->power;
+            global_5.area  = citer->area;
+          }
+        }
+      }
+    }
+    i--;
+  }
+}
+
+
+
+powerDef Wire::wire_model (double space, double size, double *delay)
+{
+  powerDef ptemp;
+  double len = 1;
+  //double min_wire_width = wire_width; //m
+  double beta = pmos_to_nmos_sz_ratio();
+  // switching energy
+  double switching = 0;
+  // short-circuit energy
+  double short_ckt = 0;
+  // time constant
+  double tc = 0;
+  // input cap of min sized driver
+  double input_cap = gate_C (g_tp.min_w_nmos_ +
+      min_w_pmos, 0);
+
+   // output parasitic capacitance of
+   // the min. sized driver
+  double out_cap = drain_C_(min_w_pmos, PCH, 1, 1, g_tp.cell_h_def) +
+    drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def);
+  // drive resistance
+  double out_res = (tr_R_on(g_tp.min_w_nmos_, NCH, 1) +
+      tr_R_on(min_w_pmos, PCH, 1))/2;
+  double wr = wire_res(len); //ohm
+
+  // wire cap /m
+  double wc = wire_cap(len);
+
+  repeater_spacing = space;
+  repeater_size = size;
+
+  switching = (repeater_size * (input_cap + out_cap) +
+      repeater_spacing * (wc/len)) * deviceType->Vdd * deviceType->Vdd;
+
+  tc = out_res * (input_cap + out_cap) +
+    out_res * wc/len * repeater_spacing/repeater_size +
+    wr/len * repeater_spacing * out_cap * repeater_size +
+    0.5 * (wr/len) * (wc/len)* repeater_spacing * repeater_spacing;
+
+  *delay = 0.693 * tc * len/repeater_spacing;
+
+#define Ishort_ckt 65e-6 /* across all tech Ref:Banerjee et al. {IEEE TED} */
+  short_ckt = deviceType->Vdd * g_tp.min_w_nmos_ * Ishort_ckt * 1.0986 *
+    repeater_size * tc;
+
+  ptemp.readOp.dynamic = ((len/repeater_spacing)*(switching + short_ckt));
+  ptemp.readOp.leakage = ((len/repeater_spacing)*
+      deviceType->Vdd*
+      cmos_Isub_leakage(g_tp.min_w_nmos_*repeater_size, beta*g_tp.min_w_nmos_*repeater_size, 1, inv));
+
+  ptemp.readOp.gate_leakage = ((len/repeater_spacing)*
+      deviceType->Vdd*
+      cmos_Ig_leakage(g_tp.min_w_nmos_*repeater_size, beta*g_tp.min_w_nmos_*repeater_size, 1, inv));
+
+  return ptemp;
+}
+
+void
+Wire::print_wire()
+{
+
+  cout << "\nWire Properties:\n\n";
+  cout << "  Delay Optimal\n\tRepeater size - "<< global.area.h <<
+    " \n\tRepeater spacing - " << global.area.w*1e3 << " (mm)"
+    " \n\tDelay - " << global.delay*1e6 <<  " (ns/mm)"
+    " \n\tPowerD - " << global.power.readOp.dynamic *1e6<< " (nJ/mm)"
+    " \n\tPowerL - " << global.power.readOp.leakage << " (mW/mm)"
+    " \n\tPowerLgate - " << global.power.readOp.gate_leakage << " (mW/mm)\n";
+  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
+  cout <<endl;
+
+  cout << "  5% Overhead\n\tRepeater size - "<< global_5.area.h <<
+    " \n\tRepeater spacing - " << global_5.area.w*1e3 << " (mm)"
+    " \n\tDelay - " << global_5.delay *1e6<<  " (ns/mm)"
+    " \n\tPowerD - " << global_5.power.readOp.dynamic *1e6<< " (nJ/mm)"
+    " \n\tPowerL - " << global_5.power.readOp.leakage << " (mW/mm)"
+    " \n\tPowerLgate - " << global_5.power.readOp.gate_leakage << " (mW/mm)\n";
+  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
+  cout <<endl;
+  cout << "  10% Overhead\n\tRepeater size - "<< global_10.area.h <<
+    " \n\tRepeater spacing - " << global_10.area.w*1e3 << " (mm)"
+    " \n\tDelay - " << global_10.delay *1e6<<  " (ns/mm)"
+    " \n\tPowerD - " << global_10.power.readOp.dynamic *1e6<< " (nJ/mm)"
+    " \n\tPowerL - " << global_10.power.readOp.leakage << " (mW/mm)"
+    " \n\tPowerLgate - " << global_10.power.readOp.gate_leakage << " (mW/mm)\n";
+  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
+  cout <<endl;
+  cout << "  20% Overhead\n\tRepeater size - "<< global_20.area.h <<
+    " \n\tRepeater spacing - " << global_20.area.w*1e3 << " (mm)"
+    " \n\tDelay - " << global_20.delay *1e6<<  " (ns/mm)"
+    " \n\tPowerD - " << global_20.power.readOp.dynamic *1e6<< " (nJ/mm)"
+    " \n\tPowerL - " << global_20.power.readOp.leakage << " (mW/mm)"
+    " \n\tPowerLgate - " << global_20.power.readOp.gate_leakage << " (mW/mm)\n";
+  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
+  cout <<endl;
+  cout << "  30% Overhead\n\tRepeater size - "<< global_30.area.h <<
+    " \n\tRepeater spacing - " << global_30.area.w*1e3 << " (mm)"
+    " \n\tDelay - " << global_30.delay *1e6<<  " (ns/mm)"
+    " \n\tPowerD - " << global_30.power.readOp.dynamic *1e6<< " (nJ/mm)"
+    " \n\tPowerL - " << global_30.power.readOp.leakage << " (mW/mm)"
+    " \n\tPowerLgate - " << global_30.power.readOp.gate_leakage << " (mW/mm)\n";
+  cout << "\tWire width - " <<wire_width_init*1e6 << " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init*1e6 << " microns\n";
+  cout <<endl;
+  cout << "  Low-swing wire (1 mm) - Note: Unlike repeated wires, \n\tdelay and power "
+            "values of low-swing wires do not\n\thave a linear relationship with length." <<
+      " \n\tdelay - " << low_swing.delay *1e9<<  " (ns)"
+      " \n\tpowerD - " << low_swing.power.readOp.dynamic *1e9<< " (nJ)"
+      " \n\tPowerL - " << low_swing.power.readOp.leakage << " (mW)"
+      " \n\tPowerLgate - " << low_swing.power.readOp.gate_leakage << " (mW)\n";
+  cout << "\tWire width - " <<wire_width_init * 2 /* differential */<< " microns\n";
+  cout << "\tWire spacing - " <<wire_spacing_init * 2 /* differential */<< " microns\n";
+  cout <<endl;
+  cout <<endl;
+
+}
+
diff --git a/src/gpuwattch/cacti/wire.h b/src/gpuwattch/cacti/wire.h
new file mode 100644
index 000000000..ace7c9cee
--- /dev/null
+++ b/src/gpuwattch/cacti/wire.h
@@ -0,0 +1,123 @@
+/*****************************************************************************
+ *                                McPAT/CACTI
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+
+
+#ifndef __WIRE_H__
+#define __WIRE_H__
+
+#include "basic_circuit.h"
+#include "component.h"
+#include "parameter.h"
+#include "assert.h"
+#include "cacti_interface.h"
+#include <iostream>
+#include <list>
+
+class Wire : public Component
+{
+  public:
+    Wire(enum Wire_type wire_model, double len = 0/* in u*/,
+         int nsense = 1/* no. of sense amps connected to the low-swing wire */,
+         double width_scaling = 1,
+         double spacing_scaling = 1,
+         enum Wire_placement wire_placement = outside_mat,
+         double resistivity = CU_RESISTIVITY,
+         TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+    ~Wire();
+
+    Wire( double width_scaling = 1,
+         double spacing_scaling = 1,
+         enum Wire_placement wire_placement = outside_mat,
+         double resistivity = CU_RESISTIVITY,
+         TechnologyParameter::DeviceType *dt = &(g_tp.peri_global)
+    ); // should be used only once for initializing static members
+    void init_wire();
+
+    void calculate_wire_stats();
+    void delay_optimal_wire();
+    double wire_cap(double len, bool call_from_outside=false);
+    double wire_res(double len);
+    void low_swing_model();
+    double signal_fall_time();
+    double signal_rise_time();
+    double sense_amp_input_cap();
+
+    enum Wire_type wt;
+    double wire_spacing;
+    double wire_width;
+    enum Wire_placement wire_placement;
+    double repeater_size;
+    double repeater_spacing;
+    double wire_length;
+    double in_rise_time, out_rise_time;
+
+    void set_in_rise_time(double rt)
+    {
+      in_rise_time = rt;
+    }
+    static Component global;
+    static Component global_5;
+    static Component global_10;
+    static Component global_20;
+    static Component global_30;
+    static Component low_swing;
+    static double wire_width_init;
+    static double wire_spacing_init;
+    void print_wire();
+
+  private:
+
+    int nsense; // no. of sense amps connected to a low-swing wire if it
+                // is broadcasting data to multiple destinations
+    // width and spacing scaling factor can be used
+    // to model low level wires or special
+    // fat wires
+    double w_scale, s_scale;
+    double resistivity;
+    powerDef wire_model (double space, double size, double *delay);
+    list <Component> repeated_wire;
+    void update_fullswing();
+    static int initialized;
+
+
+    //low-swing
+    Component transmitter;
+    Component l_wire;
+    Component sense_amp;
+
+    double min_w_pmos;
+
+    TechnologyParameter::DeviceType *deviceType;
+
+};
+
+#endif
diff --git a/src/gpuwattch/core.cc b/src/gpuwattch/core.cc
new file mode 100644
index 000000000..5a28e9a0a
--- /dev/null
+++ b/src/gpuwattch/core.cc
@@ -0,0 +1,6813 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+
+#include "core.h"
+#include "XML_Parse.h"
+#include "cacti/basic_circuit.h"
+#include "const.h"
+#include "io.h"
+#include "parameter.h"
+#include <algorithm>
+#include <assert.h>
+#include <cmath>
+#include <iostream>
+#include <string>
+//#include "globalvar.h"
+// double exClockRate;
+//*********************
+// Operand collector (OC) modelling (Syed Gilani)
+//*********************
+// The OCs are modelled similar to the GPGPU-Sim v3.x documentation and
+// nVIDIA patents.
+// the OC need the following GPGPU-Sim config options:
+//-gpgpu_num_reg_banks                    8 # Number of register banks (default
+//= 8) -gpgpu_reg_bank_use_warp_id                    0 # Use warp ID in mapping
+// registers to banks (default = off) -gpgpu_operand_collector_num_units_sp 6 #
+// number of collector units (default = 4)
+// -gpgpu_operand_collector_num_units_sfu 8 # number of collector units (default
+// = 4) -gpgpu_operand_collector_num_units_mem                    2 # number of
+// collector units (default = 2) -gpgpu_operand_collector_num_units_gen 0 #
+// number of collector units (default = 0)
+//-gpgpu_operand_collector_num_in_ports_sp                    1 # number of
+// collector unit in ports (default = 1)
+//-gpgpu_operand_collector_num_in_ports_sfu                    1 # number of
+// collector unit in ports (default = 1)
+//-gpgpu_operand_collector_num_in_ports_mem                    1 # number of
+// collector unit in ports (default = 1)
+//-gpgpu_operand_collector_num_in_ports_gen                    0 # number of
+// collector unit in ports (default = 0)
+//-gpgpu_operand_collector_num_out_ports_sp                    1 # number of
+// collector unit in ports (default = 1)
+//-gpgpu_operand_collector_num_out_ports_sfu                    1 # number of
+// collector unit in ports (default = 1)
+//-gpgpu_operand_collector_num_out_ports_mem                    1 # number of
+// collector unit in ports (default = 1)
+//-gpgpu_operand_collector_num_out_ports_gen                    0 # number of
+// collector unit in ports (default = 0)
+
+// The total number of collector units and their input ports, and the number of
+// register file banks determine the crossbar size.
+
+InstFetchU::InstFetchU(ParseXML *XML_interface, int ithCore_,
+                       InputParameter *interface_ip_,
+                       const CoreDynParam &dyn_p_, bool exist_)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      coredynp(dyn_p_), IB(0), BTB(0), ID_inst(0), ID_operand(0), ID_misc(0),
+      exist(exist_) {
+  if (!exist)
+    return;
+  int idx, tag, data, size, line, assoc, banks;
+  bool debug = false, is_default = true;
+
+  clockRate = coredynp.clockRate;
+  executionTime = coredynp.executionTime;
+
+  cache_p = (Cache_policy)XML->sys.core[ithCore].icache.icache_config[7];
+  // Assuming all L1 caches are virtually idxed physically tagged.
+  // cache
+
+  size = (int)XML->sys.core[ithCore].icache.icache_config[0];
+  line = (int)XML->sys.core[ithCore].icache.icache_config[1];
+  assoc = (int)XML->sys.core[ithCore].icache.icache_config[2];
+  banks = (int)XML->sys.core[ithCore].icache.icache_config[3];
+  idx = debug ? 9 : int(ceil(log2(size / line / assoc)));
+  tag = debug ? 51
+              : (int)XML->sys.physical_address_width - idx -
+                    int(ceil(log2(line))) + EXTRA_TAG_BITS;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.cache_sz =
+      debug ? 32768 : (int)XML->sys.core[ithCore].icache.icache_config[0];
+  interface_ip.line_sz =
+      debug ? 64 : (int)XML->sys.core[ithCore].icache.icache_config[1];
+  interface_ip.assoc =
+      debug ? 8 : (int)XML->sys.core[ithCore].icache.icache_config[2];
+  interface_ip.nbanks =
+      debug ? 1 : (int)XML->sys.core[ithCore].icache.icache_config[3];
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode =
+      0; // debug?0:XML->sys.core[ithCore].icache.icache_config[5];
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].icache.icache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 3.0 / clockRate
+            : XML->sys.core[ithCore].icache.icache_config[5] / clockRate;
+  interface_ip.is_cache = true;
+  interface_ip.pure_cam = false;
+  interface_ip.pure_ram = false;
+  //  interface_ip.obj_func_dyn_energy = 0;
+  //  interface_ip.obj_func_dyn_power  = 0;
+  //  interface_ip.obj_func_leak_power = 0;
+  //  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports =
+      debug ? 1 : XML->sys.core[ithCore].number_instruction_fetch_ports;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  icache.caches = new ArrayST(&interface_ip, "icache", Core_device,
+                              coredynp.opt_local, coredynp.core_ty);
+  scktRatio = g_tp.sckt_co_eff;
+  chip_PR_overhead = g_tp.chip_layout_overhead;
+  macro_PR_overhead = g_tp.macro_layout_overhead;
+  icache.area.set_area(icache.area.get_area() +
+                       icache.caches->local_result.area);
+  area.set_area(area.get_area() + icache.caches->local_result.area);
+  // output_data_csv(icache.caches.local_result);
+
+  /*
+   *iCache controllers
+   *miss buffer Each MSHR contains enough state
+   *to handle one or more accesses of any type to a single memory line.
+   *Due to the generality of the MSHR mechanism,
+   *the amount of state involved is non-trivial:
+   *including the address, pointers to the cache entry and destination register,
+   *written data, and various other pieces of state.
+   */
+  interface_ip.num_search_ports =
+      debug ? 1 : XML->sys.core[ithCore].number_instruction_fetch_ports;
+  tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data = (XML->sys.physical_address_width) + int(ceil(log2(size / line))) +
+         icache.caches->l_ip.line_sz * 8;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz =
+      int(ceil(data / 8.0)); // int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].icache.buffer_sizes[0] * interface_ip.line_sz;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].icache.icache_config[4] /
+                  clockRate; // means cycle time
+  interface_ip.latency = debug
+                             ? 1.0 / clockRate
+                             : XML->sys.core[ithCore].icache.icache_config[5] /
+                                   clockRate; // means access time
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports =
+      debug ? 1 : XML->sys.core[ithCore].number_instruction_fetch_ports;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  interface_ip.num_search_ports =
+      XML->sys.core[ithCore].number_instruction_fetch_ports;
+  icache.missb = new ArrayST(&interface_ip, "icacheMissBuffer", Core_device,
+                             coredynp.opt_local, coredynp.core_ty);
+  icache.area.set_area(icache.area.get_area() +
+                       icache.missb->local_result.area);
+  area.set_area(area.get_area() + icache.missb->local_result.area);
+  // output_data_csv(icache.missb.local_result);
+
+  // fill buffer
+  tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data = icache.caches->l_ip.line_sz;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz = data * XML->sys.core[ithCore].icache.buffer_sizes[1];
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].icache.icache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].icache.icache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports =
+      debug ? 1 : XML->sys.core[ithCore].number_instruction_fetch_ports;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  interface_ip.num_search_ports =
+      XML->sys.core[ithCore].number_instruction_fetch_ports;
+  icache.ifb = new ArrayST(&interface_ip, "icacheFillBuffer", Core_device,
+                           coredynp.opt_local, coredynp.core_ty);
+  icache.area.set_area(icache.area.get_area() + icache.ifb->local_result.area);
+  area.set_area(area.get_area() + icache.ifb->local_result.area);
+  // output_data_csv(icache.ifb.local_result);
+
+  // prefetch buffer
+  tag = XML->sys.physical_address_width +
+        EXTRA_TAG_BITS; // check with previous entries to decide wthether to
+                        // merge.
+  data = icache.caches->l_ip
+             .line_sz; // separate queue to prevent from cache polution.
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].icache.buffer_sizes[2] * interface_ip.line_sz;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].icache.icache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].icache.icache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports =
+      debug ? 1 : XML->sys.core[ithCore].number_instruction_fetch_ports;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  interface_ip.num_search_ports =
+      XML->sys.core[ithCore].number_instruction_fetch_ports;
+  icache.prefetchb =
+      new ArrayST(&interface_ip, "icacheprefetchBuffer", Core_device,
+                  coredynp.opt_local, coredynp.core_ty);
+  icache.area.set_area(icache.area.get_area() +
+                       icache.prefetchb->local_result.area);
+  area.set_area(area.get_area() + icache.prefetchb->local_result.area);
+  // output_data_csv(icache.prefetchb.local_result);
+
+  // Instruction buffer
+  data =
+      XML->sys.core[ithCore].instruction_length *
+      XML->sys.core[ithCore]
+          .peak_issue_width; // icache.caches.l_ip.line_sz; //multiple
+                             // threads timing sharing the instruction buffer.
+  interface_ip.is_cache = false;
+  interface_ip.pure_ram = true;
+  interface_ip.pure_cam = false;
+  interface_ip.line_sz = int(ceil(data / 8.0));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].number_hardware_threads *
+                  XML->sys.core[ithCore].instruction_buffer_size *
+                  interface_ip.line_sz >
+              64
+          ? XML->sys.core[ithCore].number_hardware_threads *
+                XML->sys.core[ithCore].instruction_buffer_size *
+                interface_ip.line_sz
+          : 64;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput = 1.0 / clockRate;
+  interface_ip.latency = 1.0 / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  // NOTE: Assuming IB is time slice shared among threads, every fetch op will
+  // at least fetch "fetch width" instructions.
+  interface_ip.num_rw_ports =
+      debug
+          ? 1
+          : XML->sys.core[ithCore]
+                .number_instruction_fetch_ports; // XML->sys.core[ithCore].fetch_width;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  IB = new ArrayST(&interface_ip, "InstBuffer", Core_device, coredynp.opt_local,
+                   coredynp.core_ty);
+  IB->area.set_area(IB->area.get_area() + IB->local_result.area);
+  area.set_area(area.get_area() + IB->local_result.area);
+  // output_data_csv(IB.IB.local_result);
+
+  //	  inst_decoder.opcode_length = XML->sys.core[ithCore].opcode_width;
+  //	  inst_decoder.init_decoder(is_default, &interface_ip);
+  //	  inst_decoder.full_decoder_power();
+
+  if (coredynp.predictionW > 0) {
+    /*
+     * BTB branch target buffer, accessed during IF stage. Virtually indexed and
+     * virtually tagged It is only a cache without all the buffers in the cache
+     * controller since it is more like a look up table than a cache with cache
+     * controller. When access miss, no load from other places such as main
+     * memory (not actively fill the misses), it is passively updated under two
+     * circumstances: 1)  when BPT@ID stage finds out current is a taken branch
+     * while BTB missed 2)  When BPT@ID stage predicts differently than BTB 3)
+     * When ID stage finds out current instruction is not a branch while BTB had
+     * a hit.(mark as invalid) 4)  when EXEU find out wrong target has been
+     * provided from BTB.
+     *
+     */
+    size = XML->sys.core[ithCore].BTB.BTB_config[0];
+    line = XML->sys.core[ithCore].BTB.BTB_config[1];
+    assoc = XML->sys.core[ithCore].BTB.BTB_config[2];
+    banks = XML->sys.core[ithCore].BTB.BTB_config[3];
+    idx = debug ? 9 : int(ceil(log2(size / line / assoc)));
+    //    	  tag							   =
+    //    debug?51:XML->sys.virtual_address_width-idx-int(ceil(log2(line))) +
+    //    int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads)))
+    //    +EXTRA_TAG_BITS;
+    tag = debug ? 51
+                : XML->sys.virtual_address_width +
+                      int(ceil(log2(
+                          XML->sys.core[ithCore].number_hardware_threads))) +
+                      EXTRA_TAG_BITS;
+    interface_ip.is_cache = true;
+    interface_ip.pure_ram = false;
+    interface_ip.pure_cam = false;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.cache_sz = debug ? 32768 : size;
+    interface_ip.line_sz = debug ? 64 : line;
+    interface_ip.assoc = debug ? 8 : assoc;
+    interface_ip.nbanks = debug ? 1 : banks;
+    interface_ip.out_w = interface_ip.line_sz * 8;
+    interface_ip.access_mode =
+        0; // debug?0:XML->sys.core[ithCore].dcache.dcache_config[5];
+    interface_ip.throughput =
+        debug ? 1.0 / clockRate
+              : XML->sys.core[ithCore].BTB.BTB_config[4] / clockRate;
+    interface_ip.latency =
+        debug ? 3.0 / clockRate
+              : XML->sys.core[ithCore].BTB.BTB_config[5] / clockRate;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = 1;
+    interface_ip.num_rd_ports = coredynp.predictionW;
+    interface_ip.num_wr_ports = coredynp.predictionW;
+    interface_ip.num_se_rd_ports = 0;
+    BTB = new ArrayST(&interface_ip, "Branch Target Buffer", Core_device,
+                      coredynp.opt_local, coredynp.core_ty);
+    BTB->area.set_area(BTB->area.get_area() + BTB->local_result.area);
+    area.set_area(area.get_area() + BTB->local_result.area);
+    /// cout<<"area="<<area<<endl;
+
+    BPT = new BranchPredictor(XML, ithCore, &interface_ip, coredynp);
+    area.set_area(area.get_area() + BPT->area.get_area());
+  }
+
+  ID_inst = new inst_decoder(is_default, &interface_ip, coredynp.opcode_length,
+                             1 /*Decoder should not know how many by itself*/,
+                             coredynp.x86, Core_device, coredynp.core_ty);
+
+  ID_operand =
+      new inst_decoder(is_default, &interface_ip, coredynp.arch_ireg_width, 1,
+                       coredynp.x86, Core_device, coredynp.core_ty);
+
+  ID_misc = new inst_decoder(is_default, &interface_ip,
+                             8 /* Prefix field etc upto 14B*/, 1, coredynp.x86,
+                             Core_device, coredynp.core_ty);
+  // TODO: X86 decoder should decode the inst in cyclic mode under the control
+  // of squencer. So the dynamic power should be multiplied by a few times.
+  area.set_area(area.get_area() +
+                (ID_inst->area.get_area() + ID_operand->area.get_area() +
+                 ID_misc->area.get_area()) *
+                    coredynp.decodeW);
+}
+
+BranchPredictor::BranchPredictor(ParseXML *XML_interface, int ithCore_,
+                                 InputParameter *interface_ip_,
+                                 const CoreDynParam &dyn_p_, bool exist_)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      coredynp(dyn_p_), globalBPT(0), localBPT(0), L1_localBPT(0),
+      L2_localBPT(0), chooser(0), RAS(0), exist(exist_) {
+  /*
+   * Branch Predictor, accessed during ID stage.
+   * McPAT's branch predictor model is the tournament branch predictor used in
+   * Alpha 21264, including global predictor, local two level predictor, and
+   * Chooser. The Branch predictor also includes a RAS (return address stack)
+   * for function calls Branch predictors are tagged by thread ID and modeled as
+   * 1-way associative $ However RAS return address stacks are duplicated for
+   * each thread.
+   * TODO:Data Width need to be computed more precisely	 *
+   */
+  if (!exist)
+    return;
+  int tag, data;
+
+  clockRate = coredynp.clockRate;
+  executionTime = coredynp.executionTime;
+  interface_ip.assoc = 1;
+  interface_ip.pure_cam = false;
+  if (coredynp.multithreaded) {
+
+    tag = int(log2(coredynp.num_hthreads) + EXTRA_TAG_BITS);
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+
+    interface_ip.is_cache = true;
+    interface_ip.pure_ram = false;
+  } else {
+    interface_ip.is_cache = false;
+    interface_ip.pure_ram = true;
+  }
+  // Global predictor
+  data =
+      int(ceil(XML->sys.core[ithCore].predictor.global_predictor_bits / 8.0));
+  interface_ip.line_sz = data;
+  interface_ip.cache_sz =
+      data * XML->sys.core[ithCore].predictor.global_predictor_entries;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput = 1.0 / clockRate;
+  interface_ip.latency = 1.0 / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = coredynp.predictionW;
+  interface_ip.num_wr_ports = coredynp.predictionW;
+  interface_ip.num_se_rd_ports = 0;
+  globalBPT = new ArrayST(&interface_ip, "Global Predictor", Core_device,
+                          coredynp.opt_local, coredynp.core_ty);
+  globalBPT->area.set_area(globalBPT->area.get_area() +
+                           globalBPT->local_result.area);
+  area.set_area(area.get_area() + globalBPT->local_result.area);
+
+  // Local BPT (Level 1)
+  data =
+      int(ceil(XML->sys.core[ithCore].predictor.local_predictor_size[0] / 8.0));
+  interface_ip.line_sz = data;
+  interface_ip.cache_sz =
+      data * XML->sys.core[ithCore].predictor.local_predictor_entries;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput = 1.0 / clockRate;
+  interface_ip.latency = 1.0 / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = coredynp.predictionW;
+  interface_ip.num_wr_ports = coredynp.predictionW;
+  interface_ip.num_se_rd_ports = 0;
+  L1_localBPT = new ArrayST(&interface_ip, "L1 local Predictor", Core_device,
+                            coredynp.opt_local, coredynp.core_ty);
+  L1_localBPT->area.set_area(L1_localBPT->area.get_area() +
+                             L1_localBPT->local_result.area);
+  area.set_area(area.get_area() + L1_localBPT->local_result.area);
+
+  // Local BPT (Level 2)
+  data =
+      int(ceil(XML->sys.core[ithCore].predictor.local_predictor_size[1] / 8.0));
+  interface_ip.line_sz = data;
+  interface_ip.cache_sz =
+      data * XML->sys.core[ithCore].predictor.local_predictor_entries;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput = 1.0 / clockRate;
+  interface_ip.latency = 1.0 / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = coredynp.predictionW;
+  interface_ip.num_wr_ports = coredynp.predictionW;
+  interface_ip.num_se_rd_ports = 0;
+  L2_localBPT = new ArrayST(&interface_ip, "L2 local Predictor", Core_device,
+                            coredynp.opt_local, coredynp.core_ty);
+  L2_localBPT->area.set_area(L2_localBPT->area.get_area() +
+                             L2_localBPT->local_result.area);
+  area.set_area(area.get_area() + L2_localBPT->local_result.area);
+
+  // Chooser
+  data =
+      int(ceil(XML->sys.core[ithCore].predictor.chooser_predictor_bits / 8.0));
+  interface_ip.line_sz = data;
+  interface_ip.cache_sz =
+      data * XML->sys.core[ithCore].predictor.chooser_predictor_entries;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput = 1.0 / clockRate;
+  interface_ip.latency = 1.0 / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = coredynp.predictionW;
+  interface_ip.num_wr_ports = coredynp.predictionW;
+  interface_ip.num_se_rd_ports = 0;
+  chooser = new ArrayST(&interface_ip, "Predictor Chooser", Core_device,
+                        coredynp.opt_local, coredynp.core_ty);
+  chooser->area.set_area(chooser->area.get_area() + chooser->local_result.area);
+  area.set_area(area.get_area() + chooser->local_result.area);
+
+  // RAS return address stacks are Duplicated for each thread.
+  interface_ip.is_cache = false;
+  interface_ip.pure_ram = true;
+  data = int(ceil(coredynp.pc_width / 8.0));
+  interface_ip.line_sz = data;
+  interface_ip.cache_sz = data * XML->sys.core[ithCore].RAS_size;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput = 1.0 / clockRate;
+  interface_ip.latency = 1.0 / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = coredynp.predictionW;
+  interface_ip.num_wr_ports = coredynp.predictionW;
+  interface_ip.num_se_rd_ports = 0;
+  RAS = new ArrayST(&interface_ip, "RAS", Core_device, coredynp.opt_local,
+                    coredynp.core_ty);
+  RAS->area.set_area(RAS->area.get_area() +
+                     RAS->local_result.area * coredynp.num_hthreads);
+  area.set_area(area.get_area() +
+                RAS->local_result.area * coredynp.num_hthreads);
+}
+
+SchedulerU::SchedulerU(ParseXML *XML_interface, int ithCore_,
+                       InputParameter *interface_ip_,
+                       const CoreDynParam &dyn_p_, bool exist_)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      coredynp(dyn_p_), int_inst_window(0), fp_inst_window(0), ROB(0),
+      instruction_selection(0), exist(exist_) {
+  if (!exist)
+    return;
+  int tag, data;
+  bool is_default = true;
+  string tmp_name;
+
+  clockRate = coredynp.clockRate;
+  executionTime = coredynp.executionTime;
+  if ((coredynp.core_ty == Inorder && coredynp.multithreaded)) {
+    // Instruction issue queue, in-order multi-issue or multithreaded processor
+    // also has this structure. Unified window for Inorder processors
+    tag = int(log2(XML->sys.core[ithCore].number_hardware_threads) *
+              coredynp.perThreadState); // This is the normal thread state bits
+                                        // based on Niagara Design
+    data = XML->sys.core[ithCore].instruction_length;
+    // NOTE: x86 inst can be very lengthy, up to 15B. Source: Intel® 64 and
+    // IA-32 Architectures Software Developer’s Manual
+    interface_ip.is_cache = true;
+    interface_ip.pure_cam = false;
+    interface_ip.pure_ram = false;
+    interface_ip.line_sz = int(ceil(data / 8.0));
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.cache_sz =
+        XML->sys.core[ithCore].instruction_window_size * interface_ip.line_sz >
+                64
+            ? XML->sys.core[ithCore].instruction_window_size *
+                  interface_ip.line_sz
+            : 64;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8;
+    interface_ip.access_mode = 1;
+    interface_ip.throughput = 1.0 / clockRate;
+    interface_ip.latency = 1.0 / clockRate;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = 0;
+    interface_ip.num_rd_ports = coredynp.peak_issueW;
+    interface_ip.num_wr_ports = coredynp.peak_issueW;
+    interface_ip.num_se_rd_ports = 0;
+    interface_ip.num_search_ports = coredynp.peak_issueW;
+    int_inst_window = new ArrayST(&interface_ip, "InstFetchQueue", Core_device,
+                                  coredynp.opt_local, coredynp.core_ty);
+    int_inst_window->area.set_area(int_inst_window->area.get_area() +
+                                   int_inst_window->local_result.area *
+                                       coredynp.num_pipelines);
+    area.set_area(area.get_area() +
+                  int_inst_window->local_result.area * coredynp.num_pipelines);
+    // output_data_csv(iRS.RS.local_result);
+    Iw_height = int_inst_window->local_result.cache_ht;
+
+    /*
+     * selection logic
+     * In a single-issue Inorder multithreaded processor like Niagara, issue
+     * width=1*number_of_threads since the processor does need to pick up
+     * instructions from multiple ready ones(although these ready ones are from
+     * different threads).While SMT processors do not distinguish which thread
+     * belongs to who at the issue stage.
+     */
+
+    instruction_selection = new selection_logic(
+        is_default, XML->sys.core[ithCore].instruction_window_size,
+        coredynp.peak_issueW * XML->sys.core[ithCore].number_hardware_threads,
+        &interface_ip, Core_device, coredynp.core_ty);
+  }
+
+  if (coredynp.core_ty == OOO) {
+    /*
+     * CAM based instruction window
+     * For physicalRegFilebased OOO it is the instruction issue queue, where
+     * only tags of phy regs are stored For RS based OOO it is the Reservation
+     * station, where both tags and values of phy regs are stored It is written
+     * once and read twice(two operands) before an instruction can be issued.
+     * X86 instruction can be very long up to 15B. add instruction length in XML
+     */
+    if (coredynp.scheu_ty == PhysicalRegFile) {
+      tag = coredynp.phy_ireg_width;
+      // Each time only half of the tag is compared, but two tag should be
+      // stored. This underestimate the search power
+      data =
+          int((ceil((coredynp.instruction_length +
+                     2 * (coredynp.phy_ireg_width - coredynp.arch_ireg_width)) /
+                    2.0) /
+               8.0));
+      // Data width being divided by 2 means only after both operands available
+      // the whole data will be read out. This is modeled using two equivalent
+      // readouts with half of the data width
+      tmp_name = "InstIssueQueue";
+    } else {
+      tag = coredynp.phy_ireg_width;
+      // Each time only half of the tag is compared, but two tag should be
+      // stored. This underestimate the search power
+      data =
+          int(ceil(((coredynp.instruction_length +
+                     2 * (coredynp.phy_ireg_width - coredynp.arch_ireg_width) +
+                     2 * coredynp.int_data_width) /
+                    2.0) /
+                   8.0));
+      // Data width being divided by 2 means only after both operands available
+      // the whole data will be read out. This is modeled using two equivalent
+      // readouts with half of the data width
+
+      tmp_name = "IntReservationStation";
+    }
+    interface_ip.is_cache = true;
+    interface_ip.pure_cam = false;
+    interface_ip.pure_ram = false;
+    interface_ip.line_sz = data;
+    interface_ip.cache_sz =
+        data * XML->sys.core[ithCore].instruction_window_size;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.access_mode = 0;
+    interface_ip.throughput = 2 * 1.0 / clockRate;
+    interface_ip.latency = 2 * 1.0 / clockRate;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = 0;
+    interface_ip.num_rd_ports = coredynp.peak_issueW;
+    interface_ip.num_wr_ports = coredynp.peak_issueW;
+    interface_ip.num_se_rd_ports = 0;
+    interface_ip.num_search_ports = coredynp.peak_issueW;
+    int_inst_window = new ArrayST(&interface_ip, tmp_name, Core_device,
+                                  coredynp.opt_local, coredynp.core_ty);
+    int_inst_window->area.set_area(int_inst_window->area.get_area() +
+                                   int_inst_window->local_result.area *
+                                       coredynp.num_pipelines);
+    area.set_area(area.get_area() +
+                  int_inst_window->local_result.area * coredynp.num_pipelines);
+    Iw_height = int_inst_window->local_result.cache_ht;
+    // FU inst window
+    if (coredynp.scheu_ty == PhysicalRegFile) {
+      tag = 2 * coredynp.phy_freg_width; // TODO: each time only half of the tag
+                                         // is compared
+      data =
+          int(ceil((coredynp.instruction_length +
+                    2 * (coredynp.phy_freg_width - coredynp.arch_freg_width)) /
+                   8.0));
+      tmp_name = "FPIssueQueue";
+    } else {
+      tag = 2 * coredynp.phy_ireg_width;
+      data =
+          int(ceil((coredynp.instruction_length +
+                    2 * (coredynp.phy_freg_width - coredynp.arch_freg_width) +
+                    2 * coredynp.fp_data_width) /
+                   8.0));
+      tmp_name = "FPReservationStation";
+    }
+    interface_ip.is_cache = true;
+    interface_ip.pure_cam = false;
+    interface_ip.pure_ram = false;
+    interface_ip.line_sz = data;
+    interface_ip.cache_sz =
+        data * XML->sys.core[ithCore].fp_instruction_window_size;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.access_mode = 0;
+    interface_ip.throughput = 1.0 / clockRate;
+    interface_ip.latency = 1.0 / clockRate;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = 0;
+    interface_ip.num_rd_ports = coredynp.fp_issueW;
+    interface_ip.num_wr_ports = coredynp.fp_issueW;
+    interface_ip.num_se_rd_ports = 0;
+    interface_ip.num_search_ports = coredynp.fp_issueW;
+    fp_inst_window = new ArrayST(&interface_ip, tmp_name, Core_device,
+                                 coredynp.opt_local, coredynp.core_ty);
+    fp_inst_window->area.set_area(fp_inst_window->area.get_area() +
+                                  fp_inst_window->local_result.area *
+                                      coredynp.num_fp_pipelines);
+    area.set_area(area.get_area() + fp_inst_window->local_result.area *
+                                        coredynp.num_fp_pipelines);
+    fp_Iw_height = fp_inst_window->local_result.cache_ht;
+
+    if (XML->sys.core[ithCore].ROB_size > 0) {
+      /*
+       *  if ROB_size = 0, then the target processor does not support
+       *hardware-based speculation, i.e. , the processor allow OOO issue as well
+       *as OOO completion, which means branch must be resolved before
+       *instruction issued into instruction window, since there is no change to
+       *flush miss-predict branch path after instructions are issued in this
+       *situation.
+       *
+       *  ROB.ROB size = inflight inst. ROB is unified for int and fp inst.
+       *  One old approach is to combine the RAT and ROB as a huge CAM structure
+       *as in AMD K7. However, this approach is abandoned due to its high power
+       *and poor scalablility. McPAT uses current implementation of ROB as
+       *circular buffer. ROB is written once when instruction is issued and read
+       *once when the instruction is committed.         *
+       */
+      int robExtra = int(ceil(5 + log2(coredynp.num_hthreads)));
+      // 5 bits are: busy, Issued, Finished, speculative, valid
+      if (coredynp.scheu_ty == PhysicalRegFile) {
+        // PC is to id the instruction for recover exception.
+        // inst is used to map the renamed dest. registers.so that commit stage
+        // can know which reg/RRAT to update
+        //				data =
+        // int(ceil((robExtra+coredynp.pc_width
+        //+ coredynp.instruction_length
+        //+ 2*coredynp.phy_ireg_width)/8.0));
+        data = int(ceil(
+            (robExtra + coredynp.pc_width + coredynp.phy_ireg_width) / 8.0));
+      } else {
+        // in RS based OOO, ROB also contains value of destination reg
+        //				data  =
+        // int(ceil((robExtra+coredynp.pc_width
+        //+ coredynp.instruction_length
+        //+ 2*coredynp.phy_ireg_width + coredynp.fp_data_width)/8.0));
+        data = int(ceil((robExtra + coredynp.pc_width +
+                         coredynp.phy_ireg_width + coredynp.fp_data_width) /
+                        8.0));
+      }
+      interface_ip.is_cache = false;
+      interface_ip.pure_cam = false;
+      interface_ip.pure_ram = true;
+      interface_ip.line_sz = data;
+      interface_ip.cache_sz =
+          data * XML->sys.core[ithCore]
+                     .ROB_size; // The XML ROB size is for all threads
+      interface_ip.assoc = 1;
+      interface_ip.nbanks = 1;
+      interface_ip.out_w = interface_ip.line_sz * 8;
+      interface_ip.access_mode = 1;
+      interface_ip.throughput = 1.0 / clockRate;
+      interface_ip.latency = 1.0 / clockRate;
+      interface_ip.obj_func_dyn_energy = 0;
+      interface_ip.obj_func_dyn_power = 0;
+      interface_ip.obj_func_leak_power = 0;
+      interface_ip.obj_func_cycle_t = 1;
+      interface_ip.num_rw_ports = 0;
+      interface_ip.num_rd_ports = coredynp.peak_commitW;
+      interface_ip.num_wr_ports = coredynp.peak_issueW;
+      interface_ip.num_se_rd_ports = 0;
+      interface_ip.num_search_ports = 0;
+      ROB = new ArrayST(&interface_ip, "ReorderBuffer", Core_device,
+                        coredynp.opt_local, coredynp.core_ty);
+      ROB->area.set_area(ROB->area.get_area() +
+                         ROB->local_result.area * coredynp.num_pipelines);
+      area.set_area(area.get_area() +
+                    ROB->local_result.area * coredynp.num_pipelines);
+      ROB_height = ROB->local_result.cache_ht;
+    }
+
+    instruction_selection = new selection_logic(
+        is_default, XML->sys.core[ithCore].instruction_window_size,
+        coredynp.peak_issueW, &interface_ip, Core_device, coredynp.core_ty);
+  }
+}
+
+LoadStoreU::LoadStoreU(ParseXML *XML_interface, int ithCore_,
+                       InputParameter *interface_ip_,
+                       const CoreDynParam &dyn_p_, bool exist_)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      coredynp(dyn_p_), LSQ(0), exist(exist_) {
+  if (!exist)
+    return;
+  int idx, tag, data, size, line, assoc;
+  bool debug = false;
+  int ldst_opcode = XML->sys.core[ithCore].opcode_width; // 16;
+
+  clockRate = coredynp.clockRate;
+  executionTime = coredynp.executionTime;
+  cache_p = (Cache_policy)XML->sys.core[ithCore].dcache.dcache_config[7];
+
+  interface_ip.num_search_ports = XML->sys.core[ithCore].memory_ports;
+  interface_ip.is_cache = true;
+  interface_ip.pure_cam = false;
+  interface_ip.pure_ram = false;
+
+  // Crossbar based interconnect for shared memory accesses, added by Syed
+  // Crossbar
+
+  if (XML->sys.architecture == 1) {
+    xbar_shared = new Crossbar(
+        coredynp.num_fpus, coredynp.num_fpus, 32,
+        &(g_tp.peri_global)); // Syed: coredynp.num_fpus is used as simd_width
+  } else {
+    xbar_shared = new Crossbar(
+        coredynp.num_fpus, coredynp.num_fpus, 32,
+        &(g_tp.peri_global)); // Syed: coredynp.num_fpus is used as simd_width
+  }
+
+  // TODO: Check if this line should be changed to
+  // new
+  // Crossbar(simd_width,shared_memory_banks,word_length*simd_width,&(g_tp.peri_global));
+
+  // shared memory added by Jingwen
+  size = (int)XML->sys.core[ithCore].sharedmemory.dcache_config[0];
+  line = (int)XML->sys.core[ithCore].sharedmemory.dcache_config[1];
+  assoc = (int)XML->sys.core[ithCore].sharedmemory.dcache_config[2];
+  idx = debug ? 9 : int(ceil(log2(size / line / assoc)));
+  tag = debug ? 51
+              : XML->sys.physical_address_width - idx - int(ceil(log2(line))) +
+                    EXTRA_TAG_BITS;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = 1;
+  interface_ip.cache_sz =
+      debug ? 32768 : (int)XML->sys.core[ithCore].sharedmemory.dcache_config[0];
+  interface_ip.line_sz =
+      debug ? 64 : (int)XML->sys.core[ithCore].sharedmemory.dcache_config[1];
+  interface_ip.assoc =
+      debug ? 8 : (int)XML->sys.core[ithCore].sharedmemory.dcache_config[2];
+  interface_ip.nbanks =
+      debug ? 1 : (int)XML->sys.core[ithCore].sharedmemory.dcache_config[3];
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode =
+      0; // debug?0:XML->sys.core[ithCore].sharedmemory.dcache_config[5];
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].sharedmemory.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 3.0 / clockRate
+            : XML->sys.core[ithCore].sharedmemory.dcache_config[5] / clockRate;
+  interface_ip.is_cache = true;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports =
+      debug
+          ? 1
+          : XML->sys.core[ithCore]
+                .memory_ports; // usually In-order has 1 and OOO has 2 at least.
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  sharedmemory.caches = new ArrayST(&interface_ip, "sharedmemory", Core_device,
+                                    coredynp.opt_local, coredynp.core_ty);
+  sharedmemory.area.set_area(sharedmemory.area.get_area() +
+                             sharedmemory.caches->local_result.area);
+  area.set_area(area.get_area() + sharedmemory.caches->local_result.area +
+                xbar_shared->area.get_area());
+
+  // shared memory buffer
+  // miss buffer
+  tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data = (XML->sys.physical_address_width) + int(ceil(log2(size / line))) +
+         sharedmemory.caches->l_ip.line_sz * 8;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = 1;
+  interface_ip.line_sz =
+      int(ceil(data / 8.0)); // int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz = XML->sys.core[ithCore].sharedmemory.buffer_sizes[0] *
+                          interface_ip.line_sz;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].sharedmemory.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].sharedmemory.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  sharedmemory.missb =
+      new ArrayST(&interface_ip, "SharedmemoryMissBuffer", Core_device,
+                  coredynp.opt_local, coredynp.core_ty);
+  sharedmemory.area.set_area(sharedmemory.area.get_area() +
+                             sharedmemory.missb->local_result.area);
+  area.set_area(area.get_area() + sharedmemory.missb->local_result.area);
+
+  // sharedmemory fill buffer
+  tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data = sharedmemory.caches->l_ip.line_sz;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = 1;
+  interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz =
+      data * XML->sys.core[ithCore].sharedmemory.buffer_sizes[1];
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].sharedmemory.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].sharedmemory.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  sharedmemory.ifb =
+      new ArrayST(&interface_ip, "SharedMemoryFillBuffer", Core_device,
+                  coredynp.opt_local, coredynp.core_ty);
+  sharedmemory.area.set_area(sharedmemory.area.get_area() +
+                             sharedmemory.ifb->local_result.area);
+  area.set_area(area.get_area() + sharedmemory.ifb->local_result.area);
+
+  // sharedmemory prefetch buffer
+  tag = XML->sys.physical_address_width +
+        EXTRA_TAG_BITS; // check with previous entries to decide wthether to
+                        // merge.
+  data = sharedmemory.caches->l_ip
+             .line_sz; // separate queue to prevent from cache polution.
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = 1;
+  interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz = XML->sys.core[ithCore].sharedmemory.buffer_sizes[2] *
+                          interface_ip.line_sz;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].sharedmemory.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].sharedmemory.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  sharedmemory.prefetchb =
+      new ArrayST(&interface_ip, "dcacheprefetchBuffer", Core_device,
+                  coredynp.opt_local, coredynp.core_ty);
+  sharedmemory.area.set_area(sharedmemory.area.get_area() +
+                             sharedmemory.prefetchb->local_result.area);
+  area.set_area(area.get_area() + sharedmemory.prefetchb->local_result.area);
+
+  // shared memory WBB
+  if (cache_p == Write_back) {
+    tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+    data = sharedmemory.caches->l_ip.line_sz;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = 1;
+    interface_ip.line_sz = data;
+    interface_ip.cache_sz =
+        XML->sys.core[ithCore].sharedmemory.buffer_sizes[3] *
+        interface_ip.line_sz;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8;
+    interface_ip.access_mode = 2;
+    interface_ip.throughput =
+        debug
+            ? 1.0 / clockRate
+            : XML->sys.core[ithCore].sharedmemory.dcache_config[4] / clockRate;
+    interface_ip.latency =
+        debug
+            ? 1.0 / clockRate
+            : XML->sys.core[ithCore].sharedmemory.dcache_config[5] / clockRate;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = XML->sys.core[ithCore].memory_ports;
+    interface_ip.num_rd_ports = 0;
+    interface_ip.num_wr_ports = 0;
+    interface_ip.num_se_rd_ports = 0;
+    sharedmemory.wbb = new ArrayST(&interface_ip, "dcacheWBB", Core_device,
+                                   coredynp.opt_local, coredynp.core_ty);
+    sharedmemory.area.set_area(sharedmemory.area.get_area() +
+                               sharedmemory.wbb->local_result.area);
+    area.set_area(area.get_area() + sharedmemory.wbb->local_result.area);
+    // output_data_csv(sharedmemory.wbb.local_result);
+  }
+
+  /*
+   * ccache starts here
+   */
+  // Constant cache
+  size = (int)XML->sys.core[ithCore].ccache.dcache_config[0];
+  line = (int)XML->sys.core[ithCore].ccache.dcache_config[1];
+  assoc = (int)XML->sys.core[ithCore].ccache.dcache_config[2];
+  idx = debug ? 9 : int(ceil(log2(size / line / assoc)));
+  tag = debug ? 51
+              : XML->sys.physical_address_width - idx - int(ceil(log2(line))) +
+                    EXTRA_TAG_BITS;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.cache_sz =
+      debug ? 32768 : (int)XML->sys.core[ithCore].ccache.dcache_config[0];
+  interface_ip.line_sz =
+      debug ? 64 : (int)XML->sys.core[ithCore].ccache.dcache_config[1];
+  interface_ip.assoc =
+      debug ? 8 : (int)XML->sys.core[ithCore].ccache.dcache_config[2];
+  interface_ip.nbanks =
+      debug ? 1 : (int)XML->sys.core[ithCore].ccache.dcache_config[3];
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode =
+      0; // debug?0:XML->sys.core[ithCore].ccache.dcache_config[5];
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].ccache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 3.0 / clockRate
+            : XML->sys.core[ithCore].ccache.dcache_config[5] / clockRate;
+  interface_ip.is_cache = true;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports =
+      debug
+          ? 1
+          : XML->sys.core[ithCore]
+                .memory_ports; // usually In-order has 1 and OOO has 2 at least.
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  ccache.caches = new ArrayST(&interface_ip, "ccache", Core_device,
+                              coredynp.opt_local, coredynp.core_ty);
+  ccache.area.set_area(ccache.area.get_area() +
+                       ccache.caches->local_result.area);
+  area.set_area(area.get_area() + ccache.caches->local_result.area);
+  // output_data_csv(ccache.caches.local_result);
+
+  // cCache controllers
+  // miss buffer
+  tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data = (XML->sys.physical_address_width) + int(ceil(log2(size / line))) +
+         ccache.caches->l_ip.line_sz * 8;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz =
+      int(ceil(data / 8.0)); // int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].ccache.buffer_sizes[0] * interface_ip.line_sz;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].ccache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].ccache.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  ccache.missb = new ArrayST(&interface_ip, "ccacheMissBuffer", Core_device,
+                             coredynp.opt_local, coredynp.core_ty);
+  ccache.area.set_area(ccache.area.get_area() +
+                       ccache.missb->local_result.area);
+  area.set_area(area.get_area() + ccache.missb->local_result.area);
+  // output_data_csv(ccache.missb.local_result);
+
+  // fill buffer
+  tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data = ccache.caches->l_ip.line_sz;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz = data * XML->sys.core[ithCore].ccache.buffer_sizes[1];
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].ccache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].ccache.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  ccache.ifb = new ArrayST(&interface_ip, "ccacheFillBuffer", Core_device,
+                           coredynp.opt_local, coredynp.core_ty);
+  ccache.area.set_area(ccache.area.get_area() + ccache.ifb->local_result.area);
+  area.set_area(area.get_area() + ccache.ifb->local_result.area);
+  // output_data_csv(ccache.ifb.local_result);
+
+  // prefetch buffer
+  tag = XML->sys.physical_address_width +
+        EXTRA_TAG_BITS; // check with previous entries to decide wthether to
+                        // merge.
+  data = ccache.caches->l_ip
+             .line_sz; // separate queue to prevent from cache polution.
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].ccache.buffer_sizes[2] * interface_ip.line_sz;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].ccache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].ccache.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  ccache.prefetchb =
+      new ArrayST(&interface_ip, "ccacheprefetchBuffer", Core_device,
+                  coredynp.opt_local, coredynp.core_ty);
+  ccache.area.set_area(ccache.area.get_area() +
+                       ccache.prefetchb->local_result.area);
+  area.set_area(area.get_area() + ccache.prefetchb->local_result.area);
+  // output_data_csv(ccache.prefetchb.local_result);
+
+  // WBB
+  if (cache_p == Write_back) {
+    tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+    data = ccache.caches->l_ip.line_sz;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.line_sz = data;
+    interface_ip.cache_sz =
+        XML->sys.core[ithCore].ccache.buffer_sizes[3] * interface_ip.line_sz;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8;
+    interface_ip.access_mode = 2;
+    interface_ip.throughput =
+        debug ? 1.0 / clockRate
+              : XML->sys.core[ithCore].ccache.dcache_config[4] / clockRate;
+    interface_ip.latency =
+        debug ? 1.0 / clockRate
+              : XML->sys.core[ithCore].ccache.dcache_config[5] / clockRate;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = XML->sys.core[ithCore].memory_ports;
+    interface_ip.num_rd_ports = 0;
+    interface_ip.num_wr_ports = 0;
+    interface_ip.num_se_rd_ports = 0;
+    ccache.wbb = new ArrayST(&interface_ip, "ccacheWBB", Core_device,
+                             coredynp.opt_local, coredynp.core_ty);
+    ccache.area.set_area(ccache.area.get_area() +
+                         ccache.wbb->local_result.area);
+    area.set_area(area.get_area() + ccache.wbb->local_result.area);
+    // output_data_csv(ccache.wbb.local_result);
+  }
+
+  /*
+   * tcache starts here
+   */
+  // Texture cache
+  size = (int)XML->sys.core[ithCore].tcache.dcache_config[0];
+  line = (int)XML->sys.core[ithCore].tcache.dcache_config[1];
+  assoc = (int)XML->sys.core[ithCore].tcache.dcache_config[2];
+  idx = debug ? 9 : int(ceil(log2(size / line / assoc)));
+  tag = debug ? 51
+              : XML->sys.physical_address_width - idx - int(ceil(log2(line))) +
+                    EXTRA_TAG_BITS;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.cache_sz =
+      debug ? 32768 : (int)XML->sys.core[ithCore].tcache.dcache_config[0];
+  interface_ip.line_sz =
+      debug ? 64 : (int)XML->sys.core[ithCore].tcache.dcache_config[1];
+  interface_ip.assoc =
+      debug ? 8 : (int)XML->sys.core[ithCore].tcache.dcache_config[2];
+  interface_ip.nbanks =
+      debug ? 1 : (int)XML->sys.core[ithCore].tcache.dcache_config[3];
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode =
+      0; // debug?0:XML->sys.core[ithCore].tcache.dcache_config[5];
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].tcache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 3.0 / clockRate
+            : XML->sys.core[ithCore].tcache.dcache_config[5] / clockRate;
+  interface_ip.is_cache = true;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports =
+      debug
+          ? 1
+          : XML->sys.core[ithCore]
+                .memory_ports; // usually In-order has 1 and OOO has 2 at least.
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  tcache.caches = new ArrayST(&interface_ip, "tcache", Core_device,
+                              coredynp.opt_local, coredynp.core_ty);
+  tcache.area.set_area(tcache.area.get_area() +
+                       tcache.caches->local_result.area);
+  area.set_area(area.get_area() + tcache.caches->local_result.area);
+  // output_data_csv(tcache.caches.local_result);
+
+  // tCache controllers
+  // miss buffer
+  tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data = (XML->sys.physical_address_width) + int(ceil(log2(size / line))) +
+         tcache.caches->l_ip.line_sz * 8;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz =
+      int(ceil(data / 8.0)); // int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].tcache.buffer_sizes[0] * interface_ip.line_sz;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].tcache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].tcache.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  tcache.missb = new ArrayST(&interface_ip, "tcacheMissBuffer", Core_device,
+                             coredynp.opt_local, coredynp.core_ty);
+  tcache.area.set_area(tcache.area.get_area() +
+                       tcache.missb->local_result.area);
+  area.set_area(area.get_area() + tcache.missb->local_result.area);
+  // output_data_csv(tcache.missb.local_result);
+
+  // fill buffer
+  tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data = tcache.caches->l_ip.line_sz;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz = data * XML->sys.core[ithCore].tcache.buffer_sizes[1];
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].tcache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].tcache.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  tcache.ifb = new ArrayST(&interface_ip, "tcacheFillBuffer", Core_device,
+                           coredynp.opt_local, coredynp.core_ty);
+  tcache.area.set_area(tcache.area.get_area() + tcache.ifb->local_result.area);
+  area.set_area(area.get_area() + tcache.ifb->local_result.area);
+  // output_data_csv(tcache.ifb.local_result);
+
+  // prefetch buffer
+  tag = XML->sys.physical_address_width +
+        EXTRA_TAG_BITS; // check with previous entries to decide wthether to
+                        // merge.
+  data = tcache.caches->l_ip
+             .line_sz; // separate queue to prevent from cache polution.
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].tcache.buffer_sizes[2] * interface_ip.line_sz;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].tcache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].tcache.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  tcache.prefetchb =
+      new ArrayST(&interface_ip, "tcacheprefetchBuffer", Core_device,
+                  coredynp.opt_local, coredynp.core_ty);
+  tcache.area.set_area(tcache.area.get_area() +
+                       tcache.prefetchb->local_result.area);
+  area.set_area(area.get_area() + tcache.prefetchb->local_result.area);
+  // output_data_csv(tcache.prefetchb.local_result);
+
+  // WBB
+  if (cache_p == Write_back) {
+    tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+    data = tcache.caches->l_ip.line_sz;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.line_sz = data;
+    interface_ip.cache_sz =
+        XML->sys.core[ithCore].tcache.buffer_sizes[3] * interface_ip.line_sz;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8;
+    interface_ip.access_mode = 2;
+    interface_ip.throughput =
+        debug ? 1.0 / clockRate
+              : XML->sys.core[ithCore].tcache.dcache_config[4] / clockRate;
+    interface_ip.latency =
+        debug ? 1.0 / clockRate
+              : XML->sys.core[ithCore].tcache.dcache_config[5] / clockRate;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = XML->sys.core[ithCore].memory_ports;
+    interface_ip.num_rd_ports = 0;
+    interface_ip.num_wr_ports = 0;
+    interface_ip.num_se_rd_ports = 0;
+    tcache.wbb = new ArrayST(&interface_ip, "tcacheWBB", Core_device,
+                             coredynp.opt_local, coredynp.core_ty);
+    tcache.area.set_area(tcache.area.get_area() +
+                         tcache.wbb->local_result.area);
+    area.set_area(area.get_area() + tcache.wbb->local_result.area);
+    // output_data_csv(tcache.wbb.local_result);
+  }
+
+  /*
+   * dcache starts here
+   */
+  // Dcache
+  size = (int)XML->sys.core[ithCore].dcache.dcache_config[0];
+  line = (int)XML->sys.core[ithCore].dcache.dcache_config[1];
+  assoc = (int)XML->sys.core[ithCore].dcache.dcache_config[2];
+  idx = debug ? 9 : int(ceil(log2(size / line / assoc)));
+  tag = debug ? 51
+              : XML->sys.physical_address_width - idx - int(ceil(log2(line))) +
+                    EXTRA_TAG_BITS;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.cache_sz =
+      debug ? 32768 : (int)XML->sys.core[ithCore].dcache.dcache_config[0];
+  interface_ip.line_sz =
+      debug ? 64 : (int)XML->sys.core[ithCore].dcache.dcache_config[1];
+  interface_ip.assoc =
+      debug ? 8 : (int)XML->sys.core[ithCore].dcache.dcache_config[2];
+  interface_ip.nbanks =
+      debug ? 1 : (int)XML->sys.core[ithCore].dcache.dcache_config[3];
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode =
+      0; // debug?0:XML->sys.core[ithCore].dcache.dcache_config[5];
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].dcache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 3.0 / clockRate
+            : XML->sys.core[ithCore].dcache.dcache_config[5] / clockRate;
+  interface_ip.is_cache = true;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports =
+      debug
+          ? 1
+          : XML->sys.core[ithCore]
+                .memory_ports; // usually In-order has 1 and OOO has 2 at least.
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  dcache.caches = new ArrayST(&interface_ip, "dcache", Core_device,
+                              coredynp.opt_local, coredynp.core_ty);
+  dcache.area.set_area(dcache.area.get_area() +
+                       dcache.caches->local_result.area);
+  area.set_area(area.get_area() + dcache.caches->local_result.area);
+  // output_data_csv(dcache.caches.local_result);
+
+  // dCache controllers
+  // miss buffer
+  tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data = (XML->sys.physical_address_width) + int(ceil(log2(size / line))) +
+         dcache.caches->l_ip.line_sz * 8;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz =
+      int(ceil(data / 8.0)); // int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].dcache.buffer_sizes[0] * interface_ip.line_sz;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].dcache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].dcache.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  dcache.missb = new ArrayST(&interface_ip, "dcacheMissBuffer", Core_device,
+                             coredynp.opt_local, coredynp.core_ty);
+  dcache.area.set_area(dcache.area.get_area() +
+                       dcache.missb->local_result.area);
+  area.set_area(area.get_area() + dcache.missb->local_result.area);
+  // output_data_csv(dcache.missb.local_result);
+
+  // fill buffer
+  tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data = dcache.caches->l_ip.line_sz;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz = data * XML->sys.core[ithCore].dcache.buffer_sizes[1];
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].dcache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].dcache.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  dcache.ifb = new ArrayST(&interface_ip, "dcacheFillBuffer", Core_device,
+                           coredynp.opt_local, coredynp.core_ty);
+  dcache.area.set_area(dcache.area.get_area() + dcache.ifb->local_result.area);
+  area.set_area(area.get_area() + dcache.ifb->local_result.area);
+  // output_data_csv(dcache.ifb.local_result);
+
+  // prefetch buffer
+  tag = XML->sys.physical_address_width +
+        EXTRA_TAG_BITS; // check with previous entries to decide wthether to
+                        // merge.
+  data = dcache.caches->l_ip
+             .line_sz; // separate queue to prevent from cache polution.
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].dcache.buffer_sizes[2] * interface_ip.line_sz;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 2;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].dcache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].dcache.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = debug ? 1 : XML->sys.core[ithCore].memory_ports;
+  ;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  dcache.prefetchb =
+      new ArrayST(&interface_ip, "dcacheprefetchBuffer", Core_device,
+                  coredynp.opt_local, coredynp.core_ty);
+  dcache.area.set_area(dcache.area.get_area() +
+                       dcache.prefetchb->local_result.area);
+  area.set_area(area.get_area() + dcache.prefetchb->local_result.area);
+  // output_data_csv(dcache.prefetchb.local_result);
+
+  // WBB
+  if (cache_p == Write_back) {
+    tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+    data = dcache.caches->l_ip.line_sz;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.line_sz = data;
+    interface_ip.cache_sz =
+        XML->sys.core[ithCore].dcache.buffer_sizes[3] * interface_ip.line_sz;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8;
+    interface_ip.access_mode = 2;
+    interface_ip.throughput =
+        debug ? 1.0 / clockRate
+              : XML->sys.core[ithCore].dcache.dcache_config[4] / clockRate;
+    interface_ip.latency =
+        debug ? 1.0 / clockRate
+              : XML->sys.core[ithCore].dcache.dcache_config[5] / clockRate;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = XML->sys.core[ithCore].memory_ports;
+    interface_ip.num_rd_ports = 0;
+    interface_ip.num_wr_ports = 0;
+    interface_ip.num_se_rd_ports = 0;
+    dcache.wbb = new ArrayST(&interface_ip, "dcacheWBB", Core_device,
+                             coredynp.opt_local, coredynp.core_ty);
+    dcache.area.set_area(dcache.area.get_area() +
+                         dcache.wbb->local_result.area);
+    area.set_area(area.get_area() + dcache.wbb->local_result.area);
+    // output_data_csv(dcache.wbb.local_result);
+  }
+
+  /*
+   * LSU--in-order processors do not have separate load queue: unified lsq
+   * partitioned among threads
+   * it is actually the store queue but for inorder processors it serves as both
+   * loadQ and StoreQ
+   */
+  tag = ldst_opcode + XML->sys.virtual_address_width +
+        int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads))) +
+        EXTRA_TAG_BITS;
+  data = XML->sys.machine_bits;
+  interface_ip.is_cache = true;
+  interface_ip.line_sz = int(ceil(data / 32.0)) * 4;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.cache_sz = XML->sys.core[ithCore].store_buffer_size *
+                          interface_ip.line_sz *
+                          XML->sys.core[ithCore].number_hardware_threads;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 1;
+  interface_ip.throughput = 1.0 / clockRate;
+  interface_ip.latency = 1.0 / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = XML->sys.core[ithCore].memory_ports;
+  interface_ip.num_wr_ports = XML->sys.core[ithCore].memory_ports;
+  interface_ip.num_se_rd_ports = 0;
+  interface_ip.num_search_ports = XML->sys.core[ithCore].memory_ports;
+  LSQ = new ArrayST(&interface_ip, "Load(Store)Queue", Core_device,
+                    coredynp.opt_local, coredynp.core_ty);
+  LSQ->area.set_area(LSQ->area.get_area() + LSQ->local_result.area);
+  area.set_area(area.get_area() + LSQ->local_result.area);
+  area.set_area(area.get_area() * cdb_overhead);
+  // output_data_csv(LSQ.LSQ.local_result);
+  lsq_height =
+      LSQ->local_result.cache_ht *
+      sqrt(cdb_overhead); /*XML->sys.core[ithCore].number_hardware_threads*/
+
+  if ((coredynp.core_ty == OOO) &&
+      (XML->sys.core[ithCore].load_buffer_size > 0)) {
+    interface_ip.line_sz = int(ceil(data / 32.0)) * 4;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.cache_sz = XML->sys.core[ithCore].load_buffer_size *
+                            interface_ip.line_sz *
+                            XML->sys.core[ithCore].number_hardware_threads;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8;
+    interface_ip.access_mode = 1;
+    interface_ip.throughput = 1.0 / clockRate;
+    interface_ip.latency = 1.0 / clockRate;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = 0;
+    interface_ip.num_rd_ports = XML->sys.core[ithCore].memory_ports;
+    interface_ip.num_wr_ports = XML->sys.core[ithCore].memory_ports;
+    interface_ip.num_se_rd_ports = 0;
+    interface_ip.num_search_ports = XML->sys.core[ithCore].memory_ports;
+    LoadQ = new ArrayST(&interface_ip, "LoadQueue", Core_device,
+                        coredynp.opt_local, coredynp.core_ty);
+    LoadQ->area.set_area(LoadQ->area.get_area() + LoadQ->local_result.area);
+    area.set_area(area.get_area() + LoadQ->local_result.area);
+    area.set_area(area.get_area() * cdb_overhead);
+    // output_data_csv(LoadQ.LoadQ.local_result);
+    lsq_height =
+        (LSQ->local_result.cache_ht + LoadQ->local_result.cache_ht) *
+        sqrt(cdb_overhead); /*XML->sys.core[ithCore].number_hardware_threads*/
+  }
+}
+
+MemManU::MemManU(ParseXML *XML_interface, int ithCore_,
+                 InputParameter *interface_ip_, const CoreDynParam &dyn_p_,
+                 bool exist_)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      coredynp(dyn_p_), itlb(0), dtlb(0), exist(exist_) {
+  if (!exist)
+    return;
+  int tag, data;
+  bool debug = false;
+
+  clockRate = coredynp.clockRate;
+  executionTime = coredynp.executionTime;
+
+  interface_ip.is_cache = true;
+  interface_ip.pure_cam = false;
+  interface_ip.pure_ram = false;
+  interface_ip.specific_tag = 1;
+  // Itlb TLBs are partioned among threads according to Nigara and Nehalem
+  tag = XML->sys.virtual_address_width -
+        int(floor(log2(XML->sys.virtual_memory_page_size))) +
+        int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads))) +
+        EXTRA_TAG_BITS;
+  data = XML->sys.physical_address_width -
+         int(floor(log2(XML->sys.virtual_memory_page_size)));
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz =
+      int(ceil(data / 8.0)); // int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].itlb.number_entries *
+      interface_ip.line_sz; //*XML->sys.core[ithCore].number_hardware_threads;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].icache.icache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].icache.icache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports =
+      debug ? 1 : XML->sys.core[ithCore].number_instruction_fetch_ports;
+  interface_ip.num_se_rd_ports = 0;
+  interface_ip.num_search_ports =
+      debug ? 1 : XML->sys.core[ithCore].number_instruction_fetch_ports;
+  itlb = new ArrayST(&interface_ip, "ITLB", Core_device, coredynp.opt_local,
+                     coredynp.core_ty);
+  itlb->area.set_area(itlb->area.get_area() + itlb->local_result.area);
+  area.set_area(area.get_area() + itlb->local_result.area);
+  // output_data_csv(itlb.tlb.local_result);
+
+  // dtlb
+  tag = XML->sys.virtual_address_width -
+        int(floor(log2(XML->sys.virtual_memory_page_size))) +
+        int(ceil(log2(XML->sys.core[ithCore].number_hardware_threads))) +
+        EXTRA_TAG_BITS;
+  data = XML->sys.physical_address_width -
+         int(floor(log2(XML->sys.virtual_memory_page_size)));
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.line_sz =
+      int(ceil(data / 8.0)); // int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz =
+      XML->sys.core[ithCore].dtlb.number_entries *
+      interface_ip.line_sz; //*XML->sys.core[ithCore].number_hardware_threads;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].dcache.dcache_config[4] / clockRate;
+  interface_ip.latency =
+      debug ? 1.0 / clockRate
+            : XML->sys.core[ithCore].dcache.dcache_config[5] / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = XML->sys.core[ithCore].memory_ports;
+  interface_ip.num_se_rd_ports = 0;
+  interface_ip.num_search_ports = XML->sys.core[ithCore].memory_ports;
+  dtlb = new ArrayST(&interface_ip, "DTLB", Core_device, coredynp.opt_local,
+                     coredynp.core_ty);
+  dtlb->area.set_area(dtlb->area.get_area() + dtlb->local_result.area);
+  area.set_area(area.get_area() + dtlb->local_result.area);
+  // output_data_csv(dtlb.tlb.local_result);
+}
+//#define FERMI
+
+RegFU::RegFU(ParseXML *XML_interface, int ithCore_,
+             InputParameter *interface_ip_, const CoreDynParam &dyn_p_,
+             double exClockRate, bool exist_)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      coredynp(dyn_p_), IRF(0), FRF(0), RFWIN(0), exist(exist_) {
+  /*
+   * processors have separate architectural register files for each thread.
+   * therefore, the bypass buses need to travel across all the register files.
+   */
+  if (!exist)
+    return;
+  int data;
+  clockRate = exClockRate; // coredynp.clockRate;
+  executionTime = coredynp.executionTime;
+  /*********************************************************************************
+   * OC stage modelling (Syed Gilani)
+   *********************************************************************************/
+
+  // Crossbar
+
+  if (XML->sys.architecture == 1) {
+    xbar_rfu = new Crossbar(XML->sys.core[ithCore].rf_banks / 2,
+                            XML->sys.core[ithCore].collector_units / 2, (128),
+                            &(g_tp.peri_global));
+  } else {
+    xbar_rfu = new Crossbar(XML->sys.core[ithCore].rf_banks,
+                            XML->sys.core[ithCore].collector_units, (128),
+                            &(g_tp.peri_global));
+  }
+
+  // new
+  // Crossbar(simd_width,shared_memory_banks,word_length*simd_width,&(g_tp.peri_global));
+
+  // Arbiter
+  arbiter_rfu = new MCPAT_Arbiter(XML->sys.core[ithCore].rf_banks,
+                                  XML->sys.core[ithCore].collector_units, 1,
+                                  &(g_tp.peri_global));
+
+  // RF banks modelled here for GPGPU-Sim (Syed Gilani)
+  //
+  //**********************************IRF***************************************
+  data = coredynp.int_data_width;
+  // data               *= 8;
+  interface_ip.is_cache = false;
+  interface_ip.pure_cam = false;
+  interface_ip.pure_ram = true;
+
+  interface_ip.line_sz =
+      16; // int(ceil(data/32.0))*4 * XML->sys.core[ithCore].simd_width/4 ;//2
+          // for Tesla as RF width half of SIMD width
+
+  interface_ip.line_sz =
+      16; // int(ceil(data/32.0))*4 * XML->sys.core[ithCore].simd_width/2 ;//2
+          // for Tesla as RF width half of SIMD width
+
+  interface_ip.cache_sz = coredynp.num_IRF_entry * 4;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = XML->sys.core[ithCore].rf_banks;
+
+  interface_ip.out_w =
+      interface_ip.line_sz *
+      8; // interface_ip.line_sz*XML->sys.core[ithCore].simd_width/4;
+         // //2 for Tesla and 4 for Fermi
+
+  interface_ip.out_w =
+      interface_ip.line_sz *
+      8; // interface_ip.line_sz*XML->sys.core[ithCore].simd_width/2;
+         // //2 for Tesla and 4 for Fermi
+
+  interface_ip.access_mode = 1;
+  interface_ip.throughput = 1 / (clockRate);
+  interface_ip.latency = 8.0 / (clockRate);
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 1;
+  interface_ip.obj_func_cycle_t = 0;
+  interface_ip.num_rw_ports =
+      0; // this is the transfer port for saving/restoring states when
+         // exceptions happen.
+  interface_ip.num_rd_ports = 1; // 2*coredynp.peak_issueW;
+  interface_ip.num_wr_ports = 1; // coredynp.peak_issueW;
+  interface_ip.num_se_rd_ports = 0;
+  IRF = new ArrayST(&interface_ip, "Integer Register File", Core_device,
+                    coredynp.opt_local, coredynp.core_ty);
+
+  IRF->area.set_area(IRF->area.get_area() + IRF->local_result.area *
+                                                coredynp.num_pipelines *
+                                                cdb_overhead);
+
+  area.set_area(area.get_area() + IRF->local_result.area +
+                xbar_rfu->area.get_area() + arbiter_rfu->area.get_area());
+  if (XML->sys.architecture == 1) {
+    IRF->local_result.power.readOp.dynamic *= .33;
+    IRF->local_result.power.writeOp.dynamic *= .33;
+  } else {
+    IRF->local_result.power.readOp.dynamic *= .55;
+    IRF->local_result.power.writeOp.dynamic *= .55;
+  }
+
+  /**
+   * Operand collectors (32-bit wide, 8 entry banks )
+   */
+  data = 32;
+  // data               *= 8;
+  interface_ip.is_cache = false;
+  interface_ip.pure_cam = false;
+  interface_ip.pure_ram = true;
+
+  interface_ip.line_sz =
+      4; // int(ceil(data/32.0))*4 * XML->sys.core[ithCore].simd_width/4 ;//2
+         // for Tesla as RF width half of SIMD width
+
+  interface_ip.line_sz =
+      4; // int(ceil(data/32.0))*4 * XML->sys.core[ithCore].simd_width/2 ;//2
+         // for Tesla as RF width half of SIMD width
+
+  interface_ip.cache_sz = 8 * 4;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = 1;
+
+  interface_ip.out_w =
+      interface_ip
+          .line_sz; // interface_ip.line_sz*XML->sys.core[ithCore].simd_width/4;
+                    // //2 for Tesla and 4 for Fermi
+
+  interface_ip.out_w =
+      interface_ip
+          .line_sz; // interface_ip.line_sz*XML->sys.core[ithCore].simd_width/2;
+                    // //2 for Tesla and 4 for Fermi
+
+  interface_ip.access_mode = 1;
+  interface_ip.throughput = 1.0 / (clockRate);
+  interface_ip.latency = 1.0 / (clockRate);
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 1;
+  interface_ip.obj_func_cycle_t = 0;
+  interface_ip.num_rw_ports =
+      0; // this is the transfer port for saving/restoring states when
+         // exceptions happen.
+  interface_ip.num_rd_ports = 1; // 2*coredynp.peak_issueW;
+  interface_ip.num_wr_ports = 1; // coredynp.peak_issueW;
+  interface_ip.num_se_rd_ports = 0;
+  OPC = new ArrayST(&interface_ip, "Operand collectors", Core_device,
+                    coredynp.opt_local, coredynp.core_ty);
+
+  OPC->area.set_area(OPC->area.get_area() + OPC->local_result.area *
+                                                coredynp.num_pipelines *
+                                                cdb_overhead);
+
+  area.set_area(area.get_area() + OPC->local_result.area);
+
+  /********
+   * For GPGPUSim (Syed Gilani)
+   * Do not include FRF in final results for GPU. Only model the IRF
+   ********/
+
+  //**********************************FRF***************************************
+  data = coredynp.fp_data_width;
+  // data               *= 8;
+  interface_ip.is_cache = false;
+  interface_ip.pure_cam = false;
+  interface_ip.pure_ram = true;
+  interface_ip.line_sz = int(ceil(data / 32.0)) * 4;
+  interface_ip.cache_sz = coredynp.num_FRF_entry * interface_ip.line_sz;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 1;
+  interface_ip.throughput = 1.0 / clockRate;
+  interface_ip.latency = 1.0 / clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports =
+      1; // this is the transfer port for saving/restoring states when
+         // exceptions happen.
+  interface_ip.num_rd_ports = 2 * XML->sys.core[ithCore].issue_width;
+  // interface_ip.num_rd_ports    = 1;
+  interface_ip.num_wr_ports = XML->sys.core[ithCore].issue_width;
+  // interface_ip.num_wr_ports    = 1;
+  interface_ip.num_se_rd_ports = 0;
+  FRF = new ArrayST(&interface_ip, "Floating point Register File", Core_device,
+                    coredynp.opt_local, coredynp.core_ty);
+  // FRF->area.set_area(FRF->area.get_area()+
+  // FRF->local_result.area*XML->sys.core[ithCore].number_hardware_threads*coredynp.num_fp_pipelines*cdb_overhead);
+  // area.set_area(area.get_area()+
+  // FRF->local_result.area*XML->sys.core[ithCore].number_hardware_threads*coredynp.num_fp_pipelines*cdb_overhead);
+  // area.set_area(area.get_area()*cdb_overhead);
+  // output_data_csv(FRF.RF.local_result);
+  int_regfile_height = IRF->local_result.cache_ht *
+                       XML->sys.core[ithCore].number_hardware_threads *
+                       sqrt(cdb_overhead);
+  fp_regfile_height = 0;
+  // fp_regfile_height =
+  // FRF->local_result.cache_ht*XML->sys.core[ithCore].number_hardware_threads*sqrt(cdb_overhead);
+  // since a EXU is associated with each pipeline, the cdb should not have
+  // longer length.
+  if (coredynp.regWindowing) {
+    //*********************************REG_WIN************************************
+    data =
+        coredynp
+            .int_data_width; // ECC, and usually 2 regs are transfered together
+                             // during window shifting.Niagara Mega cell
+    interface_ip.is_cache = false;
+    interface_ip.pure_cam = false;
+    interface_ip.pure_ram = true;
+    interface_ip.line_sz = int(ceil(data / 8.0));
+    interface_ip.cache_sz = XML->sys.core[ithCore].register_windows_size *
+                            IRF->l_ip.cache_sz *
+                            XML->sys.core[ithCore].number_hardware_threads;
+    interface_ip.assoc = 1;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8;
+    interface_ip.access_mode = 1;
+    interface_ip.throughput = 4.0 / clockRate;
+    interface_ip.latency = 4.0 / clockRate;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports =
+        1; // this is the transfer port for saving/restoring states when
+           // exceptions happen.
+    interface_ip.num_rd_ports = 0;
+    interface_ip.num_wr_ports = 0;
+    interface_ip.num_se_rd_ports = 0;
+    RFWIN = new ArrayST(&interface_ip, "RegWindow", Core_device,
+                        coredynp.opt_local, coredynp.core_ty);
+    RFWIN->area.set_area(RFWIN->area.get_area() +
+                         RFWIN->local_result.area * coredynp.num_pipelines);
+    area.set_area(area.get_area() +
+                  RFWIN->local_result.area * coredynp.num_pipelines);
+    // output_data_csv(RFWIN.RF.local_result);
+  }
+}
+
+EXECU::EXECU(ParseXML *XML_interface, int ithCore_,
+             InputParameter *interface_ip_, double lsq_height_,
+             const CoreDynParam &dyn_p_, double exClockRate, bool exist_)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      lsq_height(lsq_height_), coredynp(dyn_p_), rfu(0), scheu(0), fp_u(0),
+      exeu(0), mul(0), int_bypass(0), intTagBypass(0), int_mul_bypass(0),
+      intTag_mul_Bypass(0), fp_bypass(0), fpTagBypass(0), exist(exist_),
+      rf_fu_clockRate(exClockRate) {
+  if (!exist)
+    return;
+  double fu_height = 0.0;
+  clockRate = coredynp.clockRate;
+  // cout <<"EXECU exClockRate: "<<exClockRate<<endl;
+  executionTime = coredynp.executionTime;
+  rfu = new RegFU(XML, ithCore, &interface_ip, coredynp, exClockRate);
+  scheu = new SchedulerU(XML, ithCore, &interface_ip, coredynp);
+  exeu = new FunctionalUnit(XML, ithCore, &interface_ip, coredynp, ALU,
+                            exClockRate);
+  area.set_area(area.get_area() + exeu->area.get_area() + rfu->area.get_area() +
+                scheu->area.get_area());
+  fu_height = exeu->FU_height;
+  if (coredynp.num_fpus > 0) {
+    fp_u = new FunctionalUnit(XML, ithCore, &interface_ip, coredynp, FPU,
+                              exClockRate);
+    area.set_area(area.get_area() + fp_u->area.get_area());
+  }
+  if (coredynp.num_muls > 0) {
+    mul = new FunctionalUnit(XML, ithCore, &interface_ip, coredynp, MUL,
+                             exClockRate);
+    area.set_area(area.get_area() + mul->area.get_area());
+    fu_height += mul->FU_height;
+  }
+  /*
+   * broadcast logic, including int-broadcast; int_tag-broadcast; fp-broadcast;
+   * fp_tag-broadcast integer by pass has two paths and fp has 3 paths. on the
+   * same bus there are multiple tri-state drivers and muxes that go to
+   * different components on the same bus
+   */
+  if (XML->sys.Embedded) {
+    interface_ip.wt = Global_30;
+    interface_ip.wire_is_mat_type = 0;
+    interface_ip.wire_os_mat_type = 0;
+    interface_ip.throughput = 1.0 / clockRate;
+    interface_ip.latency = 1.0 / clockRate;
+  } else {
+    interface_ip.wt = Global;
+    interface_ip.wire_is_mat_type =
+        2; // start from semi-global since local wires are already used
+    interface_ip.wire_os_mat_type = 2;
+    interface_ip.throughput = 10.0 / clockRate; // Do not care
+    interface_ip.latency = 10.0 / clockRate;
+  }
+
+  if (coredynp.core_ty == Inorder) { //
+    int_bypass = new interconnect(
+        "Int Bypass Data", Core_device, 1, 1,
+        int(ceil(XML->sys.machine_bits / 32.0) * 32),
+        rfu->int_regfile_height + exeu->FU_height + lsq_height, &interface_ip,
+        3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+    bypass.area.set_area(bypass.area.get_area() + int_bypass->area.get_area());
+    intTagBypass = new interconnect(
+        "Int Bypass tag", Core_device, 1, 1, coredynp.perThreadState,
+        rfu->int_regfile_height + exeu->FU_height + lsq_height +
+            scheu->Iw_height,
+        &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+    bypass.area.set_area(bypass.area.get_area() +
+                         intTagBypass->area.get_area());
+
+    if (coredynp.num_muls > 0) {
+      int_mul_bypass = new interconnect(
+          "Mul Bypass Data", Core_device, 1, 1,
+          int(ceil(XML->sys.machine_bits / 32.0) * 32 * 1.5),
+          rfu->fp_regfile_height + exeu->FU_height + mul->FU_height +
+              lsq_height,
+          &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+      bypass.area.set_area(bypass.area.get_area() +
+                           int_mul_bypass->area.get_area());
+      intTag_mul_Bypass = new interconnect(
+          "Mul Bypass tag", Core_device, 1, 1, coredynp.perThreadState,
+          rfu->fp_regfile_height + exeu->FU_height + mul->FU_height +
+              lsq_height + scheu->Iw_height,
+          &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+      bypass.area.set_area(bypass.area.get_area() +
+                           intTag_mul_Bypass->area.get_area());
+    }
+
+    /*
+    if (coredynp.num_fpus>0)
+    {
+            fp_bypass    = new interconnect("FP Bypass Data" , Core_device, 1,
+    1, int(ceil(XML->sys.machine_bits/32.0)*32*1.5), rfu->fp_regfile_height +
+    fp_u->FU_height, &interface_ip, 3, false, 1.0, coredynp.opt_local,
+    coredynp.core_ty); bypass.area.set_area(bypass.area.get_area()
+    +fp_bypass->area.get_area()); fpTagBypass  = new interconnect("FP Bypass
+    tag"  , Core_device, 1, 1, coredynp.perThreadState, rfu->fp_regfile_height +
+    fp_u->FU_height + lsq_height + scheu->Iw_height, &interface_ip, 3,
+                            false, 1.0, coredynp.opt_local, coredynp.core_ty);
+            bypass.area.set_area(bypass.area.get_area()
+    +fpTagBypass->area.get_area());
+    }*/
+  }      /* if (coredynp.core_ty==Inorder) */
+  else { // OOO
+    if (coredynp.scheu_ty == PhysicalRegFile) {
+      /* For physical register based OOO,
+       * data broadcast interconnects cover across functional units, lsq, inst
+       * windows and register files, while tag broadcast interconnects also
+       * cover across ROB
+       */
+      int_bypass = new interconnect(
+          "Int Bypass Data", Core_device, 1, 1,
+          int(ceil(coredynp.int_data_width)),
+          rfu->int_regfile_height + exeu->FU_height + lsq_height, &interface_ip,
+          3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+      bypass.area.set_area(bypass.area.get_area() +
+                           int_bypass->area.get_area());
+      intTagBypass = new interconnect(
+          "Int Bypass tag", Core_device, 1, 1, coredynp.phy_ireg_width,
+          rfu->int_regfile_height + exeu->FU_height + lsq_height +
+              scheu->Iw_height + scheu->ROB_height,
+          &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+
+      if (coredynp.num_muls > 0) {
+        int_mul_bypass = new interconnect(
+            "Mul Bypass Data", Core_device, 1, 1,
+            int(ceil(coredynp.int_data_width)),
+            rfu->int_regfile_height + exeu->FU_height + mul->FU_height +
+                lsq_height,
+            &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+        intTag_mul_Bypass = new interconnect(
+            "Mul Bypass tag", Core_device, 1, 1, coredynp.phy_ireg_width,
+            rfu->int_regfile_height + exeu->FU_height + mul->FU_height +
+                lsq_height + scheu->Iw_height + scheu->ROB_height,
+            &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+        bypass.area.set_area(bypass.area.get_area() +
+                             int_mul_bypass->area.get_area());
+        bypass.area.set_area(bypass.area.get_area() +
+                             intTag_mul_Bypass->area.get_area());
+      }
+
+      if (coredynp.num_fpus > 0) {
+        fp_bypass = new interconnect("FP Bypass Data", Core_device, 1, 1,
+                                     int(ceil(coredynp.fp_data_width)),
+                                     rfu->fp_regfile_height + fp_u->FU_height,
+                                     &interface_ip, 3, false, 1.0,
+                                     coredynp.opt_local, coredynp.core_ty);
+        fpTagBypass = new interconnect(
+            "FP Bypass tag", Core_device, 1, 1, coredynp.phy_freg_width,
+            rfu->fp_regfile_height + fp_u->FU_height + lsq_height +
+                scheu->fp_Iw_height + scheu->ROB_height,
+            &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+        bypass.area.set_area(bypass.area.get_area() +
+                             fp_bypass->area.get_area());
+        bypass.area.set_area(bypass.area.get_area() +
+                             fpTagBypass->area.get_area());
+      }
+    } else {
+      /*
+       * In RS based processor both data and tag are broadcast together,
+       * covering functional units, lsq, nst windows, register files, and ROBs
+       */
+      int_bypass = new interconnect(
+          "Int Bypass Data", Core_device, 1, 1,
+          int(ceil(coredynp.int_data_width)),
+          rfu->int_regfile_height + exeu->FU_height + lsq_height +
+              scheu->Iw_height + scheu->ROB_height,
+          &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+      intTagBypass = new interconnect(
+          "Int Bypass tag", Core_device, 1, 1, coredynp.phy_ireg_width,
+          rfu->int_regfile_height + exeu->FU_height + lsq_height +
+              scheu->Iw_height + scheu->ROB_height,
+          &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+      bypass.area.set_area(bypass.area.get_area() +
+                           int_bypass->area.get_area());
+      bypass.area.set_area(bypass.area.get_area() +
+                           intTagBypass->area.get_area());
+      if (coredynp.num_muls > 0) {
+        int_mul_bypass = new interconnect(
+            "Mul Bypass Data", Core_device, 1, 1,
+            int(ceil(coredynp.int_data_width)),
+            rfu->int_regfile_height + exeu->FU_height + mul->FU_height +
+                lsq_height + scheu->Iw_height + scheu->ROB_height,
+            &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+        intTag_mul_Bypass = new interconnect(
+            "Mul Bypass tag", Core_device, 1, 1, coredynp.phy_ireg_width,
+            rfu->int_regfile_height + exeu->FU_height + mul->FU_height +
+                lsq_height + scheu->Iw_height + scheu->ROB_height,
+            &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+        bypass.area.set_area(bypass.area.get_area() +
+                             int_mul_bypass->area.get_area());
+        bypass.area.set_area(bypass.area.get_area() +
+                             intTag_mul_Bypass->area.get_area());
+      }
+
+      if (coredynp.num_fpus > 0) {
+        fp_bypass = new interconnect(
+            "FP Bypass Data", Core_device, 1, 1,
+            int(ceil(coredynp.fp_data_width)),
+            rfu->fp_regfile_height + fp_u->FU_height + lsq_height +
+                scheu->fp_Iw_height + scheu->ROB_height,
+            &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+        fpTagBypass = new interconnect(
+            "FP Bypass tag", Core_device, 1, 1, coredynp.phy_freg_width,
+            rfu->fp_regfile_height + fp_u->FU_height + lsq_height +
+                scheu->fp_Iw_height + scheu->ROB_height,
+            &interface_ip, 3, false, 1.0, coredynp.opt_local, coredynp.core_ty);
+        bypass.area.set_area(bypass.area.get_area() +
+                             fp_bypass->area.get_area());
+        bypass.area.set_area(bypass.area.get_area() +
+                             fpTagBypass->area.get_area());
+      }
+    } /* else */
+
+  } /* else */
+  area.set_area(area.get_area() /*+ bypass.area.get_area()*/);
+}
+
+RENAMINGU::RENAMINGU(ParseXML *XML_interface, int ithCore_,
+                     InputParameter *interface_ip_, const CoreDynParam &dyn_p_,
+                     bool exist_)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      coredynp(dyn_p_), iFRAT(0), fFRAT(0), iRRAT(0), fRRAT(0), ifreeL(0),
+      ffreeL(0), idcl(0), fdcl(0), RAHT(0), exist(exist_) {
+  /*
+   * Although renaming logic maybe be used in in-order processors,
+   * McPAT assumes no renaming logic is used since the performance gain is very
+   * limited and the only major inorder processor with renaming logic is
+   * Itainium that is a VLIW processor and different from current McPAT's model.
+   * physical register base OOO must have Dual-RAT architecture or equivalent
+   * structure.FRAT:FrontRAT, RRAT:RetireRAT; i,f prefix mean int and fp RAT for
+   * all Renaming logic, random accessible checkpointing is used, but only
+   * update when instruction retires. FRAT will be read twice and written once
+   * per instruction; RRAT will be write once per instruction when committing
+   * and reads out all when context switch checkpointing is implicit Renaming
+   * logic is duplicated for each different hardware threads
+   *
+   * No Dual-RAT is needed in RS-based OOO processors,
+   * however, RAT needs to do associative search in RAT, when instruction
+   * commits and ROB release the entry, to make sure all the renamings
+   * associated with the ROB to be released are updated at the same time. RAM
+   * scheme has # ARchi Reg entry with each entry hold phy reg tag, CAM scheme
+   * has # Phy Reg entry with each entry hold ARchi reg tag,
+   *
+   * Both RAM and CAM have same DCL
+   */
+  if (!exist)
+    return;
+  int tag, data, out_w;
+  //	interface_ip.wire_is_mat_type = 0;
+  //	interface_ip.wire_os_mat_type = 0;
+  //	interface_ip.wt               = Global_30;
+  clockRate = coredynp.clockRate;
+  executionTime = coredynp.executionTime;
+  if (coredynp.core_ty == OOO) {
+    // integer pipeline
+    if (coredynp.scheu_ty == PhysicalRegFile) {
+      if (coredynp.rm_ty ==
+          RAMbased) { // FRAT with global checkpointing (GCs) please see paper
+                      // tech report for detailed explaintions
+        data =
+            33; // int(ceil(coredynp.phy_ireg_width*(1+coredynp.globalCheckpoint)/8.0));
+        //			data
+        //= int(ceil(coredynp.phy_ireg_width/8.0));
+        out_w = 1; // int(ceil(coredynp.phy_ireg_width/8.0));
+        interface_ip.is_cache = false;
+        interface_ip.pure_cam = false;
+        interface_ip.pure_ram = true;
+        interface_ip.line_sz = data;
+        interface_ip.cache_sz =
+            data * XML->sys.core[ithCore].archi_Regs_IRF_size;
+        interface_ip.assoc = 1;
+        interface_ip.nbanks = 1;
+        interface_ip.out_w = out_w * 8;
+        interface_ip.access_mode = 2;
+        interface_ip.throughput = 1.0 / clockRate;
+        interface_ip.latency = 1.0 / clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t = 1;
+        interface_ip.num_rw_ports = 1; // the extra one port is for GCs
+        interface_ip.num_rd_ports = 2 * coredynp.decodeW;
+        interface_ip.num_wr_ports = coredynp.decodeW;
+        interface_ip.num_se_rd_ports = 0;
+        iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device,
+                            coredynp.opt_local, coredynp.core_ty);
+        iFRAT->area.set_area(
+            iFRAT->area.get_area() +
+            iFRAT->local_result.area *
+                XML->sys.core[ithCore].number_hardware_threads);
+        area.set_area(area.get_area() + iFRAT->area.get_area());
+
+        //			//RAHT According to Intel, combine GC with FRAT
+        // is very costly. 			data =
+        // int(ceil(coredynp.phy_ireg_width/8.0)*coredynp.num_IRF_entry);
+        // out_w = data; 			interface_ip.is_cache
+        //= false; 			interface_ip.pure_cam            =
+        // false; 			interface_ip.pure_ram            = true;
+        // interface_ip.line_sz = data; 			interface_ip.cache_sz =
+        // data*coredynp.globalCheckpoint; interface_ip.assoc = 1;
+        // interface_ip.nbanks              = 1;
+        // interface_ip.out_w =
+        // out_w*8; 			interface_ip.access_mode         = 0;
+        // interface_ip.throughput = 1.0/clockRate;
+        // interface_ip.latency = 1.0/clockRate;
+        // interface_ip.obj_func_dyn_energy = 0;
+        //interface_ip.obj_func_dyn_power  = 0; 			interface_ip.obj_func_leak_power
+        //= 0; 			interface_ip.obj_func_cycle_t    = 1; 			interface_ip.num_rw_ports
+        //= 1;//the extra one
+        // port is for GCs 			interface_ip.num_rd_ports    =
+        // 2*coredynp.decodeW; 			interface_ip.num_wr_ports    =
+        // coredynp.decodeW;
+        //			interface_ip.num_se_rd_ports = 0;
+        //			iFRAT = new ArrayST(&interface_ip, "Int
+        // FrontRAT");
+        // iFRAT->area.set_area(iFRAT->area.get_area()+
+        // iFRAT->local_result.area*XML->sys.core[ithCore].number_hardware_threads);
+        //			area.set_area(area.get_area()+
+        // iFRAT->area.get_area());
+
+        // FRAT floating point
+        data = int(ceil(coredynp.phy_freg_width *
+                        (1 + coredynp.globalCheckpoint) / 8.0));
+        out_w = int(ceil(coredynp.phy_freg_width / 8.0));
+        interface_ip.is_cache = false;
+        interface_ip.pure_cam = false;
+        interface_ip.pure_ram = true;
+        interface_ip.line_sz = data;
+        interface_ip.cache_sz =
+            data * XML->sys.core[ithCore].archi_Regs_FRF_size;
+        interface_ip.assoc = 1;
+        interface_ip.nbanks = 1;
+        interface_ip.out_w = out_w * 8;
+        interface_ip.access_mode = 2;
+        interface_ip.throughput = 1.0 / clockRate;
+        interface_ip.latency = 1.0 / clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t = 1;
+        interface_ip.num_rw_ports = 1; // the extra one port is for GCs
+        interface_ip.num_rd_ports = 2 * coredynp.fp_decodeW;
+        interface_ip.num_wr_ports = coredynp.fp_decodeW;
+        interface_ip.num_se_rd_ports = 0;
+        fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device,
+                            coredynp.opt_local, coredynp.core_ty);
+        fFRAT->area.set_area(
+            fFRAT->area.get_area() +
+            fFRAT->local_result.area *
+                XML->sys.core[ithCore].number_hardware_threads);
+        area.set_area(area.get_area() + fFRAT->area.get_area());
+
+      } else if ((coredynp.rm_ty == CAMbased)) {
+        // FRAT
+        tag = coredynp.arch_ireg_width;
+        data = int(
+            ceil((coredynp.arch_ireg_width + 1 * coredynp.globalCheckpoint) /
+                 8.0)); // the address of CAM needed to be sent out
+        out_w = int(ceil(coredynp.arch_ireg_width / 8.0));
+        interface_ip.is_cache = true;
+        interface_ip.pure_cam = false;
+        interface_ip.pure_ram = false;
+        interface_ip.line_sz = data;
+        interface_ip.cache_sz = data * XML->sys.core[ithCore].phy_Regs_IRF_size;
+        interface_ip.assoc = 0;
+        interface_ip.nbanks = 1;
+        interface_ip.out_w = out_w * 8;
+        interface_ip.specific_tag = 1;
+        interface_ip.tag_w = tag;
+        interface_ip.access_mode = 2;
+        interface_ip.throughput = 1.0 / clockRate;
+        interface_ip.latency = 1.0 / clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t = 1;
+        interface_ip.num_rw_ports = 1; // for GCs
+        interface_ip.num_rd_ports = coredynp.decodeW;
+        interface_ip.num_wr_ports = coredynp.decodeW;
+        interface_ip.num_se_rd_ports = 0;
+        interface_ip.num_search_ports = 2 * coredynp.decodeW;
+        iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device,
+                            coredynp.opt_local, coredynp.core_ty);
+        iFRAT->area.set_area(
+            iFRAT->area.get_area() +
+            iFRAT->local_result.area *
+                XML->sys.core[ithCore].number_hardware_threads);
+        area.set_area(area.get_area() + iFRAT->area.get_area());
+
+        // FRAT for FP
+        tag = coredynp.arch_freg_width;
+        data = int(
+            ceil((coredynp.arch_freg_width + 1 * coredynp.globalCheckpoint) /
+                 8.0)); // the address of CAM needed to be sent out
+        out_w = int(ceil(coredynp.arch_freg_width / 8.0));
+        interface_ip.is_cache = true;
+        interface_ip.pure_cam = false;
+        interface_ip.pure_ram = false;
+        interface_ip.line_sz = data;
+        interface_ip.cache_sz = data * XML->sys.core[ithCore].phy_Regs_FRF_size;
+        interface_ip.assoc = 0;
+        interface_ip.nbanks = 1;
+        interface_ip.out_w = out_w * 8;
+        interface_ip.specific_tag = 1;
+        interface_ip.tag_w = tag;
+        interface_ip.access_mode = 2;
+        interface_ip.throughput = 1.0 / clockRate;
+        interface_ip.latency = 1.0 / clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t = 1;
+        interface_ip.num_rw_ports = 1; // for GCs
+        interface_ip.num_rd_ports = coredynp.fp_decodeW;
+        interface_ip.num_wr_ports = coredynp.fp_decodeW;
+        interface_ip.num_se_rd_ports = 0;
+        interface_ip.num_search_ports = 2 * coredynp.fp_decodeW;
+        fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device,
+                            coredynp.opt_local, coredynp.core_ty);
+        fFRAT->area.set_area(
+            fFRAT->area.get_area() +
+            fFRAT->local_result.area *
+                XML->sys.core[ithCore].number_hardware_threads);
+        area.set_area(area.get_area() + fFRAT->area.get_area());
+      }
+
+      // RRAT is always RAM based, does not have GCs, and is used only for
+      // record latest non-speculative mapping
+      data = int(ceil(coredynp.phy_ireg_width / 8.0));
+      interface_ip.is_cache = false;
+      interface_ip.pure_cam = false;
+      interface_ip.pure_ram = true;
+      interface_ip.line_sz = data;
+      interface_ip.cache_sz = data *
+                              XML->sys.core[ithCore].archi_Regs_IRF_size *
+                              2; // HACK to make it as least 64B
+      interface_ip.assoc = 1;
+      interface_ip.nbanks = 1;
+      interface_ip.out_w = interface_ip.line_sz * 8;
+      interface_ip.access_mode = 1;
+      interface_ip.throughput = 1.0 / clockRate;
+      interface_ip.latency = 1.0 / clockRate;
+      interface_ip.obj_func_dyn_energy = 0;
+      interface_ip.obj_func_dyn_power = 0;
+      interface_ip.obj_func_leak_power = 0;
+      interface_ip.obj_func_cycle_t = 1;
+      interface_ip.num_rw_ports = 0;
+      interface_ip.num_rd_ports = XML->sys.core[ithCore].commit_width;
+      interface_ip.num_wr_ports = XML->sys.core[ithCore].commit_width;
+      interface_ip.num_se_rd_ports = 0;
+      iRRAT = new ArrayST(&interface_ip, "Int RetireRAT", Core_device,
+                          coredynp.opt_local, coredynp.core_ty);
+      iRRAT->area.set_area(iRRAT->area.get_area() +
+                           iRRAT->local_result.area *
+                               XML->sys.core[ithCore].number_hardware_threads);
+      area.set_area(area.get_area() + iRRAT->area.get_area());
+
+      // RRAT for FP
+      data = int(ceil(coredynp.phy_freg_width / 8.0));
+      interface_ip.is_cache = false;
+      interface_ip.pure_cam = false;
+      interface_ip.pure_ram = true;
+      interface_ip.line_sz = data;
+      interface_ip.cache_sz = data *
+                              XML->sys.core[ithCore].archi_Regs_FRF_size *
+                              2; // HACK to make it as least 64B
+      interface_ip.assoc = 1;
+      interface_ip.nbanks = 1;
+      interface_ip.out_w = interface_ip.line_sz * 8;
+      interface_ip.access_mode = 1;
+      interface_ip.throughput = 1.0 / clockRate;
+      interface_ip.latency = 1.0 / clockRate;
+      interface_ip.obj_func_dyn_energy = 0;
+      interface_ip.obj_func_dyn_power = 0;
+      interface_ip.obj_func_leak_power = 0;
+      interface_ip.obj_func_cycle_t = 1;
+      interface_ip.num_rw_ports = 0;
+      interface_ip.num_rd_ports = coredynp.fp_decodeW;
+      interface_ip.num_wr_ports = coredynp.fp_decodeW;
+      interface_ip.num_se_rd_ports = 0;
+      fRRAT = new ArrayST(&interface_ip, "Int RetireRAT", Core_device,
+                          coredynp.opt_local, coredynp.core_ty);
+      fRRAT->area.set_area(fRRAT->area.get_area() +
+                           fRRAT->local_result.area *
+                               XML->sys.core[ithCore].number_hardware_threads);
+      area.set_area(area.get_area() + fRRAT->area.get_area());
+
+      // Freelist of renaming unit always RAM based
+      // Recycle happens at two places: 1)when DCL check there are WAW, the
+      // Phyregisters/ROB directly recycles into freelist
+      // 2)When instruction commits the Phyregisters/ROB needed to be recycled.
+      // therefore num_wr port = decode-1(-1 means at least one phy reg will be
+      // used for the current renaming group) + commit width
+      data = int(ceil(coredynp.phy_ireg_width / 8.0));
+      interface_ip.is_cache = false;
+      interface_ip.pure_cam = false;
+      interface_ip.pure_ram = true;
+      interface_ip.line_sz = data;
+      interface_ip.cache_sz = data * coredynp.num_ifreelist_entries;
+      interface_ip.assoc = 1;
+      interface_ip.nbanks = 1;
+      interface_ip.out_w = interface_ip.line_sz * 8;
+      interface_ip.access_mode = 1;
+      interface_ip.throughput = 1.0 / clockRate;
+      interface_ip.latency = 1.0 / clockRate;
+      interface_ip.obj_func_dyn_energy = 0;
+      interface_ip.obj_func_dyn_power = 0;
+      interface_ip.obj_func_leak_power = 0;
+      interface_ip.obj_func_cycle_t = 1;
+      interface_ip.num_rw_ports = 1; // TODO
+      interface_ip.num_rd_ports = coredynp.decodeW;
+      interface_ip.num_wr_ports =
+          coredynp.decodeW - 1 + XML->sys.core[ithCore].commit_width;
+      // every cycle, (coredynp.decodeW -1) inst may need to send back it dest
+      // tags, committW insts needs to update freelist buffers
+      interface_ip.num_se_rd_ports = 0;
+      ifreeL = new ArrayST(&interface_ip, "Int Free List", Core_device,
+                           coredynp.opt_local, coredynp.core_ty);
+      ifreeL->area.set_area(ifreeL->area.get_area() +
+                            ifreeL->local_result.area *
+                                XML->sys.core[ithCore].number_hardware_threads);
+      area.set_area(area.get_area() + ifreeL->area.get_area());
+
+      // freelist for FP
+      data = int(ceil(coredynp.phy_freg_width / 8.0));
+      interface_ip.is_cache = false;
+      interface_ip.pure_cam = false;
+      interface_ip.pure_ram = true;
+      interface_ip.line_sz = data;
+      interface_ip.cache_sz = data * coredynp.num_ffreelist_entries;
+      interface_ip.assoc = 1;
+      interface_ip.nbanks = 1;
+      interface_ip.out_w = interface_ip.line_sz * 8;
+      interface_ip.access_mode = 1;
+      interface_ip.throughput = 1.0 / clockRate;
+      interface_ip.latency = 1.0 / clockRate;
+      interface_ip.obj_func_dyn_energy = 0;
+      interface_ip.obj_func_dyn_power = 0;
+      interface_ip.obj_func_leak_power = 0;
+      interface_ip.obj_func_cycle_t = 1;
+      interface_ip.num_rw_ports = 1;
+      interface_ip.num_rd_ports = coredynp.fp_decodeW;
+      interface_ip.num_wr_ports =
+          coredynp.fp_decodeW - 1 + XML->sys.core[ithCore].commit_width;
+      interface_ip.num_se_rd_ports = 0;
+      ffreeL = new ArrayST(&interface_ip, "Int Free List", Core_device,
+                           coredynp.opt_local, coredynp.core_ty);
+      ffreeL->area.set_area(ffreeL->area.get_area() +
+                            ffreeL->local_result.area *
+                                XML->sys.core[ithCore].number_hardware_threads);
+      area.set_area(area.get_area() + ffreeL->area.get_area());
+
+      idcl = new dep_resource_conflict_check(
+          &interface_ip, coredynp,
+          coredynp.phy_ireg_width); // TODO:Separate 2 sections See TR
+      fdcl = new dep_resource_conflict_check(&interface_ip, coredynp,
+                                             coredynp.phy_freg_width);
+
+    } else if (coredynp.scheu_ty == ReservationStation) {
+      if (coredynp.rm_ty == RAMbased) {
+        /*
+         * however, RAT needs to do associative search in RAT, when instruction
+         * commits and ROB release the entry, to make sure all the renamings
+         * associated with the ROB to be released are updated to ARF at the same
+         * time. RAM based RAT for RS base OOO does not save the search
+         * operations. Its advantage is to have less entries than CAM based RAT
+         * so that it is more scalable as number of ROB/physical regs increases.
+         */
+        tag = coredynp.phy_ireg_width;
+        data = int(ceil(coredynp.phy_ireg_width *
+                        (1 + coredynp.globalCheckpoint) / 8.0));
+        out_w = int(ceil(coredynp.phy_ireg_width / 8.0));
+        interface_ip.is_cache = true;
+        interface_ip.pure_cam = false;
+        interface_ip.pure_ram = false;
+        interface_ip.line_sz = data;
+        interface_ip.cache_sz =
+            data * XML->sys.core[ithCore].archi_Regs_IRF_size;
+        interface_ip.assoc = 0;
+        interface_ip.nbanks = 1;
+        interface_ip.out_w = out_w * 8;
+        interface_ip.access_mode = 2;
+        interface_ip.throughput = 1.0 / clockRate;
+        interface_ip.latency = 1.0 / clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t = 1;
+        interface_ip.num_rw_ports = 1; // the extra one port is for GCs
+        interface_ip.num_rd_ports = 2 * coredynp.decodeW;
+        interface_ip.num_wr_ports = coredynp.decodeW;
+        interface_ip.num_se_rd_ports = 0;
+        interface_ip.num_search_ports = coredynp.commitW; // TODO
+        iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device,
+                            coredynp.opt_local, coredynp.core_ty);
+        iFRAT->local_result.adjust_area();
+        iFRAT->area.set_area(
+            iFRAT->area.get_area() +
+            iFRAT->local_result.area *
+                XML->sys.core[ithCore].number_hardware_threads);
+        area.set_area(area.get_area() + iFRAT->area.get_area());
+
+        // FP
+        tag = coredynp.phy_freg_width;
+        data = int(ceil(coredynp.phy_freg_width *
+                        (1 + coredynp.globalCheckpoint) / 8.0));
+        out_w = int(ceil(coredynp.phy_freg_width / 8.0));
+        interface_ip.is_cache = true;
+        interface_ip.pure_cam = false;
+        interface_ip.pure_ram = false;
+        interface_ip.line_sz = data;
+        interface_ip.cache_sz =
+            data * XML->sys.core[ithCore].archi_Regs_FRF_size;
+        interface_ip.assoc = 0;
+        interface_ip.nbanks = 1;
+        interface_ip.out_w = out_w * 8;
+        interface_ip.access_mode = 2;
+        interface_ip.throughput = 1.0 / clockRate;
+        interface_ip.latency = 1.0 / clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t = 1;
+        interface_ip.num_rw_ports = 1; // the extra one port is for GCs
+        interface_ip.num_rd_ports = 2 * coredynp.fp_decodeW;
+        interface_ip.num_wr_ports = coredynp.fp_decodeW;
+        interface_ip.num_se_rd_ports = 0;
+        interface_ip.num_search_ports =
+            coredynp.fp_decodeW; // actually is fp commit width
+        fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device,
+                            coredynp.opt_local, coredynp.core_ty);
+        fFRAT->local_result.adjust_area();
+        fFRAT->area.set_area(
+            fFRAT->area.get_area() +
+            fFRAT->local_result.area *
+                XML->sys.core[ithCore].number_hardware_threads);
+        area.set_area(area.get_area() + fFRAT->area.get_area());
+
+      } else if ((coredynp.rm_ty == CAMbased)) {
+        // FRAT
+        tag = coredynp.arch_ireg_width;
+        data = int(ceil(coredynp.arch_ireg_width +
+                        1 * coredynp.globalCheckpoint /
+                            8.0)); // the address of CAM needed to be sent out
+        out_w = int(ceil(coredynp.arch_ireg_width / 8.0));
+        interface_ip.is_cache = true;
+        interface_ip.pure_cam = false;
+        interface_ip.pure_ram = false;
+        interface_ip.line_sz = data;
+        interface_ip.cache_sz = data * XML->sys.core[ithCore].phy_Regs_IRF_size;
+        interface_ip.assoc = 0;
+        interface_ip.nbanks = 1;
+        interface_ip.out_w = out_w * 8;
+        interface_ip.specific_tag = 1;
+        interface_ip.tag_w = tag;
+        interface_ip.access_mode = 2;
+        interface_ip.throughput = 1.0 / clockRate;
+        interface_ip.latency = 1.0 / clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t = 1;
+        interface_ip.num_rw_ports = 1; // for GCs
+        interface_ip.num_rd_ports =
+            XML->sys.core[ithCore].decode_width; // 0;TODO
+        interface_ip.num_wr_ports = XML->sys.core[ithCore].decode_width;
+        interface_ip.num_se_rd_ports = 0;
+        interface_ip.num_search_ports = 2 * XML->sys.core[ithCore].decode_width;
+        iFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device,
+                            coredynp.opt_local, coredynp.core_ty);
+        iFRAT->area.set_area(
+            iFRAT->area.get_area() +
+            iFRAT->local_result.area *
+                XML->sys.core[ithCore].number_hardware_threads);
+        area.set_area(area.get_area() + iFRAT->area.get_area());
+
+        // FRAT
+        tag = coredynp.arch_freg_width;
+        data = int(ceil(coredynp.arch_freg_width +
+                        1 * coredynp.globalCheckpoint /
+                            8.0)); // the address of CAM needed to be sent out
+        out_w = int(ceil(coredynp.arch_freg_width / 8.0));
+        interface_ip.is_cache = true;
+        interface_ip.pure_cam = false;
+        interface_ip.pure_ram = false;
+        interface_ip.line_sz = data;
+        interface_ip.cache_sz = data * XML->sys.core[ithCore].phy_Regs_FRF_size;
+        interface_ip.assoc = 0;
+        interface_ip.nbanks = 1;
+        interface_ip.out_w = out_w * 8;
+        interface_ip.specific_tag = 1;
+        interface_ip.tag_w = tag;
+        interface_ip.access_mode = 2;
+        interface_ip.throughput = 1.0 / clockRate;
+        interface_ip.latency = 1.0 / clockRate;
+        interface_ip.obj_func_dyn_energy = 0;
+        interface_ip.obj_func_dyn_power = 0;
+        interface_ip.obj_func_leak_power = 0;
+        interface_ip.obj_func_cycle_t = 1;
+        interface_ip.num_rw_ports = 1; // for GCs
+        interface_ip.num_rd_ports =
+            XML->sys.core[ithCore].decode_width; // 0;TODO;
+        interface_ip.num_wr_ports = coredynp.fp_decodeW;
+        interface_ip.num_se_rd_ports = 0;
+        interface_ip.num_search_ports = 2 * coredynp.fp_decodeW;
+        fFRAT = new ArrayST(&interface_ip, "Int FrontRAT", Core_device,
+                            coredynp.opt_local, coredynp.core_ty);
+        fFRAT->area.set_area(
+            fFRAT->area.get_area() +
+            fFRAT->local_result.area *
+                XML->sys.core[ithCore].number_hardware_threads);
+        area.set_area(area.get_area() + fFRAT->area.get_area());
+      }
+      // No RRAT for RS based OOO
+      // Freelist of renaming unit of RS based OOO is unifed for both int and fp
+      // renaming unit since the ROB is unified
+      data = int(ceil(coredynp.phy_ireg_width / 8.0));
+      interface_ip.is_cache = false;
+      interface_ip.pure_cam = false;
+      interface_ip.pure_ram = true;
+      interface_ip.line_sz = data;
+      interface_ip.cache_sz = data * coredynp.num_ifreelist_entries;
+      interface_ip.assoc = 1;
+      interface_ip.nbanks = 1;
+      interface_ip.out_w = interface_ip.line_sz * 8;
+      interface_ip.access_mode = 1;
+      interface_ip.throughput = 1.0 / clockRate;
+      interface_ip.latency = 1.0 / clockRate;
+      interface_ip.obj_func_dyn_energy = 0;
+      interface_ip.obj_func_dyn_power = 0;
+      interface_ip.obj_func_leak_power = 0;
+      interface_ip.obj_func_cycle_t = 1;
+      interface_ip.num_rw_ports = 1; // TODO
+      interface_ip.num_rd_ports = XML->sys.core[ithCore].decode_width;
+      interface_ip.num_wr_ports = XML->sys.core[ithCore].decode_width - 1 +
+                                  XML->sys.core[ithCore].commit_width;
+      interface_ip.num_se_rd_ports = 0;
+      ifreeL = new ArrayST(&interface_ip, "Unified Free List", Core_device,
+                           coredynp.opt_local, coredynp.core_ty);
+      ifreeL->area.set_area(ifreeL->area.get_area() +
+                            ifreeL->local_result.area *
+                                XML->sys.core[ithCore].number_hardware_threads);
+      area.set_area(area.get_area() + ifreeL->area.get_area());
+
+      idcl = new dep_resource_conflict_check(
+          &interface_ip, coredynp,
+          coredynp.phy_ireg_width); // TODO:Separate 2 sections See TR
+      fdcl = new dep_resource_conflict_check(&interface_ip, coredynp,
+                                             coredynp.phy_freg_width);
+    }
+  }
+  if (coredynp.core_ty == Inorder && coredynp.issueW > 1) {
+    /* Dependency check logic will only present when decode(issue) width>1.
+     *  Multiple issue in order processor can do without renaming, but dcl is a
+     * must.
+     */
+    idcl = new dep_resource_conflict_check(
+        &interface_ip, coredynp,
+        coredynp.phy_ireg_width); // TODO:Separate 2 sections See TR
+    fdcl = new dep_resource_conflict_check(&interface_ip, coredynp,
+                                           coredynp.phy_freg_width);
+  }
+}
+
+Core::Core(ParseXML *XML_interface, int ithCore_, InputParameter *interface_ip_)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      ifu(0), lsu(0), mmu(0), exu(0), rnu(0), corepipe(0), undiffCore(0),
+      l2cache(0) {
+  /**
+   * Testing: (to be removed) added by syed
+   */
+  // XML->sys.core[ithCore].simd_width=8;// (8)
+  // XML->sys.core[ithCore].collector_units=4;// (4)
+  // XML->sys.core[ithCore].core_clock_ratio=2.0;// (2.0)
+  // XML->sys.core[ithCore].warp_size=32;// (32)
+
+  /*
+   * initialize, compute and optimize individual components.
+   */
+
+  IdleCoreEnergy = 0;
+  IdlePower_PerCore = 0;
+  double pipeline_area_per_unit;
+  if (XML->sys.Private_L2) {
+    l2cache = new SharedCache(XML, ithCore, &interface_ip);
+  }
+  //  interface_ip.wire_is_mat_type = 2;
+  //  interface_ip.wire_os_mat_type = 2;
+  //  interface_ip.wt               =Global_30;
+  set_core_param();
+  clockRate = coredynp.clockRate;
+  exClockRate = clockRate * XML->sys.core[ithCore].core_clock_ratio;
+
+  executionTime = coredynp.executionTime;
+  ifu = new InstFetchU(XML, ithCore, &interface_ip, coredynp);
+  lsu = new LoadStoreU(XML, ithCore, &interface_ip, coredynp);
+  mmu = new MemManU(XML, ithCore, &interface_ip, coredynp);
+  exu = new EXECU(XML, ithCore, &interface_ip, lsu->lsq_height, coredynp,
+                  exClockRate, true);
+
+  undiffCore = new UndiffCore(XML, ithCore, &interface_ip, coredynp);
+  if (coredynp.core_ty == OOO) {
+    rnu = new RENAMINGU(XML, ithCore, &interface_ip, coredynp);
+  }
+  corepipe = new Pipeline(&interface_ip, coredynp);
+
+  if (coredynp.core_ty == OOO) {
+    pipeline_area_per_unit =
+        (corepipe->area.get_area() * coredynp.num_pipelines) / 5.0;
+    if (rnu->exist) {
+      rnu->area.set_area(rnu->area.get_area() + pipeline_area_per_unit);
+    }
+  } else {
+    pipeline_area_per_unit =
+        (corepipe->area.get_area() * coredynp.num_pipelines) / 4.0;
+  }
+
+  // area.set_area(area.get_area()+ corepipe->area.get_area());
+  if (ifu->exist) {
+    ifu->area.set_area(ifu->area.get_area() + pipeline_area_per_unit);
+    area.set_area(area.get_area() + ifu->area.get_area());
+  }
+  if (lsu->exist) {
+    lsu->area.set_area(lsu->area.get_area() + pipeline_area_per_unit);
+    area.set_area(area.get_area() + lsu->area.get_area());
+  }
+  if (exu->exist) {
+    exu->area.set_area(exu->area.get_area() + pipeline_area_per_unit);
+    area.set_area(area.get_area() + exu->area.get_area());
+  }
+  if (mmu->exist) {
+    mmu->area.set_area(mmu->area.get_area() + pipeline_area_per_unit);
+    area.set_area(area.get_area() + mmu->area.get_area());
+  }
+
+  if (coredynp.core_ty == OOO) {
+    if (rnu->exist) {
+
+      area.set_area(area.get_area() + rnu->area.get_area());
+    }
+  }
+
+  if (undiffCore->exist) {
+    area.set_area(area.get_area() + undiffCore->area.get_area());
+  }
+
+  if (XML->sys.Private_L2) {
+    area.set_area(area.get_area() + l2cache->area.get_area());
+  }
+  //  //clock power
+  //  clockNetwork.init_wire_external(is_default, &interface_ip);
+  //  clockNetwork.clk_area           =area*1.1;//10% of placement overhead.
+  //  rule of thumb clockNetwork.end_wiring_level   =5;//toplevel metal
+  //  clockNetwork.start_wiring_level =5;//toplevel metal
+  //  clockNetwork.num_regs           = corepipe.tot_stage_vector;
+  //  clockNetwork.optimize_wire();
+}
+
+void BranchPredictor::computeEnergy(bool is_tdp) {
+  if (!exist)
+    return;
+  double r_access;
+  double w_access;
+  if (is_tdp) {
+    r_access = coredynp.predictionW * coredynp.BR_duty_cycle;
+    w_access = 0 * coredynp.BR_duty_cycle;
+    globalBPT->stats_t.readAc.access = r_access;
+    globalBPT->stats_t.writeAc.access = w_access;
+    globalBPT->tdp_stats = globalBPT->stats_t;
+
+    L1_localBPT->stats_t.readAc.access = r_access;
+    L1_localBPT->stats_t.writeAc.access = w_access;
+    L1_localBPT->tdp_stats = L1_localBPT->stats_t;
+
+    L2_localBPT->stats_t.readAc.access = r_access;
+    L2_localBPT->stats_t.writeAc.access = w_access;
+    L2_localBPT->tdp_stats = L2_localBPT->stats_t;
+
+    chooser->stats_t.readAc.access = r_access;
+    chooser->stats_t.writeAc.access = w_access;
+    chooser->tdp_stats = chooser->stats_t;
+
+    RAS->stats_t.readAc.access = r_access;
+    RAS->stats_t.writeAc.access = w_access;
+    RAS->tdp_stats = RAS->stats_t;
+  } else {
+    // The resolution of BPT accesses is coarse, but this is
+    // because most simulators cannot track finer grained details
+    r_access = XML->sys.core[ithCore].branch_instructions;
+    w_access =
+        XML->sys.core[ithCore].branch_mispredictions +
+        0.1 * XML->sys.core[ithCore]
+                  .branch_instructions; // 10% of BR will flip internal bits//0
+    globalBPT->stats_t.readAc.access = r_access;
+    globalBPT->stats_t.writeAc.access = w_access;
+    globalBPT->rtp_stats = globalBPT->stats_t;
+
+    L1_localBPT->stats_t.readAc.access = r_access;
+    L1_localBPT->stats_t.writeAc.access = w_access;
+    L1_localBPT->rtp_stats = L1_localBPT->stats_t;
+
+    L2_localBPT->stats_t.readAc.access = r_access;
+    L2_localBPT->stats_t.writeAc.access = w_access;
+    L2_localBPT->rtp_stats = L2_localBPT->stats_t;
+
+    chooser->stats_t.readAc.access = r_access;
+    chooser->stats_t.writeAc.access = w_access;
+    chooser->rtp_stats = chooser->stats_t;
+
+    RAS->stats_t.readAc.access = XML->sys.core[ithCore].function_calls;
+    RAS->stats_t.writeAc.access = XML->sys.core[ithCore].function_calls;
+    RAS->rtp_stats = RAS->stats_t;
+  }
+
+  globalBPT->power_t.reset();
+  L1_localBPT->power_t.reset();
+  L2_localBPT->power_t.reset();
+  chooser->power_t.reset();
+  RAS->power_t.reset();
+
+  globalBPT->power_t.readOp.dynamic +=
+      globalBPT->local_result.power.readOp.dynamic *
+          globalBPT->stats_t.readAc.access +
+      globalBPT->stats_t.writeAc.access *
+          globalBPT->local_result.power.writeOp.dynamic;
+  L1_localBPT->power_t.readOp.dynamic +=
+      L1_localBPT->local_result.power.readOp.dynamic *
+          L1_localBPT->stats_t.readAc.access +
+      L1_localBPT->stats_t.writeAc.access *
+          L1_localBPT->local_result.power.writeOp.dynamic;
+
+  L2_localBPT->power_t.readOp.dynamic +=
+      L2_localBPT->local_result.power.readOp.dynamic *
+          L2_localBPT->stats_t.readAc.access +
+      L2_localBPT->stats_t.writeAc.access *
+          L2_localBPT->local_result.power.writeOp.dynamic;
+
+  chooser->power_t.readOp.dynamic +=
+      chooser->local_result.power.readOp.dynamic *
+          chooser->stats_t.readAc.access +
+      chooser->stats_t.writeAc.access *
+          chooser->local_result.power.writeOp.dynamic;
+  RAS->power_t.readOp.dynamic +=
+      RAS->local_result.power.readOp.dynamic * RAS->stats_t.readAc.access +
+      RAS->stats_t.writeAc.access * RAS->local_result.power.writeOp.dynamic;
+
+  if (is_tdp) {
+    globalBPT->power =
+        globalBPT->power_t + globalBPT->local_result.power * pppm_lkg;
+    L1_localBPT->power =
+        L1_localBPT->power_t + L1_localBPT->local_result.power * pppm_lkg;
+    L2_localBPT->power =
+        L2_localBPT->power_t + L2_localBPT->local_result.power * pppm_lkg;
+    chooser->power = chooser->power_t + chooser->local_result.power * pppm_lkg;
+    RAS->power =
+        RAS->power_t + RAS->local_result.power * coredynp.pppm_lkg_multhread;
+
+    power = power + globalBPT->power + L1_localBPT->power + chooser->power +
+            RAS->power;
+  } else {
+    globalBPT->rt_power =
+        globalBPT->power_t + globalBPT->local_result.power * pppm_lkg;
+    L1_localBPT->rt_power =
+        L1_localBPT->power_t + L1_localBPT->local_result.power * pppm_lkg;
+    L2_localBPT->rt_power =
+        L2_localBPT->power_t + L2_localBPT->local_result.power * pppm_lkg;
+    chooser->rt_power =
+        chooser->power_t + chooser->local_result.power * pppm_lkg;
+    RAS->rt_power =
+        RAS->power_t + RAS->local_result.power * coredynp.pppm_lkg_multhread;
+    rt_power = rt_power + globalBPT->rt_power + L1_localBPT->rt_power +
+               chooser->rt_power + RAS->rt_power;
+  }
+}
+
+void BranchPredictor::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  if (!exist)
+    return;
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+  if (is_tdp) {
+    cout << indent_str << "Global Predictor:" << endl;
+    cout << indent_str_next << "Area = " << globalBPT->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << globalBPT->power.readOp.dynamic * clockRate
+         << " W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? globalBPT->power.readOp.longer_channel_leakage
+                          : globalBPT->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << globalBPT->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << globalBPT->rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << endl;
+    cout << indent_str << "Local Predictor:" << endl;
+    cout << indent_str << "L1_Local Predictor:" << endl;
+    cout << indent_str_next << "Area = " << L1_localBPT->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << L1_localBPT->power.readOp.dynamic * clockRate
+         << " W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? L1_localBPT->power.readOp.longer_channel_leakage
+                          : L1_localBPT->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << L1_localBPT->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << L1_localBPT->rt_power.readOp.dynamic / executionTime << " W"
+         << endl;
+    cout << endl;
+    cout << indent_str << "L2_Local Predictor:" << endl;
+    cout << indent_str_next << "Area = " << L2_localBPT->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << L2_localBPT->power.readOp.dynamic * clockRate
+         << " W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? L2_localBPT->power.readOp.longer_channel_leakage
+                          : L2_localBPT->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << L2_localBPT->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << L2_localBPT->rt_power.readOp.dynamic / executionTime << " W"
+         << endl;
+    cout << endl;
+
+    cout << indent_str << "Chooser:" << endl;
+    cout << indent_str_next << "Area = " << chooser->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << chooser->power.readOp.dynamic * clockRate
+         << " W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? chooser->power.readOp.longer_channel_leakage
+                          : chooser->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << chooser->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << chooser->rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << endl;
+    cout << indent_str << "RAS:" << endl;
+    cout << indent_str_next << "Area = " << RAS->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << RAS->power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? RAS->power.readOp.longer_channel_leakage
+                          : RAS->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << RAS->power.readOp.gate_leakage << " W" << endl;
+    cout << indent_str_next
+         << "Runtime Dynamic = " << RAS->rt_power.readOp.dynamic / executionTime
+         << " W" << endl;
+    cout << endl;
+  } else {
+    //		cout << indent_str_next << "Global Predictor    Peak Dynamic = "
+    //<< globalBPT->rt_power.readOp.dynamic*clockRate << " W" << endl;
+    // cout << indent_str_next << "Global Predictor    Subthreshold Leakage = "
+    // << globalBPT->rt_power.readOp.leakage <<" W" << endl; 		cout <<
+    // indent_str_next
+    //<< "Global Predictor    Gate Leakage = " <<
+    // globalBPT->rt_power.readOp.gate_leakage << " W" << endl;
+    // cout
+    // << indent_str_next << "Local Predictor   Peak Dynamic = " <<
+    // L1_localBPT->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+    // cout
+    // << indent_str_next << "Local Predictor   Subthreshold Leakage = " <<
+    // L1_localBPT->rt_power.readOp.leakage  << " W" << endl; 		cout <<
+    // indent_str_next << "Local Predictor   Gate Leakage = " <<
+    // L1_localBPT->rt_power.readOp.gate_leakage  << " W" << endl;
+    // cout
+    // << indent_str_next << "Chooser   Peak Dynamic = " <<
+    // chooser->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+    // cout
+    // << indent_str_next << "Chooser   Subthreshold Leakage = " <<
+    // chooser->rt_power.readOp.leakage  << " W" << endl; 		cout <<
+    // indent_str_next
+    //<< "Chooser   Gate Leakage = " << chooser->rt_power.readOp.gate_leakage <<
+    //" W" << endl; 		cout << indent_str_next << "RAS   Peak Dynamic =
+    //"
+    //<< RAS->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+    // cout << indent_str_next << "RAS   Subthreshold Leakage = " <<
+    // RAS->rt_power.readOp.leakage  << " W" << endl; 		cout <<
+    // indent_str_next
+    // << "RAS   Gate Leakage = " << RAS->rt_power.readOp.gate_leakage  << " W"
+    //<< endl;
+  }
+}
+
+void InstFetchU::computeEnergy(bool is_tdp) {
+  executionTime =
+      XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6); // Syed
+  // cout <<"IFU: execution time:
+  // "<<XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6)<<endl; cout
+  // <<"IFU: total cycles"<<XML->sys.total_cycles<<endl;
+  if (!exist)
+    return;
+  if (is_tdp) {
+    // init stats for Peak
+    icache.caches->stats_t.readAc.access =
+        icache.caches->l_ip.num_rw_ports * coredynp.IFU_duty_cycle;
+    icache.caches->stats_t.readAc.miss = 0;
+    icache.caches->stats_t.readAc.hit = icache.caches->stats_t.readAc.access -
+                                        icache.caches->stats_t.readAc.miss;
+    icache.caches->tdp_stats = icache.caches->stats_t;
+
+    icache.missb->stats_t.readAc.access = icache.missb->stats_t.readAc.hit =
+        icache.missb->l_ip.num_search_ports;
+    icache.missb->stats_t.writeAc.access = icache.missb->stats_t.writeAc.hit =
+        icache.missb->l_ip.num_search_ports;
+    icache.missb->tdp_stats = icache.missb->stats_t;
+
+    icache.ifb->stats_t.readAc.access = icache.ifb->stats_t.readAc.hit =
+        icache.ifb->l_ip.num_search_ports;
+    icache.ifb->stats_t.writeAc.access = icache.ifb->stats_t.writeAc.hit =
+        icache.ifb->l_ip.num_search_ports;
+    icache.ifb->tdp_stats = icache.ifb->stats_t;
+
+    icache.prefetchb->stats_t.readAc.access =
+        icache.prefetchb->stats_t.readAc.hit =
+            icache.prefetchb->l_ip.num_search_ports;
+    icache.prefetchb->stats_t.writeAc.access = icache.ifb->stats_t.writeAc.hit =
+        icache.ifb->l_ip.num_search_ports;
+    icache.prefetchb->tdp_stats = icache.prefetchb->stats_t;
+
+    IB->stats_t.readAc.access = IB->stats_t.writeAc.access =
+        XML->sys.core[ithCore].peak_issue_width;
+    IB->tdp_stats = IB->stats_t;
+
+    if (coredynp.predictionW > 0) {
+      BTB->stats_t.readAc.access =
+          coredynp.predictionW; // XML->sys.core[ithCore].BTB.read_accesses;
+      BTB->stats_t.writeAc.access =
+          0; // XML->sys.core[ithCore].BTB.write_accesses;
+    }
+
+    ID_inst->stats_t.readAc.access = coredynp.decodeW;
+    ID_operand->stats_t.readAc.access = coredynp.decodeW;
+    ID_misc->stats_t.readAc.access = coredynp.decodeW;
+    ID_inst->tdp_stats = ID_inst->stats_t;
+    ID_operand->tdp_stats = ID_operand->stats_t;
+    ID_misc->tdp_stats = ID_misc->stats_t;
+
+  } /* if (is_tdp) */
+  else {
+    rt_power.reset();
+    icache.rt_power.reset(); // Jingwen
+    // init stats for Runtime Dynamic (RTP)
+    // cout<< "****>>>>Icache stats:"<<endl;
+    // cout<<"Read accesses: "<< XML->sys.core[ithCore].icache.read_accesses <<
+    // " Read misses: "<<XML->sys.core[ithCore].icache.read_misses<<endl;
+    icache.caches->stats_t.readAc.access =
+        XML->sys.core[ithCore].icache.read_accesses;
+    icache.caches->stats_t.readAc.miss =
+        XML->sys.core[ithCore].icache.read_misses;
+    // cout<<endl<<"inside mcpat read access=
+    // "<<XML->sys.core[ithCore].icache.read_accesses; cout<<endl<<"inside mcpat
+    // read miss= "<<XML->sys.core[ithCore].icache.read_misses;
+
+    icache.caches->stats_t.readAc.hit = icache.caches->stats_t.readAc.access -
+                                        icache.caches->stats_t.readAc.miss;
+    icache.caches->rtp_stats = icache.caches->stats_t;
+    // cout<<endl<<"inside mcpat read hit=
+    // "<<icache.caches->stats_t.readAc.hit<<endl;
+    icache.missb->stats_t.readAc.access = icache.caches->stats_t.readAc.miss;
+    icache.missb->stats_t.writeAc.access = icache.caches->stats_t.readAc.miss;
+    icache.missb->rtp_stats = icache.missb->stats_t;
+
+    icache.ifb->stats_t.readAc.access = icache.caches->stats_t.readAc.miss;
+    icache.ifb->stats_t.writeAc.access = icache.caches->stats_t.readAc.miss;
+    icache.ifb->rtp_stats = icache.ifb->stats_t;
+
+    icache.prefetchb->stats_t.readAc.access =
+        icache.caches->stats_t.readAc.miss;
+    icache.prefetchb->stats_t.writeAc.access =
+        icache.caches->stats_t.readAc.miss;
+    icache.prefetchb->rtp_stats = icache.prefetchb->stats_t;
+
+    IB->stats_t.readAc.access = IB->stats_t.writeAc.access =
+        XML->sys.core[ithCore].total_instructions;
+    IB->rtp_stats = IB->stats_t;
+    // cout<<"IB: total instructions: "<<IB->stats_t.readAc.access <<endl;
+    if (coredynp.predictionW > 0) {
+      BTB->stats_t.readAc.access =
+          XML->sys.core[ithCore]
+              .BTB.read_accesses; // XML->sys.core[ithCore].branch_instructions;
+      BTB->stats_t.writeAc.access =
+          XML->sys.core[ithCore]
+              .BTB
+              .write_accesses; // XML->sys.core[ithCore].branch_mispredictions;
+      BTB->rtp_stats = BTB->stats_t;
+    }
+    // cout<<"ID: total instructions: "<<
+    // XML->sys.core[ithCore].total_instructions<<endl;
+    ID_inst->stats_t.readAc.access = XML->sys.core[ithCore].total_instructions;
+    ID_operand->stats_t.readAc.access =
+        XML->sys.core[ithCore].total_instructions;
+    ID_misc->stats_t.readAc.access = XML->sys.core[ithCore].total_instructions;
+    ID_inst->rtp_stats = ID_inst->stats_t;
+    ID_operand->rtp_stats = ID_operand->stats_t;
+    ID_misc->rtp_stats = ID_misc->stats_t;
+  }
+
+  icache.power_t.reset();
+  IB->power_t.reset();
+  //	ID_inst->power_t.reset();
+  //	ID_operand->power_t.reset();
+  //	ID_misc->power_t.reset();
+  if (coredynp.predictionW > 0) {
+    BTB->power_t.reset();
+  }
+
+  icache.power_t.readOp.dynamic +=
+      (icache.caches->stats_t.readAc.hit *
+           icache.caches->local_result.power.readOp.dynamic +
+       // icache.caches->stats_t.readAc.miss*icache.caches->local_result.tag_array2->power.readOp.dynamic+
+       icache.caches->stats_t.readAc.miss *
+           icache.caches->local_result.power.readOp
+               .dynamic + // assume tag data accessed in parallel
+       icache.caches->stats_t.readAc.miss *
+           icache.caches->local_result.power.writeOp
+               .dynamic); // read miss in Icache cause a write to Icache
+  icache.power_t.readOp.dynamic +=
+      icache.missb->stats_t.readAc.access *
+          icache.missb->local_result.power.searchOp.dynamic +
+      icache.missb->stats_t.writeAc.access *
+          icache.missb->local_result.power.writeOp
+              .dynamic; // each access to missb involves a CAM and a write
+  icache.power_t.readOp.dynamic +=
+      icache.ifb->stats_t.readAc.access *
+          icache.ifb->local_result.power.searchOp.dynamic +
+      icache.ifb->stats_t.writeAc.access *
+          icache.ifb->local_result.power.writeOp.dynamic;
+  icache.power_t.readOp.dynamic +=
+      icache.prefetchb->stats_t.readAc.access *
+          icache.prefetchb->local_result.power.searchOp.dynamic +
+      icache.prefetchb->stats_t.writeAc.access *
+          icache.prefetchb->local_result.power.writeOp.dynamic;
+  // cout<<"Icache power: "<<icache.power_t.readOp.dynamic	<<endl;
+  IB->power_t.readOp.dynamic +=
+      IB->local_result.power.readOp.dynamic * IB->stats_t.readAc.access +
+      IB->stats_t.writeAc.access * IB->local_result.power.writeOp.dynamic;
+  // cout << "IB power: "<<IB->power_t.readOp.dynamic<<endl;
+  if (coredynp.predictionW > 0) {
+    BTB->power_t.readOp.dynamic +=
+        BTB->local_result.power.readOp.dynamic * BTB->stats_t.readAc.access +
+        BTB->stats_t.writeAc.access * BTB->local_result.power.writeOp.dynamic;
+
+    BPT->computeEnergy(is_tdp);
+  }
+
+  if (is_tdp) {
+    //    	icache.power = icache.power_t +
+    //    	        (icache.caches->local_result.power)*pppm_lkg +
+    //    			(icache.missb->local_result.power +
+    //    			icache.ifb->local_result.power +
+    //    			icache.prefetchb->local_result.power)*pppm_Isub;
+    icache.power = icache.power_t + (icache.caches->local_result.power +
+                                     icache.missb->local_result.power +
+                                     icache.ifb->local_result.power +
+                                     icache.prefetchb->local_result.power) *
+                                        pppm_lkg;
+
+    IB->power = IB->power_t + IB->local_result.power * pppm_lkg;
+    power = power + icache.power + IB->power;
+    if (coredynp.predictionW > 0) {
+      BTB->power = BTB->power_t + BTB->local_result.power * pppm_lkg;
+      power = power + BTB->power + BPT->power;
+    }
+
+    ID_inst->power_t.readOp.dynamic = ID_inst->power.readOp.dynamic;
+    ID_operand->power_t.readOp.dynamic = ID_operand->power.readOp.dynamic;
+    ID_misc->power_t.readOp.dynamic = ID_misc->power.readOp.dynamic;
+
+    ID_inst->power.readOp.dynamic *= ID_inst->tdp_stats.readAc.access;
+    ID_operand->power.readOp.dynamic *= ID_operand->tdp_stats.readAc.access;
+    ID_misc->power.readOp.dynamic *= ID_misc->tdp_stats.readAc.access;
+
+    power = power + (ID_inst->power + ID_operand->power + ID_misc->power);
+  } /* if (is_tdp) */
+  else {
+    //    	icache.rt_power = icache.power_t +
+    //    	        (icache.caches->local_result.power)*pppm_lkg +
+    //    			(icache.missb->local_result.power +
+    //    			icache.ifb->local_result.power +
+    //    			icache.prefetchb->local_result.power)*pppm_Isub;
+
+    icache.rt_power = icache.power_t + (icache.caches->local_result.power +
+                                        icache.missb->local_result.power +
+                                        icache.ifb->local_result.power +
+                                        icache.prefetchb->local_result.power) *
+                                           pppm_lkg;
+
+    // IB->rt_power = IB->power_t + IB->local_result.power*pppm_lkg;
+    IB->rt_power.readOp.dynamic =
+        IB->local_result.power.readOp.dynamic * IB->rtp_stats.readAc.access;
+    IB->rt_power.readOp.dynamic +=
+        IB->local_result.power.writeOp.dynamic * IB->rtp_stats.writeAc.access;
+    rt_power = rt_power + icache.rt_power + IB->rt_power;
+    if (coredynp.predictionW > 0) {
+      BTB->rt_power = BTB->power_t + BTB->local_result.power * pppm_lkg;
+      rt_power = rt_power + BTB->rt_power + BPT->rt_power;
+    }
+
+    ID_inst->rt_power.readOp.dynamic =
+        ID_inst->power_t.readOp.dynamic * ID_inst->rtp_stats.readAc.access;
+    ID_operand->rt_power.readOp.dynamic = ID_operand->power_t.readOp.dynamic *
+                                          ID_operand->rtp_stats.readAc.access;
+    ID_misc->rt_power.readOp.dynamic =
+        ID_misc->power_t.readOp.dynamic * ID_misc->rtp_stats.readAc.access;
+
+    rt_power = rt_power +
+               (ID_inst->rt_power + ID_operand->rt_power + ID_misc->rt_power);
+    // cout<<"ID inst: "<<ID_inst->rt_power.readOp.dynamic << " ID operand:
+    // "<<ID_operand->rt_power.readOp.dynamic<<" ID misc:
+    // "<<ID_misc->rt_power.readOp.dynamic<<endl;
+  }
+}
+
+void InstFetchU::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  if (!exist)
+    return;
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+
+    cout << indent_str << "Instruction Cache:" << endl;
+    cout << indent_str_next << "Area = " << icache.area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << icache.power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? icache.power.readOp.longer_channel_leakage
+                          : icache.power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << icache.power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << icache.rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << endl;
+    if (coredynp.predictionW > 0) {
+      cout << indent_str << "Branch Target Buffer:" << endl;
+      cout << indent_str_next << "Area = " << BTB->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << BTB->power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? BTB->power.readOp.longer_channel_leakage
+                            : BTB->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << BTB->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << BTB->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+      if (BPT->exist) {
+        cout << indent_str << "Branch Predictor:" << endl;
+        cout << indent_str_next << "Area = " << BPT->area.get_area() * 1e-6
+             << " mm^2" << endl;
+        cout << indent_str_next
+             << "Peak Dynamic = " << BPT->power.readOp.dynamic * clockRate
+             << " W" << endl;
+        cout << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel ? BPT->power.readOp.longer_channel_leakage
+                              : BPT->power.readOp.leakage)
+             << " W" << endl;
+        cout << indent_str_next
+             << "Gate Leakage = " << BPT->power.readOp.gate_leakage << " W"
+             << endl;
+        cout << indent_str_next << "Runtime Dynamic = "
+             << BPT->rt_power.readOp.dynamic / executionTime << " W" << endl;
+        cout << endl;
+        if (plevel > 3) {
+          BPT->displayEnergy(indent + 4, plevel, is_tdp);
+        }
+      }
+    }
+    cout << indent_str << "Instruction Buffer:" << endl;
+    cout << indent_str_next << "Area = " << IB->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << IB->power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? IB->power.readOp.longer_channel_leakage
+                          : IB->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << IB->power.readOp.gate_leakage << " W" << endl;
+    cout << indent_str_next
+         << "Runtime Dynamic = " << IB->rt_power.readOp.dynamic / executionTime
+         << " W" << endl;
+    cout << endl;
+    cout << indent_str << "Instruction Decoder:" << endl;
+    cout << indent_str_next << "Area = "
+         << (ID_inst->area.get_area() + ID_operand->area.get_area() +
+             ID_misc->area.get_area()) *
+                coredynp.decodeW * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next << "Peak Dynamic = "
+         << (ID_inst->power.readOp.dynamic + ID_operand->power.readOp.dynamic +
+             ID_misc->power.readOp.dynamic) *
+                clockRate
+         << " W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? (ID_inst->power.readOp.longer_channel_leakage +
+                             ID_operand->power.readOp.longer_channel_leakage +
+                             ID_misc->power.readOp.longer_channel_leakage)
+                          : (ID_inst->power.readOp.leakage +
+                             ID_operand->power.readOp.leakage +
+                             ID_misc->power.readOp.leakage))
+         << " W" << endl;
+    cout << indent_str_next << "Gate Leakage = "
+         << (ID_inst->power.readOp.gate_leakage +
+             ID_operand->power.readOp.gate_leakage +
+             ID_misc->power.readOp.gate_leakage)
+         << " W" << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << (ID_inst->rt_power.readOp.dynamic +
+             ID_operand->rt_power.readOp.dynamic +
+             ID_misc->rt_power.readOp.dynamic) /
+                executionTime
+         << " W" << endl;
+    cout << endl;
+  } else {
+    //		cout << indent_str_next << "Instruction Cache    Peak Dynamic =
+    //"
+    //<< icache.rt_power.readOp.dynamic*clockRate << " W" << endl;
+    // cout << indent_str_next << "Instruction Cache    Subthreshold Leakage = "
+    // << icache.rt_power.readOp.leakage <<" W" << endl; 		cout <<
+    // indent_str_next << "Instruction Cache    Gate Leakage = " <<
+    // icache.rt_power.readOp.gate_leakage << " W" << endl; 		cout <<
+    // indent_str_next << "Instruction Buffer   Peak Dynamic = " <<
+    // IB->rt_power.readOp.dynamic*clockRate  << " W" << endl; 		cout <<
+    // indent_str_next << "Instruction Buffer   Subthreshold Leakage = " <<
+    // IB->rt_power.readOp.leakage  << " W" << endl; 		cout <<
+    // indent_str_next
+    // << "Instruction Buffer   Gate Leakage = " <<
+    // IB->rt_power.readOp.gate_leakage
+    //<< " W" << endl; 		cout << indent_str_next << "Branch Target Buffer
+    // Peak Dynamic = " << BTB->rt_power.readOp.dynamic*clockRate  << " W" <<
+    // endl; 		cout << indent_str_next << "Branch Target Buffer
+    // Subthreshold Leakage = " << BTB->rt_power.readOp.leakage  << " W" <<
+    // endl; 		cout
+    // << indent_str_next << "Branch Target Buffer   Gate Leakage = " <<
+    // BTB->rt_power.readOp.gate_leakage  << " W" << endl; 		cout <<
+    // indent_str_next << "Branch Predictor   Peak Dynamic = " <<
+    // BPT->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+    // cout
+    // << indent_str_next << "Branch Predictor   Subthreshold Leakage = " <<
+    // BPT->rt_power.readOp.leakage  << " W" << endl; 		cout <<
+    // indent_str_next
+    // << "Branch Predictor   Gate Leakage = " <<
+    // BPT->rt_power.readOp.gate_leakage
+    //<< " W" << endl;
+  }
+}
+
+void RENAMINGU::computeEnergy(bool is_tdp) {
+  if (!exist)
+    return;
+  double pppm_t[4] = {1, 1, 1, 1};
+  if (is_tdp) { // init stats for Peak
+    if (coredynp.core_ty == OOO) {
+      if (coredynp.scheu_ty == PhysicalRegFile) {
+        if (coredynp.rm_ty == RAMbased) {
+          iFRAT->stats_t.readAc.access = iFRAT->l_ip.num_rd_ports;
+          iFRAT->stats_t.writeAc.access = iFRAT->l_ip.num_wr_ports;
+          iFRAT->tdp_stats = iFRAT->stats_t;
+
+          fFRAT->stats_t.readAc.access = fFRAT->l_ip.num_rd_ports;
+          fFRAT->stats_t.writeAc.access = fFRAT->l_ip.num_wr_ports;
+          fFRAT->tdp_stats = fFRAT->stats_t;
+
+        } else if ((coredynp.rm_ty == CAMbased)) {
+          iFRAT->stats_t.readAc.access = iFRAT->l_ip.num_search_ports;
+          iFRAT->stats_t.writeAc.access = iFRAT->l_ip.num_wr_ports;
+          iFRAT->tdp_stats = iFRAT->stats_t;
+
+          fFRAT->stats_t.readAc.access = fFRAT->l_ip.num_search_ports;
+          fFRAT->stats_t.writeAc.access = fFRAT->l_ip.num_wr_ports;
+          fFRAT->tdp_stats = fFRAT->stats_t;
+        }
+
+        iRRAT->stats_t.readAc.access = iRRAT->l_ip.num_rd_ports;
+        iRRAT->stats_t.writeAc.access = iRRAT->l_ip.num_wr_ports;
+        iRRAT->tdp_stats = iRRAT->stats_t;
+
+        fRRAT->stats_t.readAc.access = fRRAT->l_ip.num_rd_ports;
+        fRRAT->stats_t.writeAc.access = fRRAT->l_ip.num_wr_ports;
+        fRRAT->tdp_stats = fRRAT->stats_t;
+
+        ifreeL->stats_t.readAc.access =
+            coredynp.decodeW; // ifreeL->l_ip.num_rd_ports;;
+        ifreeL->stats_t.writeAc.access =
+            coredynp.decodeW; // ifreeL->l_ip.num_wr_ports;
+        ifreeL->tdp_stats = ifreeL->stats_t;
+
+        ffreeL->stats_t.readAc.access =
+            coredynp.decodeW; // ffreeL->l_ip.num_rd_ports;
+        ffreeL->stats_t.writeAc.access =
+            coredynp.decodeW; // ffreeL->l_ip.num_wr_ports;
+        ffreeL->tdp_stats = ffreeL->stats_t;
+      } else if (coredynp.scheu_ty == ReservationStation) {
+        if (coredynp.rm_ty == RAMbased) {
+          iFRAT->stats_t.readAc.access = iFRAT->l_ip.num_rd_ports;
+          iFRAT->stats_t.writeAc.access = iFRAT->l_ip.num_wr_ports;
+          iFRAT->stats_t.searchAc.access = iFRAT->l_ip.num_search_ports;
+          iFRAT->tdp_stats = iFRAT->stats_t;
+
+          fFRAT->stats_t.readAc.access = fFRAT->l_ip.num_rd_ports;
+          fFRAT->stats_t.writeAc.access = fFRAT->l_ip.num_wr_ports;
+          fFRAT->stats_t.searchAc.access = fFRAT->l_ip.num_search_ports;
+          fFRAT->tdp_stats = fFRAT->stats_t;
+
+        } else if ((coredynp.rm_ty == CAMbased)) {
+          iFRAT->stats_t.readAc.access = iFRAT->l_ip.num_search_ports;
+          iFRAT->stats_t.writeAc.access = iFRAT->l_ip.num_wr_ports;
+          iFRAT->tdp_stats = iFRAT->stats_t;
+
+          fFRAT->stats_t.readAc.access = fFRAT->l_ip.num_search_ports;
+          fFRAT->stats_t.writeAc.access = fFRAT->l_ip.num_wr_ports;
+          fFRAT->tdp_stats = fFRAT->stats_t;
+        }
+        // Unified free list for both int and fp
+        ifreeL->stats_t.readAc.access =
+            coredynp.decodeW; // ifreeL->l_ip.num_rd_ports;
+        ifreeL->stats_t.writeAc.access =
+            coredynp.decodeW; // ifreeL->l_ip.num_wr_ports;
+        ifreeL->tdp_stats = ifreeL->stats_t;
+      }
+      idcl->stats_t.readAc.access = coredynp.decodeW;
+      fdcl->stats_t.readAc.access = coredynp.decodeW;
+      idcl->tdp_stats = idcl->stats_t;
+      fdcl->tdp_stats = fdcl->stats_t;
+    } else {
+      if (coredynp.issueW > 1) {
+        idcl->stats_t.readAc.access = coredynp.decodeW;
+        fdcl->stats_t.readAc.access = coredynp.decodeW;
+        idcl->tdp_stats = idcl->stats_t;
+        fdcl->tdp_stats = fdcl->stats_t;
+      }
+    }
+
+  } else { // init stats for Runtime Dynamic (RTP)
+    if (coredynp.core_ty == OOO) {
+      if (coredynp.scheu_ty == PhysicalRegFile) {
+        if (coredynp.rm_ty == RAMbased) {
+          iFRAT->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads;
+          iFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].rename_writes;
+          iFRAT->rtp_stats = iFRAT->stats_t;
+
+          fFRAT->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_reads;
+          fFRAT->stats_t.writeAc.access =
+              XML->sys.core[ithCore].fp_rename_writes;
+          fFRAT->rtp_stats = fFRAT->stats_t;
+        } else if ((coredynp.rm_ty == CAMbased)) {
+          iFRAT->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads;
+          iFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].rename_writes;
+          iFRAT->rtp_stats = iFRAT->stats_t;
+
+          fFRAT->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_reads;
+          fFRAT->stats_t.writeAc.access =
+              XML->sys.core[ithCore].fp_rename_writes;
+          fFRAT->rtp_stats = fFRAT->stats_t;
+        }
+
+        iRRAT->stats_t.readAc.access =
+            XML->sys.core[ithCore]
+                .rename_writes; // Hack, should be (context switch + branch
+                                // mispredictions)*16
+        iRRAT->stats_t.writeAc.access = XML->sys.core[ithCore].rename_writes;
+        iRRAT->rtp_stats = iRRAT->stats_t;
+
+        fRRAT->stats_t.readAc.access =
+            XML->sys.core[ithCore]
+                .fp_rename_writes; // Hack, should be (context switch + branch
+                                   // mispredictions)*16
+        fRRAT->stats_t.writeAc.access = XML->sys.core[ithCore].fp_rename_writes;
+        fRRAT->rtp_stats = fRRAT->stats_t;
+
+        ifreeL->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads;
+        ifreeL->stats_t.writeAc.access =
+            2 * XML->sys.core[ithCore].rename_writes;
+        ifreeL->rtp_stats = ifreeL->stats_t;
+
+        ffreeL->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_reads;
+        ffreeL->stats_t.writeAc.access =
+            2 * XML->sys.core[ithCore].fp_rename_writes;
+        ffreeL->rtp_stats = ffreeL->stats_t;
+      } else if (coredynp.scheu_ty == ReservationStation) {
+        if (coredynp.rm_ty == RAMbased) {
+          iFRAT->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads;
+          iFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].rename_writes;
+          iFRAT->stats_t.searchAc.access =
+              XML->sys.core[ithCore]
+                  .committed_int_instructions; // hack: not all committed
+                                               // instructions use regs.
+          iFRAT->rtp_stats = iFRAT->stats_t;
+
+          fFRAT->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_reads;
+          fFRAT->stats_t.writeAc.access =
+              XML->sys.core[ithCore].fp_rename_writes;
+          fFRAT->stats_t.searchAc.access =
+              XML->sys.core[ithCore].committed_fp_instructions;
+          fFRAT->rtp_stats = fFRAT->stats_t;
+        } else if ((coredynp.rm_ty == CAMbased)) {
+          iFRAT->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads;
+          iFRAT->stats_t.writeAc.access = XML->sys.core[ithCore].rename_writes;
+          iFRAT->rtp_stats = iFRAT->stats_t;
+
+          fFRAT->stats_t.readAc.access = XML->sys.core[ithCore].fp_rename_reads;
+          fFRAT->stats_t.writeAc.access =
+              XML->sys.core[ithCore].fp_rename_writes;
+          fFRAT->rtp_stats = fFRAT->stats_t;
+        }
+        // Unified free list for both int and fp since the ROB act as physcial
+        // registers
+        ifreeL->stats_t.readAc.access = XML->sys.core[ithCore].rename_reads +
+                                        XML->sys.core[ithCore].fp_rename_reads;
+        ifreeL->stats_t.writeAc.access =
+            2 * (XML->sys.core[ithCore].rename_writes +
+                 XML->sys.core[ithCore]
+                     .fp_rename_writes); // HACK: 2-> since some of renaming in
+                                         // the same group are terminated early
+        ifreeL->rtp_stats = ifreeL->stats_t;
+      }
+      idcl->stats_t.readAc.access = 3 * coredynp.decodeW * coredynp.decodeW *
+                                    XML->sys.core[ithCore].rename_reads;
+      fdcl->stats_t.readAc.access = 3 * coredynp.fp_issueW *
+                                    coredynp.fp_issueW *
+                                    XML->sys.core[ithCore].fp_rename_writes;
+      idcl->rtp_stats = idcl->stats_t;
+      fdcl->rtp_stats = fdcl->stats_t;
+    } else {
+      if (coredynp.issueW > 1) {
+        idcl->stats_t.readAc.access =
+            2 * XML->sys.core[ithCore].int_instructions;
+        fdcl->stats_t.readAc.access = XML->sys.core[ithCore].fp_instructions;
+        idcl->rtp_stats = idcl->stats_t;
+        fdcl->rtp_stats = fdcl->stats_t;
+      }
+    }
+  }
+  /* Compute engine */
+  if (coredynp.core_ty == OOO) {
+    if (coredynp.scheu_ty == PhysicalRegFile) {
+      if (coredynp.rm_ty == RAMbased) {
+        iFRAT->power_t.reset();
+        fFRAT->power_t.reset();
+
+        iFRAT->power_t.readOp.dynamic +=
+            (iFRAT->stats_t.readAc.access *
+                 (iFRAT->local_result.power.readOp.dynamic +
+                  idcl->power.readOp.dynamic) +
+             iFRAT->stats_t.writeAc.access *
+                 iFRAT->local_result.power.writeOp.dynamic);
+        fFRAT->power_t.readOp.dynamic +=
+            (fFRAT->stats_t.readAc.access *
+                 (fFRAT->local_result.power.readOp.dynamic +
+                  fdcl->power.readOp.dynamic) +
+             fFRAT->stats_t.writeAc.access *
+                 fFRAT->local_result.power.writeOp.dynamic);
+      } else if ((coredynp.rm_ty == CAMbased)) {
+        iFRAT->power_t.reset();
+        fFRAT->power_t.reset();
+        iFRAT->power_t.readOp.dynamic +=
+            (iFRAT->stats_t.readAc.access *
+                 (iFRAT->local_result.power.searchOp.dynamic +
+                  idcl->power.readOp.dynamic) +
+             iFRAT->stats_t.writeAc.access *
+                 iFRAT->local_result.power.writeOp.dynamic);
+        fFRAT->power_t.readOp.dynamic +=
+            (fFRAT->stats_t.readAc.access *
+                 (fFRAT->local_result.power.searchOp.dynamic +
+                  fdcl->power.readOp.dynamic) +
+             fFRAT->stats_t.writeAc.access *
+                 fFRAT->local_result.power.writeOp.dynamic);
+      }
+
+      iRRAT->power_t.reset();
+      fRRAT->power_t.reset();
+      ifreeL->power_t.reset();
+      ffreeL->power_t.reset();
+
+      iRRAT->power_t.readOp.dynamic +=
+          (iRRAT->stats_t.readAc.access *
+               iRRAT->local_result.power.readOp.dynamic +
+           iRRAT->stats_t.writeAc.access *
+               iRRAT->local_result.power.writeOp.dynamic);
+      fRRAT->power_t.readOp.dynamic +=
+          (fRRAT->stats_t.readAc.access *
+               fRRAT->local_result.power.readOp.dynamic +
+           fRRAT->stats_t.writeAc.access *
+               fRRAT->local_result.power.writeOp.dynamic);
+      ifreeL->power_t.readOp.dynamic +=
+          (ifreeL->stats_t.readAc.access *
+               ifreeL->local_result.power.readOp.dynamic +
+           ifreeL->stats_t.writeAc.access *
+               ifreeL->local_result.power.writeOp.dynamic);
+      ffreeL->power_t.readOp.dynamic +=
+          (ffreeL->stats_t.readAc.access *
+               ffreeL->local_result.power.readOp.dynamic +
+           ffreeL->stats_t.writeAc.access *
+               ffreeL->local_result.power.writeOp.dynamic);
+
+    } else if (coredynp.scheu_ty == ReservationStation) {
+      if (coredynp.rm_ty == RAMbased) {
+        iFRAT->power_t.reset();
+        fFRAT->power_t.reset();
+
+        iFRAT->power_t.readOp.dynamic +=
+            (iFRAT->stats_t.readAc.access *
+                 (iFRAT->local_result.power.readOp.dynamic +
+                  idcl->power.readOp.dynamic) +
+             iFRAT->stats_t.writeAc.access *
+                 iFRAT->local_result.power.writeOp.dynamic +
+             iFRAT->stats_t.searchAc.access *
+                 iFRAT->local_result.power.searchOp.dynamic);
+        fFRAT->power_t.readOp.dynamic +=
+            (fFRAT->stats_t.readAc.access *
+                 (fFRAT->local_result.power.readOp.dynamic +
+                  fdcl->power.readOp.dynamic) +
+             fFRAT->stats_t.writeAc.access *
+                 fFRAT->local_result.power.writeOp.dynamic +
+             fFRAT->stats_t.searchAc.access *
+                 fFRAT->local_result.power.searchOp.dynamic);
+      } else if ((coredynp.rm_ty == CAMbased)) {
+        iFRAT->power_t.reset();
+        fFRAT->power_t.reset();
+        iFRAT->power_t.readOp.dynamic +=
+            (iFRAT->stats_t.readAc.access *
+                 (iFRAT->local_result.power.searchOp.dynamic +
+                  idcl->power.readOp.dynamic) +
+             iFRAT->stats_t.writeAc.access *
+                 iFRAT->local_result.power.writeOp.dynamic);
+        fFRAT->power_t.readOp.dynamic +=
+            (fFRAT->stats_t.readAc.access *
+                 (fFRAT->local_result.power.searchOp.dynamic +
+                  fdcl->power.readOp.dynamic) +
+             fFRAT->stats_t.writeAc.access *
+                 fFRAT->local_result.power.writeOp.dynamic);
+      }
+      ifreeL->power_t.reset();
+      ifreeL->power_t.readOp.dynamic +=
+          (ifreeL->stats_t.readAc.access *
+               ifreeL->local_result.power.readOp.dynamic +
+           ifreeL->stats_t.writeAc.access *
+               ifreeL->local_result.power.writeOp.dynamic);
+    }
+
+  } else {
+    if (coredynp.issueW > 1) {
+      idcl->power_t.reset();
+      fdcl->power_t.reset();
+      set_pppm(pppm_t, idcl->stats_t.readAc.access, coredynp.num_hthreads,
+               coredynp.num_hthreads, idcl->stats_t.readAc.access);
+      idcl->power_t = idcl->power * pppm_t;
+      set_pppm(pppm_t, fdcl->stats_t.readAc.access, coredynp.num_hthreads,
+               coredynp.num_hthreads, idcl->stats_t.readAc.access);
+      fdcl->power_t = fdcl->power * pppm_t;
+    }
+  }
+
+  // assign value to tpd and rtp
+  if (is_tdp) {
+    if (coredynp.core_ty == OOO) {
+      if (coredynp.scheu_ty == PhysicalRegFile) {
+        iFRAT->power =
+            iFRAT->power_t +
+            (iFRAT->local_result.power) * coredynp.pppm_lkg_multhread +
+            idcl->power_t;
+        fFRAT->power =
+            fFRAT->power_t +
+            (fFRAT->local_result.power) * coredynp.pppm_lkg_multhread +
+            fdcl->power_t;
+        iRRAT->power = iRRAT->power_t +
+                       iRRAT->local_result.power * coredynp.pppm_lkg_multhread;
+        fRRAT->power = fRRAT->power_t +
+                       fRRAT->local_result.power * coredynp.pppm_lkg_multhread;
+        ifreeL->power = ifreeL->power_t + ifreeL->local_result.power *
+                                              coredynp.pppm_lkg_multhread;
+        ffreeL->power = ffreeL->power_t + ffreeL->local_result.power *
+                                              coredynp.pppm_lkg_multhread;
+        power = power + (iFRAT->power + fFRAT->power) +
+                (iRRAT->power + fRRAT->power) + (ifreeL->power + ffreeL->power);
+      } else if (coredynp.scheu_ty == ReservationStation) {
+        iFRAT->power =
+            iFRAT->power_t +
+            (iFRAT->local_result.power) * coredynp.pppm_lkg_multhread +
+            idcl->power_t;
+        fFRAT->power =
+            fFRAT->power_t +
+            (fFRAT->local_result.power) * coredynp.pppm_lkg_multhread +
+            fdcl->power_t;
+        ifreeL->power = ifreeL->power_t + ifreeL->local_result.power *
+                                              coredynp.pppm_lkg_multhread;
+        power = power + (iFRAT->power + fFRAT->power) + ifreeL->power;
+      }
+    } else {
+      power = power + idcl->power_t + fdcl->power_t;
+    }
+
+  } else {
+    if (coredynp.core_ty == OOO) {
+      if (coredynp.scheu_ty == PhysicalRegFile) {
+        iFRAT->rt_power =
+            iFRAT->power_t +
+            (iFRAT->local_result.power) * coredynp.pppm_lkg_multhread +
+            idcl->power_t;
+        fFRAT->rt_power =
+            fFRAT->power_t +
+            (fFRAT->local_result.power) * coredynp.pppm_lkg_multhread +
+            fdcl->power_t;
+        iRRAT->rt_power = iRRAT->power_t + iRRAT->local_result.power *
+                                               coredynp.pppm_lkg_multhread;
+        fRRAT->rt_power = fRRAT->power_t + fRRAT->local_result.power *
+                                               coredynp.pppm_lkg_multhread;
+        ifreeL->rt_power = ifreeL->power_t + ifreeL->local_result.power *
+                                                 coredynp.pppm_lkg_multhread;
+        ffreeL->rt_power = ffreeL->power_t + ffreeL->local_result.power *
+                                                 coredynp.pppm_lkg_multhread;
+        rt_power = rt_power + (iFRAT->rt_power + fFRAT->rt_power) +
+                   (iRRAT->rt_power + fRRAT->rt_power) +
+                   (ifreeL->rt_power + ffreeL->rt_power);
+      } else if (coredynp.scheu_ty == ReservationStation) {
+        iFRAT->rt_power =
+            iFRAT->power_t +
+            (iFRAT->local_result.power) * coredynp.pppm_lkg_multhread +
+            idcl->power_t;
+        fFRAT->rt_power =
+            fFRAT->power_t +
+            (fFRAT->local_result.power) * coredynp.pppm_lkg_multhread +
+            fdcl->power_t;
+        ifreeL->rt_power = ifreeL->power_t + ifreeL->local_result.power *
+                                                 coredynp.pppm_lkg_multhread;
+        rt_power =
+            rt_power + (iFRAT->rt_power + fFRAT->rt_power) + ifreeL->rt_power;
+      }
+    } else {
+      rt_power = rt_power + idcl->power_t + fdcl->power_t;
+    }
+  }
+}
+
+void RENAMINGU::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  if (!exist)
+    return;
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+
+    if (coredynp.core_ty == OOO) {
+      cout << indent_str << "Int Front End RAT:" << endl;
+      cout << indent_str_next << "Area = " << iFRAT->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << iFRAT->power.readOp.dynamic * clockRate
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? iFRAT->power.readOp.longer_channel_leakage
+                            : iFRAT->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << iFRAT->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << iFRAT->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+      cout << indent_str << "FP Front End RAT:" << endl;
+      cout << indent_str_next << "Area = " << fFRAT->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << fFRAT->power.readOp.dynamic * clockRate
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? fFRAT->power.readOp.longer_channel_leakage
+                            : fFRAT->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << fFRAT->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << fFRAT->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+      cout << indent_str << "Free List:" << endl;
+      cout << indent_str_next << "Area = " << ifreeL->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << ifreeL->power.readOp.dynamic * clockRate
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? ifreeL->power.readOp.longer_channel_leakage
+                            : ifreeL->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << ifreeL->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << ifreeL->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+
+      if (coredynp.scheu_ty == PhysicalRegFile) {
+        cout << indent_str << "Int Retire RAT: " << endl;
+        cout << indent_str_next << "Area = " << iRRAT->area.get_area() * 1e-6
+             << " mm^2" << endl;
+        cout << indent_str_next
+             << "Peak Dynamic = " << iRRAT->power.readOp.dynamic * clockRate
+             << " W" << endl;
+        cout << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel ? iRRAT->power.readOp.longer_channel_leakage
+                              : iRRAT->power.readOp.leakage)
+             << " W" << endl;
+        cout << indent_str_next
+             << "Gate Leakage = " << iRRAT->power.readOp.gate_leakage << " W"
+             << endl;
+        cout << indent_str_next << "Runtime Dynamic = "
+             << iRRAT->rt_power.readOp.dynamic / executionTime << " W" << endl;
+        cout << endl;
+        cout << indent_str << "FP Retire RAT:" << endl;
+        cout << indent_str_next << "Area = " << fRRAT->area.get_area() * 1e-6
+             << " mm^2" << endl;
+        cout << indent_str_next
+             << "Peak Dynamic = " << fRRAT->power.readOp.dynamic * clockRate
+             << " W" << endl;
+        cout << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel ? fRRAT->power.readOp.longer_channel_leakage
+                              : fRRAT->power.readOp.leakage)
+             << " W" << endl;
+        cout << indent_str_next
+             << "Gate Leakage = " << fRRAT->power.readOp.gate_leakage << " W"
+             << endl;
+        cout << indent_str_next << "Runtime Dynamic = "
+             << fRRAT->rt_power.readOp.dynamic / executionTime << " W" << endl;
+        cout << endl;
+        cout << indent_str << "FP Free List:" << endl;
+        cout << indent_str_next << "Area = " << ffreeL->area.get_area() * 1e-6
+             << " mm^2" << endl;
+        cout << indent_str_next
+             << "Peak Dynamic = " << ffreeL->power.readOp.dynamic * clockRate
+             << " W" << endl;
+        cout << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel ? ffreeL->power.readOp.longer_channel_leakage
+                              : ffreeL->power.readOp.leakage)
+             << " W" << endl;
+        cout << indent_str_next
+             << "Gate Leakage = " << ffreeL->power.readOp.gate_leakage << " W"
+             << endl;
+        cout << indent_str_next << "Runtime Dynamic = "
+             << ffreeL->rt_power.readOp.dynamic / executionTime << " W" << endl;
+        cout << endl;
+      }
+    } else {
+      cout << indent_str << "Int DCL:" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << idcl->power.readOp.dynamic * clockRate
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? idcl->power.readOp.longer_channel_leakage
+                            : idcl->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << idcl->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << idcl->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << indent_str << "FP DCL:" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << fdcl->power.readOp.dynamic * clockRate
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? fdcl->power.readOp.longer_channel_leakage
+                            : fdcl->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << fdcl->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << fdcl->rt_power.readOp.dynamic / executionTime << " W" << endl;
+    }
+  } else {
+    if (coredynp.core_ty == OOO) {
+      cout << indent_str_next << "Int Front End RAT    Peak Dynamic = "
+           << iFRAT->rt_power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "Int Front End RAT    Subthreshold Leakage = "
+           << iFRAT->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next << "Int Front End RAT    Gate Leakage = "
+           << iFRAT->rt_power.readOp.gate_leakage << " W" << endl;
+      cout << indent_str_next << "FP Front End RAT   Peak Dynamic = "
+           << fFRAT->rt_power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "FP Front End RAT   Subthreshold Leakage = "
+           << fFRAT->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next << "FP Front End RAT   Gate Leakage = "
+           << fFRAT->rt_power.readOp.gate_leakage << " W" << endl;
+      cout << indent_str_next << "Free List   Peak Dynamic = "
+           << ifreeL->rt_power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "Free List   Subthreshold Leakage = "
+           << ifreeL->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next << "Free List   Gate Leakage = "
+           << fFRAT->rt_power.readOp.gate_leakage << " W" << endl;
+      if (coredynp.scheu_ty == PhysicalRegFile) {
+        cout << indent_str_next << "Int Retire RAT   Peak Dynamic = "
+             << iRRAT->rt_power.readOp.dynamic * clockRate << " W" << endl;
+        cout << indent_str_next << "Int Retire RAT   Subthreshold Leakage = "
+             << iRRAT->rt_power.readOp.leakage << " W" << endl;
+        cout << indent_str_next << "Int Retire RAT   Gate Leakage = "
+             << iRRAT->rt_power.readOp.gate_leakage << " W" << endl;
+        cout << indent_str_next << "FP Retire RAT   Peak Dynamic = "
+             << fRRAT->rt_power.readOp.dynamic * clockRate << " W" << endl;
+        cout << indent_str_next << "FP Retire RAT   Subthreshold Leakage = "
+             << fRRAT->rt_power.readOp.leakage << " W" << endl;
+        cout << indent_str_next << "FP Retire RAT   Gate Leakage = "
+             << fRRAT->rt_power.readOp.gate_leakage << " W" << endl;
+        cout << indent_str_next << "FP Free List   Peak Dynamic = "
+             << ffreeL->rt_power.readOp.dynamic * clockRate << " W" << endl;
+        cout << indent_str_next << "FP Free List   Subthreshold Leakage = "
+             << ffreeL->rt_power.readOp.leakage << " W" << endl;
+        cout << indent_str_next << "FP Free List   Gate Leakage = "
+             << fFRAT->rt_power.readOp.gate_leakage << " W" << endl;
+      }
+    } else {
+      cout << indent_str_next << "Int DCL   Peak Dynamic = "
+           << idcl->rt_power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "Int DCL   Subthreshold Leakage = "
+           << idcl->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next
+           << "Int DCL   Gate Leakage = " << idcl->rt_power.readOp.gate_leakage
+           << " W" << endl;
+      cout << indent_str_next << "FP DCL   Peak Dynamic = "
+           << fdcl->rt_power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "FP DCL   Subthreshold Leakage = "
+           << fdcl->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next
+           << "FP DCL   Gate Leakage = " << fdcl->rt_power.readOp.gate_leakage
+           << " W" << endl;
+    }
+  }
+}
+
+void SchedulerU::computeEnergy(bool is_tdp) {
+  if (!exist)
+    return;
+  double ROB_duty_cycle;
+  //	ROB_duty_cycle = ((coredynp.ALU_duty_cycle +
+  // coredynp.num_muls>0?coredynp.MUL_duty_cycle:0
+  //			+ coredynp.num_fpus>0?coredynp.FPU_duty_cycle:0))*1.1<1
+  //? (coredynp.ALU_duty_cycle + coredynp.num_muls>0?coredynp.MUL_duty_cycle:0
+  //					+
+  // coredynp.num_fpus>0?coredynp.FPU_duty_cycle:0)*1.1:1;
+  ROB_duty_cycle = 1;
+  // init stats
+  if (is_tdp) {
+    if (coredynp.core_ty == OOO) {
+      int_inst_window->stats_t.readAc.access =
+          coredynp.issueW *
+          coredynp.num_pipelines; // int_inst_window->l_ip.num_search_ports;
+      int_inst_window->stats_t.writeAc.access =
+          coredynp.issueW *
+          coredynp.num_pipelines; // int_inst_window->l_ip.num_wr_ports;
+      int_inst_window->stats_t.searchAc.access =
+          coredynp.issueW * coredynp.num_pipelines;
+      int_inst_window->tdp_stats = int_inst_window->stats_t;
+      fp_inst_window->stats_t.readAc.access =
+          fp_inst_window->l_ip.num_rd_ports * coredynp.num_fp_pipelines;
+      fp_inst_window->stats_t.writeAc.access =
+          fp_inst_window->l_ip.num_wr_ports * coredynp.num_fp_pipelines;
+      fp_inst_window->stats_t.searchAc.access =
+          fp_inst_window->l_ip.num_search_ports * coredynp.num_fp_pipelines;
+      fp_inst_window->tdp_stats = fp_inst_window->stats_t;
+
+      if (XML->sys.core[ithCore].ROB_size > 0) {
+        ROB->stats_t.readAc.access =
+            coredynp.commitW * coredynp.num_pipelines * ROB_duty_cycle;
+        ROB->stats_t.writeAc.access =
+            coredynp.issueW * coredynp.num_pipelines * ROB_duty_cycle;
+        ROB->tdp_stats = ROB->stats_t;
+
+        /*
+         * When inst commits, ROB must be read.
+         * Because for Physcial register based cores, physical register tag in
+         * ROB need to be read out and write into RRAT/CAM based RAT. For RS
+         * based cores, register content that stored in ROB must be read out and
+         * stored in architectural registers.
+         *
+         * if no-register is involved, the ROB read out operation when
+         * instruction commits can be ignored. assuming 20% insts. belong this
+         * type.
+         * TODO: ROB duty_cycle need to be revisited
+         */
+      }
+
+    } else if (coredynp.multithreaded) {
+      int_inst_window->stats_t.readAc.access =
+          coredynp.issueW *
+          coredynp.num_pipelines; // int_inst_window->l_ip.num_search_ports;
+      int_inst_window->stats_t.writeAc.access =
+          coredynp.issueW *
+          coredynp.num_pipelines; // int_inst_window->l_ip.num_wr_ports;
+      int_inst_window->stats_t.searchAc.access =
+          coredynp.issueW * coredynp.num_pipelines;
+      int_inst_window->tdp_stats = int_inst_window->stats_t;
+    }
+
+  } else { // rtp
+    if (coredynp.core_ty == OOO) {
+      int_inst_window->stats_t.readAc.access =
+          XML->sys.core[ithCore].inst_window_reads;
+      int_inst_window->stats_t.writeAc.access =
+          XML->sys.core[ithCore].inst_window_writes;
+      int_inst_window->stats_t.searchAc.access =
+          XML->sys.core[ithCore].inst_window_wakeup_accesses;
+      int_inst_window->rtp_stats = int_inst_window->stats_t;
+      fp_inst_window->stats_t.readAc.access =
+          XML->sys.core[ithCore].fp_inst_window_reads;
+      fp_inst_window->stats_t.writeAc.access =
+          XML->sys.core[ithCore].fp_inst_window_writes;
+      fp_inst_window->stats_t.searchAc.access =
+          XML->sys.core[ithCore].fp_inst_window_wakeup_accesses;
+      fp_inst_window->rtp_stats = fp_inst_window->stats_t;
+
+      if (XML->sys.core[ithCore].ROB_size > 0) {
+
+        ROB->stats_t.readAc.access = XML->sys.core[ithCore].ROB_reads;
+        ROB->stats_t.writeAc.access = XML->sys.core[ithCore].ROB_writes;
+        /* ROB need to be updated in RS based OOO when new values are produced,
+         * this update may happen before the commit stage when ROB entry is
+         * released
+         * 1. ROB write at instruction inserted in
+         * 2. ROB write as results produced (for RS based OOO only)
+         * 3. ROB read  as instruction committed. For RS based OOO, data values
+         * are read out and sent to ARF For Physical reg based OOO, no data
+         * stored in ROB, but register tags need to be read out and used to set
+         * the RRAT and to recycle the register tag to free list buffer
+         */
+        ROB->rtp_stats = ROB->stats_t;
+      }
+
+    } else if (coredynp.multithreaded) {
+      int_inst_window->stats_t.readAc.access =
+          XML->sys.core[ithCore].int_instructions +
+          XML->sys.core[ithCore].fp_instructions;
+      int_inst_window->stats_t.writeAc.access =
+          XML->sys.core[ithCore].int_instructions +
+          XML->sys.core[ithCore].fp_instructions;
+      int_inst_window->stats_t.searchAc.access =
+          2 * (XML->sys.core[ithCore].int_instructions +
+               XML->sys.core[ithCore].fp_instructions);
+      int_inst_window->rtp_stats = int_inst_window->stats_t;
+    }
+  }
+
+  // computation engine
+  if (coredynp.core_ty == OOO) {
+    int_inst_window->power_t.reset();
+    fp_inst_window->power_t.reset();
+
+    /* each instruction needs to write to scheduler, read out when all resources
+     * and source operands are ready two search ops with one for each source
+     * operand
+     *
+     */
+    int_inst_window->power_t.readOp.dynamic +=
+        int_inst_window->local_result.power.readOp.dynamic *
+            int_inst_window->stats_t.readAc.access +
+        int_inst_window->local_result.power.searchOp.dynamic *
+            int_inst_window->stats_t.searchAc.access +
+        int_inst_window->local_result.power.writeOp.dynamic *
+            int_inst_window->stats_t.writeAc.access +
+        int_inst_window->stats_t.readAc.access *
+            instruction_selection->power.readOp.dynamic;
+
+    fp_inst_window->power_t.readOp.dynamic +=
+        fp_inst_window->local_result.power.readOp.dynamic *
+            fp_inst_window->stats_t.readAc.access +
+        fp_inst_window->local_result.power.searchOp.dynamic *
+            fp_inst_window->stats_t.searchAc.access +
+        fp_inst_window->local_result.power.writeOp.dynamic *
+            fp_inst_window->stats_t.writeAc.access +
+        fp_inst_window->stats_t.writeAc.access *
+            instruction_selection->power.readOp.dynamic;
+
+    if (XML->sys.core[ithCore].ROB_size > 0) {
+      ROB->power_t.reset();
+      ROB->power_t.readOp.dynamic +=
+          ROB->local_result.power.readOp.dynamic * ROB->stats_t.readAc.access +
+          ROB->stats_t.writeAc.access * ROB->local_result.power.writeOp.dynamic;
+    }
+
+  } else if (coredynp.multithreaded) {
+    int_inst_window->power_t.reset();
+    int_inst_window->power_t.readOp.dynamic +=
+        int_inst_window->local_result.power.readOp.dynamic *
+            int_inst_window->stats_t.readAc.access +
+        int_inst_window->local_result.power.searchOp.dynamic *
+            int_inst_window->stats_t.searchAc.access +
+        int_inst_window->local_result.power.writeOp.dynamic *
+            int_inst_window->stats_t.writeAc.access +
+        int_inst_window->stats_t.writeAc.access *
+            instruction_selection->power.readOp.dynamic;
+  }
+
+  // assign values
+  if (is_tdp) {
+    if (coredynp.core_ty == OOO) {
+      int_inst_window->power =
+          int_inst_window->power_t +
+          (int_inst_window->local_result.power + instruction_selection->power) *
+              pppm_lkg;
+      fp_inst_window->power =
+          fp_inst_window->power_t +
+          (fp_inst_window->local_result.power + instruction_selection->power) *
+              pppm_lkg;
+      power = power + int_inst_window->power + fp_inst_window->power;
+      if (XML->sys.core[ithCore].ROB_size > 0) {
+        ROB->power = ROB->power_t + ROB->local_result.power * pppm_lkg;
+        power = power + ROB->power;
+      }
+
+    } else if (coredynp.multithreaded) {
+      //			set_pppm(pppm_t,
+      // XML->sys.core[ithCore].issue_width,1, 1, 1);
+      int_inst_window->power =
+          int_inst_window->power_t +
+          (int_inst_window->local_result.power + instruction_selection->power) *
+              pppm_lkg;
+      power = power + int_inst_window->power;
+    }
+
+  } else { // rtp
+    if (coredynp.core_ty == OOO) {
+      int_inst_window->rt_power =
+          int_inst_window->power_t +
+          (int_inst_window->local_result.power + instruction_selection->power) *
+              pppm_lkg;
+      fp_inst_window->rt_power =
+          fp_inst_window->power_t +
+          (fp_inst_window->local_result.power + instruction_selection->power) *
+              pppm_lkg;
+      rt_power =
+          rt_power + int_inst_window->rt_power + fp_inst_window->rt_power;
+      if (XML->sys.core[ithCore].ROB_size > 0) {
+        ROB->rt_power = ROB->power_t + ROB->local_result.power * pppm_lkg;
+        rt_power = rt_power + ROB->rt_power;
+      }
+
+    } else if (coredynp.multithreaded) {
+      //			set_pppm(pppm_t,
+      // XML->sys.core[ithCore].issue_width,1, 1, 1);
+      int_inst_window->rt_power =
+          int_inst_window->power_t +
+          (int_inst_window->local_result.power + instruction_selection->power) *
+              pppm_lkg;
+      rt_power = rt_power + int_inst_window->rt_power;
+    }
+  }
+  //	set_pppm(pppm_t, XML->sys.core[ithCore].issue_width,1, 1, 1);
+  //	cout<<"Scheduler
+  // power="<<power.readOp.dynamic<<"leakage="<<power.readOp.leakage<<endl;
+  //	cout<<"IW="<<int_inst_window->local_result.power.searchOp.dynamic *
+  // int_inst_window->stats_t.readAc.access +
+  //    + int_inst_window->local_result.power.writeOp.dynamic *
+  //    int_inst_window->stats_t.writeAc.access<<"leakage="<<int_inst_window->local_result.power.readOp.leakage<<endl;
+  //	cout<<"selection"<<instruction_selection->power.readOp.dynamic<<"leakage"<<instruction_selection->power.readOp.leakage<<endl;
+}
+
+void SchedulerU::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  if (!exist)
+    return;
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+    if (coredynp.core_ty == OOO) {
+      cout << indent_str << "Instruction Window:" << endl;
+      cout << indent_str_next
+           << "Area = " << int_inst_window->area.get_area() * 1e-6 << " mm^2"
+           << endl;
+      cout << indent_str_next << "Peak Dynamic = "
+           << int_inst_window->power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel
+                   ? int_inst_window->power.readOp.longer_channel_leakage
+                   : int_inst_window->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << int_inst_window->power.readOp.gate_leakage
+           << " W" << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << int_inst_window->rt_power.readOp.dynamic / executionTime << " W"
+           << endl;
+      cout << endl;
+      cout << indent_str << "FP Instruction Window:" << endl;
+      cout << indent_str_next
+           << "Area = " << fp_inst_window->area.get_area() * 1e-6 << " mm^2"
+           << endl;
+      cout << indent_str_next << "Peak Dynamic = "
+           << fp_inst_window->power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel
+                   ? fp_inst_window->power.readOp.longer_channel_leakage
+                   : fp_inst_window->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << fp_inst_window->power.readOp.gate_leakage
+           << " W" << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << fp_inst_window->rt_power.readOp.dynamic / executionTime << " W"
+           << endl;
+      cout << endl;
+      if (XML->sys.core[ithCore].ROB_size > 0) {
+        cout << indent_str << "ROB:" << endl;
+        cout << indent_str_next << "Area = " << ROB->area.get_area() * 1e-6
+             << " mm^2" << endl;
+        cout << indent_str_next
+             << "Peak Dynamic = " << ROB->power.readOp.dynamic * clockRate
+             << " W" << endl;
+        cout << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel ? ROB->power.readOp.longer_channel_leakage
+                              : ROB->power.readOp.leakage)
+             << " W" << endl;
+        cout << indent_str_next
+             << "Gate Leakage = " << ROB->power.readOp.gate_leakage << " W"
+             << endl;
+        cout << indent_str_next << "Runtime Dynamic = "
+             << ROB->rt_power.readOp.dynamic / executionTime << " W" << endl;
+        cout << endl;
+      }
+    } else if (coredynp.multithreaded) {
+      cout << indent_str << "Instruction Window:" << endl;
+      cout << indent_str_next
+           << "Area = " << int_inst_window->area.get_area() * 1e-6 << " mm^2"
+           << endl;
+      cout << indent_str_next << "Peak Dynamic = "
+           << int_inst_window->power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel
+                   ? int_inst_window->power.readOp.longer_channel_leakage
+                   : int_inst_window->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << int_inst_window->power.readOp.gate_leakage
+           << " W" << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << int_inst_window->rt_power.readOp.dynamic / executionTime << " W"
+           << endl;
+      cout << endl;
+    }
+  } else {
+    if (coredynp.core_ty == OOO) {
+      cout << indent_str_next << "Instruction Window    Peak Dynamic = "
+           << int_inst_window->rt_power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next << "Instruction Window    Subthreshold Leakage = "
+           << int_inst_window->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next << "Instruction Window    Gate Leakage = "
+           << int_inst_window->rt_power.readOp.gate_leakage << " W" << endl;
+      cout << indent_str_next << "FP Instruction Window   Peak Dynamic = "
+           << fp_inst_window->rt_power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next
+           << "FP Instruction Window   Subthreshold Leakage = "
+           << fp_inst_window->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next << "FP Instruction Window   Gate Leakage = "
+           << fp_inst_window->rt_power.readOp.gate_leakage << " W" << endl;
+      if (XML->sys.core[ithCore].ROB_size > 0) {
+        cout << indent_str_next << "ROB   Peak Dynamic = "
+             << ROB->rt_power.readOp.dynamic * clockRate << " W" << endl;
+        cout << indent_str_next
+             << "ROB   Subthreshold Leakage = " << ROB->rt_power.readOp.leakage
+             << " W" << endl;
+        cout << indent_str_next
+             << "ROB   Gate Leakage = " << ROB->rt_power.readOp.gate_leakage
+             << " W" << endl;
+      }
+    } else if (coredynp.multithreaded) {
+      cout << indent_str_next << "Instruction Window    Peak Dynamic = "
+           << int_inst_window->rt_power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next << "Instruction Window    Subthreshold Leakage = "
+           << int_inst_window->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next << "Instruction Window    Gate Leakage = "
+           << int_inst_window->rt_power.readOp.gate_leakage << " W" << endl;
+    }
+  }
+}
+
+void LoadStoreU::computeEnergy(bool is_tdp) {
+  if (!exist)
+    return;
+
+  executionTime =
+      XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6); // Syed
+
+  // RF crossbar power (Syed)
+  xbar_shared->compute_power();
+
+  if (is_tdp) {
+
+    // init stats for Peak
+    // added by Jingwen
+    sharedmemory.caches->stats_t.readAc.access =
+        0.67 * sharedmemory.caches->l_ip.num_rw_ports * coredynp.LSU_duty_cycle;
+    sharedmemory.caches->stats_t.readAc.miss = 0;
+    sharedmemory.caches->stats_t.readAc.hit =
+        sharedmemory.caches->stats_t.readAc.access -
+        sharedmemory.caches->stats_t.readAc.miss;
+    sharedmemory.caches->stats_t.writeAc.access =
+        0.33 * sharedmemory.caches->l_ip.num_rw_ports * coredynp.LSU_duty_cycle;
+    sharedmemory.caches->stats_t.writeAc.miss = 0;
+    sharedmemory.caches->stats_t.writeAc.hit =
+        sharedmemory.caches->stats_t.writeAc.access -
+        sharedmemory.caches->stats_t.writeAc.miss;
+    sharedmemory.caches->tdp_stats = sharedmemory.caches->stats_t;
+
+    sharedmemory.missb->stats_t.readAc.access =
+        sharedmemory.missb->l_ip.num_search_ports;
+    sharedmemory.missb->stats_t.writeAc.access =
+        sharedmemory.missb->l_ip.num_search_ports;
+    sharedmemory.missb->tdp_stats = sharedmemory.missb->stats_t;
+
+    sharedmemory.ifb->stats_t.readAc.access =
+        sharedmemory.ifb->l_ip.num_search_ports;
+    sharedmemory.ifb->stats_t.writeAc.access =
+        sharedmemory.ifb->l_ip.num_search_ports;
+    sharedmemory.ifb->tdp_stats = sharedmemory.ifb->stats_t;
+
+    sharedmemory.prefetchb->stats_t.readAc.access =
+        sharedmemory.prefetchb->l_ip.num_search_ports;
+    sharedmemory.prefetchb->stats_t.writeAc.access =
+        sharedmemory.ifb->l_ip.num_search_ports;
+    sharedmemory.prefetchb->tdp_stats = sharedmemory.prefetchb->stats_t;
+    if (cache_p == Write_back) {
+      sharedmemory.wbb->stats_t.readAc.access =
+          sharedmemory.wbb->l_ip.num_search_ports;
+      sharedmemory.wbb->stats_t.writeAc.access =
+          sharedmemory.wbb->l_ip.num_search_ports;
+      sharedmemory.wbb->tdp_stats = sharedmemory.wbb->stats_t;
+    }
+
+    // init stats for Peak
+    dcache.caches->stats_t.readAc.access =
+        0.67 * dcache.caches->l_ip.num_rw_ports * coredynp.LSU_duty_cycle;
+    dcache.caches->stats_t.readAc.miss = 0;
+    dcache.caches->stats_t.readAc.hit = dcache.caches->stats_t.readAc.access -
+                                        dcache.caches->stats_t.readAc.miss;
+    dcache.caches->stats_t.writeAc.access =
+        0.33 * dcache.caches->l_ip.num_rw_ports * coredynp.LSU_duty_cycle;
+    dcache.caches->stats_t.writeAc.miss = 0;
+    dcache.caches->stats_t.writeAc.hit = dcache.caches->stats_t.writeAc.access -
+                                         dcache.caches->stats_t.writeAc.miss;
+    dcache.caches->tdp_stats = dcache.caches->stats_t;
+
+    dcache.missb->stats_t.readAc.access = dcache.missb->l_ip.num_search_ports;
+    dcache.missb->stats_t.writeAc.access = dcache.missb->l_ip.num_search_ports;
+    dcache.missb->tdp_stats = dcache.missb->stats_t;
+
+    dcache.ifb->stats_t.readAc.access = dcache.ifb->l_ip.num_search_ports;
+    dcache.ifb->stats_t.writeAc.access = dcache.ifb->l_ip.num_search_ports;
+    dcache.ifb->tdp_stats = dcache.ifb->stats_t;
+
+    dcache.prefetchb->stats_t.readAc.access =
+        dcache.prefetchb->l_ip.num_search_ports;
+    dcache.prefetchb->stats_t.writeAc.access =
+        dcache.ifb->l_ip.num_search_ports;
+    dcache.prefetchb->tdp_stats = dcache.prefetchb->stats_t;
+    if (cache_p == Write_back) {
+      dcache.wbb->stats_t.readAc.access = dcache.wbb->l_ip.num_search_ports;
+      dcache.wbb->stats_t.writeAc.access = dcache.wbb->l_ip.num_search_ports;
+      dcache.wbb->tdp_stats = dcache.wbb->stats_t;
+    }
+
+    // init stats for Peak - ccache
+    ccache.caches->stats_t.readAc.access =
+        0.67 * ccache.caches->l_ip.num_rw_ports * coredynp.LSU_duty_cycle;
+    ccache.caches->stats_t.readAc.miss = 0;
+    ccache.caches->stats_t.readAc.hit = ccache.caches->stats_t.readAc.access -
+                                        ccache.caches->stats_t.readAc.miss;
+    ccache.caches->stats_t.writeAc.access =
+        0.33 * ccache.caches->l_ip.num_rw_ports * coredynp.LSU_duty_cycle;
+    ccache.caches->stats_t.writeAc.miss = 0;
+    ccache.caches->stats_t.writeAc.hit = ccache.caches->stats_t.writeAc.access -
+                                         ccache.caches->stats_t.writeAc.miss;
+    ccache.caches->tdp_stats = ccache.caches->stats_t;
+
+    ccache.missb->stats_t.readAc.access = ccache.missb->l_ip.num_search_ports;
+    ccache.missb->stats_t.writeAc.access = ccache.missb->l_ip.num_search_ports;
+    ccache.missb->tdp_stats = ccache.missb->stats_t;
+
+    ccache.ifb->stats_t.readAc.access = ccache.ifb->l_ip.num_search_ports;
+    ccache.ifb->stats_t.writeAc.access = ccache.ifb->l_ip.num_search_ports;
+    ccache.ifb->tdp_stats = ccache.ifb->stats_t;
+
+    ccache.prefetchb->stats_t.readAc.access =
+        ccache.prefetchb->l_ip.num_search_ports;
+    ccache.prefetchb->stats_t.writeAc.access =
+        ccache.ifb->l_ip.num_search_ports;
+    ccache.prefetchb->tdp_stats = ccache.prefetchb->stats_t;
+    if (cache_p == Write_back) {
+      ccache.wbb->stats_t.readAc.access = ccache.wbb->l_ip.num_search_ports;
+      ccache.wbb->stats_t.writeAc.access = ccache.wbb->l_ip.num_search_ports;
+      ccache.wbb->tdp_stats = ccache.wbb->stats_t;
+    }
+
+    // init stats for Peak - tcache
+    tcache.caches->stats_t.readAc.access =
+        0.67 * tcache.caches->l_ip.num_rw_ports * coredynp.LSU_duty_cycle;
+    tcache.caches->stats_t.readAc.miss = 0;
+    tcache.caches->stats_t.readAc.hit = tcache.caches->stats_t.readAc.access -
+                                        tcache.caches->stats_t.readAc.miss;
+    tcache.caches->stats_t.writeAc.access =
+        0.33 * tcache.caches->l_ip.num_rw_ports * coredynp.LSU_duty_cycle;
+    tcache.caches->stats_t.writeAc.miss = 0;
+    tcache.caches->stats_t.writeAc.hit = tcache.caches->stats_t.writeAc.access -
+                                         tcache.caches->stats_t.writeAc.miss;
+    tcache.caches->tdp_stats = tcache.caches->stats_t;
+
+    tcache.missb->stats_t.readAc.access = tcache.missb->l_ip.num_search_ports;
+    tcache.missb->stats_t.writeAc.access = tcache.missb->l_ip.num_search_ports;
+    tcache.missb->tdp_stats = tcache.missb->stats_t;
+
+    tcache.ifb->stats_t.readAc.access = tcache.ifb->l_ip.num_search_ports;
+    tcache.ifb->stats_t.writeAc.access = tcache.ifb->l_ip.num_search_ports;
+    tcache.ifb->tdp_stats = tcache.ifb->stats_t;
+
+    tcache.prefetchb->stats_t.readAc.access =
+        tcache.prefetchb->l_ip.num_search_ports;
+    tcache.prefetchb->stats_t.writeAc.access =
+        tcache.ifb->l_ip.num_search_ports;
+    tcache.prefetchb->tdp_stats = tcache.prefetchb->stats_t;
+    if (cache_p == Write_back) {
+      tcache.wbb->stats_t.readAc.access = tcache.wbb->l_ip.num_search_ports;
+      tcache.wbb->stats_t.writeAc.access = tcache.wbb->l_ip.num_search_ports;
+      tcache.wbb->tdp_stats = tcache.wbb->stats_t;
+    }
+
+    LSQ->stats_t.readAc.access = LSQ->stats_t.writeAc.access =
+        LSQ->l_ip.num_search_ports * coredynp.LSU_duty_cycle;
+    LSQ->tdp_stats = LSQ->stats_t;
+    if ((coredynp.core_ty == OOO) &&
+        (XML->sys.core[ithCore].load_buffer_size > 0)) {
+      LoadQ->stats_t.readAc.access = LoadQ->stats_t.writeAc.access =
+          LoadQ->l_ip.num_search_ports * coredynp.LSU_duty_cycle;
+      LoadQ->tdp_stats = LoadQ->stats_t;
+    }
+  } else {
+    // init stats for Runtime Dynamic (RTP)
+
+    sharedmemory.caches->stats_t.readAc.access =
+        XML->sys.core[ithCore].sharedmemory.read_accesses;
+    sharedmemory.caches->stats_t.readAc.miss =
+        XML->sys.core[ithCore].sharedmemory.read_misses;
+    sharedmemory.caches->stats_t.readAc.hit =
+        sharedmemory.caches->stats_t.readAc.access -
+        sharedmemory.caches->stats_t.readAc.miss;
+    sharedmemory.caches->stats_t.writeAc.access =
+        XML->sys.core[ithCore].sharedmemory.write_accesses;
+    sharedmemory.caches->stats_t.writeAc.miss =
+        XML->sys.core[ithCore].sharedmemory.write_misses;
+    sharedmemory.caches->stats_t.writeAc.hit =
+        sharedmemory.caches->stats_t.writeAc.access -
+        sharedmemory.caches->stats_t.writeAc.miss;
+    sharedmemory.caches->rtp_stats = sharedmemory.caches->stats_t;
+
+    dcache.caches->stats_t.readAc.access =
+        XML->sys.core[ithCore].dcache.read_accesses;
+    dcache.caches->stats_t.readAc.miss =
+        XML->sys.core[ithCore].dcache.read_misses;
+    dcache.caches->stats_t.readAc.hit = dcache.caches->stats_t.readAc.access -
+                                        dcache.caches->stats_t.readAc.miss;
+    dcache.caches->stats_t.writeAc.access =
+        XML->sys.core[ithCore].dcache.write_accesses;
+    dcache.caches->stats_t.writeAc.miss =
+        XML->sys.core[ithCore].dcache.write_misses;
+    dcache.caches->stats_t.writeAc.hit = dcache.caches->stats_t.writeAc.access -
+                                         dcache.caches->stats_t.writeAc.miss;
+    dcache.caches->rtp_stats = dcache.caches->stats_t;
+
+    ccache.caches->stats_t.readAc.access =
+        XML->sys.core[ithCore].ccache.read_accesses;
+    ccache.caches->stats_t.readAc.miss =
+        XML->sys.core[ithCore].ccache.read_misses;
+    ccache.caches->stats_t.readAc.hit = ccache.caches->stats_t.readAc.access -
+                                        ccache.caches->stats_t.readAc.miss;
+    ccache.caches->stats_t.writeAc.access =
+        XML->sys.core[ithCore].ccache.write_accesses;
+    ccache.caches->stats_t.writeAc.miss =
+        XML->sys.core[ithCore].ccache.write_misses;
+    ccache.caches->stats_t.writeAc.hit = ccache.caches->stats_t.writeAc.access -
+                                         ccache.caches->stats_t.writeAc.miss;
+    ccache.caches->rtp_stats = ccache.caches->stats_t;
+
+    tcache.caches->stats_t.readAc.access =
+        XML->sys.core[ithCore].tcache.read_accesses;
+    tcache.caches->stats_t.readAc.miss =
+        XML->sys.core[ithCore].tcache.read_misses;
+    tcache.caches->stats_t.readAc.hit = tcache.caches->stats_t.readAc.access -
+                                        tcache.caches->stats_t.readAc.miss;
+    tcache.caches->stats_t.writeAc.access =
+        XML->sys.core[ithCore].tcache.write_accesses;
+    tcache.caches->stats_t.writeAc.miss =
+        XML->sys.core[ithCore].tcache.write_misses;
+    tcache.caches->stats_t.writeAc.hit = tcache.caches->stats_t.writeAc.access -
+                                         tcache.caches->stats_t.writeAc.miss;
+    tcache.caches->rtp_stats = tcache.caches->stats_t;
+
+    if (cache_p == Write_back) {
+
+      sharedmemory.missb->stats_t.readAc.access =
+          sharedmemory.caches->stats_t.writeAc.miss;
+      sharedmemory.missb->stats_t.writeAc.access =
+          sharedmemory.caches->stats_t.writeAc.miss;
+      sharedmemory.missb->rtp_stats = sharedmemory.missb->stats_t;
+      sharedmemory.ifb->stats_t.readAc.access =
+          sharedmemory.caches->stats_t.writeAc.miss;
+      sharedmemory.ifb->stats_t.writeAc.access =
+          sharedmemory.caches->stats_t.writeAc.miss;
+      sharedmemory.ifb->rtp_stats = sharedmemory.ifb->stats_t;
+      sharedmemory.prefetchb->stats_t.readAc.access =
+          sharedmemory.caches->stats_t.writeAc.miss;
+      sharedmemory.prefetchb->stats_t.writeAc.access =
+          sharedmemory.caches->stats_t.writeAc.miss;
+      sharedmemory.prefetchb->rtp_stats = sharedmemory.prefetchb->stats_t;
+      sharedmemory.wbb->stats_t.readAc.access =
+          sharedmemory.caches->stats_t.writeAc.miss;
+      sharedmemory.wbb->stats_t.writeAc.access =
+          sharedmemory.caches->stats_t.writeAc.miss;
+      sharedmemory.wbb->rtp_stats = sharedmemory.wbb->stats_t;
+
+      dcache.missb->stats_t.readAc.access = dcache.caches->stats_t.writeAc.miss;
+      dcache.missb->stats_t.writeAc.access =
+          dcache.caches->stats_t.writeAc.miss;
+      dcache.missb->rtp_stats = dcache.missb->stats_t;
+      dcache.ifb->stats_t.readAc.access = dcache.caches->stats_t.writeAc.miss;
+      dcache.ifb->stats_t.writeAc.access = dcache.caches->stats_t.writeAc.miss;
+      dcache.ifb->rtp_stats = dcache.ifb->stats_t;
+      dcache.prefetchb->stats_t.readAc.access =
+          dcache.caches->stats_t.writeAc.miss;
+      dcache.prefetchb->stats_t.writeAc.access =
+          dcache.caches->stats_t.writeAc.miss;
+      dcache.prefetchb->rtp_stats = dcache.prefetchb->stats_t;
+      dcache.wbb->stats_t.readAc.access = dcache.caches->stats_t.writeAc.miss;
+      dcache.wbb->stats_t.writeAc.access = dcache.caches->stats_t.writeAc.miss;
+      dcache.wbb->rtp_stats = dcache.wbb->stats_t;
+
+      ccache.missb->stats_t.readAc.access = ccache.caches->stats_t.writeAc.miss;
+      ccache.missb->stats_t.writeAc.access =
+          ccache.caches->stats_t.writeAc.miss;
+      ccache.missb->rtp_stats = ccache.missb->stats_t;
+      ccache.ifb->stats_t.readAc.access = ccache.caches->stats_t.writeAc.miss;
+      ccache.ifb->stats_t.writeAc.access = ccache.caches->stats_t.writeAc.miss;
+      ccache.ifb->rtp_stats = ccache.ifb->stats_t;
+      ccache.prefetchb->stats_t.readAc.access =
+          ccache.caches->stats_t.writeAc.miss;
+      ccache.prefetchb->stats_t.writeAc.access =
+          ccache.caches->stats_t.writeAc.miss;
+      ccache.prefetchb->rtp_stats = ccache.prefetchb->stats_t;
+      ccache.wbb->stats_t.readAc.access = ccache.caches->stats_t.writeAc.miss;
+      ccache.wbb->stats_t.writeAc.access = ccache.caches->stats_t.writeAc.miss;
+      ccache.wbb->rtp_stats = ccache.wbb->stats_t;
+
+      tcache.missb->stats_t.readAc.access = tcache.caches->stats_t.writeAc.miss;
+      tcache.missb->stats_t.writeAc.access =
+          tcache.caches->stats_t.writeAc.miss;
+      tcache.missb->rtp_stats = tcache.missb->stats_t;
+      tcache.ifb->stats_t.readAc.access = tcache.caches->stats_t.writeAc.miss;
+      tcache.ifb->stats_t.writeAc.access = tcache.caches->stats_t.writeAc.miss;
+      tcache.ifb->rtp_stats = tcache.ifb->stats_t;
+      tcache.prefetchb->stats_t.readAc.access =
+          tcache.caches->stats_t.writeAc.miss;
+      tcache.prefetchb->stats_t.writeAc.access =
+          tcache.caches->stats_t.writeAc.miss;
+      tcache.prefetchb->rtp_stats = tcache.prefetchb->stats_t;
+      tcache.wbb->stats_t.readAc.access = tcache.caches->stats_t.writeAc.miss;
+      tcache.wbb->stats_t.writeAc.access = tcache.caches->stats_t.writeAc.miss;
+      tcache.wbb->rtp_stats = tcache.wbb->stats_t;
+    } else {
+      sharedmemory.missb->stats_t.readAc.access =
+          sharedmemory.caches->stats_t.readAc.miss;
+      sharedmemory.missb->stats_t.writeAc.access =
+          sharedmemory.caches->stats_t.readAc.miss;
+      sharedmemory.missb->rtp_stats = sharedmemory.missb->stats_t;
+      sharedmemory.ifb->stats_t.readAc.access =
+          sharedmemory.caches->stats_t.readAc.miss;
+      sharedmemory.ifb->stats_t.writeAc.access =
+          sharedmemory.caches->stats_t.readAc.miss;
+      sharedmemory.ifb->rtp_stats = sharedmemory.ifb->stats_t;
+      sharedmemory.prefetchb->stats_t.readAc.access =
+          sharedmemory.caches->stats_t.readAc.miss;
+      sharedmemory.prefetchb->stats_t.writeAc.access =
+          sharedmemory.caches->stats_t.readAc.miss;
+      sharedmemory.prefetchb->rtp_stats = sharedmemory.prefetchb->stats_t;
+
+      dcache.missb->stats_t.readAc.access = dcache.caches->stats_t.readAc.miss;
+      dcache.missb->stats_t.writeAc.access = dcache.caches->stats_t.readAc.miss;
+      dcache.missb->rtp_stats = dcache.missb->stats_t;
+      dcache.ifb->stats_t.readAc.access = dcache.caches->stats_t.readAc.miss;
+      dcache.ifb->stats_t.writeAc.access = dcache.caches->stats_t.readAc.miss;
+      dcache.ifb->rtp_stats = dcache.ifb->stats_t;
+      dcache.prefetchb->stats_t.readAc.access =
+          dcache.caches->stats_t.readAc.miss;
+      dcache.prefetchb->stats_t.writeAc.access =
+          dcache.caches->stats_t.readAc.miss;
+      dcache.prefetchb->rtp_stats = dcache.prefetchb->stats_t;
+
+      ccache.missb->stats_t.readAc.access = ccache.caches->stats_t.readAc.miss;
+      ccache.missb->stats_t.writeAc.access = ccache.caches->stats_t.readAc.miss;
+      ccache.missb->rtp_stats = ccache.missb->stats_t;
+      ccache.ifb->stats_t.readAc.access = ccache.caches->stats_t.readAc.miss;
+      ccache.ifb->stats_t.writeAc.access = ccache.caches->stats_t.readAc.miss;
+      ccache.ifb->rtp_stats = ccache.ifb->stats_t;
+      ccache.prefetchb->stats_t.readAc.access =
+          ccache.caches->stats_t.readAc.miss;
+      ccache.prefetchb->stats_t.writeAc.access =
+          ccache.caches->stats_t.readAc.miss;
+      ccache.prefetchb->rtp_stats = ccache.prefetchb->stats_t;
+
+      tcache.missb->stats_t.readAc.access = tcache.caches->stats_t.readAc.miss;
+      tcache.missb->stats_t.writeAc.access = tcache.caches->stats_t.readAc.miss;
+      tcache.missb->rtp_stats = tcache.missb->stats_t;
+      tcache.ifb->stats_t.readAc.access = tcache.caches->stats_t.readAc.miss;
+      tcache.ifb->stats_t.writeAc.access = tcache.caches->stats_t.readAc.miss;
+      tcache.ifb->rtp_stats = tcache.ifb->stats_t;
+      tcache.prefetchb->stats_t.readAc.access =
+          tcache.caches->stats_t.readAc.miss;
+      tcache.prefetchb->stats_t.writeAc.access =
+          tcache.caches->stats_t.readAc.miss;
+      tcache.prefetchb->rtp_stats = tcache.prefetchb->stats_t;
+    }
+
+    LSQ->stats_t.readAc.access = (XML->sys.core[ithCore].load_instructions +
+                                  XML->sys.core[ithCore].store_instructions) *
+                                 2; // flush overhead considered
+    LSQ->stats_t.writeAc.access = (XML->sys.core[ithCore].load_instructions +
+                                   XML->sys.core[ithCore].store_instructions) *
+                                  2;
+    LSQ->rtp_stats = LSQ->stats_t;
+
+    if ((coredynp.core_ty == OOO) &&
+        (XML->sys.core[ithCore].load_buffer_size > 0)) {
+      LoadQ->stats_t.readAc.access = XML->sys.core[ithCore].load_instructions +
+                                     XML->sys.core[ithCore].store_instructions;
+      LoadQ->stats_t.writeAc.access = XML->sys.core[ithCore].load_instructions +
+                                      XML->sys.core[ithCore].store_instructions;
+      LoadQ->rtp_stats = LoadQ->stats_t;
+    }
+  }
+
+  sharedmemory.power_t.reset();
+  dcache.power_t.reset();
+  ccache.power_t.reset();
+  tcache.power_t.reset();
+  LSQ->power_t.reset();
+
+  sharedmemory.power_t.readOp.dynamic +=
+      (sharedmemory.caches->stats_t.readAc.hit *
+           sharedmemory.caches->local_result.power.readOp.dynamic +
+       sharedmemory.caches->stats_t.readAc.miss *
+           sharedmemory.caches->local_result.power.readOp.dynamic +
+       sharedmemory.caches->stats_t.writeAc.miss *
+           sharedmemory.caches->local_result.tag_array2->power.readOp.dynamic +
+       sharedmemory.caches->stats_t.writeAc.access *
+           sharedmemory.caches->local_result.power.writeOp.dynamic +
+       xbar_shared->power.readOp.dynamic *
+           (sharedmemory.caches->stats_t.readAc.hit +
+            sharedmemory.caches->stats_t.writeAc.hit));
+
+  dcache.power_t.readOp.dynamic +=
+      (dcache.caches->stats_t.readAc.hit *
+           dcache.caches->local_result.power.readOp.dynamic +
+       dcache.caches->stats_t.readAc.miss *
+           dcache.caches->local_result.power.readOp.dynamic +
+       dcache.caches->stats_t.writeAc.miss *
+           dcache.caches->local_result.tag_array2->power.readOp.dynamic +
+       dcache.caches->stats_t.writeAc.access *
+           dcache.caches->local_result.power.writeOp.dynamic +
+       xbar_shared->power.readOp.dynamic *
+           (dcache.caches->stats_t.readAc.hit +
+            dcache.caches->stats_t.writeAc.hit));
+  ccache.power_t.readOp.dynamic +=
+      (ccache.caches->stats_t.readAc.hit *
+           ccache.caches->local_result.power.readOp.dynamic +
+       ccache.caches->stats_t.readAc.miss *
+           ccache.caches->local_result.power.readOp.dynamic +
+       ccache.caches->stats_t.writeAc.miss *
+           ccache.caches->local_result.tag_array2->power.readOp.dynamic +
+       ccache.caches->stats_t.writeAc.access *
+           ccache.caches->local_result.power.writeOp.dynamic +
+       xbar_shared->power.readOp.dynamic * (ccache.caches->stats_t.readAc.hit));
+
+  tcache.power_t.readOp.dynamic +=
+      (tcache.caches->stats_t.readAc.hit *
+           tcache.caches->local_result.power.readOp.dynamic +
+       tcache.caches->stats_t.readAc.miss *
+           tcache.caches->local_result.power.readOp.dynamic +
+       tcache.caches->stats_t.writeAc.miss *
+           tcache.caches->local_result.tag_array2->power.readOp.dynamic +
+       tcache.caches->stats_t.writeAc.access *
+           tcache.caches->local_result.power.writeOp.dynamic +
+       xbar_shared->power.readOp.dynamic *
+           (tcache.caches->stats_t.readAc.hit +
+            tcache.caches->stats_t.writeAc.hit));
+
+  if (cache_p == Write_back) { // write miss will generate a write later
+    dcache.power_t.readOp.dynamic +=
+        dcache.caches->stats_t.writeAc.miss *
+        dcache.caches->local_result.power.writeOp.dynamic;
+    ccache.power_t.readOp.dynamic +=
+        ccache.caches->stats_t.writeAc.miss *
+        ccache.caches->local_result.power.writeOp.dynamic;
+    tcache.power_t.readOp.dynamic +=
+        tcache.caches->stats_t.writeAc.miss *
+        tcache.caches->local_result.power.writeOp.dynamic;
+    sharedmemory.power_t.readOp.dynamic +=
+        sharedmemory.caches->stats_t.writeAc.miss *
+        sharedmemory.caches->local_result.power.writeOp.dynamic;
+  }
+
+  sharedmemory.power_t.readOp.dynamic +=
+      sharedmemory.missb->stats_t.readAc.access *
+          sharedmemory.missb->local_result.power.searchOp.dynamic +
+      sharedmemory.missb->stats_t.writeAc.access *
+          sharedmemory.missb->local_result.power.writeOp
+              .dynamic; // each access to missb involves a CAM and a write
+  sharedmemory.power_t.readOp.dynamic +=
+      sharedmemory.ifb->stats_t.readAc.access *
+          sharedmemory.ifb->local_result.power.searchOp.dynamic +
+      sharedmemory.ifb->stats_t.writeAc.access *
+          sharedmemory.ifb->local_result.power.writeOp.dynamic;
+  sharedmemory.power_t.readOp.dynamic +=
+      sharedmemory.prefetchb->stats_t.readAc.access *
+          sharedmemory.prefetchb->local_result.power.searchOp.dynamic +
+      sharedmemory.prefetchb->stats_t.writeAc.access *
+          sharedmemory.prefetchb->local_result.power.writeOp.dynamic;
+  if (cache_p == Write_back) {
+    sharedmemory.power_t.readOp.dynamic +=
+        sharedmemory.wbb->stats_t.readAc.access *
+            sharedmemory.wbb->local_result.power.searchOp.dynamic +
+        sharedmemory.wbb->stats_t.writeAc.access *
+            sharedmemory.wbb->local_result.power.writeOp.dynamic;
+  }
+
+  dcache.power_t.readOp.dynamic +=
+      dcache.missb->stats_t.readAc.access *
+          dcache.missb->local_result.power.searchOp.dynamic +
+      dcache.missb->stats_t.writeAc.access *
+          dcache.missb->local_result.power.writeOp
+              .dynamic; // each access to missb involves a CAM and a write
+  dcache.power_t.readOp.dynamic +=
+      dcache.ifb->stats_t.readAc.access *
+          dcache.ifb->local_result.power.searchOp.dynamic +
+      dcache.ifb->stats_t.writeAc.access *
+          dcache.ifb->local_result.power.writeOp.dynamic;
+  dcache.power_t.readOp.dynamic +=
+      dcache.prefetchb->stats_t.readAc.access *
+          dcache.prefetchb->local_result.power.searchOp.dynamic +
+      dcache.prefetchb->stats_t.writeAc.access *
+          dcache.prefetchb->local_result.power.writeOp.dynamic;
+  if (cache_p == Write_back) {
+    dcache.power_t.readOp.dynamic +=
+        dcache.wbb->stats_t.readAc.access *
+            dcache.wbb->local_result.power.searchOp.dynamic +
+        dcache.wbb->stats_t.writeAc.access *
+            dcache.wbb->local_result.power.writeOp.dynamic;
+  }
+
+  ccache.power_t.readOp.dynamic +=
+      ccache.missb->stats_t.readAc.access *
+          ccache.missb->local_result.power.searchOp.dynamic +
+      ccache.missb->stats_t.writeAc.access *
+          ccache.missb->local_result.power.writeOp
+              .dynamic; // each access to missb involves a CAM and a write
+  ccache.power_t.readOp.dynamic +=
+      ccache.ifb->stats_t.readAc.access *
+          ccache.ifb->local_result.power.searchOp.dynamic +
+      ccache.ifb->stats_t.writeAc.access *
+          ccache.ifb->local_result.power.writeOp.dynamic;
+  ccache.power_t.readOp.dynamic +=
+      ccache.prefetchb->stats_t.readAc.access *
+          ccache.prefetchb->local_result.power.searchOp.dynamic +
+      ccache.prefetchb->stats_t.writeAc.access *
+          ccache.prefetchb->local_result.power.writeOp.dynamic;
+  if (cache_p == Write_back) {
+    ccache.power_t.readOp.dynamic +=
+        ccache.wbb->stats_t.readAc.access *
+            ccache.wbb->local_result.power.searchOp.dynamic +
+        ccache.wbb->stats_t.writeAc.access *
+            ccache.wbb->local_result.power.writeOp.dynamic;
+  }
+
+  tcache.power_t.readOp.dynamic +=
+      tcache.missb->stats_t.readAc.access *
+          tcache.missb->local_result.power.searchOp.dynamic +
+      tcache.missb->stats_t.writeAc.access *
+          tcache.missb->local_result.power.writeOp
+              .dynamic; // each access to missb involves a CAM and a write
+  tcache.power_t.readOp.dynamic +=
+      tcache.ifb->stats_t.readAc.access *
+          tcache.ifb->local_result.power.searchOp.dynamic +
+      tcache.ifb->stats_t.writeAc.access *
+          tcache.ifb->local_result.power.writeOp.dynamic;
+  tcache.power_t.readOp.dynamic +=
+      tcache.prefetchb->stats_t.readAc.access *
+          tcache.prefetchb->local_result.power.searchOp.dynamic +
+      tcache.prefetchb->stats_t.writeAc.access *
+          tcache.prefetchb->local_result.power.writeOp.dynamic;
+  if (cache_p == Write_back) {
+    tcache.power_t.readOp.dynamic +=
+        tcache.wbb->stats_t.readAc.access *
+            tcache.wbb->local_result.power.searchOp.dynamic +
+        tcache.wbb->stats_t.writeAc.access *
+            tcache.wbb->local_result.power.writeOp.dynamic;
+  }
+
+  if ((coredynp.core_ty == OOO) &&
+      (XML->sys.core[ithCore].load_buffer_size > 0)) {
+    LoadQ->power_t.reset();
+    LoadQ->power_t.readOp.dynamic +=
+        LoadQ->stats_t.readAc.access *
+            (LoadQ->local_result.power.searchOp.dynamic +
+             LoadQ->local_result.power.readOp.dynamic) +
+        LoadQ->stats_t.writeAc.access *
+            LoadQ->local_result.power.writeOp
+                .dynamic; // every memory access invloves at least two
+                          // operations on LoadQ
+
+    LSQ->power_t.readOp.dynamic +=
+        LSQ->stats_t.readAc.access * (LSQ->local_result.power.searchOp.dynamic +
+                                      LSQ->local_result.power.readOp.dynamic) +
+        LSQ->stats_t.writeAc.access *
+            LSQ->local_result.power.writeOp
+                .dynamic; // every memory access invloves at least two
+                          // operations on LSQ
+
+  } else {
+    //	LSQ->power_t.readOp.dynamic  +=
+    // LSQ->stats_t.readAc.access*(LSQ->local_result.power.searchOp.dynamic +
+    // LSQ->local_result.power.readOp.dynamic)
+    //	        +
+    // LSQ->stats_t.writeAc.access*LSQ->local_result.power.writeOp.dynamic;//every
+    // memory access invloves at least two operations on LSQ 	No LSQ in GPUs
+    //(Syed)
+  }
+
+  if (is_tdp) {
+    //    	dcache.power = dcache.power_t +
+    //    (dcache.caches->local_result.power)*pppm_lkg +
+    //    			(dcache.missb->local_result.power +
+    //    			dcache.ifb->local_result.power +
+    //    			dcache.prefetchb->local_result.power +
+    //    			dcache.wbb->local_result.power)*pppm_Isub;
+
+    sharedmemory.power =
+        sharedmemory.power_t +
+        (sharedmemory.caches->local_result.power +
+         sharedmemory.missb->local_result.power +
+         sharedmemory.ifb->local_result.power +
+         sharedmemory.prefetchb->local_result.power + xbar_shared->power) *
+            pppm_lkg;
+    if (cache_p == Write_back) {
+      sharedmemory.power =
+          sharedmemory.power + sharedmemory.wbb->local_result.power * pppm_lkg;
+    }
+
+    dcache.power = dcache.power_t + (dcache.caches->local_result.power +
+                                     dcache.missb->local_result.power +
+                                     dcache.ifb->local_result.power +
+                                     dcache.prefetchb->local_result.power) *
+                                        pppm_lkg;
+    if (cache_p == Write_back) {
+      dcache.power = dcache.power + dcache.wbb->local_result.power * pppm_lkg;
+    }
+
+    ccache.power = ccache.power_t + (ccache.caches->local_result.power +
+                                     ccache.missb->local_result.power +
+                                     ccache.ifb->local_result.power +
+                                     ccache.prefetchb->local_result.power) *
+                                        pppm_lkg;
+    if (cache_p == Write_back) {
+      ccache.power = ccache.power + ccache.wbb->local_result.power * pppm_lkg;
+    }
+
+    tcache.power = tcache.power_t + (tcache.caches->local_result.power +
+                                     tcache.missb->local_result.power +
+                                     tcache.ifb->local_result.power +
+                                     tcache.prefetchb->local_result.power) *
+                                        pppm_lkg;
+    if (cache_p == Write_back) {
+      tcache.power = tcache.power + tcache.wbb->local_result.power * pppm_lkg;
+    }
+
+    LSQ->power = LSQ->power_t + LSQ->local_result.power * pppm_lkg;
+    // No LSQ in GPUs (Syed)
+    LSQ->power.reset();
+    power = power + dcache.power + LSQ->power + sharedmemory.power +
+            ccache.power + tcache.power;
+
+    if ((coredynp.core_ty == OOO) &&
+        (XML->sys.core[ithCore].load_buffer_size > 0)) {
+      LoadQ->power = LoadQ->power_t + LoadQ->local_result.power * pppm_lkg;
+      power = power + LoadQ->power;
+    }
+  } else {
+    //    	dcache.rt_power = dcache.power_t +
+    //    (dcache.caches->local_result.power +
+    //    dcache.missb->local_result.power
+    //    + 			dcache.ifb->local_result.power +
+    //    			dcache.prefetchb->local_result.power +
+    //    			dcache.wbb->local_result.power)*pppm_lkg;
+    rt_power.reset();
+    sharedmemory.rt_power.reset(); // Jingwen
+    tcache.rt_power.reset();
+    ccache.rt_power.reset();
+    dcache.rt_power.reset();
+    LSQ->rt_power.reset();
+
+    sharedmemory.rt_power =
+        sharedmemory.power_t + (sharedmemory.caches->local_result.power +
+                                sharedmemory.missb->local_result.power +
+                                sharedmemory.ifb->local_result.power +
+                                sharedmemory.prefetchb->local_result.power) *
+                                   pppm_lkg;
+
+    if (cache_p == Write_back) {
+      sharedmemory.rt_power = sharedmemory.rt_power +
+                              sharedmemory.wbb->local_result.power * pppm_lkg;
+    }
+
+    dcache.rt_power = dcache.power_t + (dcache.caches->local_result.power +
+                                        dcache.missb->local_result.power +
+                                        dcache.ifb->local_result.power +
+                                        dcache.prefetchb->local_result.power) *
+                                           pppm_lkg;
+    if (cache_p == Write_back) {
+      dcache.rt_power =
+          dcache.rt_power + dcache.wbb->local_result.power * pppm_lkg;
+    }
+
+    ccache.rt_power = ccache.power_t + (ccache.caches->local_result.power +
+                                        ccache.missb->local_result.power +
+                                        ccache.ifb->local_result.power +
+                                        ccache.prefetchb->local_result.power) *
+                                           pppm_lkg;
+    if (cache_p == Write_back) {
+      ccache.rt_power =
+          ccache.rt_power + ccache.wbb->local_result.power * pppm_lkg;
+    }
+
+    tcache.rt_power = tcache.power_t + (tcache.caches->local_result.power +
+                                        tcache.missb->local_result.power +
+                                        tcache.ifb->local_result.power +
+                                        tcache.prefetchb->local_result.power) *
+                                           pppm_lkg;
+    if (cache_p == Write_back) {
+      tcache.rt_power =
+          tcache.rt_power + tcache.wbb->local_result.power * pppm_lkg;
+    }
+
+    LSQ->rt_power = LSQ->power_t + LSQ->local_result.power * pppm_lkg;
+    LSQ->rt_power.reset();
+    rt_power = rt_power + dcache.rt_power + LSQ->rt_power +
+               sharedmemory.rt_power + ccache.rt_power + tcache.rt_power;
+
+    if ((coredynp.core_ty == OOO) &&
+        (XML->sys.core[ithCore].load_buffer_size > 0)) {
+      LoadQ->rt_power = LoadQ->power_t + LoadQ->local_result.power * pppm_lkg;
+      rt_power = rt_power + LoadQ->rt_power;
+    }
+  }
+}
+
+void LoadStoreU::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  if (!exist)
+    return;
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+
+    cout << indent_str << "Shared Memory:" << endl;
+    cout << indent_str_next << "Area = " << sharedmemory.area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << sharedmemory.power.readOp.dynamic * clockRate
+         << " W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? sharedmemory.power.readOp.longer_channel_leakage
+                          : sharedmemory.power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << sharedmemory.power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << sharedmemory.rt_power.readOp.dynamic / executionTime << " W"
+         << endl;
+    cout << endl;
+
+    cout << indent_str << "Data Cache:" << endl;
+    cout << indent_str_next << "Area = " << dcache.area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << dcache.power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? dcache.power.readOp.longer_channel_leakage
+                          : dcache.power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << dcache.power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << dcache.rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << endl;
+
+    cout << indent_str << "Constant Cache:" << endl;
+    cout << indent_str_next << "Area = " << ccache.area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << ccache.power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? dcache.power.readOp.longer_channel_leakage
+                          : dcache.power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << ccache.power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << ccache.rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << indent_str_next
+         << "Runtime Dynamic Energy = " << ccache.rt_power.readOp.dynamic
+         << " J" << endl;
+    cout << indent_str_next << "Execution Time = " << executionTime << " s"
+         << endl;
+    cout << endl;
+
+    cout << indent_str << "Texture Cache:" << endl;
+    cout << indent_str_next << "Area = " << tcache.area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << tcache.power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? dcache.power.readOp.longer_channel_leakage
+                          : dcache.power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << tcache.power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << tcache.rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << endl;
+
+    if (coredynp.core_ty == Inorder) {
+      cout << indent_str << "Load/Store Queue:" << endl;
+      cout << indent_str_next << "Area = " << LSQ->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << LSQ->power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? LSQ->power.readOp.longer_channel_leakage
+                            : LSQ->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << LSQ->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << LSQ->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+    } else
+
+    {
+      if (XML->sys.core[ithCore].load_buffer_size > 0) {
+        cout << indent_str << "LoadQ:" << endl;
+        cout << indent_str_next << "Area = " << LoadQ->area.get_area() * 1e-6
+             << " mm^2" << endl;
+        cout << indent_str_next
+             << "Peak Dynamic = " << LoadQ->power.readOp.dynamic * clockRate
+             << " W" << endl;
+        cout << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel ? LoadQ->power.readOp.longer_channel_leakage
+                              : LoadQ->power.readOp.leakage)
+             << " W" << endl;
+        cout << indent_str_next
+             << "Gate Leakage = " << LoadQ->power.readOp.gate_leakage << " W"
+             << endl;
+        cout << indent_str_next << "Runtime Dynamic = "
+             << LoadQ->rt_power.readOp.dynamic / executionTime << " W" << endl;
+        cout << endl;
+      }
+      cout << indent_str << "StoreQ:" << endl;
+      cout << indent_str_next << "Area = " << LSQ->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << LSQ->power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? LSQ->power.readOp.longer_channel_leakage
+                            : LSQ->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << LSQ->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << LSQ->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+    }
+  } else {
+
+    cout << indent_str_next << "Shared Memory    Peak Dynamic = "
+         << sharedmemory.rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next << "Shared Memory    Subthreshold Leakage = "
+         << sharedmemory.rt_power.readOp.leakage << " W" << endl;
+    cout << indent_str_next << "Shared Memory    Gate Leakage = "
+         << sharedmemory.rt_power.readOp.gate_leakage << " W" << endl;
+
+    cout << indent_str_next << "Data Cache    Peak Dynamic = "
+         << dcache.rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next << "Data Cache    Subthreshold Leakage = "
+         << dcache.rt_power.readOp.leakage << " W" << endl;
+    cout << indent_str_next << "Data Cache    Gate Leakage = "
+         << dcache.rt_power.readOp.gate_leakage << " W" << endl;
+
+    cout << indent_str_next << "Constant Cache    Peak Dynamic = "
+         << ccache.rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next << "Constant Cache    Subthreshold Leakage = "
+         << ccache.rt_power.readOp.leakage << " W" << endl;
+    cout << indent_str_next << "Constant Cache    Gate Leakage = "
+         << ccache.rt_power.readOp.gate_leakage << " W" << endl;
+
+    cout << indent_str_next << "Texture Cache    Peak Dynamic = "
+         << tcache.rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next << "Texture Cache    Subthreshold Leakage = "
+         << tcache.rt_power.readOp.leakage << " W" << endl;
+    cout << indent_str_next << "Texture Cache    Gate Leakage = "
+         << tcache.rt_power.readOp.gate_leakage << " W" << endl;
+
+    if (coredynp.core_ty == Inorder) {
+      cout << indent_str_next << "Load/Store Queue   Peak Dynamic = "
+           << LSQ->rt_power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "Load/Store Queue   Subthreshold Leakage = "
+           << LSQ->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next << "Load/Store Queue   Gate Leakage = "
+           << LSQ->rt_power.readOp.gate_leakage << " W" << endl;
+    } else {
+      cout << indent_str_next << "LoadQ   Peak Dynamic = "
+           << LoadQ->rt_power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "LoadQ   Subthreshold Leakage = "
+           << LoadQ->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next
+           << "LoadQ   Gate Leakage = " << LoadQ->rt_power.readOp.gate_leakage
+           << " W" << endl;
+      cout << indent_str_next << "StoreQ   Peak Dynamic = "
+           << LSQ->rt_power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next
+           << "StoreQ   Subthreshold Leakage = " << LSQ->rt_power.readOp.leakage
+           << " W" << endl;
+      cout << indent_str_next
+           << "StoreQ   Gate Leakage = " << LSQ->rt_power.readOp.gate_leakage
+           << " W" << endl;
+    }
+  }
+}
+
+void MemManU::computeEnergy(bool is_tdp) {
+
+  if (!exist)
+    return;
+  if (is_tdp) {
+    // init stats for Peak
+    itlb->stats_t.readAc.access = itlb->l_ip.num_search_ports;
+    itlb->stats_t.readAc.miss = 0;
+    itlb->stats_t.readAc.hit =
+        itlb->stats_t.readAc.access - itlb->stats_t.readAc.miss;
+    itlb->tdp_stats = itlb->stats_t;
+
+    dtlb->stats_t.readAc.access =
+        dtlb->l_ip.num_search_ports * coredynp.LSU_duty_cycle;
+    dtlb->stats_t.readAc.miss = 0;
+    dtlb->stats_t.readAc.hit =
+        dtlb->stats_t.readAc.access - dtlb->stats_t.readAc.miss;
+    dtlb->tdp_stats = dtlb->stats_t;
+  } else {
+    // init stats for Runtime Dynamic (RTP)
+    itlb->stats_t.readAc.access = XML->sys.core[ithCore].itlb.total_accesses;
+    itlb->stats_t.readAc.miss = XML->sys.core[ithCore].itlb.total_misses;
+    itlb->stats_t.readAc.hit =
+        itlb->stats_t.readAc.access - itlb->stats_t.readAc.miss;
+    itlb->rtp_stats = itlb->stats_t;
+
+    dtlb->stats_t.readAc.access = XML->sys.core[ithCore].dtlb.total_accesses;
+    dtlb->stats_t.readAc.miss = XML->sys.core[ithCore].dtlb.total_misses;
+    dtlb->stats_t.readAc.hit =
+        dtlb->stats_t.readAc.access - dtlb->stats_t.readAc.miss;
+    dtlb->rtp_stats = dtlb->stats_t;
+  }
+
+  itlb->power_t.reset();
+  dtlb->power_t.reset();
+  itlb->power_t.readOp.dynamic +=
+      itlb->stats_t.readAc.access * itlb->local_result.power.searchOp
+                                        .dynamic // FA spent most power in tag,
+                                                 // so use total access not hits
+      + itlb->stats_t.readAc.miss * itlb->local_result.power.writeOp.dynamic;
+  dtlb->power_t.readOp.dynamic +=
+      dtlb->stats_t.readAc.access * dtlb->local_result.power.searchOp
+                                        .dynamic // FA spent most power in tag,
+                                                 // so use total access not hits
+      + dtlb->stats_t.readAc.miss * dtlb->local_result.power.writeOp.dynamic;
+
+  if (is_tdp) {
+    itlb->power = itlb->power_t + itlb->local_result.power * pppm_lkg;
+    dtlb->power = dtlb->power_t + dtlb->local_result.power * pppm_lkg;
+    power = power + itlb->power + dtlb->power;
+  } else {
+    itlb->rt_power = itlb->power_t + itlb->local_result.power * pppm_lkg;
+    dtlb->rt_power = dtlb->power_t + dtlb->local_result.power * pppm_lkg;
+    rt_power = rt_power + itlb->rt_power + dtlb->rt_power;
+  }
+}
+
+void MemManU::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  if (!exist)
+    return;
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+    cout << indent_str << "Itlb:" << endl;
+    cout << indent_str_next << "Area = " << itlb->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << itlb->power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? itlb->power.readOp.longer_channel_leakage
+                          : itlb->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << itlb->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << itlb->rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << endl;
+    cout << indent_str << "Dtlb:" << endl;
+    cout << indent_str_next << "Area = " << dtlb->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << dtlb->power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? dtlb->power.readOp.longer_channel_leakage
+                          : dtlb->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << dtlb->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << dtlb->rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << endl;
+  } else {
+    cout << indent_str_next << "Itlb    Peak Dynamic = "
+         << itlb->rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next
+         << "Itlb    Subthreshold Leakage = " << itlb->rt_power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next
+         << "Itlb    Gate Leakage = " << itlb->rt_power.readOp.gate_leakage
+         << " W" << endl;
+    cout << indent_str_next << "Dtlb   Peak Dynamic = "
+         << dtlb->rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next
+         << "Dtlb   Subthreshold Leakage = " << dtlb->rt_power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next
+         << "Dtlb   Gate Leakage = " << dtlb->rt_power.readOp.gate_leakage
+         << " W" << endl;
+  }
+}
+
+void RegFU::computeEnergy(bool is_tdp) {
+  /*
+   * Architecture RF and physical RF cannot be present at the same time.
+   * Therefore, the RF stats can only refer to either ARF or PRF;
+   * And the same stats can be used for both.
+   */
+  if (!exist)
+    return;
+
+  executionTime =
+      XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6); // Syed
+  // RF crossbar power (Syed Gilani)
+  xbar_rfu->compute_power();
+
+  // Arbiter power
+  arbiter_rfu->compute_power();
+
+  if (is_tdp) {
+    // RF power -- modified by Syed
+    // init stats for Peak
+
+    IRF->stats_t.readAc.access = 4;
+    IRF->stats_t.writeAc.access = 4;
+    IRF->tdp_stats = IRF->stats_t;
+
+    IRF->stats_t.readAc.access = 2;
+    IRF->stats_t.writeAc.access = 1;
+    IRF->tdp_stats = IRF->stats_t;
+
+    OPC->stats_t.readAc.access = 32;
+    OPC->stats_t.writeAc.access = 32;
+    OPC->tdp_stats = OPC->stats_t;
+
+    // Commented by Syed (GPUs have a single RF which we model by IRF)
+    // FRF->stats_t.readAc.access  =
+    // FRF->l_ip.num_rd_ports*coredynp.FPU_duty_cycle*1.05*coredynp.num_fp_pipelines;
+    // FRF->stats_t.writeAc.access  =
+    // FRF->l_ip.num_wr_ports*coredynp.FPU_duty_cycle*1.05*coredynp.num_fp_pipelines;
+    // FRF->tdp_stats = FRF->stats_t;
+    if (coredynp.regWindowing) {
+      RFWIN->stats_t.readAc.access = 0;  // 0.5*RFWIN->l_ip.num_rw_ports;
+      RFWIN->stats_t.writeAc.access = 0; // 0.5*RFWIN->l_ip.num_rw_ports;
+      RFWIN->tdp_stats = RFWIN->stats_t;
+    }
+  } /* if (is_tdp) */
+  else {
+    // init stats for Runtime Dynamic (RTP)
+    // in Tesla each RF operand accesses 2 banks, so multiply acceses by 2
+    // (read and write energies of reg_file are per bank)(Tesla) : Syed
+    // Also, for a SIMD width of 8 and warp size of 32 threads, 4 accesses
+    // (each accessing 2 banks) need to be performed per operand
+    if (XML->sys.architecture == 1) {
+      IRF->stats_t.readAc.access =
+          (XML->sys.core[ithCore].int_regfile_reads / 32) * (4 * 2); /// 1.5;
+      IRF->stats_t.writeAc.access =
+          (XML->sys.core[ithCore].int_regfile_writes / 32) * (4 * 2); /// 1.5;
+    } else {
+      IRF->stats_t.readAc.access =
+          (XML->sys.core[ithCore].int_regfile_reads / 32) *
+          (2 * 4); /// 1.5;//TODO: no diff on archi and phy
+      IRF->stats_t.writeAc.access =
+          (XML->sys.core[ithCore].int_regfile_writes / 32) * (2 * 4); /// 1.5;
+    }
+    IRF->rtp_stats = IRF->stats_t;
+
+    OPC->stats_t.readAc.access =
+        (XML->sys.core[ithCore].int_regfile_reads) /*/1.5*/ +
+        XML->sys.core[ithCore]
+            .non_rf_operands; /// 1.5;//TODO: no diff on archi and phy
+    OPC->stats_t.writeAc.access = 0;
+    OPC->rtp_stats = OPC->stats_t;
+
+    // cout<< "IRF read energy: "<<
+    // IRF->local_result.power.readOp.dynamic<<endl; cout<< "IRF write energy:
+    // "<<    IRF->local_result.power.writeOp.dynamic<<endl;
+    // FRF->stats_t.readAc.access  = XML->sys.core[ithCore].float_regfile_reads;
+    // FRF->stats_t.writeAc.access  =
+    // XML->sys.core[ithCore].float_regfile_writes; FRF->rtp_stats =
+    // FRF->stats_t;
+    if (coredynp.regWindowing) {
+      RFWIN->stats_t.readAc.access = XML->sys.core[ithCore].function_calls * 16;
+      RFWIN->stats_t.writeAc.access =
+          XML->sys.core[ithCore].function_calls * 16;
+      RFWIN->rtp_stats = RFWIN->stats_t;
+
+      IRF->stats_t.readAc.access = XML->sys.core[ithCore].int_regfile_reads +
+                                   XML->sys.core[ithCore].function_calls * 16;
+      IRF->stats_t.writeAc.access = XML->sys.core[ithCore].int_regfile_writes +
+                                    XML->sys.core[ithCore].function_calls * 16;
+      IRF->rtp_stats = IRF->stats_t;
+    }
+  }
+  IRF->power_t.reset();
+  FRF->power_t.reset();
+  OPC->power_t.reset();
+  // IRF->power_t  =  IRF->power_t + IRF->local_result.power;// +
+  // xbar_rfu->power + arbiter_rfu->power;
+
+  IRF->power_t.readOp.dynamic =
+      (IRF->stats_t.readAc.access * IRF->local_result.power.readOp.dynamic +
+       IRF->stats_t.writeAc.access * IRF->local_result.power.writeOp.dynamic);
+  OPC->power_t.readOp.dynamic =
+      (OPC->stats_t.readAc.access * OPC->local_result.power.readOp.dynamic);
+
+  if (coredynp.regWindowing) {
+    RFWIN->power_t.reset();
+    RFWIN->power_t.readOp.dynamic +=
+        (RFWIN->stats_t.readAc.access *
+             RFWIN->local_result.power.readOp.dynamic +
+         RFWIN->stats_t.writeAc.access *
+             RFWIN->local_result.power.writeOp.dynamic);
+  }
+
+  if (is_tdp) {
+
+    // cout<<"pre: IRF_power_t: "<<IRF->power.readOp.dynamic<<"
+    // ("<<IRF->power.readOp.dynamic*clockRate<<") "<<" IRF_localresult: "<<
+    //                   IRF->local_result.power.readOp.dynamic<<endl;
+    // Syed: removed the multiplication of power by hardware threads
+    // since the one IRF  is shared by all threads in GPUs
+
+    // FRF->power  =  FRF->power_t + FRF->local_result.power
+    // *coredynp.pppm_lkg_multhread;
+
+    double pppm_lkg_banks[4];
+    set_pppm(pppm_lkg_banks, 0, XML->sys.core[ithCore].collector_units,
+             XML->sys.core[ithCore].collector_units);
+    IRF->power = (IRF->power_t) + IRF->local_result.power * pppm_lkg;
+    IRF->power.readOp.dynamic = IRF->power_t.readOp.dynamic * 1;
+    OPC->power = (OPC->power_t) + OPC->local_result.power * pppm_lkg_banks;
+    OPC->power.readOp.dynamic = OPC->power_t.readOp.dynamic * 1;
+
+    power = power + (IRF->power + OPC->power);
+
+    if (coredynp.regWindowing) {
+      RFWIN->power = RFWIN->power_t + RFWIN->local_result.power * pppm_lkg;
+      power = power + RFWIN->power;
+    }
+  } /* if (is_tdp) */
+  else {
+    // Removed *coredynp.pppm_lkg_multhread since all hardware threads shared
+    // the same IRF
+    IRF->rt_power =
+        IRF->power_t +
+        IRF->local_result.power * pppm_lkg; /* *coredynp.pppm_lkg_multhread;*/
+    OPC->rt_power = OPC->power_t + OPC->local_result.power * pppm_lkg;
+    if (XML->sys.architecture == 1) {
+      // Each warp operand accesses the crossbar
+      xbar_rfu->rt_power.readOp.dynamic =
+          ((XML->sys.core[ithCore].int_regfile_reads / (32 /**1.5*/)) +
+           (XML->sys.core[ithCore].non_rf_operands / (32 /**1.5*/))) *
+          xbar_rfu->power.readOp.dynamic;
+    } else {
+      xbar_rfu->rt_power.readOp.dynamic =
+          ((XML->sys.core[ithCore].int_regfile_reads / (32 /**1.5*/)) +
+           (XML->sys.core[ithCore].non_rf_operands / (32 /**1.5*/))) *
+          xbar_rfu->power.readOp.dynamic;
+    }
+    arbiter_rfu->rt_power.readOp.dynamic =
+        ((XML->sys.core[ithCore].int_regfile_reads / (32 /**1.5*/)) +
+         (XML->sys.core[ithCore].non_rf_operands / (32 /**1.5*/))) *
+        arbiter_rfu->power.readOp.dynamic;
+
+    rt_power =
+        rt_power + (IRF->power_t /*+ FRF->power_t*/ + xbar_rfu->rt_power +
+                    arbiter_rfu->rt_power + OPC->power_t);
+    if (coredynp.regWindowing) {
+      RFWIN->rt_power = RFWIN->power_t + RFWIN->local_result.power * pppm_lkg;
+      rt_power = rt_power + RFWIN->rt_power;
+    }
+  }
+}
+
+void RegFU::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  if (!exist)
+    return;
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+    cout << indent_str << "Register file banks: " << endl;
+    cout << indent_str_next << "Area = " << IRF->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << IRF->power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? IRF->power.readOp.longer_channel_leakage
+                          : IRF->power.readOp.leakage)
+         << " W" << endl;
+
+    cout << indent_str_next
+         << "Gate Leakage = " << IRF->power.readOp.gate_leakage << " W" << endl;
+    cout << indent_str_next
+         << "Runtime Dynamic = " << IRF->rt_power.readOp.dynamic / executionTime
+         << " W" << endl;
+    cout << endl;
+    cout << indent_str << "Crossbar (Integer RF):" << endl;
+    cout << indent_str_next << "Area = " << xbar_rfu->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << xbar_rfu->power.readOp.dynamic * clockRate
+         << " W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? xbar_rfu->power.readOp.longer_channel_leakage
+                          : xbar_rfu->power.readOp.leakage)
+         << " W" << endl;
+
+    cout << indent_str_next
+         << "Gate Leakage = " << xbar_rfu->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << xbar_rfu->rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << endl;
+
+    cout << indent_str << "Arbiter (Integer RF):" << endl;
+    cout << indent_str_next << "Area = " << arbiter_rfu->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << arbiter_rfu->power.readOp.dynamic * clockRate
+         << " W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? arbiter_rfu->power.readOp.longer_channel_leakage
+                          : arbiter_rfu->power.readOp.leakage)
+         << " W" << endl;
+
+    cout << indent_str_next
+         << "Gate Leakage = " << arbiter_rfu->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << arbiter_rfu->rt_power.readOp.dynamic / executionTime << " W"
+         << endl;
+    cout << endl;
+
+    /*
+    cout << indent_str<< "Floating Point RF:" << endl;
+    cout << indent_str_next << "Area = " << FRF->area.get_area()*1e-6  << "
+    mm^2" << endl; cout << indent_str_next << "Peak Dynamic = " <<
+    FRF->power.readOp.dynamic*clockRate  << " W" << endl; cout <<
+    indent_str_next << "Subthreshold Leakage = "
+            << (long_channel?
+    FRF->power.readOp.longer_channel_leakage:FRF->power.readOp.leakage)  << " W"
+    << endl; cout << indent_str_next << "Gate Leakage = " <<
+    FRF->power.readOp.gate_leakage  << " W" << endl; cout << indent_str_next <<
+    "Runtime Dynamic = " << FRF->rt_power.readOp.dynamic/executionTime << " W"
+    << endl; cout <<endl;
+    */
+    if (coredynp.regWindowing) {
+      cout << indent_str << "Register Windows:" << endl;
+      cout << indent_str_next << "Area = " << RFWIN->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << RFWIN->power.readOp.dynamic * clockRate
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? RFWIN->power.readOp.longer_channel_leakage
+                            : RFWIN->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << RFWIN->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << RFWIN->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+    }
+  } else {
+    cout << indent_str_next << "Integer RF    Peak Dynamic = "
+         << IRF->rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next << "Integer RF    Subthreshold Leakage = "
+         << IRF->rt_power.readOp.leakage << " W" << endl;
+    cout << indent_str_next
+         << "Integer RF    Gate Leakage = " << IRF->rt_power.readOp.gate_leakage
+         << " W" << endl;
+    cout << indent_str_next << "Floating Point RF   Peak Dynamic = "
+         << FRF->rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next << "Floating Point RF   Subthreshold Leakage = "
+         << FRF->rt_power.readOp.leakage << " W" << endl;
+    cout << indent_str_next << "Floating Point RF   Gate Leakage = "
+         << FRF->rt_power.readOp.gate_leakage << " W" << endl;
+    if (coredynp.regWindowing) {
+      cout << indent_str_next << "Register Windows   Peak Dynamic = "
+           << RFWIN->rt_power.readOp.dynamic * clockRate << " W" << endl;
+      cout << indent_str_next << "Register Windows   Subthreshold Leakage = "
+           << RFWIN->rt_power.readOp.leakage << " W" << endl;
+      cout << indent_str_next << "Register Windows   Gate Leakage = "
+           << RFWIN->rt_power.readOp.gate_leakage << " W" << endl;
+    }
+  }
+}
+
+void EXECU::computeEnergy(bool is_tdp) {
+  if (!exist)
+    return;
+  // Syed
+  double pppm_t[4] = {1, 1, 1, 1};
+  double pppm_freqScaling[4] = {rf_fu_clockRate / clockRate, 1, 1, 1};
+  executionTime =
+      XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6); // Syed
+
+  //	rfu->power.reset();
+  rfu->rt_power.reset();
+  //	scheu->power.reset();
+  scheu->rt_power.reset();
+  //	exeu->power.reset();
+  exeu->rt_power.reset();
+
+  rfu->computeEnergy(is_tdp);
+  scheu->computeEnergy(is_tdp);
+  exeu->computeEnergy(is_tdp);
+  if (coredynp.num_fpus > 0) {
+    fp_u->rt_power.reset();
+    fp_u->computeEnergy(is_tdp);
+  }
+  if (coredynp.num_muls > 0) {
+    mul->rt_power.reset();
+    mul->computeEnergy(is_tdp);
+  }
+  bypass.rt_power.reset();
+
+  if (is_tdp) {
+    set_pppm(
+        pppm_t, 2 * coredynp.ALU_cdb_duty_cycle, 2, 2,
+        2 * coredynp
+                .ALU_cdb_duty_cycle); // 2 means two source operands needs to be
+                                      // passed for each int instruction.
+    // bypass.power = bypass.power + intTagBypass->power*pppm_t +
+    // int_bypass->power*pppm_t;
+    if (coredynp.num_muls > 0) {
+      set_pppm(
+          pppm_t, 2 * coredynp.MUL_cdb_duty_cycle, 2, 2,
+          2 * coredynp
+                  .MUL_cdb_duty_cycle); // 2 means two source operands needs to
+                                        // be passed for each int instruction.
+      // No conventional bypassing in GPU (Syed)
+      // bypass.power = bypass.power + intTag_mul_Bypass->power*pppm_t +
+      // int_mul_bypass->power*pppm_t;
+      power = power + mul->power * pppm_freqScaling;
+    }
+
+    if (coredynp.num_fpus > 0) {
+      set_pppm(
+          pppm_t, 3 * coredynp.FPU_cdb_duty_cycle, 3, 3,
+          3 * coredynp
+                  .FPU_cdb_duty_cycle); // 3 means three source operands needs
+                                        // to be passed for each fp instruction.
+      // No conventional bypassing in GPU (Syed)
+      // bypass.power = bypass.power + fp_bypass->power*pppm_t  +
+      // fpTagBypass->power*pppm_t ;
+      power = power + fp_u->power * pppm_freqScaling;
+    }
+    // No conventional bypassing in GPU (Syed)
+
+    power = power + rfu->power * pppm_freqScaling +
+            exeu->power * pppm_freqScaling /*+ bypass.power*/ + scheu->power;
+
+  } else {
+    set_pppm(pppm_t, XML->sys.core[ithCore].cdb_alu_accesses, 2, 2,
+             XML->sys.core[ithCore].cdb_alu_accesses);
+    // bypass.rt_power = bypass.rt_power + intTagBypass->power*pppm_t;
+    // bypass.rt_power = bypass.rt_power + int_bypass->power*pppm_t;
+
+    if (coredynp.num_muls > 0) {
+      set_pppm(pppm_t, XML->sys.core[ithCore].cdb_mul_accesses, 2, 2,
+               XML->sys.core[ithCore]
+                   .cdb_mul_accesses); // 2 means two source operands needs to
+                                       // be passed for each int instruction.
+      // bypass.rt_power = bypass.rt_power + intTag_mul_Bypass->power*pppm_t +
+      // int_mul_bypass->power*pppm_t;
+      rt_power = rt_power + mul->rt_power;
+    }
+
+    if (coredynp.num_fpus > 0) {
+      set_pppm(pppm_t, XML->sys.core[ithCore].cdb_fpu_accesses, 3, 3,
+               XML->sys.core[ithCore].cdb_fpu_accesses);
+      // bypass.rt_power = bypass.rt_power + fp_bypass->power*pppm_t;
+      // bypass.rt_power = bypass.rt_power + fpTagBypass->power*pppm_t;
+      rt_power = rt_power + fp_u->rt_power;
+    }
+    // No conventional bypassing in GPU (Syed)
+    rt_power = rt_power + rfu->rt_power * pppm_freqScaling +
+               exeu->rt_power * pppm_freqScaling +
+               /*bypass.rt_power +*/ scheu->rt_power;
+  }
+}
+
+void EXECU::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  if (!exist)
+    return;
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  //	cout << indent_str_next << "Results Broadcast Bus Area = " <<
+  // bypass->area.get_area() *1e-6 << " mm^2" << endl;
+  if (is_tdp) {
+    cout << indent_str << "Register Files:" << endl;
+    cout << indent_str_next << "Area = " << rfu->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << rfu->power.readOp.dynamic * rf_fu_clockRate
+         << " W" << endl;
+    // cout << "rf_fu Clock rate: "<< rf_fu_clockRate<<endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? rfu->power.readOp.longer_channel_leakage
+                          : rfu->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << rfu->power.readOp.gate_leakage << " W" << endl;
+    cout << indent_str_next
+         << "Runtime Dynamic = " << rfu->rt_power.readOp.dynamic / executionTime
+         << " W" << endl;
+    cout << endl;
+    if (plevel > 3) {
+      rfu->displayEnergy(indent + 4, is_tdp);
+    }
+    cout << indent_str << "Instruction Scheduler:" << endl;
+    cout << indent_str_next << "Area = " << scheu->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << scheu->power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? scheu->power.readOp.longer_channel_leakage
+                          : scheu->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << scheu->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << scheu->rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << endl;
+    if (plevel > 3) {
+      scheu->displayEnergy(indent + 4, is_tdp);
+    }
+    exeu->displayEnergy(indent, is_tdp);
+    if (coredynp.num_fpus > 0) {
+      fp_u->displayEnergy(indent, is_tdp);
+    }
+    if (coredynp.num_muls > 0) {
+      mul->displayEnergy(indent, is_tdp);
+    }
+    cout << indent_str << "Results Broadcast Bus:" << endl;
+    cout << indent_str_next
+         << "Area Overhead = " << bypass.area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << bypass.power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? bypass.power.readOp.longer_channel_leakage
+                          : bypass.power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << bypass.power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << bypass.rt_power.readOp.dynamic / executionTime << " W" << endl;
+    cout << endl;
+  } else {
+    cout << indent_str_next << "Register Files    Peak Dynamic = "
+         << rfu->rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next << "Register Files    Subthreshold Leakage = "
+         << rfu->rt_power.readOp.leakage << " W" << endl;
+    cout << indent_str_next << "Register Files    Gate Leakage = "
+         << rfu->rt_power.readOp.gate_leakage << " W" << endl;
+    cout << indent_str_next << "Instruction Sheduler   Peak Dynamic = "
+         << scheu->rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next << "Instruction Sheduler   Subthreshold Leakage = "
+         << scheu->rt_power.readOp.leakage << " W" << endl;
+    cout << indent_str_next << "Instruction Sheduler   Gate Leakage = "
+         << scheu->rt_power.readOp.gate_leakage << " W" << endl;
+    cout << indent_str_next << "Results Broadcast Bus   Peak Dynamic = "
+         << bypass.rt_power.readOp.dynamic * clockRate << " W" << endl;
+    cout << indent_str_next << "Results Broadcast Bus   Subthreshold Leakage = "
+         << bypass.rt_power.readOp.leakage << " W" << endl;
+    cout << indent_str_next << "Results Broadcast Bus   Gate Leakage = "
+         << bypass.rt_power.readOp.gate_leakage << " W" << endl;
+  }
+}
+
+// Jingwen
+void Core::compute() {
+  // power_point_product_masks
+  double pppm_t[4] = {1, 1, 1, 1};
+  double rtp_pipeline_coe;
+  double num_units = 4.0;
+  Pipeline_energy = 0;
+
+  // Set pipeline duty cycle for this inteval
+  coredynp.pipeline_duty_cycle = XML->sys.core[ithCore].pipeline_duty_cycle;
+  rt_power.reset();
+  ifu->rt_power.reset();
+  lsu->rt_power.reset();
+  mmu->rt_power.reset();
+  exu->rt_power.reset();
+
+  ifu->computeEnergy(false);
+  lsu->computeEnergy(false);
+  mmu->computeEnergy(false);
+  exu->computeEnergy(false);
+
+  if (XML->sys.homogeneous_cores == 1) {
+    rtp_pipeline_coe = coredynp.pipeline_duty_cycle * XML->sys.total_cycles *
+                       XML->sys.number_of_cores;
+  } else {
+
+    rtp_pipeline_coe = coredynp.pipeline_duty_cycle * coredynp.total_cycles;
+    // Jingwen
+    if (coredynp.total_cycles != XML->sys.total_cycles) {
+      cout << "total cycle not match!" << endl;
+      exit(1);
+    }
+  }
+
+  set_pppm(pppm_t, coredynp.num_pipelines * rtp_pipeline_coe / num_units,
+           coredynp.num_pipelines / num_units,
+           coredynp.num_pipelines / num_units,
+           coredynp.num_pipelines / num_units);
+
+  if (ifu->exist) {
+    Pipeline_energy += corepipe->power.readOp.dynamic *
+                       (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+    ifu->rt_power = ifu->rt_power + corepipe->power * pppm_t;
+    rt_power = rt_power + ifu->rt_power;
+  }
+
+  if (lsu->exist) {
+    Pipeline_energy += corepipe->power.readOp.dynamic *
+                       (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+    lsu->rt_power = lsu->rt_power + corepipe->power * pppm_t;
+    rt_power = rt_power + lsu->rt_power;
+  }
+  if (exu->exist) {
+    Pipeline_energy += corepipe->power.readOp.dynamic *
+                       (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+    exu->rt_power = exu->rt_power + corepipe->power * pppm_t;
+    rt_power = rt_power + exu->rt_power;
+  }
+  if (mmu->exist) {
+    Pipeline_energy += corepipe->power.readOp.dynamic *
+                       (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+    mmu->rt_power = mmu->rt_power + corepipe->power * pppm_t;
+    rt_power = rt_power + mmu->rt_power;
+  }
+
+  rt_power = rt_power + undiffCore->power;
+
+  if (XML->sys.Private_L2) {
+
+    l2cache->computeEnergy(false);
+    rt_power = rt_power + l2cache->rt_power;
+  }
+
+  IdleCoreEnergy =
+      XML->sys.num_idle_cores * XML->sys.idle_core_power * executionTime;
+
+  rt_power.readOp.dynamic += IdleCoreEnergy;
+}
+
+void Core::computeEnergy(bool is_tdp) {
+  // power_point_product_masks
+  double pppm_t[4] = {1, 1, 1, 1};
+  double rtp_pipeline_coe;
+  double num_units = 4.0;
+  Pipeline_energy = 0;
+
+  if (XML->sys.homogeneous_cores == 1) {
+    rtp_pipeline_coe = coredynp.pipeline_duty_cycle * XML->sys.total_cycles *
+                       XML->sys.number_of_cores;
+  } else {
+    rtp_pipeline_coe = coredynp.pipeline_duty_cycle * coredynp.total_cycles;
+  }
+
+  if (is_tdp) {
+    ifu->computeEnergy(is_tdp);
+    lsu->computeEnergy(is_tdp);
+    mmu->computeEnergy(is_tdp);
+    exu->computeEnergy(is_tdp);
+
+    if (coredynp.core_ty == OOO) {
+      num_units = 5.0;
+      rnu->computeEnergy(is_tdp);
+      set_pppm(pppm_t, coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units);
+      if (rnu->exist) {
+        rnu->power = rnu->power + corepipe->power * pppm_t;
+        power = power + rnu->power;
+      }
+    }
+
+    if (ifu->exist) {
+      Pipeline_energy +=
+          corepipe->power.readOp.dynamic *
+          (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+      set_pppm(pppm_t,
+               coredynp.num_pipelines / num_units * coredynp.IFU_duty_cycle,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units);
+      //			cout << "IFU = " <<
+      // ifu->power.readOp.dynamic*clockRate  << " W" << endl;
+      ifu->power = ifu->power + corepipe->power * pppm_t;
+      //			cout << "IFU = " <<
+      // ifu->power.readOp.dynamic*clockRate  << " W" << endl;
+      // cout << "1/4 pipe = " <<
+      // corepipe->power.readOp.dynamic*clockRate/num_units  << " W" << endl;
+      power = power + ifu->power;
+      //			cout << "core = " <<
+      // power.readOp.dynamic*clockRate  << " W" << endl;
+    }
+    if (lsu->exist) {
+      Pipeline_energy +=
+          corepipe->power.readOp.dynamic *
+          (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+      set_pppm(pppm_t,
+               coredynp.num_pipelines / num_units * coredynp.LSU_duty_cycle,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units);
+      lsu->power = lsu->power + corepipe->power * pppm_t;
+      //			cout << "LSU = " <<
+      // lsu->power.readOp.dynamic*clockRate  << " W" << endl;
+      power = power + lsu->power;
+      //			cout << "core = " <<
+      // power.readOp.dynamic*clockRate  << " W" << endl;
+    }
+    if (exu->exist) {
+      Pipeline_energy +=
+          corepipe->power.readOp.dynamic *
+          (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+      set_pppm(pppm_t,
+               coredynp.num_pipelines / num_units * coredynp.ALU_duty_cycle,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units);
+      // cout<<"ExPowerScalingFactor:"<<coredynp.num_pipelines/num_units*coredynp.ALU_duty_cycle<<endl;
+      exu->power = exu->power + corepipe->power * pppm_t;
+      //			cout << "EXE = " <<
+      // exu->power.readOp.dynamic*clockRate  << " W" << endl;
+      power = power + exu->power;
+      //			cout << "core = " <<
+      // power.readOp.dynamic*clockRate  << " W" << endl;
+    }
+    if (mmu->exist) {
+      Pipeline_energy +=
+          corepipe->power.readOp.dynamic *
+          (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+      set_pppm(pppm_t,
+               coredynp.num_pipelines / num_units *
+                   (0.5 + 0.5 * coredynp.LSU_duty_cycle),
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units);
+      mmu->power = mmu->power + corepipe->power * pppm_t;
+      //			cout << "MMU = " <<
+      // mmu->power.readOp.dynamic*clockRate  << " W" << endl;
+      power = power + mmu->power;
+      //			cout << "core = " <<
+      // power.readOp.dynamic*clockRate  << " W" << endl;
+    }
+
+    power = power + undiffCore->power;
+
+    if (XML->sys.Private_L2) {
+
+      l2cache->computeEnergy(is_tdp);
+      set_pppm(pppm_t, l2cache->cachep.clockRate / clockRate, 1, 1, 1);
+      // l2cache->power = l2cache->power*pppm_t;
+      power = power + l2cache->power * pppm_t;
+    }
+
+  } else {
+    rt_power.reset();
+
+    ifu->computeEnergy(is_tdp);
+    lsu->computeEnergy(is_tdp);
+    mmu->computeEnergy(is_tdp);
+    exu->computeEnergy(is_tdp);
+    if (coredynp.core_ty == OOO) {
+      num_units = 5.0;
+      rnu->computeEnergy(is_tdp);
+      set_pppm(pppm_t, coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units);
+      if (rnu->exist) {
+        rnu->rt_power = rnu->rt_power + corepipe->power * pppm_t;
+
+        rt_power = rt_power + rnu->rt_power;
+      }
+    } else {
+
+      set_pppm(pppm_t, coredynp.num_pipelines * rtp_pipeline_coe / num_units,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units,
+               coredynp.num_pipelines / num_units);
+    }
+
+    if (ifu->exist) {
+      Pipeline_energy +=
+          corepipe->power.readOp.dynamic *
+          (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+      ifu->rt_power = ifu->rt_power + corepipe->power * pppm_t;
+      rt_power = rt_power + ifu->rt_power;
+    }
+    if (lsu->exist) {
+      Pipeline_energy +=
+          corepipe->power.readOp.dynamic *
+          (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+      lsu->rt_power = lsu->rt_power + corepipe->power * pppm_t;
+      rt_power = rt_power + lsu->rt_power;
+    }
+    if (exu->exist) {
+      Pipeline_energy +=
+          corepipe->power.readOp.dynamic *
+          (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+      exu->rt_power = exu->rt_power + corepipe->power * pppm_t;
+      rt_power = rt_power + exu->rt_power;
+    }
+    if (mmu->exist) {
+      Pipeline_energy +=
+          corepipe->power.readOp.dynamic *
+          (coredynp.num_pipelines * rtp_pipeline_coe / num_units);
+      mmu->rt_power = mmu->rt_power + corepipe->power * pppm_t;
+      rt_power = rt_power + mmu->rt_power;
+    }
+
+    rt_power = rt_power + undiffCore->power;
+    //		cout << "EXE = " << exu->power.readOp.dynamic*clockRate  << " W"
+    //<< endl;
+    if (XML->sys.Private_L2) {
+      l2cache->computeEnergy(is_tdp);
+      // set_pppm(pppm_t,1/l2cache->cachep.executionTime, 1,1,1);
+      // l2cache->rt_power = l2cache->rt_power*pppm_t;
+      rt_power = rt_power + l2cache->rt_power;
+    }
+  }
+}
+
+void Core::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+  if (is_tdp) {
+    cout << "Core:" << endl;
+    cout << indent_str << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic * clockRate
+         << " W" << endl;
+    cout << indent_str << "Subthreshold Leakage = "
+         << (long_channel ? power.readOp.longer_channel_leakage
+                          : power.readOp.leakage)
+         << " W" << endl;
+    // cout << indent_str << "Subthreshold Leakage = " <<
+    // power.readOp.longer_channel_leakage <<" W" << endl;
+    cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str
+         << "Runtime Dynamic = " << rt_power.readOp.dynamic / executionTime
+         << " W" << endl;
+    cout << endl;
+    if (ifu->exist) {
+      cout << indent_str << "Instruction Fetch Unit:" << endl;
+      cout << indent_str_next << "Area = " << ifu->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << ifu->power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? ifu->power.readOp.longer_channel_leakage
+                            : ifu->power.readOp.leakage)
+           << " W" << endl;
+      // cout << indent_str_next << "Subthreshold Leakage = " <<
+      // ifu->power.readOp.longer_channel_leakage <<" W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << ifu->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << ifu->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+      if (plevel > 2) {
+        ifu->displayEnergy(indent + 4, plevel, is_tdp);
+      }
+    }
+    if (coredynp.core_ty == OOO) {
+      if (rnu->exist) {
+        cout << indent_str << "Renaming Unit:" << endl;
+        cout << indent_str_next << "Area = " << rnu->area.get_area() * 1e-6
+             << " mm^2" << endl;
+        cout << indent_str_next
+             << "Peak Dynamic = " << rnu->power.readOp.dynamic * clockRate
+             << " W" << endl;
+        cout << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel ? rnu->power.readOp.longer_channel_leakage
+                              : rnu->power.readOp.leakage)
+             << " W" << endl;
+        // cout << indent_str_next << "Subthreshold Leakage = " <<
+        // rnu->power.readOp.longer_channel_leakage  << " W" << endl;
+        cout << indent_str_next
+             << "Gate Leakage = " << rnu->power.readOp.gate_leakage << " W"
+             << endl;
+        cout << indent_str_next << "Runtime Dynamic = "
+             << rnu->rt_power.readOp.dynamic / executionTime << " W" << endl;
+        cout << endl;
+        if (plevel > 2) {
+          rnu->displayEnergy(indent + 4, plevel, is_tdp);
+        }
+      }
+    }
+    if (lsu->exist) {
+      cout << indent_str << "Load Store Unit:" << endl;
+      cout << indent_str_next << "Area = " << lsu->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << lsu->power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? lsu->power.readOp.longer_channel_leakage
+                            : lsu->power.readOp.leakage)
+           << " W" << endl;
+      // cout << indent_str_next << "Subthreshold Leakage = " <<
+      // lsu->power.readOp.longer_channel_leakage  << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << lsu->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << lsu->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+      if (plevel > 2) {
+        lsu->displayEnergy(indent + 4, plevel, is_tdp);
+      }
+    }
+    if (mmu->exist) {
+      cout << indent_str << "Memory Management Unit:" << endl;
+      cout << indent_str_next << "Area = " << mmu->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << mmu->power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? mmu->power.readOp.longer_channel_leakage
+                            : mmu->power.readOp.leakage)
+           << " W" << endl;
+      // cout << indent_str_next << "Subthreshold Leakage = " <<
+      // mmu->power.readOp.longer_channel_leakage   << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << mmu->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << mmu->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+      if (plevel > 2) {
+        mmu->displayEnergy(indent + 4, plevel, is_tdp);
+      }
+    }
+    if (exu->exist) {
+      cout << indent_str << "Execution Unit:" << endl;
+      cout << indent_str_next << "Area = " << exu->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << exu->power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next
+           << "Peak Dynamic Energy = " << exu->power.readOp.dynamic << " W"
+           << endl;
+      cout << indent_str_next << "clock Rate = " << clockRate << " W" << endl;
+
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? exu->power.readOp.longer_channel_leakage
+                            : exu->power.readOp.leakage)
+           << " W" << endl;
+      // cout << indent_str_next << "Subthreshold Leakage = " <<
+      // exu->power.readOp.longer_channel_leakage << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << exu->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << exu->rt_power.readOp.dynamic / executionTime << " W" << endl;
+      cout << endl;
+      if (plevel > 2) {
+        exu->displayEnergy(indent + 4, plevel, is_tdp);
+      }
+    }
+    //		if (plevel >2)
+    //		{
+    //			if (undiffCore->exist)
+    //			{
+    //				cout << indent_str << "Undifferentiated Core" <<
+    // endl; 				cout << indent_str_next << "Area = " <<
+    // undiffCore->area.get_area()*1e-6<< " mm^2" << endl;
+    // cout
+    // << indent_str_next << "Peak Dynamic = " <<
+    // undiffCore->power.readOp.dynamic*clockRate << " W" << endl;
+    ////				cout << indent_str_next << "Subthreshold Leakage
+    ///=
+    ///"
+    ///<< undiffCore->power.readOp.leakage <<" W" << endl;
+    //				cout << indent_str_next << "Subthreshold Leakage
+    //=
+    //"
+    //								<<
+    //(long_channel?
+    // undiffCore->power.readOp.longer_channel_leakage:undiffCore->power.readOp.leakage)
+    //<< " W" << endl; 				cout << indent_str_next << "Gate Leakage
+    //=
+    //"
+    //<< undiffCore->power.readOp.gate_leakage << " W" << endl;
+    //				//		cout << indent_str_next << "Runtime Dynamic
+    //=
+    //"
+    //<< undiffCore->rt_power.readOp.dynamic/executionTime << " W" << endl;
+    // cout
+    //<<endl;
+    //			}
+    //		}
+    if (XML->sys.Private_L2) {
+
+      l2cache->displayEnergy(4, is_tdp);
+    }
+
+    cout << indent_str << "Idle Core: " << endl;
+    cout << indent_str_next
+         << "Runtime Dynamic = " << IdleCoreEnergy / executionTime << " W\n"
+         << endl;
+
+  } else {
+    //		cout << indent_str_next << "Instruction Fetch Unit    Peak
+    //Dynamic
+    //=
+    //"
+    //<< ifu->rt_power.readOp.dynamic*clockRate << " W" << endl;
+    // cout
+    //<< indent_str_next << "Instruction Fetch Unit    Subthreshold Leakage = "
+    // << ifu->rt_power.readOp.leakage <<" W" << endl; 		cout <<
+    // indent_str_next << "Instruction Fetch Unit    Gate Leakage = " <<
+    // ifu->rt_power.readOp.gate_leakage << " W" << endl; 		cout <<
+    // indent_str_next
+    //<< "Load Store Unit   Peak Dynamic = " <<
+    // lsu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+    // cout
+    // << indent_str_next << "Load Store Unit   Subthreshold Leakage = " <<
+    // lsu->rt_power.readOp.leakage  << " W" << endl; 		cout <<
+    // indent_str_next
+    // << "Load Store Unit   Gate Leakage = " <<
+    // lsu->rt_power.readOp.gate_leakage
+    //<< " W" << endl; 		cout << indent_str_next << "Memory Management
+    // Unit Peak Dynamic = " << mmu->rt_power.readOp.dynamic*clockRate  << " W"
+    // <<
+    // endl; 		cout << indent_str_next << "Memory Management Unit
+    // Subthreshold Leakage = " << mmu->rt_power.readOp.leakage  << " W" <<
+    // endl; 		cout
+    // << indent_str_next << "Memory Management Unit   Gate Leakage = " <<
+    // mmu->rt_power.readOp.gate_leakage  << " W" << endl; 		cout <<
+    // indent_str_next << "Execution Unit   Peak Dynamic = " <<
+    // exu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+    // cout
+    // << indent_str_next << "Execution Unit   Subthreshold Leakage = " <<
+    // exu->rt_power.readOp.leakage  << " W" << endl; 		cout <<
+    // indent_str_next
+    // << "Execution Unit   Gate Leakage = " <<
+    // exu->rt_power.readOp.gate_leakage
+    //<< " W" << endl;
+  }
+}
+InstFetchU ::~InstFetchU() {
+
+  if (!exist)
+    return;
+  if (IB) {
+    delete IB;
+    IB = 0;
+  }
+  if (ID_inst) {
+    delete ID_inst;
+    ID_inst = 0;
+  }
+  if (ID_operand) {
+    delete ID_operand;
+    ID_operand = 0;
+  }
+  if (ID_misc) {
+    delete ID_misc;
+    ID_misc = 0;
+  }
+  if (coredynp.predictionW > 0) {
+    if (BTB) {
+      delete BTB;
+      BTB = 0;
+    }
+    if (BPT) {
+      delete BPT;
+      BPT = 0;
+    }
+  }
+}
+
+BranchPredictor ::~BranchPredictor() {
+
+  if (!exist)
+    return;
+  if (globalBPT) {
+    delete globalBPT;
+    globalBPT = 0;
+  }
+  if (localBPT) {
+    delete localBPT;
+    localBPT = 0;
+  }
+  if (L1_localBPT) {
+    delete L1_localBPT;
+    L1_localBPT = 0;
+  }
+  if (L2_localBPT) {
+    delete L2_localBPT;
+    L2_localBPT = 0;
+  }
+  if (chooser) {
+    delete chooser;
+    chooser = 0;
+  }
+  if (RAS) {
+    delete RAS;
+    RAS = 0;
+  }
+}
+
+RENAMINGU ::~RENAMINGU() {
+
+  if (!exist)
+    return;
+  if (iFRAT) {
+    delete iFRAT;
+    iFRAT = 0;
+  }
+  if (fFRAT) {
+    delete fFRAT;
+    fFRAT = 0;
+  }
+  if (iRRAT) {
+    delete iRRAT;
+    iRRAT = 0;
+  }
+  if (iFRAT) {
+    delete iFRAT;
+    iFRAT = 0;
+  }
+  if (ifreeL) {
+    delete ifreeL;
+    ifreeL = 0;
+  }
+  if (ffreeL) {
+    delete ffreeL;
+    ffreeL = 0;
+  }
+  if (idcl) {
+    delete idcl;
+    idcl = 0;
+  }
+  if (fdcl) {
+    delete fdcl;
+    fdcl = 0;
+  }
+  if (RAHT) {
+    delete RAHT;
+    RAHT = 0;
+  }
+}
+
+LoadStoreU ::~LoadStoreU() {
+
+  if (!exist)
+    return;
+  if (LSQ) {
+    delete LSQ;
+    LSQ = 0;
+  }
+}
+
+MemManU ::~MemManU() {
+
+  if (!exist)
+    return;
+  if (itlb) {
+    delete itlb;
+    itlb = 0;
+  }
+  if (dtlb) {
+    delete dtlb;
+    dtlb = 0;
+  }
+}
+
+RegFU ::~RegFU() {
+
+  if (!exist)
+    return;
+  if (IRF) {
+    delete IRF;
+    IRF = 0;
+  }
+  if (FRF) {
+    delete FRF;
+    FRF = 0;
+  }
+  if (RFWIN) {
+    delete RFWIN;
+    RFWIN = 0;
+  }
+}
+
+SchedulerU ::~SchedulerU() {
+
+  if (!exist)
+    return;
+  if (int_inst_window) {
+    delete int_inst_window;
+    int_inst_window = 0;
+  }
+  if (fp_inst_window) {
+    delete int_inst_window;
+    int_inst_window = 0;
+  }
+  if (ROB) {
+    delete ROB;
+    ROB = 0;
+  }
+  if (instruction_selection) {
+    delete instruction_selection;
+    instruction_selection = 0;
+  }
+}
+
+EXECU ::~EXECU() {
+
+  if (!exist)
+    return;
+  if (int_bypass) {
+    delete int_bypass;
+    int_bypass = 0;
+  }
+  if (intTagBypass) {
+    delete intTagBypass;
+    intTagBypass = 0;
+  }
+  if (int_mul_bypass) {
+    delete int_mul_bypass;
+    int_mul_bypass = 0;
+  }
+  if (intTag_mul_Bypass) {
+    delete intTag_mul_Bypass;
+    intTag_mul_Bypass = 0;
+  }
+  if (fp_bypass) {
+    delete fp_bypass;
+    fp_bypass = 0;
+  }
+  if (fpTagBypass) {
+    delete fpTagBypass;
+    fpTagBypass = 0;
+  }
+  if (fp_u) {
+    delete fp_u;
+    fp_u = 0;
+  }
+  if (exeu) {
+    delete exeu;
+    exeu = 0;
+  }
+  if (mul) {
+    delete mul;
+    mul = 0;
+  }
+  if (rfu) {
+    delete rfu;
+    rfu = 0;
+  }
+  if (scheu) {
+    delete scheu;
+    scheu = 0;
+  }
+}
+
+Core ::~Core() {
+
+  if (ifu) {
+    delete ifu;
+    ifu = 0;
+  }
+  if (lsu) {
+    delete lsu;
+    lsu = 0;
+  }
+  if (rnu) {
+    delete rnu;
+    rnu = 0;
+  }
+  if (mmu) {
+    delete mmu;
+    mmu = 0;
+  }
+  if (exu) {
+    delete exu;
+    exu = 0;
+  }
+  if (corepipe) {
+    delete corepipe;
+    corepipe = 0;
+  }
+  if (undiffCore) {
+    delete undiffCore;
+    undiffCore = 0;
+  }
+  if (l2cache) {
+    delete l2cache;
+    l2cache = 0;
+  }
+}
+
+void Core::set_core_param() {
+  coredynp.opt_local = XML->sys.core[ithCore].opt_local;
+  coredynp.x86 = XML->sys.core[ithCore].x86;
+  coredynp.Embedded = XML->sys.Embedded;
+  coredynp.core_ty = (enum Core_type)XML->sys.core[ithCore].machine_type;
+  coredynp.rm_ty = (enum Renaming_type)XML->sys.core[ithCore].rename_scheme;
+  coredynp.fetchW = XML->sys.core[ithCore].fetch_width;
+  coredynp.decodeW = XML->sys.core[ithCore].decode_width;
+  coredynp.issueW = XML->sys.core[ithCore].issue_width;
+  coredynp.peak_issueW = XML->sys.core[ithCore].peak_issue_width;
+  coredynp.commitW = XML->sys.core[ithCore].commit_width;
+  coredynp.peak_commitW = XML->sys.core[ithCore].peak_issue_width;
+  coredynp.predictionW = XML->sys.core[ithCore].prediction_width;
+  coredynp.fp_issueW = XML->sys.core[ithCore].fp_issue_width;
+  coredynp.fp_decodeW = XML->sys.core[ithCore].fp_issue_width;
+  coredynp.num_alus = XML->sys.core[ithCore].ALU_per_core;
+  coredynp.num_fpus = XML->sys.core[ithCore].FPU_per_core;
+  coredynp.num_muls = XML->sys.core[ithCore].MUL_per_core;
+
+  coredynp.num_hthreads = XML->sys.core[ithCore].number_hardware_threads;
+  coredynp.multithreaded = coredynp.num_hthreads > 1 ? true : false;
+  coredynp.instruction_length = XML->sys.core[ithCore].instruction_length;
+  coredynp.pc_width = XML->sys.virtual_address_width;
+
+  coredynp.opcode_length = XML->sys.core[ithCore].opcode_width;
+  coredynp.micro_opcode_length = XML->sys.core[ithCore].micro_opcode_width;
+  coredynp.num_pipelines = XML->sys.core[ithCore].pipelines_per_core[0];
+  coredynp.pipeline_stages = XML->sys.core[ithCore].pipeline_depth[0];
+  coredynp.num_fp_pipelines = XML->sys.core[ithCore].pipelines_per_core[1];
+  coredynp.fp_pipeline_stages = XML->sys.core[ithCore].pipeline_depth[1];
+  coredynp.int_data_width = int(ceil(XML->sys.machine_bits / 32.0)) * 32;
+  coredynp.fp_data_width = coredynp.int_data_width;
+  coredynp.v_address_width = XML->sys.virtual_address_width;
+  coredynp.p_address_width = XML->sys.physical_address_width;
+
+  coredynp.scheu_ty =
+      (enum Scheduler_type)XML->sys.core[ithCore].instruction_window_scheme;
+  coredynp.arch_ireg_width =
+      int(ceil(log2(XML->sys.core[ithCore].archi_Regs_IRF_size)));
+  coredynp.arch_freg_width =
+      int(ceil(log2(XML->sys.core[ithCore].archi_Regs_FRF_size)));
+  coredynp.num_IRF_entry = XML->sys.core[ithCore].archi_Regs_IRF_size;
+  coredynp.num_FRF_entry = XML->sys.core[ithCore].archi_Regs_FRF_size;
+  coredynp.pipeline_duty_cycle = XML->sys.core[ithCore].pipeline_duty_cycle;
+  coredynp.total_cycles = XML->sys.core[ithCore].total_cycles;
+  coredynp.busy_cycles = XML->sys.core[ithCore].busy_cycles;
+  coredynp.idle_cycles = XML->sys.core[ithCore].idle_cycles;
+
+  // Max power duty cycle for peak power estimation
+  //	if (coredynp.core_ty==OOO)
+  //	{
+  //		coredynp.IFU_duty_cycle = 1;
+  //		coredynp.LSU_duty_cycle = 1;
+  //		coredynp.MemManU_I_duty_cycle =1;
+  //		coredynp.MemManU_D_duty_cycle =1;
+  //		coredynp.ALU_duty_cycle =1;
+  //		coredynp.MUL_duty_cycle =1;
+  //		coredynp.FPU_duty_cycle =1;
+  //		coredynp.ALU_cdb_duty_cycle =1;
+  //		coredynp.MUL_cdb_duty_cycle =1;
+  //		coredynp.FPU_cdb_duty_cycle =1;
+  //	}
+  //	else
+  //	{
+  coredynp.IFU_duty_cycle = XML->sys.core[ithCore].IFU_duty_cycle;
+  coredynp.BR_duty_cycle = XML->sys.core[ithCore].BR_duty_cycle;
+  coredynp.LSU_duty_cycle = XML->sys.core[ithCore].LSU_duty_cycle;
+  coredynp.MemManU_I_duty_cycle = XML->sys.core[ithCore].MemManU_I_duty_cycle;
+  coredynp.MemManU_D_duty_cycle = XML->sys.core[ithCore].MemManU_D_duty_cycle;
+  coredynp.ALU_duty_cycle = XML->sys.core[ithCore].ALU_duty_cycle;
+  coredynp.MUL_duty_cycle = XML->sys.core[ithCore].MUL_duty_cycle;
+  coredynp.FPU_duty_cycle = XML->sys.core[ithCore].FPU_duty_cycle;
+  coredynp.ALU_cdb_duty_cycle = XML->sys.core[ithCore].ALU_cdb_duty_cycle;
+  coredynp.MUL_cdb_duty_cycle = XML->sys.core[ithCore].MUL_cdb_duty_cycle;
+  coredynp.FPU_cdb_duty_cycle = XML->sys.core[ithCore].FPU_cdb_duty_cycle;
+  //	}
+
+  if (!((coredynp.core_ty == OOO) || (coredynp.core_ty == Inorder))) {
+    cout << "Invalid Core Type" << endl;
+    exit(0);
+  }
+  //	if (coredynp.core_ty==OOO)
+  //	{
+  //		cout<<"OOO processor models are being updated and will be
+  // available in next release"<<endl; 		exit(0);
+  //	}
+  if (!((coredynp.scheu_ty == PhysicalRegFile) ||
+        (coredynp.scheu_ty == ReservationStation))) {
+    cout << "Invalid OOO Scheduler Type" << endl;
+    exit(0);
+  }
+
+  if (!((coredynp.rm_ty == RAMbased) || (coredynp.rm_ty == CAMbased))) {
+    cout << "Invalid OOO Renaming Type" << endl;
+    exit(0);
+  }
+
+  if (coredynp.core_ty == OOO) {
+    if (coredynp.scheu_ty == PhysicalRegFile) {
+      coredynp.phy_ireg_width =
+          int(ceil(log2(XML->sys.core[ithCore].phy_Regs_IRF_size)));
+      coredynp.phy_freg_width =
+          int(ceil(log2(XML->sys.core[ithCore].phy_Regs_FRF_size)));
+      coredynp.num_ifreelist_entries = coredynp.num_IRF_entry =
+          XML->sys.core[ithCore].phy_Regs_IRF_size;
+      coredynp.num_ffreelist_entries = coredynp.num_FRF_entry =
+          XML->sys.core[ithCore].phy_Regs_FRF_size;
+    } else if (coredynp.scheu_ty ==
+               ReservationStation) { // ROB serves as Phy RF in RS based OOO
+      coredynp.phy_ireg_width =
+          int(ceil(log2(XML->sys.core[ithCore].ROB_size)));
+      coredynp.phy_freg_width =
+          int(ceil(log2(XML->sys.core[ithCore].ROB_size)));
+      coredynp.num_ifreelist_entries = XML->sys.core[ithCore].ROB_size;
+      coredynp.num_ffreelist_entries = XML->sys.core[ithCore].ROB_size;
+    }
+  }
+  coredynp.globalCheckpoint = 32; // best check pointing entries for a 4~8 issue
+                                  // OOO should be 16~48;See TR for reference.
+  coredynp.perThreadState = 8;
+  coredynp.instruction_length = 32;
+  coredynp.clockRate = XML->sys.core[ithCore].clock_rate;
+  coredynp.clockRate *= 1e6;
+  coredynp.regWindowing = (XML->sys.core[ithCore].register_windows_size > 0 &&
+                           coredynp.core_ty == Inorder)
+                              ? true
+                              : false;
+  coredynp.executionTime = XML->sys.total_cycles / coredynp.clockRate;
+  set_pppm(coredynp.pppm_lkg_multhread, 0, coredynp.num_hthreads,
+           coredynp.num_hthreads, 0);
+}
diff --git a/src/gpuwattch/core.h b/src/gpuwattch/core.h
new file mode 100644
index 000000000..b63abce2a
--- /dev/null
+++ b/src/gpuwattch/core.h
@@ -0,0 +1,556 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+
+#ifndef CORE_H_
+#define CORE_H_
+
+#include "XML_Parse.h"
+#include "array.h"
+#include "basic_components.h"
+#include "cacti/arbiter.h"
+#include "cacti/crossbar.h"
+#include "cacti/parameter.h"
+#include "interconnect.h"
+#include "logic.h"
+#include "noc.h"
+#include "sharedcache.h"
+
+class BranchPredictor : public Component {
+public:
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  CoreDynParam coredynp;
+  double clockRate, executionTime;
+  double scktRatio, chip_PR_overhead, macro_PR_overhead;
+  ArrayST *globalBPT;
+  ArrayST *localBPT;
+  ArrayST *L1_localBPT;
+  ArrayST *L2_localBPT;
+  ArrayST *chooser;
+  ArrayST *RAS;
+  bool exist;
+
+  BranchPredictor(ParseXML *XML_interface, int ithCore_,
+                  InputParameter *interface_ip_, const CoreDynParam &dyn_p_,
+                  bool exsit = true);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~BranchPredictor();
+};
+
+class InstFetchU : public Component {
+public:
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  CoreDynParam coredynp;
+  double clockRate, executionTime;
+  double scktRatio, chip_PR_overhead, macro_PR_overhead;
+  enum Cache_policy cache_p;
+  InstCache icache;
+  ArrayST *IB;
+  ArrayST *BTB;
+  BranchPredictor *BPT;
+  inst_decoder *ID_inst;
+  inst_decoder *ID_operand;
+  inst_decoder *ID_misc;
+  bool exist;
+
+  InstFetchU(ParseXML *XML_interface, int ithCore_,
+             InputParameter *interface_ip_, const CoreDynParam &dyn_p_,
+             bool exsit = true);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~InstFetchU();
+};
+
+class SchedulerU : public Component {
+public:
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  CoreDynParam coredynp;
+  double clockRate, executionTime;
+  double scktRatio, chip_PR_overhead, macro_PR_overhead;
+  double Iw_height, fp_Iw_height, ROB_height;
+  ArrayST *int_inst_window;
+  ArrayST *fp_inst_window;
+  ArrayST *ROB;
+  selection_logic *instruction_selection;
+  bool exist;
+
+  SchedulerU(ParseXML *XML_interface, int ithCore_,
+             InputParameter *interface_ip_, const CoreDynParam &dyn_p_,
+             bool exist_ = true);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~SchedulerU();
+};
+
+class RENAMINGU : public Component {
+public:
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  double clockRate, executionTime;
+  CoreDynParam coredynp;
+  ArrayST *iFRAT;
+  ArrayST *fFRAT;
+  ArrayST *iRRAT;
+  ArrayST *fRRAT;
+  ArrayST *ifreeL;
+  ArrayST *ffreeL;
+  dep_resource_conflict_check *idcl;
+  dep_resource_conflict_check *fdcl;
+  ArrayST *RAHT; // register alias history table Used to store GC
+  bool exist;
+
+  RENAMINGU(ParseXML *XML_interface, int ithCore_,
+            InputParameter *interface_ip_, const CoreDynParam &dyn_p_,
+            bool exist_ = true);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~RENAMINGU();
+};
+
+class LoadStoreU : public Component {
+public:
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  CoreDynParam coredynp;
+  enum Cache_policy cache_p;
+  double clockRate, executionTime;
+  double scktRatio, chip_PR_overhead, macro_PR_overhead;
+  double lsq_height;
+  DataCache dcache;
+  DataCache ccache;
+  DataCache tcache;
+  DataCache sharedmemory;
+  ArrayST *LSQ; // it is actually the store queue but for inorder processors it
+                // serves as both loadQ and StoreQ
+  ArrayST *LoadQ;
+  vector<NoC *> nocs;
+  bool exist;
+  Crossbar *xbar_shared;
+  Component noc;
+  LoadStoreU(ParseXML *XML_interface, int ithCore_,
+             InputParameter *interface_ip_, const CoreDynParam &dyn_p_,
+             bool exist_ = true);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  void displayDeviceType(int device_type_,
+                         uint32_t indent); // Added by Syed Gilani
+
+  ~LoadStoreU();
+};
+
+class MemManU : public Component {
+public:
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  CoreDynParam coredynp;
+  double clockRate, executionTime;
+  double scktRatio, chip_PR_overhead, macro_PR_overhead;
+  ArrayST *itlb;
+  ArrayST *dtlb;
+  bool exist;
+
+  MemManU(ParseXML *XML_interface, int ithCore_, InputParameter *interface_ip_,
+          const CoreDynParam &dyn_p_, bool exist_ = false);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~MemManU();
+};
+
+class RegFU : public Component {
+public:
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  CoreDynParam coredynp;
+  double clockRate, executionTime;
+  double scktRatio, chip_PR_overhead, macro_PR_overhead;
+  double int_regfile_height, fp_regfile_height;
+  ArrayST *IRF;
+  ArrayST *FRF;
+  ArrayST *RFWIN;
+  ArrayST *OPC; // Operand collectors
+  bool exist;
+  double exClockRate;
+  // OC Modelling (Syed)
+  Crossbar *xbar_rfu;
+  MCPAT_Arbiter *arbiter_rfu;
+  RegFU(ParseXML *XML_interface, int ithCore_, InputParameter *interface_ip_,
+        const CoreDynParam &dyn_p_, double exClockRate, bool exist_ = true);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~RegFU();
+};
+
+class EXECU : public Component {
+public:
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  double clockRate, executionTime;
+  double scktRatio, chip_PR_overhead, macro_PR_overhead;
+  double lsq_height;
+  CoreDynParam coredynp;
+  RegFU *rfu;
+  SchedulerU *scheu;
+  FunctionalUnit *fp_u;
+  FunctionalUnit *exeu;
+  FunctionalUnit *mul;
+  interconnect *int_bypass;
+  interconnect *intTagBypass;
+  interconnect *int_mul_bypass;
+  interconnect *intTag_mul_Bypass;
+  interconnect *fp_bypass;
+  interconnect *fpTagBypass;
+  bool exist;
+  double rf_fu_clockRate;
+  Component bypass;
+
+  EXECU(ParseXML *XML_interface, int ithCore_, InputParameter *interface_ip_,
+        double lsq_height_, const CoreDynParam &dyn_p_, double exClockRate,
+        bool exist_);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~EXECU();
+};
+
+class Core : public Component {
+public:
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  double clockRate, executionTime;
+  double exClockRate;
+  double scktRatio, chip_PR_overhead, macro_PR_overhead;
+  InstFetchU *ifu;
+  LoadStoreU *lsu;
+  MemManU *mmu;
+  EXECU *exu;
+  RENAMINGU *rnu;
+  double IdleCoreEnergy;
+  double IdlePower_PerCore;
+  Pipeline *corepipe;
+  UndiffCore *undiffCore;
+  SharedCache *l2cache;
+  CoreDynParam coredynp;
+  double Pipeline_energy;
+  // full_decoder 	inst_decoder;
+  // clock_network	clockNetwork;
+  Core(ParseXML *XML_interface, int ithCore_, InputParameter *interface_ip_);
+  void set_core_param();
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+
+  float get_coefficient_icache_hits() {
+    // return 1.5*ifu->icache.caches->local_result.power.readOp.dynamic;
+    return ifu->icache.caches->local_result.power.readOp.dynamic;
+  }
+
+  float get_coefficient_icache_misses() {
+    float value = 0;
+    value += ifu->icache.caches->local_result.power.writeOp.dynamic;
+    value += ifu->icache.caches->local_result.power.readOp.dynamic;
+    value += ifu->icache.missb->local_result.power.searchOp.dynamic;
+    value += ifu->icache.missb->local_result.power.writeOp.dynamic;
+    value += ifu->icache.ifb->local_result.power.searchOp.dynamic;
+    value += ifu->icache.ifb->local_result.power.writeOp.dynamic;
+    value += ifu->icache.prefetchb->local_result.power.searchOp.dynamic;
+    value += ifu->icache.prefetchb->local_result.power.writeOp.dynamic;
+    return value;
+  }
+
+  float get_coefficient_tot_insts() {
+    float value = 0;
+    value += ifu->IB->local_result.power.readOp.dynamic;
+    value += ifu->IB->local_result.power.writeOp.dynamic;
+    value += ifu->ID_inst->power_t.readOp.dynamic;
+    value += ifu->ID_operand->power_t.readOp.dynamic;
+    value += ifu->ID_misc->power_t.readOp.dynamic;
+    return value;
+  }
+
+  float get_coefficient_fpint_insts() {
+    float value = 0;
+    value += exu->scheu->int_inst_window->local_result.power.readOp.dynamic;
+    value +=
+        2 * exu->scheu->int_inst_window->local_result.power.searchOp.dynamic;
+    value += exu->scheu->int_inst_window->local_result.power.writeOp.dynamic;
+    value += exu->scheu->instruction_selection->power.readOp.dynamic;
+    return value;
+  }
+
+  float get_coefficient_dcache_readhits() {
+    float value = 0;
+    value += lsu->dcache.caches->local_result.power.readOp.dynamic;
+    value += lsu->xbar_shared->power.readOp.dynamic;
+    // return 0.5*value;
+    return value;
+  }
+  float get_coefficient_dcache_readmisses() {
+    float value = 0;
+    value += lsu->dcache.caches->local_result.power.readOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->dcache.missb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->dcache.ifb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->dcache.prefetchb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->dcache.missb->local_result.power.writeOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->dcache.ifb->local_result.power.writeOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->dcache.prefetchb->local_result.power.writeOp.dynamic;
+
+    // return 0.5*value;
+    return value;
+  }
+  float get_coefficient_dcache_writehits() {
+    float value = 0;
+    value += lsu->dcache.caches->local_result.power.writeOp.dynamic;
+    value += lsu->xbar_shared->power.readOp.dynamic;
+    return value;
+  }
+  float get_coefficient_dcache_writemisses() {
+    float value = 0;
+    value += lsu->dcache.caches->local_result.power.writeOp.dynamic;
+    value += lsu->dcache.caches->local_result.tag_array2->power.readOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? lsu->dcache.caches->local_result.power.writeOp.dynamic
+                 : 0;
+    value += (lsu->cache_p == Write_back)
+                 ? lsu->dcache.missb->local_result.power.searchOp.dynamic
+                 : 0;
+    value += (lsu->cache_p == Write_back)
+                 ? lsu->dcache.ifb->local_result.power.searchOp.dynamic
+                 : 0;
+    value += (lsu->cache_p == Write_back)
+                 ? lsu->dcache.prefetchb->local_result.power.searchOp.dynamic
+                 : 0;
+    value += (lsu->cache_p == Write_back)
+                 ? lsu->dcache.wbb->local_result.power.searchOp.dynamic
+                 : 0;
+    value += (lsu->cache_p == Write_back)
+                 ? lsu->dcache.missb->local_result.power.writeOp.dynamic
+                 : 0;
+    value += (lsu->cache_p == Write_back)
+                 ? lsu->dcache.ifb->local_result.power.writeOp.dynamic
+                 : 0;
+    value += (lsu->cache_p == Write_back)
+                 ? lsu->dcache.prefetchb->local_result.power.writeOp.dynamic
+                 : 0;
+    value += (lsu->cache_p == Write_back)
+                 ? lsu->dcache.wbb->local_result.power.writeOp.dynamic
+                 : 0;
+    // return 1.6*value;
+    return value;
+  }
+
+  float get_coefficient_tcache_readhits() {
+    float value = 0;
+    value += lsu->tcache.caches->local_result.power.readOp.dynamic;
+    value += lsu->xbar_shared->power.readOp.dynamic;
+    // return 0.2*value;
+    return value;
+  }
+  float get_coefficient_tcache_readmisses() {
+    float value = 0;
+    value += lsu->tcache.caches->local_result.power.readOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.missb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.missb->local_result.power.writeOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.ifb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.ifb->local_result.power.writeOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.prefetchb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.prefetchb->local_result.power.writeOp.dynamic;
+
+    // return 0.2*value;
+    return value;
+  }
+  float get_coefficient_tcache_readmisses1() {
+
+    return lsu->tcache.caches->local_result.power.readOp.dynamic;
+  }
+  float get_coefficient_tcache_readmisses2() {
+    float value = 0;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.missb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.missb->local_result.power.writeOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.ifb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.ifb->local_result.power.writeOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.prefetchb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->tcache.prefetchb->local_result.power.writeOp.dynamic;
+    return value;
+  }
+
+  float get_coefficient_ccache_readhits() {
+    // return 1.2*lsu->ccache.caches->local_result.power.readOp.dynamic+lsu->xbar_shared->power.readOp.dynamic;
+    // return 1.2*lsu->ccache.caches->local_result.power.readOp.dynamic+lsu->xbar_shared->power.readOp.dynamic;
+    return lsu->ccache.caches->local_result.power.readOp.dynamic +
+           lsu->xbar_shared->power.readOp.dynamic;
+  }
+  float get_coefficient_ccache_readmisses() {
+    float value = 0;
+    value += lsu->ccache.caches->local_result.power.readOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->ccache.missb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->ccache.ifb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->ccache.prefetchb->local_result.power.searchOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->ccache.missb->local_result.power.writeOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->ccache.ifb->local_result.power.writeOp.dynamic;
+    value += (lsu->cache_p == Write_back)
+                 ? 0
+                 : lsu->ccache.prefetchb->local_result.power.writeOp.dynamic;
+    return value;
+  }
+
+  float get_coefficient_sharedmemory_readhits() {
+    float value = 0;
+    value += lsu->sharedmemory.caches->local_result.power.readOp.dynamic;
+    value += lsu->xbar_shared->power.readOp.dynamic;
+    // return 3*value;
+    return value;
+  }
+
+  float get_coefficient_lsq_accesses() {
+    float value = 0;
+    // Changed by Syed -- We have removed LSQ
+    // value+=2*lsu->LSQ->local_result.power.searchOp.dynamic;
+    // value+=2*lsu->LSQ->local_result.power.readOp.dynamic;
+    // value+=2*lsu->LSQ->local_result.power.writeOp.dynamic;
+    return value;
+  }
+
+  float get_coefficient_regreads_accesses() {
+    float value = 0;
+    value += ((exu->rfu->IRF->local_result.power.readOp.dynamic / 32) *
+              (4 * 2) /*/1.5*/);
+    value += exu->rfu->xbar_rfu->power.readOp.dynamic / (32 /**1.5*/);
+    value += (exu->rfu->arbiter_rfu->power.readOp.dynamic / 32 /**1.5)*/);
+    value += exu->rfu->OPC->local_result.power.readOp.dynamic /*/1.5*/;
+    return value;
+  }
+
+  float get_coefficient_regwrites_accesses() {
+    return ((exu->rfu->IRF->local_result.power.writeOp.dynamic / 32) *
+            (4 * 2) /*/1.5*/);
+  }
+
+  float get_coefficient_noregfileops_accesses() {
+    return ((exu->rfu->xbar_rfu->power.readOp.dynamic / (32 /**1.5*/)) +
+            (exu->rfu->arbiter_rfu->power.readOp.dynamic / (32 /**1.5*/)) +
+            (exu->rfu->OPC->local_result.power.readOp.dynamic /*/(1.5)*/));
+  }
+
+  float get_coefficient_ialu_accesses() {
+    // return 10*exu->exeu->per_access_energy*g_tp.sckt_co_eff;
+    return exu->exeu->per_access_energy * g_tp.sckt_co_eff;
+  }
+
+  float get_coefficient_sfu_accesses() {
+    return exu->mul->per_access_energy * g_tp.sckt_co_eff;
+    // return 2.6*exu->mul->per_access_energy*g_tp.sckt_co_eff;
+  }
+
+  float get_coefficient_fpu_accesses() {
+    // return 3.2*exu->fp_u->per_access_energy*g_tp.sckt_co_eff;
+    return exu->fp_u->per_access_energy * g_tp.sckt_co_eff;
+  }
+
+  float get_coefficient_duty_cycle() {
+    float value = 0;
+    float num_units = 4.0;
+    value = XML->sys.total_cycles * XML->sys.number_of_cores;
+    value *= coredynp.num_pipelines;
+    value /= num_units;
+    value *= corepipe->power.readOp.dynamic;
+    value *= 3;
+    return value;
+    // return 1.5*value;
+  }
+
+  void compute();
+  ~Core();
+};
+
+#endif /* CORE_H_ */
diff --git a/src/gpuwattch/fermi.xml b/src/gpuwattch/fermi.xml
new file mode 100755
index 000000000..5b09d457b
--- /dev/null
+++ b/src/gpuwattch/fermi.xml
@@ -0,0 +1,497 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="number_of_cores" value="16"/>
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="40"/><!-- nm -->
+		<param name="target_core_clockrate" value="700"/><!--MHz -->
+		<param name="temperature" value="380"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="32768"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="1"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>
+			<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->  
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="49152,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,256,8,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="40"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
diff --git a/src/gpuwattch/globalvar.h b/src/gpuwattch/globalvar.h
new file mode 100644
index 000000000..e0b76c2c6
--- /dev/null
+++ b/src/gpuwattch/globalvar.h
@@ -0,0 +1,43 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef GLOBALVAR_H_
+#define GLOBALVAR_H_
+
+#ifdef GLOBALVAR
+#define EXTERN
+#else
+#define EXTERN extern
+#endif
+
+EXTERN bool opt_for_clk;
+
+#endif /* GLOBALVAR_H_ */
diff --git a/src/gpuwattch/gpgpu.xml b/src/gpuwattch/gpgpu.xml
new file mode 100644
index 000000000..313527c73
--- /dev/null
+++ b/src/gpuwattch/gpgpu.xml
@@ -0,0 +1,477 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="30"/>
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="4"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/>
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="45"/><!-- nm -->
+		<param name="target_core_clockrate" value="1300"/><!--MHz -->
+		<param name="temperature" value="380"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<stat name="total_cycles" value="4144"/>
+		<stat name="idle_cycles" value="0"/>
+		<stat name="busy_cycles"  value="4144"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="1300"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="1"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="1"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="6,6"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="1"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="1"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="1"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<param name="archi_Regs_IRF_size" value="32"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="1"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="58880"/>
+			<stat name="int_instructions" value="25600"/>
+			<stat name="fp_instructions" value="30720"/>
+			<stat name="branch_instructions" value="0"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="15360"/>
+			<stat name="store_instructions" value="2560"/>
+			<stat name="committed_instructions" value="58880"/>
+			<stat name="committed_int_instructions" value="25600"/>
+			<stat name="committed_fp_instructions" value="30720"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="4144"/>
+		    <stat name="idle_cycles" value="0"/>
+		    <stat name="busy_cycles"  value="4144"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="40960"/>
+			<stat name="float_regfile_reads" value="28160"/>
+			<stat name="int_regfile_writes" value="25600"/>
+			<stat name="float_regfile_writes" value="28160"/>
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="20480"/>			
+			<stat name="fpu_accesses" value="12800"/>
+			<stat name="mul_accesses" value="10000"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="0.4"/>
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="1"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="29440"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="1"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="16384,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="5120"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="15360"/>
+				<stat name="write_accesses" value="2560"/>
+				<stat name="read_misses" value="720"/>
+				<stat name="write_misses" value="120"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1300"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1300"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="786432,64,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="1300"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="1300"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="8"/>
+			<param name="output_ports" value="5"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="136"/>
+			<param name="input_buffer_entries_per_vc" value="2"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="360000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>
+		
+<!--**********************************************************************-->
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="32"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="200"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="4"/>
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="1"/>
+			<param name="number_ranks" value="2"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="128"/>
+			<param name="addressbus_width" value="51"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="33333"/>
+			<stat name="memory_reads" value="16667"/>
+			<stat name="memory_writes" value="16667"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
diff --git a/src/gpuwattch/gpgpu_sim.verify b/src/gpuwattch/gpgpu_sim.verify
new file mode 100644
index 000000000..3b326563d
--- /dev/null
+++ b/src/gpuwattch/gpgpu_sim.verify
@@ -0,0 +1,3 @@
+/********************** GPGPU-Sim Verification File **********************/
+This file ensures that the GPGPU-Sim version of McPAT is included.
+/*************************************************************************/
diff --git a/src/gpuwattch/gpgpu_sim_wrapper.cc b/src/gpuwattch/gpgpu_sim_wrapper.cc
new file mode 100644
index 000000000..fff123170
--- /dev/null
+++ b/src/gpuwattch/gpgpu_sim_wrapper.cc
@@ -0,0 +1,870 @@
+// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
+// The University of British Columbia
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+// Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution. Neither the name of
+// The University of British Columbia nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#include "gpgpu_sim_wrapper.h"
+#include <sys/stat.h>
+#define SP_BASE_POWER 0
+#define SFU_BASE_POWER 0
+
+static const char *pwr_cmp_label[] = {
+    "IBP,", "ICP,",  "DCP,",   "TCP,",   "CCP,",        "SHRDP,",
+    "RFP,", "SPP,",  "SFUP,",  "FPUP,",  "SCHEDP,",     "L2CP,",
+    "MCP,", "NOCP,", "DRAMP,", "PIPEP,", "IDLE_COREP,", "CONST_DYNAMICP"};
+
+enum pwr_cmp_t {
+  IBP = 0,
+  ICP,
+  DCP,
+  TCP,
+  CCP,
+  SHRDP,
+  RFP,
+  SPP,
+  SFUP,
+  FPUP,
+  SCHEDP,
+  L2CP,
+  MCP,
+  NOCP,
+  DRAMP,
+  PIPEP,
+  IDLE_COREP,
+  CONST_DYNAMICP,
+  NUM_COMPONENTS_MODELLED
+};
+
+gpgpu_sim_wrapper::gpgpu_sim_wrapper(bool power_simulation_enabled,
+                                     char *xmlfile) {
+  kernel_sample_count = 0;
+  total_sample_count = 0;
+
+  kernel_tot_power = 0;
+
+  num_pwr_cmps = NUM_COMPONENTS_MODELLED;
+  num_perf_counters = NUM_PERFORMANCE_COUNTERS;
+
+  // Initialize per-component counter/power vectors
+  avg_max_min_counters<double> init;
+  kernel_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, init);
+  kernel_cmp_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, init);
+
+  kernel_power = init;  // Per-kernel powers
+  gpu_tot_power = init; // Global powers
+
+  sample_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, 0);
+
+  sample_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, 0);
+  initpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
+  effpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
+
+  const_dynamic_power = 0;
+  proc_power = 0;
+
+  g_power_filename = NULL;
+  g_power_trace_filename = NULL;
+  g_metric_trace_filename = NULL;
+  g_steady_state_tracking_filename = NULL;
+  xml_filename = xmlfile;
+  g_power_simulation_enabled = power_simulation_enabled;
+  g_power_trace_enabled = false;
+  g_steady_power_levels_enabled = false;
+  g_power_trace_zlevel = 0;
+  g_power_per_cycle_dump = false;
+  gpu_steady_power_deviation = 0;
+  gpu_steady_min_period = 0;
+
+  gpu_stat_sample_freq = 0;
+  p = new ParseXML();
+  if (g_power_simulation_enabled) {
+    p->parse(xml_filename);
+  }
+  proc = new Processor(p);
+  power_trace_file = NULL;
+  metric_trace_file = NULL;
+  steady_state_tacking_file = NULL;
+  has_written_avg = false;
+  init_inst_val = false;
+}
+
+gpgpu_sim_wrapper::~gpgpu_sim_wrapper() {}
+
+bool gpgpu_sim_wrapper::sanity_check(double a, double b) {
+  if (b == 0)
+    return (abs(a - b) < 0.00001);
+  else
+    return (abs(a - b) / abs(b) < 0.00001);
+
+  return false;
+}
+void gpgpu_sim_wrapper::init_mcpat(
+    char *xmlfile, char *powerfilename, char *power_trace_filename,
+    char *metric_trace_filename, char *steady_state_filename,
+    bool power_sim_enabled, bool trace_enabled, bool steady_state_enabled,
+    bool power_per_cycle_dump, double steady_power_deviation,
+    double steady_min_period, int zlevel, double init_val,
+    int stat_sample_freq) {
+  // Write File Headers for (-metrics trace, -power trace)
+
+  reset_counters();
+  static bool mcpat_init = true;
+
+  // initialize file name if it is not set
+  time_t curr_time;
+  time(&curr_time);
+  char *date = ctime(&curr_time);
+  char *s = date;
+  while (*s) {
+    if (*s == ' ' || *s == '\t' || *s == ':')
+      *s = '-';
+    if (*s == '\n' || *s == '\r')
+      *s = 0;
+    s++;
+  }
+
+  if (mcpat_init) {
+    g_power_filename = powerfilename;
+    g_power_trace_filename = power_trace_filename;
+    g_metric_trace_filename = metric_trace_filename;
+    g_steady_state_tracking_filename = steady_state_filename;
+    xml_filename = xmlfile;
+    g_power_simulation_enabled = power_sim_enabled;
+    g_power_trace_enabled = trace_enabled;
+    g_steady_power_levels_enabled = steady_state_enabled;
+    g_power_trace_zlevel = zlevel;
+    g_power_per_cycle_dump = power_per_cycle_dump;
+    gpu_steady_power_deviation = steady_power_deviation;
+    gpu_steady_min_period = steady_min_period;
+
+    gpu_stat_sample_freq = stat_sample_freq;
+
+    // p->sys.total_cycles=gpu_stat_sample_freq*4;
+    p->sys.total_cycles = gpu_stat_sample_freq;
+    power_trace_file = NULL;
+    metric_trace_file = NULL;
+    steady_state_tacking_file = NULL;
+
+    if (g_power_trace_enabled) {
+      power_trace_file = gzopen(g_power_trace_filename, "w");
+      metric_trace_file = gzopen(g_metric_trace_filename, "w");
+      if ((power_trace_file == NULL) || (metric_trace_file == NULL)) {
+        printf("error - could not open trace files \n");
+        exit(1);
+      }
+      gzsetparams(power_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
+
+      gzprintf(power_trace_file, "power,");
+      for (unsigned i = 0; i < num_pwr_cmps; i++) {
+        gzprintf(power_trace_file, pwr_cmp_label[i]);
+      }
+      gzprintf(power_trace_file, "\n");
+
+      gzsetparams(metric_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
+      for (unsigned i = 0; i < num_perf_counters; i++) {
+        gzprintf(metric_trace_file, perf_count_label[i]);
+      }
+      gzprintf(metric_trace_file, "\n");
+
+      gzclose(power_trace_file);
+      gzclose(metric_trace_file);
+    }
+    if (g_steady_power_levels_enabled) {
+      steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "w");
+      if ((steady_state_tacking_file == NULL)) {
+        printf("error - could not open trace files \n");
+        exit(1);
+      }
+      gzsetparams(steady_state_tacking_file, g_power_trace_zlevel,
+                  Z_DEFAULT_STRATEGY);
+      gzprintf(steady_state_tacking_file, "start,end,power,IPC,");
+      for (unsigned i = 0; i < num_perf_counters; i++) {
+        gzprintf(steady_state_tacking_file, perf_count_label[i]);
+      }
+      gzprintf(steady_state_tacking_file, "\n");
+
+      gzclose(steady_state_tacking_file);
+    }
+
+    mcpat_init = false;
+    has_written_avg = false;
+    powerfile.open(g_power_filename);
+    int flg = chmod(g_power_filename, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    assert(flg == 0);
+  }
+  sample_val = 0;
+  init_inst_val = init_val; // gpu_tot_sim_insn+gpu_sim_insn;
+}
+
+void gpgpu_sim_wrapper::reset_counters() {
+
+  avg_max_min_counters<double> init;
+  for (unsigned i = 0; i < num_perf_counters; ++i) {
+    sample_perf_counters[i] = 0;
+    kernel_cmp_perf_counters[i] = init;
+  }
+  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+    sample_cmp_pwr[i] = 0;
+    kernel_cmp_pwr[i] = init;
+  }
+
+  // Reset per-kernel counters
+  kernel_sample_count = 0;
+  kernel_tot_power = 0;
+  kernel_power = init;
+
+  return;
+}
+
+void gpgpu_sim_wrapper::set_inst_power(bool clk_gated_lanes, double tot_cycles,
+                                       double busy_cycles, double tot_inst,
+                                       double int_inst, double fp_inst,
+                                       double load_inst, double store_inst,
+                                       double committed_inst) {
+  p->sys.core[0].gpgpu_clock_gated_lanes = clk_gated_lanes;
+  p->sys.core[0].total_cycles = tot_cycles;
+  p->sys.core[0].busy_cycles = busy_cycles;
+  p->sys.core[0].total_instructions =
+      tot_inst * p->sys.scaling_coefficients[TOT_INST];
+  p->sys.core[0].int_instructions =
+      int_inst * p->sys.scaling_coefficients[FP_INT];
+  p->sys.core[0].fp_instructions =
+      fp_inst * p->sys.scaling_coefficients[FP_INT];
+  p->sys.core[0].load_instructions = load_inst;
+  p->sys.core[0].store_instructions = store_inst;
+  p->sys.core[0].committed_instructions = committed_inst;
+  sample_perf_counters[FP_INT] = int_inst + fp_inst;
+  sample_perf_counters[TOT_INST] = tot_inst;
+}
+
+void gpgpu_sim_wrapper::set_regfile_power(double reads, double writes,
+                                          double ops) {
+  p->sys.core[0].int_regfile_reads =
+      reads * p->sys.scaling_coefficients[REG_RD];
+  p->sys.core[0].int_regfile_writes =
+      writes * p->sys.scaling_coefficients[REG_WR];
+  p->sys.core[0].non_rf_operands =
+      ops * p->sys.scaling_coefficients[NON_REG_OPs];
+  sample_perf_counters[REG_RD] = reads;
+  sample_perf_counters[REG_WR] = writes;
+  sample_perf_counters[NON_REG_OPs] = ops;
+}
+
+void gpgpu_sim_wrapper::set_icache_power(double hits, double misses) {
+  p->sys.core[0].icache.read_accesses =
+      hits * p->sys.scaling_coefficients[IC_H] +
+      misses * p->sys.scaling_coefficients[IC_M];
+  p->sys.core[0].icache.read_misses =
+      misses * p->sys.scaling_coefficients[IC_M];
+  sample_perf_counters[IC_H] = hits;
+  sample_perf_counters[IC_M] = misses;
+}
+
+void gpgpu_sim_wrapper::set_ccache_power(double hits, double misses) {
+  p->sys.core[0].ccache.read_accesses =
+      hits * p->sys.scaling_coefficients[CC_H] +
+      misses * p->sys.scaling_coefficients[CC_M];
+  p->sys.core[0].ccache.read_misses =
+      misses * p->sys.scaling_coefficients[CC_M];
+  sample_perf_counters[CC_H] = hits;
+  sample_perf_counters[CC_M] = misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_tcache_power(double hits, double misses) {
+  p->sys.core[0].tcache.read_accesses =
+      hits * p->sys.scaling_coefficients[TC_H] +
+      misses * p->sys.scaling_coefficients[TC_M];
+  p->sys.core[0].tcache.read_misses =
+      misses * p->sys.scaling_coefficients[TC_M];
+  sample_perf_counters[TC_H] = hits;
+  sample_perf_counters[TC_M] = misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_shrd_mem_power(double accesses) {
+  p->sys.core[0].sharedmemory.read_accesses =
+      accesses * p->sys.scaling_coefficients[SHRD_ACC];
+  sample_perf_counters[SHRD_ACC] = accesses;
+}
+
+void gpgpu_sim_wrapper::set_l1cache_power(double read_hits, double read_misses,
+                                          double write_hits,
+                                          double write_misses) {
+  p->sys.core[0].dcache.read_accesses =
+      read_hits * p->sys.scaling_coefficients[DC_RH] +
+      read_misses * p->sys.scaling_coefficients[DC_RM];
+  p->sys.core[0].dcache.read_misses =
+      read_misses * p->sys.scaling_coefficients[DC_RM];
+  p->sys.core[0].dcache.write_accesses =
+      write_hits * p->sys.scaling_coefficients[DC_WH] +
+      write_misses * p->sys.scaling_coefficients[DC_WM];
+  p->sys.core[0].dcache.write_misses =
+      write_misses * p->sys.scaling_coefficients[DC_WM];
+  sample_perf_counters[DC_RH] = read_hits;
+  sample_perf_counters[DC_RM] = read_misses;
+  sample_perf_counters[DC_WH] = write_hits;
+  sample_perf_counters[DC_WM] = write_misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_l2cache_power(double read_hits, double read_misses,
+                                          double write_hits,
+                                          double write_misses) {
+  p->sys.l2.total_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
+                             read_misses * p->sys.scaling_coefficients[L2_RM] +
+                             write_hits * p->sys.scaling_coefficients[L2_WH] +
+                             write_misses * p->sys.scaling_coefficients[L2_WM];
+  p->sys.l2.read_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
+                            read_misses * p->sys.scaling_coefficients[L2_RM];
+  p->sys.l2.write_accesses = write_hits * p->sys.scaling_coefficients[L2_WH] +
+                             write_misses * p->sys.scaling_coefficients[L2_WM];
+  p->sys.l2.read_hits = read_hits * p->sys.scaling_coefficients[L2_RH];
+  p->sys.l2.read_misses = read_misses * p->sys.scaling_coefficients[L2_RM];
+  p->sys.l2.write_hits = write_hits * p->sys.scaling_coefficients[L2_WH];
+  p->sys.l2.write_misses = write_misses * p->sys.scaling_coefficients[L2_WM];
+  sample_perf_counters[L2_RH] = read_hits;
+  sample_perf_counters[L2_RM] = read_misses;
+  sample_perf_counters[L2_WH] = write_hits;
+  sample_perf_counters[L2_WM] = write_misses;
+}
+
+void gpgpu_sim_wrapper::set_idle_core_power(double num_idle_core) {
+  p->sys.num_idle_cores = num_idle_core;
+  sample_perf_counters[IDLE_CORE_N] = num_idle_core;
+}
+
+void gpgpu_sim_wrapper::set_duty_cycle_power(double duty_cycle) {
+  p->sys.core[0].pipeline_duty_cycle =
+      duty_cycle * p->sys.scaling_coefficients[PIPE_A];
+  sample_perf_counters[PIPE_A] = duty_cycle;
+}
+
+void gpgpu_sim_wrapper::set_mem_ctrl_power(double reads, double writes,
+                                           double dram_precharge) {
+  p->sys.mc.memory_accesses = reads * p->sys.scaling_coefficients[MEM_RD] +
+                              writes * p->sys.scaling_coefficients[MEM_WR];
+  p->sys.mc.memory_reads = reads * p->sys.scaling_coefficients[MEM_RD];
+  p->sys.mc.memory_writes = writes * p->sys.scaling_coefficients[MEM_WR];
+  p->sys.mc.dram_pre = dram_precharge * p->sys.scaling_coefficients[MEM_PRE];
+  sample_perf_counters[MEM_RD] = reads;
+  sample_perf_counters[MEM_WR] = writes;
+  sample_perf_counters[MEM_PRE] = dram_precharge;
+}
+
+void gpgpu_sim_wrapper::set_exec_unit_power(double fpu_accesses,
+                                            double ialu_accesses,
+                                            double sfu_accesses) {
+  p->sys.core[0].fpu_accesses =
+      fpu_accesses * p->sys.scaling_coefficients[FPU_ACC];
+  // Integer ALU (not present in Tesla)
+  p->sys.core[0].ialu_accesses =
+      ialu_accesses * p->sys.scaling_coefficients[SP_ACC];
+  // Sfu accesses
+  p->sys.core[0].mul_accesses =
+      sfu_accesses * p->sys.scaling_coefficients[SFU_ACC];
+
+  sample_perf_counters[SP_ACC] = ialu_accesses;
+  sample_perf_counters[SFU_ACC] = sfu_accesses;
+  sample_perf_counters[FPU_ACC] = fpu_accesses;
+}
+
+void gpgpu_sim_wrapper::set_active_lanes_power(double sp_avg_active_lane,
+                                               double sfu_avg_active_lane) {
+  p->sys.core[0].sp_average_active_lanes = sp_avg_active_lane;
+  p->sys.core[0].sfu_average_active_lanes = sfu_avg_active_lane;
+}
+
+void gpgpu_sim_wrapper::set_NoC_power(double noc_tot_reads,
+                                      double noc_tot_writes) {
+  p->sys.NoC[0].total_accesses =
+      noc_tot_reads * p->sys.scaling_coefficients[NOC_A] +
+      noc_tot_writes * p->sys.scaling_coefficients[NOC_A];
+  sample_perf_counters[NOC_A] = noc_tot_reads + noc_tot_writes;
+}
+
+void gpgpu_sim_wrapper::power_metrics_calculations() {
+  total_sample_count++;
+  kernel_sample_count++;
+
+  // Current sample power
+  double sample_power =
+      proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONST_DYNAMICP];
+
+  // Average power
+  // Previous + new + constant dynamic power (e.g., dynamic clocking power)
+  kernel_tot_power += sample_power;
+  kernel_power.avg = kernel_tot_power / kernel_sample_count;
+  for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+    kernel_cmp_pwr[ind].avg += (double)sample_cmp_pwr[ind];
+  }
+
+  for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+    kernel_cmp_perf_counters[ind].avg += (double)sample_perf_counters[ind];
+  }
+
+  // Max Power
+  if (sample_power > kernel_power.max) {
+    kernel_power.max = sample_power;
+    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+      kernel_cmp_pwr[ind].max = (double)sample_cmp_pwr[ind];
+    }
+    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+      kernel_cmp_perf_counters[ind].max = sample_perf_counters[ind];
+    }
+  }
+
+  // Min Power
+  if (sample_power < kernel_power.min || (kernel_power.min == 0)) {
+    kernel_power.min = sample_power;
+    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+      kernel_cmp_pwr[ind].min = (double)sample_cmp_pwr[ind];
+    }
+    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+      kernel_cmp_perf_counters[ind].min = sample_perf_counters[ind];
+    }
+  }
+
+  gpu_tot_power.avg = (gpu_tot_power.avg + sample_power);
+  gpu_tot_power.max =
+      (sample_power > gpu_tot_power.max) ? sample_power : gpu_tot_power.max;
+  gpu_tot_power.min =
+      ((sample_power < gpu_tot_power.min) || (gpu_tot_power.min == 0))
+          ? sample_power
+          : gpu_tot_power.min;
+}
+
+void gpgpu_sim_wrapper::print_trace_files() {
+  open_files();
+
+  for (unsigned i = 0; i < num_perf_counters; ++i) {
+    gzprintf(metric_trace_file, "%f,", sample_perf_counters[i]);
+  }
+  gzprintf(metric_trace_file, "\n");
+
+  gzprintf(power_trace_file, "%f,", proc_power);
+  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+    gzprintf(power_trace_file, "%f,", sample_cmp_pwr[i]);
+  }
+  gzprintf(power_trace_file, "\n");
+
+  close_files();
+}
+
+void gpgpu_sim_wrapper::update_coefficients() {
+
+  initpower_coeff[FP_INT] = proc->cores[0]->get_coefficient_fpint_insts();
+  effpower_coeff[FP_INT] =
+      initpower_coeff[FP_INT] * p->sys.scaling_coefficients[FP_INT];
+
+  initpower_coeff[TOT_INST] = proc->cores[0]->get_coefficient_tot_insts();
+  effpower_coeff[TOT_INST] =
+      initpower_coeff[TOT_INST] * p->sys.scaling_coefficients[TOT_INST];
+
+  initpower_coeff[REG_RD] =
+      proc->cores[0]->get_coefficient_regreads_accesses() *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
+  initpower_coeff[REG_WR] =
+      proc->cores[0]->get_coefficient_regwrites_accesses() *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
+  initpower_coeff[NON_REG_OPs] =
+      proc->cores[0]->get_coefficient_noregfileops_accesses() *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
+  effpower_coeff[REG_RD] =
+      initpower_coeff[REG_RD] * p->sys.scaling_coefficients[REG_RD];
+  effpower_coeff[REG_WR] =
+      initpower_coeff[REG_WR] * p->sys.scaling_coefficients[REG_WR];
+  effpower_coeff[NON_REG_OPs] =
+      initpower_coeff[NON_REG_OPs] * p->sys.scaling_coefficients[NON_REG_OPs];
+
+  initpower_coeff[IC_H] = proc->cores[0]->get_coefficient_icache_hits();
+  initpower_coeff[IC_M] = proc->cores[0]->get_coefficient_icache_misses();
+  effpower_coeff[IC_H] =
+      initpower_coeff[IC_H] * p->sys.scaling_coefficients[IC_H];
+  effpower_coeff[IC_M] =
+      initpower_coeff[IC_M] * p->sys.scaling_coefficients[IC_M];
+
+  initpower_coeff[CC_H] = (proc->cores[0]->get_coefficient_ccache_readhits() +
+                           proc->get_coefficient_readcoalescing());
+  initpower_coeff[CC_M] = (proc->cores[0]->get_coefficient_ccache_readmisses() +
+                           proc->get_coefficient_readcoalescing());
+  effpower_coeff[CC_H] =
+      initpower_coeff[CC_H] * p->sys.scaling_coefficients[CC_H];
+  effpower_coeff[CC_M] =
+      initpower_coeff[CC_M] * p->sys.scaling_coefficients[CC_M];
+
+  initpower_coeff[TC_H] = (proc->cores[0]->get_coefficient_tcache_readhits() +
+                           proc->get_coefficient_readcoalescing());
+  initpower_coeff[TC_M] = (proc->cores[0]->get_coefficient_tcache_readmisses() +
+                           proc->get_coefficient_readcoalescing());
+  effpower_coeff[TC_H] =
+      initpower_coeff[TC_H] * p->sys.scaling_coefficients[TC_H];
+  effpower_coeff[TC_M] =
+      initpower_coeff[TC_M] * p->sys.scaling_coefficients[TC_M];
+
+  initpower_coeff[SHRD_ACC] =
+      proc->cores[0]->get_coefficient_sharedmemory_readhits();
+  effpower_coeff[SHRD_ACC] =
+      initpower_coeff[SHRD_ACC] * p->sys.scaling_coefficients[SHRD_ACC];
+
+  initpower_coeff[DC_RH] = (proc->cores[0]->get_coefficient_dcache_readhits() +
+                            proc->get_coefficient_readcoalescing());
+  initpower_coeff[DC_RM] =
+      (proc->cores[0]->get_coefficient_dcache_readmisses() +
+       proc->get_coefficient_readcoalescing());
+  initpower_coeff[DC_WH] = (proc->cores[0]->get_coefficient_dcache_writehits() +
+                            proc->get_coefficient_writecoalescing());
+  initpower_coeff[DC_WM] =
+      (proc->cores[0]->get_coefficient_dcache_writemisses() +
+       proc->get_coefficient_writecoalescing());
+  effpower_coeff[DC_RH] =
+      initpower_coeff[DC_RH] * p->sys.scaling_coefficients[DC_RH];
+  effpower_coeff[DC_RM] =
+      initpower_coeff[DC_RM] * p->sys.scaling_coefficients[DC_RM];
+  effpower_coeff[DC_WH] =
+      initpower_coeff[DC_WH] * p->sys.scaling_coefficients[DC_WH];
+  effpower_coeff[DC_WM] =
+      initpower_coeff[DC_WM] * p->sys.scaling_coefficients[DC_WM];
+
+  initpower_coeff[L2_RH] = proc->get_coefficient_l2_read_hits();
+  initpower_coeff[L2_RM] = proc->get_coefficient_l2_read_misses();
+  initpower_coeff[L2_WH] = proc->get_coefficient_l2_write_hits();
+  initpower_coeff[L2_WM] = proc->get_coefficient_l2_write_misses();
+  effpower_coeff[L2_RH] =
+      initpower_coeff[L2_RH] * p->sys.scaling_coefficients[L2_RH];
+  effpower_coeff[L2_RM] =
+      initpower_coeff[L2_RM] * p->sys.scaling_coefficients[L2_RM];
+  effpower_coeff[L2_WH] =
+      initpower_coeff[L2_WH] * p->sys.scaling_coefficients[L2_WH];
+  effpower_coeff[L2_WM] =
+      initpower_coeff[L2_WM] * p->sys.scaling_coefficients[L2_WM];
+
+  initpower_coeff[IDLE_CORE_N] =
+      p->sys.idle_core_power * proc->cores[0]->executionTime;
+  effpower_coeff[IDLE_CORE_N] =
+      initpower_coeff[IDLE_CORE_N] * p->sys.scaling_coefficients[IDLE_CORE_N];
+
+  initpower_coeff[PIPE_A] = proc->cores[0]->get_coefficient_duty_cycle();
+  effpower_coeff[PIPE_A] =
+      initpower_coeff[PIPE_A] * p->sys.scaling_coefficients[PIPE_A];
+
+  initpower_coeff[MEM_RD] = proc->get_coefficient_mem_reads();
+  initpower_coeff[MEM_WR] = proc->get_coefficient_mem_writes();
+  initpower_coeff[MEM_PRE] = proc->get_coefficient_mem_pre();
+  effpower_coeff[MEM_RD] =
+      initpower_coeff[MEM_RD] * p->sys.scaling_coefficients[MEM_RD];
+  effpower_coeff[MEM_WR] =
+      initpower_coeff[MEM_WR] * p->sys.scaling_coefficients[MEM_WR];
+  effpower_coeff[MEM_PRE] =
+      initpower_coeff[MEM_PRE] * p->sys.scaling_coefficients[MEM_PRE];
+
+  initpower_coeff[SP_ACC] =
+      proc->cores[0]->get_coefficient_ialu_accesses() *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
+  ;
+  initpower_coeff[SFU_ACC] = proc->cores[0]->get_coefficient_sfu_accesses();
+  initpower_coeff[FPU_ACC] = proc->cores[0]->get_coefficient_fpu_accesses();
+
+  effpower_coeff[SP_ACC] =
+      initpower_coeff[SP_ACC] * p->sys.scaling_coefficients[SP_ACC];
+  effpower_coeff[SFU_ACC] =
+      initpower_coeff[SFU_ACC] * p->sys.scaling_coefficients[SFU_ACC];
+  effpower_coeff[FPU_ACC] =
+      initpower_coeff[FPU_ACC] * p->sys.scaling_coefficients[FPU_ACC];
+
+  initpower_coeff[NOC_A] = proc->get_coefficient_noc_accesses();
+  effpower_coeff[NOC_A] =
+      initpower_coeff[NOC_A] * p->sys.scaling_coefficients[NOC_A];
+
+  const_dynamic_power =
+      proc->get_const_dynamic_power() / (proc->cores[0]->executionTime);
+
+  for (unsigned i = 0; i < num_perf_counters; i++) {
+    initpower_coeff[i] /= (proc->cores[0]->executionTime);
+    effpower_coeff[i] /= (proc->cores[0]->executionTime);
+  }
+}
+
+void gpgpu_sim_wrapper::update_components_power() {
+
+  update_coefficients();
+
+  proc_power = proc->rt_power.readOp.dynamic;
+
+  sample_cmp_pwr[IBP] =
+      (proc->cores[0]->ifu->IB->rt_power.readOp.dynamic +
+       proc->cores[0]->ifu->IB->rt_power.writeOp.dynamic +
+       proc->cores[0]->ifu->ID_misc->rt_power.readOp.dynamic +
+       proc->cores[0]->ifu->ID_operand->rt_power.readOp.dynamic +
+       proc->cores[0]->ifu->ID_inst->rt_power.readOp.dynamic) /
+      (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[ICP] = proc->cores[0]->ifu->icache.rt_power.readOp.dynamic /
+                        (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[DCP] = proc->cores[0]->lsu->dcache.rt_power.readOp.dynamic /
+                        (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[TCP] = proc->cores[0]->lsu->tcache.rt_power.readOp.dynamic /
+                        (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[CCP] = proc->cores[0]->lsu->ccache.rt_power.readOp.dynamic /
+                        (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[SHRDP] =
+      proc->cores[0]->lsu->sharedmemory.rt_power.readOp.dynamic /
+      (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[RFP] =
+      (proc->cores[0]->exu->rfu->rt_power.readOp.dynamic /
+       (proc->cores[0]->executionTime)) *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
+
+  sample_cmp_pwr[SPP] =
+      (proc->cores[0]->exu->exeu->rt_power.readOp.dynamic /
+       (proc->cores[0]->executionTime)) *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
+
+  sample_cmp_pwr[SFUP] = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic /
+                          (proc->cores[0]->executionTime));
+
+  sample_cmp_pwr[FPUP] = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic /
+                          (proc->cores[0]->executionTime));
+
+  sample_cmp_pwr[SCHEDP] = proc->cores[0]->exu->scheu->rt_power.readOp.dynamic /
+                           (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[L2CP] = (proc->XML->sys.number_of_L2s > 0)
+                             ? proc->l2array[0]->rt_power.readOp.dynamic /
+                                   (proc->cores[0]->executionTime)
+                             : 0;
+
+  sample_cmp_pwr[MCP] = (proc->mc->rt_power.readOp.dynamic -
+                         proc->mc->dram->rt_power.readOp.dynamic) /
+                        (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[NOCP] =
+      proc->nocs[0]->rt_power.readOp.dynamic / (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[DRAMP] =
+      proc->mc->dram->rt_power.readOp.dynamic / (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[PIPEP] =
+      proc->cores[0]->Pipeline_energy / (proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[IDLE_COREP] =
+      proc->cores[0]->IdleCoreEnergy / (proc->cores[0]->executionTime);
+
+  // This constant dynamic power (e.g., clock power) part is estimated via
+  // regression model.
+  sample_cmp_pwr[CONST_DYNAMICP] = 0;
+  double cnst_dyn =
+      proc->get_const_dynamic_power() / (proc->cores[0]->executionTime);
+  // If the regression scaling term is greater than the recorded constant
+  // dynamic power then use the difference (other portion already added to
+  // dynamic power). Else, all the constant dynamic power is accounted for, add
+  // nothing.
+  if (p->sys.scaling_coefficients[CONST_DYNAMICN] > cnst_dyn)
+    sample_cmp_pwr[CONST_DYNAMICP] =
+        (p->sys.scaling_coefficients[CONST_DYNAMICN] - cnst_dyn);
+
+  proc_power += sample_cmp_pwr[CONST_DYNAMICP];
+
+  double sum_pwr_cmp = 0;
+  for (unsigned i = 0; i < num_pwr_cmps; i++) {
+    sum_pwr_cmp += sample_cmp_pwr[i];
+  }
+  bool check = false;
+  check = sanity_check(sum_pwr_cmp, proc_power);
+  assert("Total Power does not equal the sum of the components\n" && (check));
+}
+
+void gpgpu_sim_wrapper::compute() { proc->compute(); }
+void gpgpu_sim_wrapper::print_power_kernel_stats(
+    double gpu_sim_cycle, double gpu_tot_sim_cycle, double init_value,
+    const std::string &kernel_info_string, bool print_trace) {
+  detect_print_steady_state(1, init_value);
+  if (g_power_simulation_enabled) {
+
+    powerfile << kernel_info_string << std::endl;
+
+    sanity_check((kernel_power.avg * kernel_sample_count), kernel_tot_power);
+    powerfile << "Kernel Average Power Data:" << std::endl;
+    powerfile << "kernel_avg_power = " << kernel_power.avg << std::endl;
+
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_avg_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].avg / kernel_sample_count << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_avg_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].avg / kernel_sample_count
+                << std::endl;
+    }
+
+    powerfile << std::endl << "Kernel Maximum Power Data:" << std::endl;
+    powerfile << "kernel_max_power = " << kernel_power.max << std::endl;
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_max_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].max << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_max_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].max << std::endl;
+    }
+
+    powerfile << std::endl << "Kernel Minimum Power Data:" << std::endl;
+    powerfile << "kernel_min_power = " << kernel_power.min << std::endl;
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_min_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].min << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_min_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].min << std::endl;
+    }
+
+    powerfile << std::endl
+              << "Accumulative Power Statistics Over Previous Kernels:"
+              << std::endl;
+    powerfile << "gpu_tot_avg_power = "
+              << gpu_tot_power.avg / total_sample_count << std::endl;
+    powerfile << "gpu_tot_max_power = " << gpu_tot_power.max << std::endl;
+    powerfile << "gpu_tot_min_power = " << gpu_tot_power.min << std::endl;
+    powerfile << std::endl << std::endl;
+    powerfile.flush();
+
+    if (print_trace) {
+      print_trace_files();
+    }
+  }
+}
+void gpgpu_sim_wrapper::dump() {
+  if (g_power_per_cycle_dump)
+    proc->displayEnergy(2, 5);
+}
+
+void gpgpu_sim_wrapper::print_steady_state(int position, double init_val) {
+  double temp_avg = sample_val / (double)samples.size();
+  double temp_ipc = (init_val - init_inst_val) /
+                    (double)(samples.size() * gpu_stat_sample_freq);
+
+  if ((samples.size() >
+       gpu_steady_min_period)) { // If steady state occurred for some time,
+                                 // print to file
+    has_written_avg = true;
+    gzprintf(steady_state_tacking_file, "%u,%d,%f,%f,", sample_start,
+             total_sample_count, temp_avg, temp_ipc);
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      gzprintf(steady_state_tacking_file, "%f,",
+               samples_counter.at(i) / ((double)samples.size()));
+    }
+    gzprintf(steady_state_tacking_file, "\n");
+  } else {
+    if (!has_written_avg && position)
+      gzprintf(steady_state_tacking_file,
+               "ERROR! Not enough steady state points to generate average\n");
+  }
+
+  sample_start = 0;
+  sample_val = 0;
+  init_inst_val = init_val;
+  samples.clear();
+  samples_counter.clear();
+  pwr_counter.clear();
+  assert(samples.size() == 0);
+}
+
+void gpgpu_sim_wrapper::detect_print_steady_state(int position,
+                                                  double init_val) {
+  // Calculating Average
+  if (g_power_simulation_enabled && g_steady_power_levels_enabled) {
+    steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "a");
+    if (position == 0) {
+      if (samples.size() == 0) {
+        // First sample
+        sample_start = total_sample_count;
+        sample_val = proc->rt_power.readOp.dynamic;
+        init_inst_val = init_val;
+        samples.push_back(proc->rt_power.readOp.dynamic);
+        assert(samples_counter.size() == 0);
+        assert(pwr_counter.size() == 0);
+
+        for (unsigned i = 0; i < (num_perf_counters); ++i) {
+          samples_counter.push_back(sample_perf_counters[i]);
+        }
+
+        for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
+          pwr_counter.push_back(sample_cmp_pwr[i]);
+        }
+        assert(pwr_counter.size() == (double)num_pwr_cmps);
+        assert(samples_counter.size() == (double)num_perf_counters);
+      } else {
+        // Get current average
+        double temp_avg = sample_val / (double)samples.size();
+
+        if (abs(proc->rt_power.readOp.dynamic - temp_avg) <
+            gpu_steady_power_deviation) { // Value is within threshold
+          sample_val += proc->rt_power.readOp.dynamic;
+          samples.push_back(proc->rt_power.readOp.dynamic);
+          for (unsigned i = 0; i < (num_perf_counters); ++i) {
+            samples_counter.at(i) += sample_perf_counters[i];
+          }
+
+          for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
+            pwr_counter.at(i) += sample_cmp_pwr[i];
+          }
+
+        } else { // Value exceeds threshold, not considered steady state
+          print_steady_state(position, init_val);
+        }
+      }
+    } else {
+      print_steady_state(position, init_val);
+    }
+    gzclose(steady_state_tacking_file);
+  }
+}
+
+void gpgpu_sim_wrapper::open_files() {
+  if (g_power_simulation_enabled) {
+    if (g_power_trace_enabled) {
+      power_trace_file = gzopen(g_power_trace_filename, "a");
+      metric_trace_file = gzopen(g_metric_trace_filename, "a");
+    }
+  }
+}
+void gpgpu_sim_wrapper::close_files() {
+  if (g_power_simulation_enabled) {
+    if (g_power_trace_enabled) {
+      gzclose(power_trace_file);
+      gzclose(metric_trace_file);
+    }
+  }
+}
diff --git a/src/gpuwattch/gpgpu_sim_wrapper.h b/src/gpuwattch/gpgpu_sim_wrapper.h
new file mode 100644
index 000000000..ea3945b54
--- /dev/null
+++ b/src/gpuwattch/gpgpu_sim_wrapper.h
@@ -0,0 +1,167 @@
+// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
+// The University of British Columbia
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+// Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution. Neither the name of
+// The University of British Columbia nor the names of its contributors may be
+// used to endorse or promote products derived from this software without
+// specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GPGPU_SIM_WRAPPER_H_
+#define GPGPU_SIM_WRAPPER_H_
+
+#include "processor.h"
+#include <assert.h>
+#include <fstream>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <zlib.h>
+
+using namespace std;
+
+template <typename T> struct avg_max_min_counters {
+  T avg;
+  T max;
+  T min;
+
+  avg_max_min_counters() {
+    avg = 0;
+    max = 0;
+    min = 0;
+  }
+};
+
+class gpgpu_sim_wrapper {
+public:
+  gpgpu_sim_wrapper(bool power_simulation_enabled, char *xmlfile);
+  ~gpgpu_sim_wrapper();
+
+  void init_mcpat(char *xmlfile, char *powerfile, char *power_trace_file,
+                  char *metric_trace_file, char *steady_state_file,
+                  bool power_sim_enabled, bool trace_enabled,
+                  bool steady_state_enabled, bool power_per_cycle_dump,
+                  double steady_power_deviation, double steady_min_period,
+                  int zlevel, double init_val, int stat_sample_freq);
+  void detect_print_steady_state(int position, double init_val);
+  void close_files();
+  void open_files();
+  void compute();
+  void dump();
+  void print_trace_files();
+  void update_components_power();
+  void update_coefficients();
+  void reset_counters();
+  void print_power_kernel_stats(double gpu_sim_cycle, double gpu_tot_sim_cycle,
+                                double init_value,
+                                const std::string &kernel_info_string,
+                                bool print_trace);
+  void power_metrics_calculations();
+  void set_inst_power(bool clk_gated_lanes, double tot_cycles,
+                      double busy_cycles, double tot_inst, double int_inst,
+                      double fp_inst, double load_inst, double store_inst,
+                      double committed_inst);
+  void set_regfile_power(double reads, double writes, double ops);
+  void set_icache_power(double accesses, double misses);
+  void set_ccache_power(double accesses, double misses);
+  void set_tcache_power(double accesses, double misses);
+  void set_shrd_mem_power(double accesses);
+  void set_l1cache_power(double read_accesses, double read_misses,
+                         double write_accesses, double write_misses);
+  void set_l2cache_power(double read_accesses, double read_misses,
+                         double write_accesses, double write_misses);
+  void set_idle_core_power(double num_idle_core);
+  void set_duty_cycle_power(double duty_cycle);
+  void set_mem_ctrl_power(double reads, double writes, double dram_precharge);
+  void set_exec_unit_power(double fpu_accesses, double ialu_accesses,
+                           double sfu_accesses);
+  void set_active_lanes_power(double sp_avg_active_lane,
+                              double sfu_avg_active_lane);
+  void set_NoC_power(double noc_tot_reads, double noc_tot_write);
+  bool sanity_check(double a, double b);
+
+private:
+  void print_steady_state(int position, double init_val);
+
+  Processor *proc;
+  ParseXML *p;
+  // power parameters
+  double const_dynamic_power;
+  double proc_power;
+
+  unsigned num_perf_counters; // # of performance counters
+  unsigned num_pwr_cmps;      // # of components modelled
+  int kernel_sample_count;    // # of samples per kernel
+  int total_sample_count;     // # of samples per benchmark
+
+  std::vector<avg_max_min_counters<double> >
+      kernel_cmp_pwr; // Per-kernel component power avg/max/min values
+  std::vector<avg_max_min_counters<double> >
+      kernel_cmp_perf_counters; // Per-kernel component avg/max/min performance
+                                // counters
+
+  double kernel_tot_power; // Total per-kernel power
+  avg_max_min_counters<double>
+      kernel_power; // Per-kernel power avg/max/min values
+  avg_max_min_counters<double>
+      gpu_tot_power; // Global GPU power avg/max/min values (across kernels)
+
+  bool has_written_avg;
+
+  std::vector<double> sample_cmp_pwr; // Current sample component powers
+  std::vector<double>
+      sample_perf_counters; // Current sample component perf. counts
+  std::vector<double> initpower_coeff;
+  std::vector<double> effpower_coeff;
+
+  // For calculating steady-state average
+  unsigned sample_start;
+  double sample_val;
+  double init_inst_val;
+  std::vector<double> samples;
+  std::vector<double> samples_counter;
+  std::vector<double> pwr_counter;
+
+  char *xml_filename;
+  char *g_power_filename;
+  char *g_power_trace_filename;
+  char *g_metric_trace_filename;
+  char *g_steady_state_tracking_filename;
+  bool g_power_simulation_enabled;
+  bool g_steady_power_levels_enabled;
+  bool g_power_trace_enabled;
+  bool g_power_per_cycle_dump;
+  double gpu_steady_power_deviation;
+  double gpu_steady_min_period;
+  int g_power_trace_zlevel;
+  double gpu_stat_sample_frequency;
+  int gpu_stat_sample_freq;
+
+  std::ofstream powerfile;
+  gzFile power_trace_file;
+  gzFile metric_trace_file;
+  gzFile steady_state_tacking_file;
+};
+
+#endif /* GPGPU_SIM_WRAPPER_H_ */
diff --git a/src/gpuwattch/gpgpu_static.xml b/src/gpuwattch/gpgpu_static.xml
new file mode 100644
index 000000000..befa95e96
--- /dev/null
+++ b/src/gpuwattch/gpgpu_static.xml
@@ -0,0 +1,492 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="number_of_cores" value="30"/>
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="0"/>
+		<param name="core_tech_node" value="65"/><!-- nm -->
+		<param name="target_core_clockrate" value="650"/><!--MHz -->
+		<param name="temperature" value="380"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="650"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="1"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="1"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="0"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="1"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="8"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="4"/>
+			<param name="simd_width" value="8"/>
+			<param name="collector_units" value="4"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="16384"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="1"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="1"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="1"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="16384,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1300"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1300"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="786432,64,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="1300"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="650"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="4"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="7"/>
+			<param name="output_ports" value="7"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="360000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="65"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="837"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29566"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="8"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="1"/>
+			<param name="number_ranks" value="2"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="32"/>
+			<param name="IO_buffer_size_per_channel" value="32"/>
+			<param name="databus_width" value="64"/>
+			<param name="addressbus_width" value="32"/>
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
diff --git a/src/gpuwattch/interconnect.cc b/src/gpuwattch/interconnect.cc
new file mode 100644
index 000000000..5698c02a6
--- /dev/null
+++ b/src/gpuwattch/interconnect.cc
@@ -0,0 +1,191 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "interconnect.h"
+#include "globalvar.h"
+#include "wire.h"
+#include <assert.h>
+#include <iostream>
+
+interconnect::interconnect(string name_, enum Device_ty device_ty_,
+                           double base_w, double base_h, int data_w, double len,
+                           const InputParameter *configure_interface,
+                           int start_wiring_level_, bool pipelinable_,
+                           double route_over_perc_, bool opt_local_,
+                           enum Core_type core_ty_, enum Wire_type wire_model,
+                           double width_s, double space_s,
+                           TechnologyParameter::DeviceType *dt)
+    : name(name_), device_ty(device_ty_), in_rise_time(0), out_rise_time(0),
+      base_width(base_w), base_height(base_h), data_width(data_w),
+      wt(wire_model), width_scaling(width_s), space_scaling(space_s),
+      start_wiring_level(start_wiring_level_), length(len),
+      // interconnect_latency(1e-12),
+      // interconnect_throughput(1e-12),
+      opt_local(opt_local_), core_ty(core_ty_), pipelinable(pipelinable_),
+      route_over_perc(route_over_perc_), deviceType(dt) {
+
+  wt = Global;
+  l_ip = *configure_interface;
+  local_result = init_interface(&l_ip);
+
+  max_unpipelined_link_delay = 0; // TODO
+  min_w_nmos = g_tp.min_w_nmos_;
+  min_w_pmos = deviceType->n_to_p_eff_curr_drv_ratio * min_w_nmos;
+
+  latency = l_ip.latency;
+  throughput = l_ip.throughput;
+  latency_overflow = false;
+  throughput_overflow = false;
+
+  /*
+   * TODO: Add wiring option from semi-global to global automatically
+   * And directly jump to global if semi-global cannot satisfy timing
+   * Fat wires only available for global wires, thus
+   * if signal wiring layer starts from semi-global,
+   * the next layer up will be global, i.e., semi-global does
+   * not have fat wires.
+   */
+  if (pipelinable == false)
+  // Non-pipelinable wires, such as bypass logic, care latency
+  {
+    compute();
+    if (opt_for_clk && opt_local) {
+      while (delay > latency && width_scaling < 3.0) {
+        width_scaling *= 2;
+        space_scaling *= 2;
+        Wire winit(width_scaling, space_scaling);
+        compute();
+      }
+      if (delay > latency) {
+        latency_overflow = true;
+      }
+    }
+  } else // Pipelinable wires, such as bus, does not care latency but throughput
+  {
+    /*
+     * TODO: Add pipe regs power, area, and timing;
+     * Pipelinable wires optimize latency first.
+     */
+    compute();
+    if (opt_for_clk && opt_local) {
+      while (delay > throughput && width_scaling < 3.0) {
+        width_scaling *= 2;
+        space_scaling *= 2;
+        Wire winit(width_scaling, space_scaling);
+        compute();
+      }
+      if (delay > throughput)
+      // insert pipeline stages
+      {
+        num_pipe_stages = (int)ceil(delay / throughput);
+        assert(num_pipe_stages > 0);
+        delay = delay / num_pipe_stages + num_pipe_stages * 0.05 * delay;
+      }
+    }
+  }
+
+  power_bit = power;
+  power.readOp.dynamic *= data_width;
+  power.readOp.leakage *= data_width;
+  power.readOp.gate_leakage *= data_width;
+  area.set_area(area.get_area() * data_width);
+  no_device_under_wire_area.h *= data_width;
+
+  if (latency_overflow == true)
+    cout << "Warning: " << name
+         << " wire structure cannot satisfy latency constraint." << endl;
+
+  assert(power.readOp.dynamic > 0);
+  assert(power.readOp.leakage > 0);
+  assert(power.readOp.gate_leakage > 0);
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(device_ty, core_ty);
+
+  double sckRation = g_tp.sckt_co_eff;
+  power.readOp.dynamic *= sckRation;
+  power.writeOp.dynamic *= sckRation;
+  power.searchOp.dynamic *= sckRation;
+
+  power.readOp.longer_channel_leakage =
+      power.readOp.leakage * long_channel_device_reduction;
+
+  if (pipelinable) // Only global wires has the option to choose whether routing
+                   // over or not
+    area.set_area(area.get_area() * route_over_perc +
+                  no_device_under_wire_area.get_area() * (1 - route_over_perc));
+
+  Wire wreset();
+}
+
+void interconnect::compute() {
+
+  Wire *wtemp1 = 0;
+  wtemp1 = new Wire(wt, length, 1, width_scaling, space_scaling);
+  delay = wtemp1->delay;
+  power.readOp.dynamic = wtemp1->power.readOp.dynamic;
+  power.readOp.leakage = wtemp1->power.readOp.leakage;
+  power.readOp.gate_leakage = wtemp1->power.readOp.gate_leakage;
+
+  area.set_area(wtemp1->area.get_area());
+  no_device_under_wire_area.h = (wtemp1->wire_width + wtemp1->wire_spacing);
+  no_device_under_wire_area.w = length;
+
+  if (wtemp1)
+    delete wtemp1;
+}
+
+void interconnect::leakage_feedback(double temperature) {
+  l_ip.temp = (unsigned int)round(temperature / 10.0) * 10;
+  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
+
+  compute();
+
+  power_bit = power;
+  power.readOp.dynamic *= data_width;
+  power.readOp.leakage *= data_width;
+  power.readOp.gate_leakage *= data_width;
+
+  assert(power.readOp.dynamic > 0);
+  assert(power.readOp.leakage > 0);
+  assert(power.readOp.gate_leakage > 0);
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(device_ty, core_ty);
+
+  double sckRation = g_tp.sckt_co_eff;
+  power.readOp.dynamic *= sckRation;
+  power.writeOp.dynamic *= sckRation;
+  power.searchOp.dynamic *= sckRation;
+
+  power.readOp.longer_channel_leakage =
+      power.readOp.leakage * long_channel_device_reduction;
+}
diff --git a/src/gpuwattch/interconnect.h b/src/gpuwattch/interconnect.h
new file mode 100644
index 000000000..fddc50c39
--- /dev/null
+++ b/src/gpuwattch/interconnect.h
@@ -0,0 +1,100 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef __INTERCONNECT_H__
+#define __INTERCONNECT_H__
+
+#include "assert.h"
+#include "basic_components.h"
+#include "cacti/basic_circuit.h"
+#include "cacti/cacti_interface.h"
+#include "cacti/component.h"
+#include "cacti/parameter.h"
+#include "cacti/subarray.h"
+#include "cacti/wire.h"
+
+// leakge power includes entire htree in a bank (when uca_tree == false)
+// leakge power includes only part to one bank when uca_tree == true
+
+class interconnect : public Component {
+public:
+  interconnect(string name_, enum Device_ty device_ty_, double base_w = 0,
+               double base_h = 0, int data_w = 0, double len = 0,
+               const InputParameter *configure_interface = NULL,
+               int start_wiring_level_ = 0, bool pipelinable_ = false,
+               double route_over_perc_ = 0.5, bool opt_local_ = true,
+               enum Core_type core_ty_ = Inorder,
+               enum Wire_type wire_model = Global, double width_s = 1.0,
+               double space_s = 1.0,
+               TechnologyParameter::DeviceType *dt = &(g_tp.peri_global));
+
+  ~interconnect(){};
+
+  void compute();
+  string name;
+  enum Device_ty device_ty;
+  double in_rise_time, out_rise_time;
+  InputParameter l_ip;
+  uca_org_t local_result;
+  Area no_device_under_wire_area;
+  void set_in_rise_time(double rt) { in_rise_time = rt; }
+
+  void leakage_feedback(double temperature);
+  double max_unpipelined_link_delay;
+  powerDef power_bit;
+
+  double wire_bw;
+  double init_wire_bw; // bus width at root
+  double base_width;
+  double base_height;
+  int data_width;
+  enum Wire_type wt;
+  double width_scaling, space_scaling;
+  int start_wiring_level;
+  double length;
+  double min_w_nmos;
+  double min_w_pmos;
+  double latency, throughput;
+  bool latency_overflow;
+  bool throughput_overflow;
+  double interconnect_latency;
+  double interconnect_throughput;
+  bool opt_local;
+  enum Core_type core_ty;
+  bool pipelinable;
+  double route_over_perc;
+  int num_pipe_stages;
+
+private:
+  TechnologyParameter::DeviceType *deviceType;
+};
+
+#endif
diff --git a/src/gpuwattch/iocontrollers.cc b/src/gpuwattch/iocontrollers.cc
new file mode 100644
index 000000000..df3ef606a
--- /dev/null
+++ b/src/gpuwattch/iocontrollers.cc
@@ -0,0 +1,505 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#include "iocontrollers.h"
+#include "XML_Parse.h"
+#include "basic_components.h"
+#include "cacti/basic_circuit.h"
+#include "const.h"
+#include "io.h"
+#include "logic.h"
+#include "parameter.h"
+#include <algorithm>
+#include <assert.h>
+#include <cmath>
+#include <iostream>
+#include <string>
+
+/*
+SUN Niagara 2 I/O power analysis:
+total signal bits: 711
+Total FBDIMM bits: (14+10)*2*8= 384
+PCIe bits:         (8 + 8)*2 = 32
+10Gb NIC:          (4*2+4*2)*2 = 32
+Debug I/Os:        168
+Other I/Os:        711- 32-32 - 384 - 168 = 95
+
+According to "Implementation of an 8-Core, 64-Thread, Power-Efficient SPARC
+Server on a Chip" 90% of I/Os are SerDers (the calucaltion is
+384+64/(711-168)=83% about the same as the 90% reported in the paper)
+--> around 80Pins are common I/Os.
+Common I/Os consumes 71mW/Gb/s according to Cadence ChipEstimate @65nm
+Niagara 2 I/O clock is 1/4 of core clock. --> 87pin (<--((711-168)*17%)) *
+71mW/Gb/s *0.25*1.4Ghz = 2.17W
+
+Total dynamic power of FBDIMM, NIC, PCIe = 84*0.132 + 84*0.049*0.132 = 11.14
+- 2.17 = 8.98 Further, if assuming I/O logic power is about 50% of I/Os then
+Total energy of FBDIMM, NIC, PCIe = 11.14 - 2.17*1.5 = 7.89
+ */
+
+/*
+ * A bug in Cadence ChipEstimator: After update the clock rate in the clock tab,
+ * a user need to re-select the IP clock (the same clk) and then click Estimate.
+ * if not reselect the new clock rate may not be propogate into the IPs.
+ *
+ */
+
+NIUController::NIUController(ParseXML *XML_interface,
+                             InputParameter *interface_ip_)
+    : XML(XML_interface), interface_ip(*interface_ip_) {
+  local_result = init_interface(&interface_ip);
+
+  double frontend_area, mac_area, SerDer_area;
+  double frontend_dyn, mac_dyn, SerDer_dyn;
+  double frontend_gates, mac_gates;
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  double NMOS_sizing, PMOS_sizing;
+
+  set_niu_param();
+
+  if (niup.type == 0) // high performance NIU
+  {
+    // Area estimation based on average of die photo from Niagara 2 and Cadence
+    // ChipEstimate using 65nm.
+    mac_area = (1.53 + 0.3) / 2 * (interface_ip.F_sz_um / 0.065) *
+               (interface_ip.F_sz_um / 0.065);
+    // Area estimation based on average of die photo from Niagara 2, ISSCC "An
+    // 800mW 10Gb Ethernet Transceiver in 0.13μm CMOS" and"A 1.2-V-Only 900-mW
+    // 10 Gb Ethernet Transceiver and XAUI Interface With Robust VCO Tuning
+    // Technique" Frontend is PCS
+    frontend_area = (9.8 + (6 + 18) * 65 / 130 * 65 / 130) / 3 *
+                    (interface_ip.F_sz_um / 0.065) *
+                    (interface_ip.F_sz_um / 0.065);
+    // Area estimation based on average of die photo from Niagara 2 and Cadence
+    // ChipEstimate hard IP @65nm. SerDer is very hard to scale
+    SerDer_area = (1.39 + 0.36) * (interface_ip.F_sz_um /
+                                   0.065); //* (interface_ip.F_sz_um/0.065);
+    // total area
+    area.set_area((mac_area + frontend_area + SerDer_area) * 1e6);
+    // Power
+    // Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T =
+    // P/F = 1.37/1Ghz = 1.37e-9);
+    mac_dyn = 2.19e-9 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd /
+              1.1 *
+              (interface_ip.F_sz_nm /
+               65.0); // niup.clockRate; //2.19W@1GHz fully active according to
+                      // Cadence ChipEstimate @65nm
+    // Cadence ChipEstimate using 65nm soft IP;
+    frontend_dyn = 0.27e-9 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd /
+                   1.1 * (interface_ip.F_sz_nm / 65.0); // niup.clockRate;
+    // according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS..." ISSCC 2006
+    // SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
+    SerDer_dyn = 0.01 * 10 * sqrt(interface_ip.F_sz_um / 0.09) *
+                 g_tp.peri_global.Vdd / 1.2 * g_tp.peri_global.Vdd / 1.2;
+    SerDer_dyn /=
+        niup.clockRate; // covert to energy per clock cycle of whole NIU
+
+    // Cadence ChipEstimate using 65nm
+    mac_gates = 111700;
+    frontend_gates = 320000;
+    NMOS_sizing = 5 * g_tp.min_w_nmos_;
+    PMOS_sizing = 5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
+
+  } else { // Low power implementations are mostly from Cadence ChipEstimator;
+           // Ignore the multiple IP effect
+    // ---When there are multiple IP (same kind or not) selected, Cadence
+    // ChipEstimator results are not a simple summation of all IPs. Ignore this
+    // effect
+    mac_area =
+        0.24 * (interface_ip.F_sz_um / 0.065) * (interface_ip.F_sz_um / 0.065);
+    frontend_area = 0.1 * (interface_ip.F_sz_um / 0.065) *
+                    (interface_ip.F_sz_um / 0.065); // Frontend is the PCS layer
+    SerDer_area =
+        0.35 * (interface_ip.F_sz_um / 0.065) * (interface_ip.F_sz_um / 0.065);
+    // Compare 130um implementation in "A 1.2-V-Only 900-mW 10 Gb Ethernet
+    // Transceiver and XAUI Interface With Robust VCO Tuning Technique" and the
+    // ChipEstimator XAUI PHY hard IP, confirm that even PHY can scale perfectly
+    // with the technology total area
+    area.set_area((mac_area + frontend_area + SerDer_area) * 1e6);
+    // Power
+    // Cadence ChipEstimate using 65nm (mac, front_end are all energy. E=P*T =
+    // P/F = 1.37/1Ghz = 1.37e-9);
+    mac_dyn = 1.257e-9 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd /
+              1.1 *
+              (interface_ip.F_sz_nm /
+               65.0); // niup.clockRate; //2.19W@1GHz fully active according to
+                      // Cadence ChipEstimate @65nm
+    // Cadence ChipEstimate using 65nm soft IP;
+    frontend_dyn = 0.6e-9 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd /
+                   1.1 * (interface_ip.F_sz_nm / 65.0); // niup.clockRate;
+    // SerDer_dyn is power not energy, scaling from 216mw/10Gb/s @130nm
+    SerDer_dyn = 0.0216 * 10 * (interface_ip.F_sz_um / 0.13) *
+                 g_tp.peri_global.Vdd / 1.2 * g_tp.peri_global.Vdd / 1.2;
+    SerDer_dyn /=
+        niup.clockRate; // covert to energy per clock cycle of whole NIU
+
+    mac_gates = 111700;
+    frontend_gates = 52000;
+
+    NMOS_sizing = g_tp.min_w_nmos_;
+    PMOS_sizing = g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
+  }
+
+  power_t.readOp.dynamic = mac_dyn + frontend_dyn + SerDer_dyn;
+  power_t.readOp.leakage =
+      (mac_gates + frontend_gates + frontend_gates) *
+      cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
+      g_tp.peri_global.Vdd; // unit W
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(Uncore_device);
+  power_t.readOp.longer_channel_leakage =
+      power_t.readOp.leakage * long_channel_device_reduction;
+  power_t.readOp.gate_leakage =
+      (mac_gates + frontend_gates + frontend_gates) *
+      cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
+      g_tp.peri_global.Vdd; // unit W
+}
+
+void NIUController::computeEnergy(bool is_tdp) {
+  if (is_tdp) {
+
+    power = power_t;
+    power.readOp.dynamic *= niup.duty_cycle;
+
+  } else {
+    rt_power = power_t;
+    rt_power.readOp.dynamic *= niup.perc_load;
+  }
+}
+
+void NIUController::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+    cout << "NIU:" << endl;
+    cout << indent_str << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str
+         << "Peak Dynamic = " << power.readOp.dynamic * niup.clockRate << " W"
+         << endl;
+    cout << indent_str << "Subthreshold Leakage = "
+         << (long_channel ? power.readOp.longer_channel_leakage
+                          : power.readOp.leakage)
+         << " W" << endl;
+    // cout << indent_str<< "Subthreshold Leakage = " <<
+    // power.readOp.longer_channel_leakage <<" W" << endl;
+    cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str
+         << "Runtime Dynamic = " << rt_power.readOp.dynamic * niup.clockRate
+         << " W" << endl;
+    cout << endl;
+  } else {
+  }
+}
+
+void NIUController::set_niu_param() {
+  niup.clockRate = XML->sys.niu.clockrate;
+  niup.clockRate *= 1e6;
+  niup.num_units = XML->sys.niu.number_units;
+  niup.duty_cycle = XML->sys.niu.duty_cycle;
+  niup.perc_load = XML->sys.niu.total_load_perc;
+  niup.type = XML->sys.niu.type;
+  //	  niup.executionTime   =
+  // XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+}
+
+PCIeController::PCIeController(ParseXML *XML_interface,
+                               InputParameter *interface_ip_)
+    : XML(XML_interface), interface_ip(*interface_ip_) {
+  local_result = init_interface(&interface_ip);
+  double ctrl_area, SerDer_area;
+  double ctrl_dyn, SerDer_dyn;
+  double ctrl_gates, SerDer_gates;
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  double NMOS_sizing, PMOS_sizing;
+
+  /* Assuming PCIe is bit-slice based architecture
+   * This is the reason for /8 in both area and power calculation
+   * to get per lane numbers
+   */
+
+  set_pcie_param();
+  if (pciep.type == 0) // high performance NIU
+  {
+    // Area estimation based on average of die photo from Niagara 2 and Cadence
+    // ChipEstimate @ 65nm.
+    ctrl_area = (5.2 + 0.5) / 2 * (interface_ip.F_sz_um / 0.065) *
+                (interface_ip.F_sz_um / 0.065);
+    // Area estimation based on average of die photo from Niagara 2, and Cadence
+    // ChipEstimate @ 65nm. Area estimation based on average of die photo from
+    // Niagara 2 and Cadence ChipEstimate hard IP @65nm. SerDer is very hard to
+    // scale
+    SerDer_area = (3.03 + 0.36) * (interface_ip.F_sz_um /
+                                   0.065); //* (interface_ip.F_sz_um/0.065);
+    // total area
+    // Power
+    // Cadence ChipEstimate using 65nm the controller includes everything: the
+    // PHY, the data link and transaction layer
+    ctrl_dyn = 3.75e-9 / 8 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd /
+               1.1 * (interface_ip.F_sz_nm / 65.0);
+    //	  //Cadence ChipEstimate using 65nm soft IP;
+    //	  frontend_dyn =
+    // 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
+    // SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
+    SerDer_dyn = 0.01 * 4 * (interface_ip.F_sz_um / 0.09) *
+                 g_tp.peri_global.Vdd / 1.2 * g_tp.peri_global.Vdd /
+                 1.2;              // PCIe 2.0 max per lane speed is 4Gb/s
+    SerDer_dyn /= pciep.clockRate; // covert to energy per clock cycle
+
+    // power_t.readOp.dynamic = (ctrl_dyn)*pciep.num_channels;
+    // Cadence ChipEstimate using 65nm
+    ctrl_gates = 900000 / 8 * pciep.num_channels;
+    //	  frontend_gates   = 120000/8;
+    //	  SerDer_gates     = 200000/8;
+    NMOS_sizing = 5 * g_tp.min_w_nmos_;
+    PMOS_sizing = 5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
+  } else {
+    ctrl_area =
+        0.412 * (interface_ip.F_sz_um / 0.065) * (interface_ip.F_sz_um / 0.065);
+    // Area estimation based on average of die photo from Niagara 2, and Cadence
+    // ChipEstimate @ 65nm.
+    SerDer_area =
+        0.36 * (interface_ip.F_sz_um / 0.065) * (interface_ip.F_sz_um / 0.065);
+    // total area
+    // Power
+    // Cadence ChipEstimate using 65nm the controller includes everything: the
+    // PHY, the data link and transaction layer
+    ctrl_dyn = 2.21e-9 / 8 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd /
+               1.1 * (interface_ip.F_sz_nm / 65.0);
+    //	  //Cadence ChipEstimate using 65nm soft IP;
+    //	  frontend_dyn =
+    // 0.27e-9/8*g_tp.peri_global.Vdd/1.1*g_tp.peri_global.Vdd/1.1*(interface_ip.F_sz_nm/65.0);
+    // SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
+    SerDer_dyn = 0.01 * 4 * (interface_ip.F_sz_um / 0.09) *
+                 g_tp.peri_global.Vdd / 1.2 * g_tp.peri_global.Vdd /
+                 1.2;              // PCIe 2.0 max per lane speed is 4Gb/s
+    SerDer_dyn /= pciep.clockRate; // covert to energy per clock cycle
+
+    // Cadence ChipEstimate using 65nm
+    ctrl_gates = 200000 / 8 * pciep.num_channels;
+    //	  frontend_gates   = 120000/8;
+    SerDer_gates = 200000 / 8 * pciep.num_channels;
+    NMOS_sizing = g_tp.min_w_nmos_;
+    PMOS_sizing = g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
+  }
+  area.set_area(((ctrl_area + (pciep.withPHY ? SerDer_area : 0)) / 8 *
+                 pciep.num_channels) *
+                1e6);
+  power_t.readOp.dynamic =
+      (ctrl_dyn + (pciep.withPHY ? SerDer_dyn : 0)) * pciep.num_channels;
+  power_t.readOp.leakage =
+      (ctrl_gates + (pciep.withPHY ? SerDer_gates : 0)) *
+      cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
+      g_tp.peri_global.Vdd; // unit W
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(Uncore_device);
+  power_t.readOp.longer_channel_leakage =
+      power_t.readOp.leakage * long_channel_device_reduction;
+  power_t.readOp.gate_leakage =
+      (ctrl_gates + (pciep.withPHY ? SerDer_gates : 0)) *
+      cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
+      g_tp.peri_global.Vdd; // unit W
+}
+
+void PCIeController::computeEnergy(bool is_tdp) {
+  if (is_tdp) {
+
+    power = power_t;
+    power.readOp.dynamic *= pciep.duty_cycle;
+
+  } else {
+    rt_power = power_t;
+    rt_power.readOp.dynamic *= pciep.perc_load;
+  }
+}
+
+void PCIeController::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+    cout << "PCIe:" << endl;
+    cout << indent_str << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str
+         << "Peak Dynamic = " << power.readOp.dynamic * pciep.clockRate << " W"
+         << endl;
+    cout << indent_str << "Subthreshold Leakage = "
+         << (long_channel ? power.readOp.longer_channel_leakage
+                          : power.readOp.leakage)
+         << " W" << endl;
+    // cout << indent_str<< "Subthreshold Leakage = " <<
+    // power.readOp.longer_channel_leakage <<" W" << endl;
+    cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str
+         << "Runtime Dynamic = " << rt_power.readOp.dynamic * pciep.clockRate
+         << " W" << endl;
+    cout << endl;
+  } else {
+  }
+}
+
+void PCIeController::set_pcie_param() {
+  pciep.clockRate = XML->sys.pcie.clockrate;
+  pciep.clockRate *= 1e6;
+  pciep.num_units = XML->sys.pcie.number_units;
+  pciep.num_channels = XML->sys.pcie.num_channels;
+  pciep.duty_cycle = XML->sys.pcie.duty_cycle;
+  pciep.perc_load = XML->sys.pcie.total_load_perc;
+  pciep.type = XML->sys.pcie.type;
+  pciep.withPHY = XML->sys.pcie.withPHY;
+  //	  pciep.executionTime   =
+  // XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+}
+
+FlashController::FlashController(ParseXML *XML_interface,
+                                 InputParameter *interface_ip_)
+    : XML(XML_interface), interface_ip(*interface_ip_) {
+  local_result = init_interface(&interface_ip);
+  double ctrl_area, SerDer_area;
+  double ctrl_dyn, SerDer_dyn;
+  double ctrl_gates, SerDer_gates;
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  double NMOS_sizing, PMOS_sizing;
+
+  /* Assuming PCIe is bit-slice based architecture
+   * This is the reason for /8 in both area and power calculation
+   * to get per lane numbers
+   */
+
+  set_fc_param();
+  if (fcp.type == 0) // high performance NIU
+  {
+    cout << "Current McPAT does not support high performance flash contorller "
+            "since even low power designs are enough for maintain throughput"
+         << endl;
+    exit(0);
+    NMOS_sizing = 5 * g_tp.min_w_nmos_;
+    PMOS_sizing = 5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
+  } else {
+    ctrl_area =
+        0.243 * (interface_ip.F_sz_um / 0.065) * (interface_ip.F_sz_um / 0.065);
+    // Area estimation based on Cadence ChipEstimate @ 65nm: NANDFLASH-CTRL from
+    // CAST
+    SerDer_area = 0.36 / 8 * (interface_ip.F_sz_um / 0.065) *
+                  (interface_ip.F_sz_um / 0.065);
+    // based On PCIe PHY TSMC65GP from Cadence ChipEstimate @ 65nm, it support
+    // 8x lanes with each lane speed up to 250MB/s (PCIe1.1x) This is already
+    // saturate the 200MB/s of the flash controller core above.
+    ctrl_gates = 129267;
+    SerDer_gates = 200000 / 8;
+    NMOS_sizing = g_tp.min_w_nmos_;
+    PMOS_sizing = g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
+
+    // Power
+    // Cadence ChipEstimate using 65nm the controller 125mW for every 200MB/s
+    // This is power not energy!
+    ctrl_dyn = 0.125 * g_tp.peri_global.Vdd / 1.1 * g_tp.peri_global.Vdd / 1.1 *
+               (interface_ip.F_sz_nm / 65.0);
+    // SerDer_dyn is power not energy, scaling from 10mw/Gb/s @90nm
+    SerDer_dyn = 0.01 * 1.6 * (interface_ip.F_sz_um / 0.09) *
+                 g_tp.peri_global.Vdd / 1.2 * g_tp.peri_global.Vdd / 1.2;
+    // max  Per controller speed is 1.6Gb/s (200MB/s)
+  }
+  double number_channel = 1 + (fcp.num_channels - 1) * 0.2;
+  area.set_area((ctrl_area + (fcp.withPHY ? SerDer_area : 0)) * 1e6 *
+                number_channel);
+  power_t.readOp.dynamic =
+      (ctrl_dyn + (fcp.withPHY ? SerDer_dyn : 0)) * number_channel;
+  power_t.readOp.leakage =
+      ((ctrl_gates + (fcp.withPHY ? SerDer_gates : 0)) * number_channel) *
+      cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
+      g_tp.peri_global.Vdd; // unit W
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(Uncore_device);
+  power_t.readOp.longer_channel_leakage =
+      power_t.readOp.leakage * long_channel_device_reduction;
+  power_t.readOp.gate_leakage =
+      ((ctrl_gates + (fcp.withPHY ? SerDer_gates : 0)) * number_channel) *
+      cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
+      g_tp.peri_global.Vdd; // unit W
+}
+
+void FlashController::computeEnergy(bool is_tdp) {
+  if (is_tdp) {
+
+    power = power_t;
+    power.readOp.dynamic *= fcp.duty_cycle;
+
+  } else {
+    rt_power = power_t;
+    rt_power.readOp.dynamic *= fcp.perc_load;
+  }
+}
+
+void FlashController::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+    cout << "Flash Controller:" << endl;
+    cout << indent_str << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W"
+         << endl; // no multiply of clock since this is power already
+    cout << indent_str << "Subthreshold Leakage = "
+         << (long_channel ? power.readOp.longer_channel_leakage
+                          : power.readOp.leakage)
+         << " W" << endl;
+    // cout << indent_str<< "Subthreshold Leakage = " <<
+    // power.readOp.longer_channel_leakage <<" W" << endl;
+    cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic
+         << " W" << endl;
+    cout << endl;
+  } else {
+  }
+}
+
+void FlashController::set_fc_param() {
+  //	  fcp.clockRate       = XML->sys.flashc.mc_clock;
+  //	  fcp.clockRate       *= 1e6;
+  fcp.peakDataTransferRate = XML->sys.flashc.peak_transfer_rate;
+  fcp.num_channels = ceil(fcp.peakDataTransferRate / 200);
+  fcp.num_mcs = XML->sys.flashc.number_mcs;
+  fcp.duty_cycle = XML->sys.flashc.duty_cycle;
+  fcp.perc_load = XML->sys.flashc.total_load_perc;
+  fcp.type = XML->sys.flashc.type;
+  fcp.withPHY = XML->sys.flashc.withPHY;
+  //	  flashcp.executionTime   =
+  // XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+}
diff --git a/src/gpuwattch/iocontrollers.h b/src/gpuwattch/iocontrollers.h
new file mode 100644
index 000000000..89d0bb5f1
--- /dev/null
+++ b/src/gpuwattch/iocontrollers.h
@@ -0,0 +1,84 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#ifndef IOCONTROLLERS_H_
+#define IOCONTROLLERS_H_
+
+#endif /* IOCONTROLLERS_H_ */
+
+#include "XML_Parse.h"
+#include "cacti/parameter.h"
+//#include "io.h"
+#include "array.h"
+//#include "Undifferentiated_Core_Area.h"
+#include "basic_components.h"
+#include <vector>
+
+class NIUController : public Component {
+public:
+  ParseXML *XML;
+  InputParameter interface_ip;
+  NIUParam niup;
+  powerDef power_t;
+  uca_org_t local_result;
+  NIUController(ParseXML *XML_interface, InputParameter *interface_ip_);
+  void set_niu_param();
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~NIUController(){};
+};
+
+class PCIeController : public Component {
+public:
+  ParseXML *XML;
+  InputParameter interface_ip;
+  PCIeParam pciep;
+  powerDef power_t;
+  uca_org_t local_result;
+  PCIeController(ParseXML *XML_interface, InputParameter *interface_ip_);
+  void set_pcie_param();
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~PCIeController(){};
+};
+
+class FlashController : public Component {
+public:
+  ParseXML *XML;
+  InputParameter interface_ip;
+  MCParam fcp;
+  powerDef power_t;
+  uca_org_t local_result;
+  FlashController(ParseXML *XML_interface, InputParameter *interface_ip_);
+  void set_fc_param();
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~FlashController(){};
+};
diff --git a/src/gpuwattch/logic.cc b/src/gpuwattch/logic.cc
new file mode 100644
index 000000000..91ca53817
--- /dev/null
+++ b/src/gpuwattch/logic.cc
@@ -0,0 +1,1351 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+#include "logic.h"
+#define SP_BASE_POWER 0
+#define SFU_BASE_POWER 0
+//.67
+
+// extern double exClockRate;
+// selection_logic
+selection_logic::selection_logic(bool _is_default, int win_entries_,
+                                 int issue_width_,
+                                 const InputParameter *configure_interface,
+                                 enum Device_ty device_ty_,
+                                 enum Core_type core_ty_)
+    // const ParseXML *_XML_interface)
+    : is_default(_is_default), win_entries(win_entries_),
+      issue_width(issue_width_), device_ty(device_ty_), core_ty(core_ty_) {
+  // uca_org_t result2;
+  l_ip = *configure_interface;
+  local_result = init_interface(&l_ip);
+  // init_tech_params(l_ip.F_sz_um, false);
+  // win_entries=numIBEntries;//IQentries;
+  // issue_width=issueWidth;
+  selection_power();
+  double sckRation = g_tp.sckt_co_eff;
+  power.readOp.dynamic *= sckRation;
+  power.writeOp.dynamic *= sckRation;
+  power.searchOp.dynamic *= sckRation;
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(device_ty, core_ty);
+  power.readOp.longer_channel_leakage =
+      power.readOp.leakage * long_channel_device_reduction;
+}
+
+void selection_logic::selection_power() { // based on cost effective superscalar
+                                          // processor TR pp27-31
+  double Ctotal, Cor, Cpencode;
+  int num_arbiter;
+  double WSelORn, WSelORprequ, WSelPn, WSelPp, WSelEnn, WSelEnp;
+
+  // TODO: the 0.8um process data is used.
+  WSelORn = 12.5 * l_ip.F_sz_um; // this was 10 micron for the 0.8 micron
+                                 // process
+  WSelORprequ =
+      50 * l_ip.F_sz_um;        // this was 40 micron for the 0.8 micron process
+  WSelPn = 12.5 * l_ip.F_sz_um; // this was 10mcron for the 0.8 micron process
+  WSelPp = 18.75 * l_ip.F_sz_um; // this was 15 micron for the 0.8 micron
+                                 // process
+  WSelEnn = 6.25 * l_ip.F_sz_um; // this was 5 micron for the 0.8 micron process
+  WSelEnp = 12.5 * l_ip.F_sz_um; // this was 10 micron for the 0.8 micron
+                                 // process
+
+  Ctotal = 0;
+  num_arbiter = 1;
+  while (win_entries > 4) {
+    win_entries = (int)ceil((double)win_entries / 4.0);
+    num_arbiter += win_entries;
+  }
+  // the 4-input OR logic to generate anyreq
+  Cor = 4 * drain_C_(WSelORn, NCH, 1, 1, g_tp.cell_h_def) +
+        drain_C_(WSelORprequ, PCH, 1, 1, g_tp.cell_h_def);
+  power.readOp.gate_leakage =
+      cmos_Ig_leakage(WSelORn, WSelORprequ, 4, nor) * g_tp.peri_global.Vdd;
+
+  // The total capacity of the 4-bit priority encoder
+  Cpencode =
+      drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(WSelPp, PCH, 1, 1, g_tp.cell_h_def) +
+      2 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(WSelPp, PCH, 2, 1, g_tp.cell_h_def) +
+      3 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(WSelPp, PCH, 3, 1, g_tp.cell_h_def) +
+      4 * drain_C_(WSelPn, NCH, 1, 1, g_tp.cell_h_def) +
+      drain_C_(WSelPp, PCH, 4, 1,
+               g_tp.cell_h_def) + // precompute priority logic
+      2 * 4 * gate_C(WSelEnn + WSelEnp, 20.0) +
+      4 * drain_C_(WSelEnn, NCH, 1, 1, g_tp.cell_h_def) +
+      2 * 4 * drain_C_(WSelEnp, PCH, 1, 1, g_tp.cell_h_def) + // enable logic
+      (2 * 4 + 2 * 3 + 2 * 2 + 2) *
+          gate_C(WSelPn + WSelPp, 10.0); // requests signal
+
+  Ctotal += issue_width * num_arbiter * (Cor + Cpencode);
+
+  power.readOp.dynamic =
+      Ctotal * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd *
+      2; // 2 means the abitration signal need to travel round trip
+  power.readOp.leakage =
+      issue_width * num_arbiter *
+      (cmos_Isub_leakage(
+           WSelPn, WSelPp, 2,
+           nor) /*approximate precompute with a nor gate*/ // grant1p
+       + cmos_Isub_leakage(WSelPn, WSelPp, 3, nor)         // grant2p
+       + cmos_Isub_leakage(WSelPn, WSelPp, 4, nor)         // grant3p
+       + cmos_Isub_leakage(WSelEnn, WSelEnp, 2, nor) * 4   // enable logic
+       + cmos_Isub_leakage(WSelEnn, WSelEnp, 1, inv) * 2 *
+             3 // for each grant there are two inverters, there are 3 grant
+               // sIsubnals
+       ) *
+      g_tp.peri_global.Vdd;
+  power.readOp.gate_leakage =
+      issue_width * num_arbiter *
+      (cmos_Ig_leakage(
+           WSelPn, WSelPp, 2,
+           nor) /*approximate precompute with a nor gate*/ // grant1p
+       + cmos_Ig_leakage(WSelPn, WSelPp, 3, nor)           // grant2p
+       + cmos_Ig_leakage(WSelPn, WSelPp, 4, nor)           // grant3p
+       + cmos_Ig_leakage(WSelEnn, WSelEnp, 2, nor) * 4     // enable logic
+       + cmos_Ig_leakage(WSelEnn, WSelEnp, 1, inv) * 2 *
+             3 // for each grant there are two inverters, there are 3 grant
+               // signals
+       ) *
+      g_tp.peri_global.Vdd;
+}
+
+dep_resource_conflict_check::dep_resource_conflict_check(
+    const InputParameter *configure_interface, const CoreDynParam &dyn_p_,
+    int compare_bits_, bool _is_default)
+    : l_ip(*configure_interface), coredynp(dyn_p_), compare_bits(compare_bits_),
+      is_default(_is_default) {
+  Wcompn = 25 * l_ip.F_sz_um; // this was 20.0 micron for the 0.8 micron process
+  Wevalinvp =
+      25 * l_ip.F_sz_um; // this was 20.0 micron for the 0.8 micron process
+  Wevalinvn =
+      100 * l_ip.F_sz_um; // this was 80.0 mcron for the 0.8 micron process
+  Wcomppreequ =
+      50 * l_ip.F_sz_um; // this was 40.0  micron for the 0.8 micron process
+  WNORn = 6.75 * l_ip.F_sz_um; // this was 5.4 micron for the 0.8 micron process
+  WNORp =
+      38.125 * l_ip.F_sz_um; // this was 30.5 micron for the 0.8 micron process
+
+  local_result = init_interface(&l_ip);
+
+  if (coredynp.core_ty == Inorder)
+    compare_bits += 16 + 8 + 8; // TODO: opcode bits + log(shared resources) +
+                                // REG TAG BITS-->opcode comparator
+  else
+    compare_bits += 16 + 8 + 8;
+
+  conflict_check_power();
+  double sckRation = g_tp.sckt_co_eff;
+  power.readOp.dynamic *= sckRation;
+  power.writeOp.dynamic *= sckRation;
+  power.searchOp.dynamic *= sckRation;
+}
+
+void dep_resource_conflict_check::conflict_check_power() {
+  double Ctotal;
+  int num_comparators;
+  num_comparators =
+      3 * ((coredynp.decodeW) * (coredynp.decodeW) -
+           coredynp.decodeW); // 2(N*N-N) is used for source to dest comparison,
+                              // (N*N-N) is used for dest to dest comparision.
+  // When decode-width ==1, no dcl logic
+
+  Ctotal = num_comparators * compare_cap();
+  // printf("%i,%s\n",XML_interface->sys.core[0].predictor.predictor_entries,XML_interface->sys.core[0].predictor.prediction_scheme);
+
+  power.readOp.dynamic =
+      Ctotal * /*CLOCKRATE*/ g_tp.peri_global.Vdd * g_tp.peri_global.Vdd /*AF*/;
+  power.readOp.leakage = num_comparators * compare_bits * 2 *
+                         simplified_nmos_leakage(Wcompn, false);
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(Core_device, coredynp.core_ty);
+  power.readOp.longer_channel_leakage =
+      power.readOp.leakage * long_channel_device_reduction;
+  power.readOp.gate_leakage =
+      num_comparators * compare_bits * 2 * cmos_Ig_leakage(Wcompn, 0, 2, nmos);
+}
+
+/* estimate comparator power consumption (this comparator is similar
+   to the tag-match structure in a CAM */
+double dep_resource_conflict_check::compare_cap() {
+  double c1, c2;
+
+  WNORp = WNORp * compare_bits /
+          2.0; // resize the big NOR gate at the DCL according to fan in.
+  /* bottom part of comparator */
+  c2 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
+                         drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def)) +
+       drain_C_(Wevalinvp, PCH, 1, 1, g_tp.cell_h_def) +
+       drain_C_(Wevalinvn, NCH, 1, 1, g_tp.cell_h_def);
+
+  /* top part of comparator */
+  c1 = (compare_bits) * (drain_C_(Wcompn, NCH, 1, 1, g_tp.cell_h_def) +
+                         drain_C_(Wcompn, NCH, 2, 1, g_tp.cell_h_def) +
+                         drain_C_(Wcomppreequ, NCH, 1, 1, g_tp.cell_h_def)) +
+       gate_C(WNORn + WNORp, 10.0) +
+       drain_C_(WNORp, NCH, 2, 1, g_tp.cell_h_def) +
+       compare_bits * drain_C_(WNORn, NCH, 2, 1, g_tp.cell_h_def);
+  return (c1 + c2);
+}
+
+void dep_resource_conflict_check::leakage_feedback(double temperature) {
+  l_ip.temp = (unsigned int)round(temperature / 10.0) * 10;
+  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
+
+  // This is part of conflict_check_power()
+  int num_comparators =
+      3 * ((coredynp.decodeW) * (coredynp.decodeW) -
+           coredynp.decodeW); // 2(N*N-N) is used for source to dest comparison,
+                              // (N*N-N) is used for dest to dest comparision.
+  power.readOp.leakage = num_comparators * compare_bits * 2 *
+                         simplified_nmos_leakage(Wcompn, false);
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(Core_device, coredynp.core_ty);
+  power.readOp.longer_channel_leakage =
+      power.readOp.leakage * long_channel_device_reduction;
+  power.readOp.gate_leakage =
+      num_comparators * compare_bits * 2 * cmos_Ig_leakage(Wcompn, 0, 2, nmos);
+}
+
+// TODO: add inverter and transmission gate base DFF.
+
+DFFCell::DFFCell(bool _is_dram, double _WdecNANDn, double _WdecNANDp,
+                 double _cell_load, const InputParameter *configure_interface)
+    : is_dram(_is_dram), cell_load(_cell_load), WdecNANDn(_WdecNANDn),
+      WdecNANDp(_WdecNANDp) { // this model is based on the NAND2 based DFF.
+  l_ip = *configure_interface;
+  //			area.set_area(730*l_ip.F_sz_um*l_ip.F_sz_um);
+  area.set_area(
+      5 * compute_gate_area(NAND, 2, WdecNANDn, WdecNANDp, g_tp.cell_h_def) +
+      compute_gate_area(NAND, 2, WdecNANDn, WdecNANDn, g_tp.cell_h_def));
+}
+
+double DFFCell::fpfp_node_cap(unsigned int fan_in, unsigned int fan_out) {
+  double Ctotal = 0;
+  // printf("WdecNANDn = %E\n", WdecNANDn);
+
+  /* part 1: drain cap of NAND gate */
+  Ctotal += drain_C_(WdecNANDn, NCH, 2, 1, g_tp.cell_h_def, is_dram) +
+            fan_in * drain_C_(WdecNANDp, PCH, 1, 1, g_tp.cell_h_def, is_dram);
+
+  /* part 2: gate cap of NAND gates */
+  Ctotal += fan_out * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
+
+  return Ctotal;
+}
+
+void DFFCell::compute_DFF_cell() {
+  double c1, c2, c3, c4, c5, c6;
+  /* node 5 and node 6 are identical to node 1 in capacitance */
+  c1 = c5 = c6 = fpfp_node_cap(2, 1);
+  c2 = fpfp_node_cap(2, 3);
+  c3 = fpfp_node_cap(3, 2);
+  c4 = fpfp_node_cap(2, 2);
+
+  // cap-load of the clock signal in each Dff, actually the clock signal only
+  // connected to one NAND2
+  clock_cap = 2 * gate_C(WdecNANDn + WdecNANDp, 0, is_dram);
+  e_switch.readOp.dynamic += (c4 + c1 + c2 + c3 + c5 + c6 + 2 * cell_load) *
+                             0.5 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  ;
+
+  /* no 1/2 for e_keep and e_clock because clock signal switches twice in one
+   * cycle */
+  e_keep_1.readOp.dynamic += c3 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  e_keep_0.readOp.dynamic += c2 * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  e_clock.readOp.dynamic +=
+      clock_cap * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd;
+  ;
+
+  /* static power */
+  e_switch.readOp.leakage +=
+      (cmos_Isub_leakage(WdecNANDn, WdecNANDp, 2, nand) *
+           5 // 5 NAND2 and 1 NAND3 in a DFF
+       + cmos_Isub_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
+      g_tp.peri_global.Vdd;
+  e_switch.readOp.gate_leakage +=
+      (cmos_Ig_leakage(WdecNANDn, WdecNANDp, 2, nand) *
+           5 // 5 NAND2 and 1 NAND3 in a DFF
+       + cmos_Ig_leakage(WdecNANDn, WdecNANDn, 3, nand)) *
+      g_tp.peri_global.Vdd;
+  // printf("leakage =%E\n",cmos_Ileak(1, is_dram) );
+}
+
+Pipeline::Pipeline(const InputParameter *configure_interface,
+                   const CoreDynParam &dyn_p_, enum Device_ty device_ty_,
+                   bool _is_core_pipeline, bool _is_default)
+    : l_ip(*configure_interface), coredynp(dyn_p_), device_ty(device_ty_),
+      is_core_pipeline(_is_core_pipeline), is_default(_is_default),
+      num_piperegs(0.0)
+
+{
+  local_result = init_interface(&l_ip);
+  if (!coredynp.Embedded)
+    process_ind = true;
+  else
+    process_ind = false;
+  WNANDn =
+      (process_ind)
+          ? 25 * l_ip.F_sz_um
+          : g_tp.min_w_nmos_; // this was  20 micron for the 0.8 micron process
+  WNANDp = (process_ind)
+               ? 37.5 * l_ip.F_sz_um
+               : g_tp.min_w_nmos_ *
+                     pmos_to_nmos_sz_ratio(); // this was  30 micron for the 0.8
+                                              // micron process
+  load_per_pipeline_stage = 2 * gate_C(WNANDn + WNANDp, 0, false);
+  compute();
+}
+
+void Pipeline::compute() {
+  compute_stage_vector();
+  DFFCell pipe_reg(false, WNANDn, WNANDp, load_per_pipeline_stage, &l_ip);
+  pipe_reg.compute_DFF_cell();
+
+  double clock_power_pipereg = num_piperegs * pipe_reg.e_clock.readOp.dynamic;
+  //******************pipeline power: currently, we average all the
+  // possibilities of the states of DFFs in the pipeline. A better way to do it
+  // is to consider the harming distance of two consecutive signals, However
+  // McPAT does not have plan to do this in near future as it focuses on worst
+  // case power.
+  double pipe_reg_power =
+      num_piperegs *
+          (pipe_reg.e_switch.readOp.dynamic + pipe_reg.e_keep_0.readOp.dynamic +
+           pipe_reg.e_keep_1.readOp.dynamic) /
+          3 +
+      clock_power_pipereg;
+  double pipe_reg_leakage = num_piperegs * pipe_reg.e_switch.readOp.leakage;
+  double pipe_reg_gate_leakage =
+      num_piperegs * pipe_reg.e_switch.readOp.gate_leakage;
+  power.readOp.dynamic += pipe_reg_power;
+  power.readOp.leakage += pipe_reg_leakage;
+  power.readOp.gate_leakage += pipe_reg_gate_leakage;
+  area.set_area(num_piperegs * pipe_reg.area.get_area());
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(device_ty, coredynp.core_ty);
+  power.readOp.longer_channel_leakage =
+      power.readOp.leakage * long_channel_device_reduction;
+
+  double sckRation = g_tp.sckt_co_eff;
+  power.readOp.dynamic *= sckRation;
+  power.writeOp.dynamic *= sckRation;
+  power.searchOp.dynamic *= sckRation;
+  double macro_layout_overhead = g_tp.macro_layout_overhead;
+  if (!coredynp.Embedded)
+    area.set_area(area.get_area() * macro_layout_overhead);
+}
+
+void Pipeline::compute_stage_vector() {
+  double num_stages, tot_stage_vector, per_stage_vector;
+  int opcode_length =
+      coredynp.x86 ? coredynp.micro_opcode_length : coredynp.opcode_length;
+  // Hthread = thread_clock_gated? 1:num_thread;
+
+  if (!is_core_pipeline) {
+    num_piperegs = l_ip.pipeline_stages *
+                   l_ip.per_stage_vector; // The number of pipeline stages are
+                                          // calculated based on the achievable
+                                          // throughput and required throughput
+  } else {
+    if (coredynp.core_ty == Inorder) {
+      /* assume 6 pipe stages and try to estimate bits per pipe stage */
+      /* pipe stage 0/IF */
+      num_piperegs += coredynp.pc_width * 2 * coredynp.num_hthreads;
+      /* pipe stage IF/ID */
+      num_piperegs += coredynp.fetchW *
+                      (coredynp.instruction_length + coredynp.pc_width) *
+                      coredynp.num_hthreads;
+      /* pipe stage IF/ThreadSEL */
+      if (coredynp.multithreaded)
+        num_piperegs += coredynp.num_hthreads *
+                        coredynp.perThreadState; // 8 bit thread states
+      /* pipe stage ID/EXE */
+      num_piperegs += coredynp.decodeW *
+                      (coredynp.instruction_length + coredynp.pc_width +
+                       pow(2.0, opcode_length) + 2 * coredynp.int_data_width) *
+                      coredynp.num_hthreads;
+      /* pipe stage EXE/MEM */
+      num_piperegs +=
+          coredynp.issueW *
+          (3 * coredynp.arch_ireg_width + pow(2.0, opcode_length) +
+           8 * 2 * coredynp.int_data_width /*+2*powers (2,reg_length)*/);
+      /* pipe stage MEM/WB the 2^opcode_length means the total decoded signal
+       * for the opcode*/
+      num_piperegs +=
+          coredynp.issueW *
+          (2 * coredynp.int_data_width + pow(2.0, opcode_length) +
+           8 * 2 * coredynp.int_data_width /*+2*powers (2,reg_length)*/);
+      //		/* pipe stage 5/6 */
+      //		num_piperegs += issueWidth*(data_width + powers
+      //(2,opcode_length)/*+2*powers (2,reg_length)*/);
+      //		/* pipe stage 6/7 */
+      //		num_piperegs += issueWidth*(data_width + powers
+      //(2,opcode_length)/*+2*powers (2,reg_length)*/);
+      //		/* pipe stage 7/8 */
+      //		num_piperegs += issueWidth*(data_width + powers
+      //(2,opcode_length)/**2*powers (2,reg_length)*/);
+      //		/* assume 50% extra in control signals (rule of thumb)
+      //*/
+      num_stages = 6;
+
+    } else {
+      /* assume 12 stage pipe stages and try to estimate bits per pipe stage */
+      /*OOO: Fetch, decode, rename, IssueQ, dispatch, regread, EXE, MEM, WB, CM
+       */
+
+      /* pipe stage 0/1F*/
+      num_piperegs +=
+          coredynp.pc_width * 2 * coredynp.num_hthreads; // PC and Next PC
+      /* pipe stage IF/ID */
+      num_piperegs +=
+          coredynp.fetchW * (coredynp.instruction_length + coredynp.pc_width) *
+          coredynp.num_hthreads; // PC is used to feed branch predictor in ID
+      /* pipe stage 1D/Renaming*/
+      num_piperegs +=
+          coredynp.decodeW * (coredynp.instruction_length + coredynp.pc_width) *
+          coredynp.num_hthreads; // PC is for branch exe in later stage.
+      /* pipe stage Renaming/wire_drive */
+      num_piperegs +=
+          coredynp.decodeW * (coredynp.instruction_length + coredynp.pc_width);
+      /* pipe stage Renaming/IssueQ */
+      num_piperegs += coredynp.issueW *
+                      (coredynp.instruction_length + coredynp.pc_width +
+                       3 * coredynp.phy_ireg_width) *
+                      coredynp.num_hthreads; // 3*coredynp.phy_ireg_width means
+                                             // 2 sources and 1 dest
+      /* pipe stage IssueQ/Dispatch */
+      num_piperegs += coredynp.issueW * (coredynp.instruction_length +
+                                         3 * coredynp.phy_ireg_width);
+      /* pipe stage Dispatch/EXE */
+
+      num_piperegs += coredynp.issueW *
+                      (3 * coredynp.phy_ireg_width + coredynp.pc_width +
+                       pow(2.0, opcode_length) /*+2*powers (2,reg_length)*/);
+      /* 2^opcode_length means the total decoded signal for the opcode*/
+      num_piperegs += coredynp.issueW *
+                      (2 * coredynp.int_data_width +
+                       pow(2.0, opcode_length) /*+2*powers (2,reg_length)*/);
+      /*2 source operands in EXE; Assume 2EXE stages* since we do not really
+       * distinguish OP*/
+      num_piperegs += coredynp.issueW *
+                      (2 * coredynp.int_data_width +
+                       pow(2.0, opcode_length) /*+2*powers (2,reg_length)*/);
+      /* pipe stage EXE/MEM, data need to be read/write, address*/
+      num_piperegs +=
+          coredynp.issueW *
+          (coredynp.int_data_width + coredynp.v_address_width +
+           pow(2.0,
+               opcode_length) /*+2*powers (2,reg_length)*/); // memory Opcode
+                                                             // still need to be
+                                                             // passed
+      /* pipe stage MEM/WB; result data, writeback regs */
+      num_piperegs +=
+          coredynp.issueW * (coredynp.int_data_width + coredynp.phy_ireg_width /* powers (2,opcode_length) + (2,opcode_length)+2*powers (2,reg_length)*/);
+      /* pipe stage WB/CM ; result data, regs need to be updated, address for
+       * resolve memory ops in ROB's top*/
+      num_piperegs +=
+          coredynp.commitW *
+          (coredynp.int_data_width + coredynp.v_address_width + coredynp.phy_ireg_width /*+ powers (2,opcode_length)*2*powers (2,reg_length)*/) *
+          coredynp.num_hthreads;
+      //		if (multithreaded)
+      //		{
+      //
+      //		}
+      num_stages = 12;
+    }
+
+    /* assume 50% extra in control registers and interrupt registers (rule of
+     * thumb) */
+    num_piperegs = num_piperegs * 1.5;
+    tot_stage_vector = num_piperegs;
+    per_stage_vector = tot_stage_vector / num_stages;
+
+    if (coredynp.core_ty == Inorder) {
+      if (coredynp.pipeline_stages > 6)
+        num_piperegs = per_stage_vector * coredynp.pipeline_stages;
+    } else // OOO
+    {
+      if (coredynp.pipeline_stages > 12)
+        num_piperegs = per_stage_vector * coredynp.pipeline_stages;
+    }
+  }
+}
+
+FunctionalUnit::FunctionalUnit(ParseXML *XML_interface, int ithCore_,
+                               InputParameter *interface_ip_,
+                               const CoreDynParam &dyn_p_,
+                               enum FU_type fu_type_, double exClockRate)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      coredynp(dyn_p_), fu_type(fu_type_) {
+  double area_t; //, leakage, gate_leakage;
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  clockRate = exClockRate; // coredynp.clockRate;
+  executionTime = coredynp.executionTime;
+  // cout<<"FU executionTime: "<<executionTime<<endl;
+
+  // XML_interface=_XML_interface;
+  uca_org_t result2;
+  result2 = init_interface(&interface_ip);
+  if (XML->sys.Embedded) {
+    if (fu_type == FPU) {
+      num_fu = coredynp.num_fpus;
+      // area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is
+      // um^2
+      area_t = 4.47 * 1e6 *
+               (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 /
+                90.0); // this is um^2 The base number
+      // 4.47 contains both VFP and NEON processing unit, VFP is about 40% and
+      // NEON is about 60%
+      if (g_ip->F_sz_nm > 90)
+        area_t = 4.47 * 1e6 *
+                 g_tp.scaling_factor.logic_scaling_co_eff; // this is um^2
+      leakage = area_t * (g_tp.scaling_factor.core_tx_density) *
+                cmos_Isub_leakage(5 * g_tp.min_w_nmos_,
+                                  5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r,
+                                  1, inv) *
+                g_tp.peri_global.Vdd / 2; // unit W
+      gate_leakage = area_t * (g_tp.scaling_factor.core_tx_density) *
+                     cmos_Ig_leakage(
+                         5 * g_tp.min_w_nmos_,
+                         5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1, inv) *
+                     g_tp.peri_global.Vdd / 2; // unit W
+      // energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction
+      // in FPU usually it can have up to 20 cycles.
+      //			base_energy = coredynp.core_ty==Inorder? 0:
+      // 89e-3*3; //W The base energy of ALU average numbers from Intel 4G and
+      // 773Mhz (Wattch) 			base_energy
+      //*=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
+      base_energy = 0;
+      per_access_energy =
+          1.15 / 1e9 / 4 / 1.3 / 1.3 * g_tp.peri_global.Vdd *
+          g_tp.peri_global.Vdd *
+          (g_ip->F_sz_nm /
+           90.0); // g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9;
+                  // //This is per Hz energy(nJ)
+      // per_access_energy*=3;
+      // FPU power from Sandia's processor sizing tech report
+      FU_height = (18667 * num_fu) * interface_ip.F_sz_um; // FPU from Sun's
+                                                           // data
+    } else if (fu_type == ALU) {
+      num_fu = coredynp.num_alus;
+      // FIXME: The first area_t = is from updated McAPAT, the second is from
+      // our changes (conflict from base) area_t =
+      // 280*260*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU +
+      // MUl
+      area_t =
+          71.85 * 71.85 * num_fu *
+          g_tp.scaling_factor.logic_scaling_co_eff; // this is um^2 ALU + MUl
+      leakage = area_t * (g_tp.scaling_factor.core_tx_density) *
+                cmos_Isub_leakage(20 * g_tp.min_w_nmos_,
+                                  20 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r,
+                                  1, inv) *
+                g_tp.peri_global.Vdd / 2; // unit W
+      gate_leakage =
+          area_t * (g_tp.scaling_factor.core_tx_density) *
+          cmos_Ig_leakage(20 * g_tp.min_w_nmos_,
+                          20 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1,
+                          inv) *
+          g_tp.peri_global.Vdd / 2;
+      leakage = 0;
+      //			base_energy = coredynp.core_ty==Inorder?
+      // 0:89e-3; //W The base energy of ALU average numbers from Intel 4G and
+      // 773Mhz (Wattch) 			base_energy
+      //*=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
+      base_energy = 0;
+      // per_access_energy
+      // = 1.15/3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9;
+      // //This is per cycle energy(nJ)
+      per_access_energy = 1.29 / 1e12 / 1.3 / 1.3 * g_tp.peri_global.Vdd *
+                          g_tp.peri_global.Vdd * (g_ip->F_sz_nm / 90.0);
+      // per_access_energy*=3;
+      FU_height = (6222 * num_fu) * interface_ip.F_sz_um; // integer ALU
+
+    } else if (fu_type == MUL) {
+      num_fu = coredynp.num_muls;
+      area_t =
+          280 * 260 * 3 *
+          g_tp.scaling_factor.logic_scaling_co_eff; // this is um^2 ALU + MUl
+      leakage = area_t * (g_tp.scaling_factor.core_tx_density) *
+                cmos_Isub_leakage(20 * g_tp.min_w_nmos_,
+                                  20 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r,
+                                  1, inv) *
+                g_tp.peri_global.Vdd / 2; // unit W
+      gate_leakage =
+          area_t * (g_tp.scaling_factor.core_tx_density) *
+          cmos_Ig_leakage(20 * g_tp.min_w_nmos_,
+                          20 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1,
+                          inv) *
+          g_tp.peri_global.Vdd / 2;
+      //			base_energy = coredynp.core_ty==Inorder?
+      // 0:89e-3*2; //W The base energy of ALU average numbers from Intel 4G and
+      // 773Mhz (Wattch) 			base_energy
+      //*=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);
+      base_energy = 0;
+      per_access_energy =
+          1.15 * 2 / 3 / 1e9 / 1.3 / 1.3 * g_tp.peri_global.Vdd *
+          g_tp.peri_global.Vdd *
+          (g_ip->F_sz_nm /
+           90.0); //(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2)/24;//0.00649*1e-9;
+                  ////This is per cycle energy(nJ), coefficient based on Wattch
+                  //(24 is the division ny latency: Syed)
+      // per_access_energy*=3;
+      FU_height =
+          (9334 * num_fu) * interface_ip.F_sz_um; // divider/mul from Sun's data
+    } else {
+      cout << "Unknown Functional Unit Type" << endl;
+      exit(0);
+    }
+    per_access_energy *= 0.5; // According to ARM data embedded processor has
+                              // much lower per acc energy
+  }                           /* if (XML->sys.Embedded) */
+  else {
+    if (fu_type == FPU) {
+      num_fu = coredynp.num_fpus;
+
+      /*
+      num_fu/=2; //2 DP FPUs combine to for a SP FPU
+      //area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is
+      um^2 area_t = 8.47*1e6*(g_ip->F_sz_nm*g_ip->F_sz_nm/90.0/90.0);//this is
+      um^2 if (g_ip->F_sz_nm>90) area_t
+      = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 leakage
+      = area_t
+      *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_,
+      5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1,
+      inv)*g_tp.peri_global.Vdd/2;//unit W gate_leakage = area_t
+      *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_,
+      5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1,
+      inv)*g_tp.peri_global.Vdd/2;//unit W
+      //energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction in
+      FPU usually it can have up to 20 cycles. base_energy =
+      coredynp.core_ty==Inorder? 0: 89e-3*3; //W The base energy of ALU average
+      numbers from Intel 4G and 773Mhz (Wattch) base_energy
+      *=(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2); per_access_energy
+      = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9;
+      //This is per op energy(nJ)
+      FU_height=(38667*num_fu)*interface_ip.F_sz_um;//FPU from Sun's data
+      */
+      // area_t = 8.47*1e6*g_tp.scaling_factor.logic_scaling_co_eff;//this is
+      // um^2
+      num_fu = num_fu / 2; // 2 DP FPUs combine to for a SP FPU
+      area_t = 8.47 * 1e6 *
+               (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 / 90.0); // this is um^2
+      if (g_ip->F_sz_nm > 90)
+        area_t = 8.47 * 1e6 *
+                 g_tp.scaling_factor.logic_scaling_co_eff; // this is um^2
+      leakage =
+          37e-3; // area_t
+                 // *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(5*g_tp.min_w_nmos_,
+                 // 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1,
+                 // inv)*g_tp.peri_global.Vdd/2;//unit W
+      gate_leakage =
+          0; // area_t
+             // *(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(5*g_tp.min_w_nmos_,
+             // 5*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1,
+             // inv)*g_tp.peri_global.Vdd/2;//unit W
+      // energy = 0.3529/10*1e-9;//this is the energy(nJ) for a FP instruction
+      // in FPU usually it can have up to 20 cycles.
+      base_energy = coredynp.core_ty == Inorder
+                        ? 0
+                        : 89e-3 * 3; // W The base energy of ALU average numbers
+                                     // from Intel 4G and 773Mhz (Wattch)
+
+      base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 / 1.2);
+
+      // Base energy (if the pipeline is not clock gated)
+      // TODO: add a check for clockgating enable
+      base_energy = SP_BASE_POWER;
+
+      // per_access_energy
+      // = 1.15*3/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9;
+      // //This is per op energy(nJ)
+      per_access_energy =
+          3.9 * 14.91 / 1e12 / 1.08 / 1.08 * g_tp.peri_global.Vdd *
+          g_tp.peri_global.Vdd *
+          (g_ip->F_sz_nm /
+           90.0); //;4.34 is scaling factor based on hardware measurements
+                  // ALU instrucitons are also executed on FPUs so add 30%
+                  // overhead for supporting ALU instrcutions per_access_energy
+                  // = 1.3*per_access_energy;
+
+      // ALU instrucitons are also executed on FPUs so add 10% overhead for
+      // supporting ALU instrcutions
+      leakage = 1.1 * leakage;
+      // cout<<"FPU Per access erngy: "<<per_access_energy/2;
+      // Divide per access energy by 2 so we have 4 DP units capapble of
+      // doing 8 SP operations
+      // per_access_energy = per_access_energy/2;
+      FU_height = (38667 * num_fu) * interface_ip.F_sz_um; // FPU from Sun's
+                                                           // data
+      per_access_energy *= 2;
+    } else if (fu_type == ALU) {
+      num_fu = coredynp.num_alus;
+      // FIXME: The first area_t = is from updated McPAT. The second is from our
+      // McPAT changes. Fix after merge area_t =
+      // 280*260*2*g_tp.scaling_factor.logic_scaling_co_eff;//this is um^2 ALU +
+      // MUl leakage = area_t
+      // *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_,
+      // 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1,
+      // inv)*g_tp.peri_global.Vdd/2;//unit W gate_leakage =
+      // area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_,
+      // 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1,
+      // inv)*g_tp.peri_global.Vdd/2;
+      area_t =
+          71.85 * 71.85 * num_fu *
+          g_tp.scaling_factor.logic_scaling_co_eff; // this is um^2 ALU + MUl
+      // leakage = area_t
+      // *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_,
+      // 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1,
+      // inv)*g_tp.peri_global.Vdd/2;//unit W gate_leakage =
+      // area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_,
+      // 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1,
+      // inv)*g_tp.peri_global.Vdd/2; Following leakage numbers are based on
+      // Syntheis based power-estimation (SYED)
+      leakage = 2.58e-5;
+      gate_leakage = 0;
+      base_energy = coredynp.core_ty == Inorder
+                        ? 0
+                        : 89e-3; // W The base energy of ALU average numbers
+                                 // from Intel 4G and 773Mhz (Wattch)
+      base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 / 1.2);
+      // per_access_energy
+      // = 1.15/1e9/4/1.3/1.3*g_tp.peri_global.Vdd*g_tp.peri_global.Vdd*(g_ip->F_sz_nm/90.0);//(g_tp.peri_global.Vdd*g_tp.peri_global.Vdd/1.2/1.2);//0.00649*1e-9;
+      // //This is per cycle energy(nJ)
+      per_access_energy = 0.8 * 1.29 / 1e12 / 1.3 / 1.3 * g_tp.peri_global.Vdd *
+                          g_tp.peri_global.Vdd * (g_ip->F_sz_nm / 90.0);
+      FU_height = (6222 * num_fu) * interface_ip.F_sz_um; // integer ALU
+      per_access_energy *= 2;
+    } else if (fu_type == MUL) {
+      num_fu = coredynp.num_muls;
+      area_t =
+          280 * 260 * 2 * 3 *
+          g_tp.scaling_factor.logic_scaling_co_eff; // this is um^2 ALU + MUl
+      // leakage = area_t
+      // *(g_tp.scaling_factor.core_tx_density)*cmos_Isub_leakage(20*g_tp.min_w_nmos_,
+      // 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1,
+      // inv)*g_tp.peri_global.Vdd/2;//unit W
+      leakage = 37e-3;
+      gate_leakage =
+          0; // area_t*(g_tp.scaling_factor.core_tx_density)*cmos_Ig_leakage(20*g_tp.min_w_nmos_,
+             // 20*g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, 1,
+             // inv)*g_tp.peri_global.Vdd/2;
+      base_energy = coredynp.core_ty == Inorder
+                        ? 0
+                        : 89e-3 * 2; // W The base energy of ALU average numbers
+                                     // from Intel 4G and 773Mhz (Wattch)
+      base_energy *= (g_tp.peri_global.Vdd * g_tp.peri_global.Vdd / 1.2 / 1.2);
+      base_energy = SFU_BASE_POWER;
+      // SFU is modelled as a double preicison FPU
+      per_access_energy =
+          8 * 14.91 / 1e12 / 1.08 / 1.08 * g_tp.peri_global.Vdd *
+          g_tp.peri_global.Vdd *
+          (g_ip->F_sz_nm /
+           90.0); // 1.5 is scaling factor based on hardware measuremetns
+      FU_height =
+          (9334 * num_fu) * interface_ip.F_sz_um; // divider/mul from Sun's data
+      per_access_energy *= 2;
+    }
+
+    else {
+      cout << "Unknown Functional Unit Type" << endl;
+      exit(0);
+    }
+  }
+  // IEXEU, simple ALU and FPU
+  //  double C_ALU, C_EXEU, C_FPU; //Lum Equivalent capacitance of IEXEU and
+  //  FPU. Based on Intel and Sun 90nm process fabracation.
+  //
+  //  C_ALU	  = 0.025e-9;//F
+  //  C_EXEU  = 0.05e-9; //F
+  //  C_FPU	  = 0.35e-9;//F
+  area.set_area(area_t * num_fu);
+  leakage *= num_fu;
+  gate_leakage *= num_fu;
+  double macro_layout_overhead = g_tp.macro_layout_overhead;
+  //	if (!XML->sys.Embedded)
+  area.set_area(area.get_area() * macro_layout_overhead);
+}
+
+void FunctionalUnit::computeEnergy(bool is_tdp) {
+
+  executionTime =
+      XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6); // Syed
+  double pppm_t[4] = {1, 1, 1, 1};
+  double FU_duty_cycle;
+  if (is_tdp) {
+
+    set_pppm(pppm_t, 2, 2, 2, 2); // 2 means two source operands needs to be
+                                  // passed for each int instruction.
+    if (fu_type == FPU) {
+      stats_t.readAc.access = num_fu;
+      tdp_stats = stats_t;
+      // Syed: FPU power numbers are already average
+      // so activity factor is already accounted for
+      FU_duty_cycle = coredynp.FPU_duty_cycle;
+    } else if (fu_type == ALU) {
+      stats_t.readAc.access = 1 * num_fu;
+      tdp_stats = stats_t;
+      FU_duty_cycle = coredynp.ALU_duty_cycle;
+    } else if (fu_type == MUL) {
+      stats_t.readAc.access = num_fu;
+      tdp_stats = stats_t;
+      FU_duty_cycle = coredynp.MUL_duty_cycle;
+    }
+
+    // power.readOp.dynamic = base_energy/clockRate +
+    // energy*stats_t.readAc.access;
+    power.readOp.dynamic =
+        per_access_energy * stats_t.readAc.access + base_energy / clockRate;
+    double sckRation = g_tp.sckt_co_eff;
+    power.readOp.dynamic *= sckRation * FU_duty_cycle;
+    power.writeOp.dynamic *= sckRation;
+    power.searchOp.dynamic *= sckRation;
+
+    power.readOp.leakage = leakage;
+    power.readOp.gate_leakage = gate_leakage;
+    double long_channel_device_reduction =
+        longer_channel_device_reduction(Core_device, coredynp.core_ty);
+    power.readOp.longer_channel_leakage =
+        power.readOp.leakage * long_channel_device_reduction;
+
+  } else {
+    if (fu_type == FPU) {
+      // Each access activates an equililant of a double-precision unit
+      // so divide accesses into half
+      stats_t.readAc.access = XML->sys.core[ithCore].fpu_accesses;
+      rtp_stats = stats_t;
+      // cout<<"FPU: --accesses "<<stats_t.readAc.access <<endl;
+
+    } else if (fu_type == ALU) {
+      stats_t.readAc.access = XML->sys.core[ithCore].ialu_accesses;
+      rtp_stats = stats_t;
+      // cout<<"ALU: --accesses "<<stats_t.readAc.access <<endl;
+    } else if (fu_type == MUL) {
+      stats_t.readAc.access = XML->sys.core[ithCore].mul_accesses;
+      rtp_stats = stats_t;
+      // cout<<"MUL: --accesses "<<stats_t.readAc.access <<endl;
+    }
+
+    // rt_power.readOp.dynamic = base_energy*executionTime +
+    // energy*stats_t.readAc.access;
+
+    if (fu_type == ALU) {
+      rt_power.readOp.dynamic = per_access_energy * stats_t.readAc.access +
+                                base_energy * executionTime;
+    } else {
+      rt_power.readOp.dynamic = per_access_energy * stats_t.readAc.access;
+    }
+
+    double sckRation = g_tp.sckt_co_eff;
+    rt_power.readOp.dynamic *= sckRation;
+    rt_power.writeOp.dynamic *= sckRation;
+    rt_power.searchOp.dynamic *= sckRation;
+    // cout<<"Power: "<<rt_power.readOp.dynamic<<endl;
+    if (fu_type == FPU) {
+      rt_power.readOp.dynamic +=
+          base_energy * executionTime *
+          (32 - XML->sys.core[ithCore].sp_average_active_lanes);
+    }
+    if (fu_type == MUL) {
+      if (XML->sys.core[ithCore].sfu_average_active_lanes >= 1)
+        rt_power.readOp.dynamic +=
+            base_energy * executionTime *
+            (32 - XML->sys.core[ithCore].sfu_average_active_lanes);
+    }
+
+  } /* else */
+}
+
+void FunctionalUnit::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  //	cout << indent_str_next << "Results Broadcast Bus Area = " <<
+  // bypass->area.get_area() *1e-6 << " mm^2" << endl;
+  if (is_tdp) {
+    if (fu_type == FPU) {
+      cout << indent_str
+           << "Floating Point Units (FPUs) (Count: " << coredynp.num_fpus
+           << " ):" << endl;
+      cout << indent_str_next << "Area = " << area.get_area() * 1e-6 << " mm^2"
+           << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << power.readOp.dynamic * clockRate << " W"
+           << endl;
+      cout << indent_str_next << "Peak Energy = " << power.readOp.dynamic
+           << " J" << endl;
+
+      //			cout << indent_str_next << "Subthreshold Leakage
+      //= " << power.readOp.leakage  << " W" << endl; cout <<"clock:
+      // "<<clockRate<<endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? power.readOp.longer_channel_leakage
+                            : power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage
+           << " W" << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << rt_power.readOp.dynamic / executionTime
+           << " W" << endl;
+      cout << endl;
+    } else if (fu_type == ALU) {
+      cout << indent_str << "Integer ALUs (Count: " << coredynp.num_alus
+           << " ):" << endl;
+      cout << indent_str_next << "Area = " << area.get_area() * 1e-6 << " mm^2"
+           << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << power.readOp.dynamic * clockRate << " W"
+           << endl;
+      //			cout << indent_str_next << "Subthreshold Leakage
+      //= " << power.readOp.leakage  << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? power.readOp.longer_channel_leakage
+                            : power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage
+           << " W" << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << rt_power.readOp.dynamic / executionTime
+           << " W" << endl;
+      cout << endl;
+    } else if (fu_type == MUL) {
+      cout << indent_str
+           << "Complex ALUs (Mul/Div) (Count: " << coredynp.num_muls
+           << " ):" << endl;
+      cout << indent_str_next << "Area = " << area.get_area() * 1e-6 << " mm^2"
+           << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << power.readOp.dynamic * clockRate << " W"
+           << endl;
+      //			cout << indent_str_next << "Subthreshold Leakage
+      //= " << power.readOp.leakage  << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? power.readOp.longer_channel_leakage
+                            : power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage
+           << " W" << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << rt_power.readOp.dynamic / executionTime
+           << " W" << endl;
+      cout << endl;
+    }
+
+  } else {
+  }
+}
+
+void FunctionalUnit::leakage_feedback(double temperature) {
+  // Update the temperature and initialize the global interfaces.
+  interface_ip.temp = (unsigned int)round(temperature / 10.0) * 10;
+
+  uca_org_t init_result = init_interface(&interface_ip); // init_result is dummy
+
+  // This is part of FunctionalUnit()
+  double area_t, leakage, gate_leakage;
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+
+  if (fu_type == FPU) {
+    area_t = 4.47 * 1e6 *
+             (g_ip->F_sz_nm * g_ip->F_sz_nm / 90.0 /
+              90.0); // this is um^2 The base number
+    if (g_ip->F_sz_nm > 90)
+      area_t =
+          4.47 * 1e6 * g_tp.scaling_factor.logic_scaling_co_eff; // this is um^2
+    leakage = area_t * (g_tp.scaling_factor.core_tx_density) *
+              cmos_Isub_leakage(5 * g_tp.min_w_nmos_,
+                                5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1,
+                                inv) *
+              g_tp.peri_global.Vdd / 2; // unit W
+    gate_leakage =
+        area_t * (g_tp.scaling_factor.core_tx_density) *
+        cmos_Ig_leakage(5 * g_tp.min_w_nmos_,
+                        5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1, inv) *
+        g_tp.peri_global.Vdd / 2; // unit W
+  } else if (fu_type == ALU) {
+    area_t = 280 * 260 * 2 * num_fu *
+             g_tp.scaling_factor.logic_scaling_co_eff; // this is um^2 ALU + MUl
+    leakage = area_t * (g_tp.scaling_factor.core_tx_density) *
+              cmos_Isub_leakage(20 * g_tp.min_w_nmos_,
+                                20 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r,
+                                1, inv) *
+              g_tp.peri_global.Vdd / 2; // unit W
+    gate_leakage =
+        area_t * (g_tp.scaling_factor.core_tx_density) *
+        cmos_Ig_leakage(20 * g_tp.min_w_nmos_,
+                        20 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1, inv) *
+        g_tp.peri_global.Vdd / 2;
+  } else if (fu_type == MUL) {
+    area_t = 280 * 260 * 2 * 3 * num_fu *
+             g_tp.scaling_factor.logic_scaling_co_eff; // this is um^2 ALU + MUl
+    leakage = area_t * (g_tp.scaling_factor.core_tx_density) *
+              cmos_Isub_leakage(20 * g_tp.min_w_nmos_,
+                                20 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r,
+                                1, inv) *
+              g_tp.peri_global.Vdd / 2; // unit W
+    gate_leakage =
+        area_t * (g_tp.scaling_factor.core_tx_density) *
+        cmos_Ig_leakage(20 * g_tp.min_w_nmos_,
+                        20 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1, inv) *
+        g_tp.peri_global.Vdd / 2;
+  } else {
+    cout << "Unknown Functional Unit Type" << endl;
+    exit(1);
+  }
+
+  power.readOp.leakage = leakage * num_fu;
+  power.readOp.gate_leakage = gate_leakage * num_fu;
+  power.readOp.longer_channel_leakage =
+      longer_channel_device_reduction(Core_device, coredynp.core_ty);
+}
+
+UndiffCore::UndiffCore(ParseXML *XML_interface, int ithCore_,
+                       InputParameter *interface_ip_,
+                       const CoreDynParam &dyn_p_, bool exist_, bool embedded_)
+    : XML(XML_interface), ithCore(ithCore_), interface_ip(*interface_ip_),
+      coredynp(dyn_p_), core_ty(coredynp.core_ty), embedded(XML->sys.Embedded),
+      pipeline_stage(coredynp.pipeline_stages),
+      num_hthreads(coredynp.num_hthreads), issue_width(coredynp.issueW),
+      exist(exist_)
+// is_default(_is_default)
+{
+  if (!exist)
+    return;
+  double undifferentiated_core = 0;
+  double core_tx_density = 0;
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  double undifferentiated_core_coe;
+  // XML_interface=_XML_interface;
+  uca_org_t result2;
+  result2 = init_interface(&interface_ip);
+
+  // Compute undifferentiated core area at 90nm.
+  if (embedded == false) {
+    // Based on the results of polynomial/log curve fitting based on
+    // undifferentiated core of Niagara, Niagara2, Merom, Penyrn, Prescott,
+    // Opteron die measurements
+    if (core_ty == OOO) {
+      // undifferentiated_core = (0.0764*pipeline_stage*pipeline_stage
+      // -2.3685*pipeline_stage + 10.405);//OOO
+      undifferentiated_core = (3.57 * log(pipeline_stage) - 1.2643) > 0
+                                  ? (3.57 * log(pipeline_stage) - 1.2643)
+                                  : 0;
+    } else if (core_ty == Inorder) {
+      // undifferentiated_core = (0.1238*pipeline_stage + 7.2572)*0.9;//inorder
+      undifferentiated_core = (-2.19 * log(pipeline_stage) + 6.55) > 0
+                                  ? (-2.19 * log(pipeline_stage) + 6.55)
+                                  : 0;
+    } else {
+      cout << "invalid core type" << endl;
+      exit(0);
+    }
+    undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0716);
+  } else {
+    // Based on the results in paper "parametrized processor models" Sandia Labs
+    if (XML->sys.opt_clockrate)
+      undifferentiated_core_coe = 0.05;
+    else
+      undifferentiated_core_coe = 0;
+    undifferentiated_core =
+        (0.4109 * pipeline_stage - 0.776) * undifferentiated_core_coe;
+    undifferentiated_core *= (1 + logtwo(num_hthreads) * 0.0426);
+  }
+
+  undifferentiated_core *= g_tp.scaling_factor.logic_scaling_co_eff *
+                           1e6; // change from mm^2 to um^2
+  core_tx_density = g_tp.scaling_factor.core_tx_density;
+  // undifferentiated_core 		    = 3*1e6;
+  // undifferentiated_core			*=
+  // g_tp.scaling_factor.logic_scaling_co_eff;//(g_ip->F_sz_um*g_ip->F_sz_um/0.09/0.09)*;
+  power.readOp.leakage =
+      undifferentiated_core *
+      (core_tx_density)*cmos_Isub_leakage(
+          5 * g_tp.min_w_nmos_, 5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1,
+          inv) *
+      g_tp.peri_global.Vdd; // unit W
+  power.readOp.gate_leakage =
+      undifferentiated_core *
+      (core_tx_density)*cmos_Ig_leakage(
+          5 * g_tp.min_w_nmos_, 5 * g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1,
+          inv) *
+      g_tp.peri_global.Vdd;
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(Core_device, coredynp.core_ty);
+  power.readOp.longer_channel_leakage =
+      power.readOp.leakage * long_channel_device_reduction;
+  area.set_area(undifferentiated_core);
+
+  scktRatio = g_tp.sckt_co_eff;
+  power.readOp.dynamic *= scktRatio;
+  power.writeOp.dynamic *= scktRatio;
+  power.searchOp.dynamic *= scktRatio;
+  macro_PR_overhead = g_tp.macro_layout_overhead;
+  area.set_area(area.get_area() * macro_PR_overhead);
+
+  //		double vt=g_tp.peri_global.Vth;
+  //		double velocity_index=1.1;
+  //		double c_in=gate_C(g_tp.min_w_nmos_,
+  // g_tp.min_w_nmos_*pmos_to_nmos_sizing_r , 0.0, false); 		double
+  // c_out= drain_C_(g_tp.min_w_nmos_, NCH, 2, 1, g_tp.cell_h_def, false) +
+  // drain_C_(g_tp.min_w_nmos_*pmos_to_nmos_sizing_r, PCH, 1, 1,
+  // g_tp.cell_h_def, false) + c_in; 		double w_nmos=g_tp.min_w_nmos_;
+  // double w_pmos=g_tp.min_w_nmos_*pmos_to_nmos_sizing_r; 		double
+  // i_on_n=1.0; 		double
+  // i_on_p=1.0; 		double i_on_n_in=1.0; 		double
+  // i_on_p_in=1; double vdd=g_tp.peri_global.Vdd;
+
+  //		power.readOp.sc=shortcircuit_simple(vt, velocity_index, c_in,
+  // c_out, w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
+  //		power.readOp.dynamic=c_out*vdd*vdd/2;
+
+  //		cout<<power.readOp.dynamic << "dynamic" <<endl;
+  //		cout<<power.readOp.sc << "sc" << endl;
+
+  //		power.readOp.sc=shortcircuit(vt, velocity_index, c_in, c_out,
+  // w_nmos,w_pmos, i_on_n, i_on_p,i_on_n_in, i_on_p_in, vdd);
+  //		power.readOp.dynamic=c_out*vdd*vdd/2;
+  //
+  //		cout<<power.readOp.dynamic << "dynamic" <<endl;
+  //		cout<<power.readOp.sc << "sc" << endl;
+}
+
+void UndiffCore::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+    cout << indent_str << "UndiffCore:" << endl;
+    cout << indent_str_next << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << power.readOp.dynamic * clockRate << " W"
+         << endl;
+    // cout << indent_str_next << "Subthreshold Leakage = " <<
+    // power.readOp.leakage <<" W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? power.readOp.longer_channel_leakage
+                          : power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage
+         << " W" << endl;
+    // cout << indent_str_next << "Runtime Dynamic = " <<
+    // rt_power.readOp.dynamic/executionTime << " W" << endl;
+    cout << endl;
+  } else {
+    cout << indent_str << "UndiffCore:" << endl;
+    cout << indent_str_next << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << power.readOp.dynamic * clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage
+         << " W" << endl;
+    // cout << indent_str_next << "Runtime Dynamic = " <<
+    // rt_power.readOp.dynamic/executionTime << " W" << endl;
+    cout << endl;
+  }
+}
+
+inst_decoder::inst_decoder(bool _is_default,
+                           const InputParameter *configure_interface,
+                           int opcode_length_, int num_decoders_, bool x86_,
+                           enum Device_ty device_ty_, enum Core_type core_ty_)
+    : is_default(_is_default), opcode_length(opcode_length_),
+      num_decoders(num_decoders_), x86(x86_), device_ty(device_ty_),
+      core_ty(core_ty_) {
+  /*
+   * Instruction decoder is different from n to 2^n decoders
+   * that are commonly used in row decoders in memory arrays.
+   * The RISC instruction decoder is typically a very simple device.
+   * We can decode an instruction by simply
+   * separating the machine word into small parts using wire slices
+   * The RISC instruction decoder can be approximate by the n to 2^n decoders,
+   * although this approximation usually underestimate power since each decoded
+   * instruction normally has more than 1 active signal.
+   *
+   * However, decoding a CISC instruction word is much more difficult
+   * than the RISC case. A CISC decoder is typically set up as a state machine.
+   * The machine reads the opcode field to determine
+   * what type of instruction it is,
+   * and where the other data values are.
+   * The instruction word is read in piece by piece,
+   * and decisions are made at each stage as to
+   * how the remainder of the instruction word will be read.
+   * (sequencer and ROM are usually needed)
+   * An x86 decoder can be even more complex since
+   * it involve  both decoding instructions into u-ops and
+   * merge u-ops when doing micro-ops fusion.
+   */
+  bool is_dram = false;
+  double pmos_to_nmos_sizing_r;
+  double load_nmos_width, load_pmos_width;
+  double C_driver_load, R_wire_load;
+  Area cell;
+
+  l_ip = *configure_interface;
+  local_result = init_interface(&l_ip);
+  cell.h = g_tp.cell_h_def;
+  cell.w = g_tp.cell_h_def;
+
+  num_decoder_segments = (int)ceil(opcode_length / 18.0);
+  if (opcode_length > 18)
+    opcode_length = 18;
+  num_decoded_signals = (int)pow(2.0, opcode_length);
+  pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  load_nmos_width = g_tp.max_w_nmos_ / 2;
+  load_pmos_width = g_tp.max_w_nmos_ * pmos_to_nmos_sizing_r;
+  C_driver_load =
+      1024 * gate_C(load_nmos_width + load_pmos_width, 0,
+                    is_dram); // TODO: this number 1024 needs to be revisited
+  R_wire_load = 3000 * l_ip.F_sz_um * g_tp.wire_outside_mat.R_per_um;
+
+  final_dec = new Decoder(num_decoded_signals, false, C_driver_load,
+                          R_wire_load, false /*is_fa*/, false /*is_dram*/,
+                          false /*wl_tr*/, // to use peri device
+                          cell);
+
+  PredecBlk *predec_blk1 =
+      new PredecBlk(num_decoded_signals, final_dec,
+                    0, // Assuming predec and dec are back to back
+                    0,
+                    1, // Each Predec only drives one final dec
+                    false /*is_dram*/, true);
+  PredecBlk *predec_blk2 =
+      new PredecBlk(num_decoded_signals, final_dec,
+                    0, // Assuming predec and dec are back to back
+                    0,
+                    1, // Each Predec only drives one final dec
+                    false /*is_dram*/, false);
+
+  PredecBlkDrv *predec_blk_drv1 = new PredecBlkDrv(0, predec_blk1, false);
+  PredecBlkDrv *predec_blk_drv2 = new PredecBlkDrv(0, predec_blk2, false);
+
+  pre_dec = new Predec(predec_blk_drv1, predec_blk_drv2);
+
+  double area_decoder = final_dec->area.get_area() * num_decoded_signals *
+                        num_decoder_segments * num_decoders;
+  // double w_decoder    = area_decoder / area.get_h();
+  double area_pre_dec =
+      (predec_blk_drv1->area.get_area() + predec_blk_drv2->area.get_area() +
+       predec_blk1->area.get_area() + predec_blk2->area.get_area()) *
+      num_decoder_segments * num_decoders;
+  area.set_area(area.get_area() + area_decoder + area_pre_dec);
+  double macro_layout_overhead = g_tp.macro_layout_overhead;
+  double chip_PR_overhead = g_tp.chip_layout_overhead;
+  area.set_area(area.get_area() * macro_layout_overhead * chip_PR_overhead);
+
+  inst_decoder_delay_power();
+
+  double sckRation = g_tp.sckt_co_eff;
+  power.readOp.dynamic *= sckRation;
+  power.writeOp.dynamic *= sckRation;
+  power.searchOp.dynamic *= sckRation;
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(device_ty, core_ty);
+  power.readOp.longer_channel_leakage =
+      power.readOp.leakage * long_channel_device_reduction;
+}
+
+void inst_decoder::inst_decoder_delay_power() {
+
+  double pppm_t[4] = {1, 1, 1, 1};
+  double squencer_passes = x86 ? 2 : 1;
+
+  set_pppm(pppm_t, squencer_passes * num_decoder_segments, num_decoder_segments,
+           squencer_passes * num_decoder_segments, num_decoder_segments);
+  power = power + pre_dec->power * pppm_t;
+  set_pppm(pppm_t, squencer_passes * num_decoder_segments,
+           num_decoder_segments * num_decoded_signals,
+           num_decoder_segments * num_decoded_signals,
+           squencer_passes * num_decoder_segments);
+  power = power + final_dec->power * pppm_t;
+}
+void inst_decoder::leakage_feedback(double temperature) {
+  l_ip.temp = (unsigned int)round(temperature / 10.0) * 10;
+  uca_org_t init_result = init_interface(&l_ip); // init_result is dummy
+
+  final_dec->leakage_feedback(temperature);
+  pre_dec->leakage_feedback(temperature);
+
+  double pppm_t[4] = {1, 1, 1, 1};
+  double squencer_passes = x86 ? 2 : 1;
+
+  set_pppm(pppm_t, squencer_passes * num_decoder_segments, num_decoder_segments,
+           squencer_passes * num_decoder_segments, num_decoder_segments);
+  power = pre_dec->power * pppm_t;
+
+  set_pppm(pppm_t, squencer_passes * num_decoder_segments,
+           num_decoder_segments * num_decoded_signals,
+           num_decoder_segments * num_decoded_signals,
+           squencer_passes * num_decoder_segments);
+  power = power + final_dec->power * pppm_t;
+
+  double sckRation = g_tp.sckt_co_eff;
+
+  power.readOp.dynamic *= sckRation;
+  power.writeOp.dynamic *= sckRation;
+  power.searchOp.dynamic *= sckRation;
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(device_ty, core_ty);
+  power.readOp.longer_channel_leakage =
+      power.readOp.leakage * long_channel_device_reduction;
+}
+
+inst_decoder::~inst_decoder() {
+  local_result.cleanup();
+
+  delete final_dec;
+
+  delete pre_dec->blk1;
+  delete pre_dec->blk2;
+  delete pre_dec->drv1;
+  delete pre_dec->drv2;
+  delete pre_dec;
+}
diff --git a/src/gpuwattch/logic.h b/src/gpuwattch/logic.h
new file mode 100644
index 000000000..c93aedcf4
--- /dev/null
+++ b/src/gpuwattch/logic.h
@@ -0,0 +1,238 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+#ifndef LOGIC_H_
+#define LOGIC_H_
+
+#include "XML_Parse.h"
+#include "arch_const.h"
+#include "basic_components.h"
+#include "cacti/basic_circuit.h"
+#include "cacti/cacti_interface.h"
+#include "cacti/component.h"
+#include "cacti/const.h"
+#include "cacti/decoder.h"
+#include "cacti/parameter.h"
+#include "xmlParser.h"
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+
+using namespace std;
+
+class selection_logic : public Component {
+public:
+  selection_logic(
+      bool _is_default, int win_entries_, int issue_width_,
+      const InputParameter *configure_interface,
+      enum Device_ty device_ty_ = Core_device,
+      enum Core_type core_ty_ = Inorder); //, const ParseXML *_XML_interface);
+  bool is_default;
+  InputParameter l_ip;
+  uca_org_t local_result;
+  const ParseXML *XML_interface;
+  int win_entries;
+  int issue_width;
+  int num_threads;
+  enum Device_ty device_ty;
+  enum Core_type core_ty;
+
+  void selection_power();
+  void leakage_feedback(double temperature); // TODO
+};
+
+class dep_resource_conflict_check : public Component {
+public:
+  dep_resource_conflict_check(const InputParameter *configure_interface,
+                              const CoreDynParam &dyn_p_, int compare_bits_,
+                              bool _is_default = true);
+  InputParameter l_ip;
+  uca_org_t local_result;
+  double WNORn, WNORp, Wevalinvp, Wevalinvn, Wcompn, Wcompp, Wcomppreequ;
+  CoreDynParam coredynp;
+  int compare_bits;
+  bool is_default;
+  statsDef tdp_stats;
+  statsDef rtp_stats;
+  statsDef stats_t;
+  powerDef power_t;
+
+  void conflict_check_power();
+  double compare_cap();
+  ~dep_resource_conflict_check() { local_result.cleanup(); }
+
+  void leakage_feedback(double temperature);
+};
+
+class inst_decoder : public Component {
+public:
+  inst_decoder(bool _is_default, const InputParameter *configure_interface,
+               int opcode_length_, int num_decoders_, bool x86_,
+               enum Device_ty device_ty_ = Core_device,
+               enum Core_type core_ty_ = Inorder);
+  inst_decoder();
+  bool is_default;
+  int opcode_length;
+  int num_decoders;
+  bool x86;
+  int num_decoder_segments;
+  int num_decoded_signals;
+  InputParameter l_ip;
+  uca_org_t local_result;
+  enum Device_ty device_ty;
+  enum Core_type core_ty;
+
+  Decoder *final_dec;
+  Predec *pre_dec;
+
+  statsDef tdp_stats;
+  statsDef rtp_stats;
+  statsDef stats_t;
+  powerDef power_t;
+  void inst_decoder_delay_power();
+  ~inst_decoder();
+  void leakage_feedback(double temperature);
+};
+
+class DFFCell : public Component {
+public:
+  DFFCell(bool _is_dram, double _WdecNANDn, double _WdecNANDp,
+          double _cell_load, const InputParameter *configure_interface);
+  InputParameter l_ip;
+  bool is_dram;
+  double cell_load;
+  double WdecNANDn;
+  double WdecNANDp;
+  double clock_cap;
+  int model;
+  int n_switch;
+  int n_keep_1;
+  int n_keep_0;
+  int n_clock;
+  powerDef e_switch;
+  powerDef e_keep_1;
+  powerDef e_keep_0;
+  powerDef e_clock;
+
+  double fpfp_node_cap(unsigned int fan_in, unsigned int fan_out);
+  void compute_DFF_cell(void);
+};
+
+class Pipeline : public Component {
+public:
+  Pipeline(const InputParameter *configure_interface,
+           const CoreDynParam &dyn_p_, enum Device_ty device_ty_ = Core_device,
+           bool _is_core_pipeline = true, bool _is_default = true);
+  InputParameter l_ip;
+  uca_org_t local_result;
+  CoreDynParam coredynp;
+  enum Device_ty device_ty;
+  bool is_core_pipeline, is_default;
+  double num_piperegs;
+  //	int pipeline_stages;
+  //	int tot_stage_vector, per_stage_vector;
+  bool process_ind;
+  double WNANDn;
+  double WNANDp;
+  double load_per_pipeline_stage;
+  //	int  Hthread,  num_thread, fetchWidth, decodeWidth, issueWidth,
+  // commitWidth, instruction_length; 	int  PC_width, opcode_length,
+  // num_arch_reg_tag, data_width,num_phsical_reg_tag, address_width; 	bool
+  // thread_clock_gated; 	bool in_order, multithreaded;
+  void compute_stage_vector();
+  void compute();
+  ~Pipeline() { local_result.cleanup(); };
+};
+
+// class core_pipeline :public pipeline{
+// public:
+//	int  Hthread,  num_thread, fetchWidth, decodeWidth, issueWidth,
+// commitWidth, instruction_length; 	int  PC_width, opcode_length,
+// num_arch_reg_tag, data_width,num_phsical_reg_tag, address_width; 	bool
+// thread_clock_gated; 	bool in_order, multithreaded; 	core_pipeline(bool
+//_is_default, const InputParameter *configure_interface); 	virtual void
+// compute_stage_vector();
+//
+//};
+
+class FunctionalUnit : public Component {
+public:
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  CoreDynParam coredynp;
+  double FU_height;
+  double clockRate, executionTime;
+  double num_fu;
+  double energy, base_energy, per_access_energy, leakage, gate_leakage;
+  bool is_default;
+  enum FU_type fu_type;
+  statsDef tdp_stats;
+  statsDef rtp_stats;
+  statsDef stats_t;
+  powerDef power_t;
+
+  FunctionalUnit(ParseXML *XML_interface, int ithCore_,
+                 InputParameter *interface_ip_, const CoreDynParam &dyn_p_,
+                 enum FU_type fu_type, double exClockRate);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  void leakage_feedback(double temperature);
+};
+
+class UndiffCore : public Component {
+public:
+  UndiffCore(ParseXML *XML_interface, int ithCore_,
+             InputParameter *interface_ip_, const CoreDynParam &dyn_p_,
+             bool exist_ = true, bool embedded_ = false);
+  ParseXML *XML;
+  int ithCore;
+  InputParameter interface_ip;
+  CoreDynParam coredynp;
+  double clockRate, executionTime;
+  double scktRatio, chip_PR_overhead, macro_PR_overhead;
+  enum Core_type core_ty;
+  bool opt_performance, embedded;
+  double pipeline_stage, num_hthreads, issue_width;
+  bool is_default;
+
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~UndiffCore(){};
+  bool exist;
+};
+#endif /* LOGIC_H_ */
diff --git a/src/gpuwattch/main.cc b/src/gpuwattch/main.cc
new file mode 100644
index 000000000..9ff79fc1c
--- /dev/null
+++ b/src/gpuwattch/main.cc
@@ -0,0 +1,95 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+#include "XML_Parse.h"
+#include "globalvar.h"
+#include "io.h"
+#include "processor.h"
+#include "version.h"
+#include "xmlParser.h"
+#include <iostream>
+
+using namespace std;
+
+void print_usage(char *argv0);
+
+int main(int argc, char *argv[]) {
+  char *fb;
+  bool infile_specified = false;
+  int plevel = 2;
+  opt_for_clk = true;
+  // cout.precision(10);
+  if (argc <= 1 || argv[1] == string("-h") || argv[1] == string("--help")) {
+    print_usage(argv[0]);
+  }
+
+  for (int32_t i = 0; i < argc; i++) {
+    if (argv[i] == string("-infile")) {
+      infile_specified = true;
+      i++;
+      fb = argv[i];
+    }
+
+    if (argv[i] == string("-print_level")) {
+      i++;
+      plevel = atoi(argv[i]);
+    }
+
+    if (argv[i] == string("-opt_for_clk")) {
+      i++;
+      opt_for_clk = (bool)atoi(argv[i]);
+    }
+  }
+  if (infile_specified == false) {
+    print_usage(argv[0]);
+  }
+
+  cout << "McPAT (version " << VER_MAJOR << "." << VER_MINOR << " of "
+       << VER_UPDATE << ") is computing the target processor...\n " << endl;
+
+  // parse XML-based interface
+  ParseXML *p1 = new ParseXML();
+  p1->parse(fb);
+  Processor proc(p1);
+  proc.displayEnergy(2, plevel);
+  delete p1;
+  return 0;
+}
+
+void print_usage(char *argv0) {
+  cerr << "How to use McPAT:" << endl;
+  cerr << "  mcpat -infile <input file name>  -print_level < level of details "
+          "0~5 >  -opt_for_clk < 0 (optimize for ED^2P only)/1 (optimzed for "
+          "target clock rate)>"
+       << endl;
+  // cerr << "    Note:default print level is at processor level, please
+  // increase it to see the details" << endl;
+  exit(1);
+}
diff --git a/src/gpuwattch/makefile b/src/gpuwattch/makefile
new file mode 100644
index 000000000..ab718ccc4
--- /dev/null
+++ b/src/gpuwattch/makefile
@@ -0,0 +1,35 @@
+TAR = mcpat
+
+.PHONY: dbg opt depend clean clean_dbg clean_opt
+
+all: opt
+
+dbg: $(TAR).mk obj_dbg
+	@$(MAKE) TAG=dbg -C . -f $(TAR).mk
+
+opt: $(TAR).mk obj_opt
+	@$(MAKE) TAG=opt -C . -f $(TAR).mk
+
+obj_dbg:
+	mkdir $@
+
+obj_opt:
+	mkdir $@
+
+depend:
+	@$(MAKE) TAG=opt -C . -f $(TAR).mk depend
+
+clean: clean_dbg clean_opt clean_cacti
+
+clean_dbg: obj_dbg
+	@$(MAKE) TAG=dbg -C . -f $(TAR).mk clean
+	rm -rf $<
+
+clean_opt: obj_opt
+	@$(MAKE) TAG=opt -C . -f $(TAR).mk clean
+	rm -rf $<
+
+clean_cacti: cacti
+	rm -rf cacti/obj_opt
+
+
diff --git a/src/gpuwattch/mcpat.mk b/src/gpuwattch/mcpat.mk
new file mode 100644
index 000000000..a09c23b4c
--- /dev/null
+++ b/src/gpuwattch/mcpat.mk
@@ -0,0 +1,105 @@
+
+OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/gpuwattch
+TARGET = mcpat
+SHELL = /bin/sh
+.PHONY: all depend clean
+.SUFFIXES: .cc .o
+
+ifndef NTHREADS
+  NTHREADS = 4
+endif
+
+
+LIBS = -I/usr/lib/ -I/usr/lib64/
+INCS = -lm
+
+CC=
+CXX=
+
+ifeq ($(shell getconf LONG_BIT),64) 
+	CXX = g++ -m64
+	CC  = gcc -m64
+else 
+	CXX = g++ -m32
+	CC  = gcc -m32
+endif 
+
+ifeq ($(TAG),dbg)
+  DBG = -Wall 
+  OPT = -ggdb -fPIC -g -O0 -DNTHREADS=1 -Icacti -lz
+else
+  DBG = 
+  OPT = -O3 -fPIC -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) -Icacti -lz
+  #OPT = -O0 -DNTHREADS=$(NTHREADS)
+endif
+
+#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) 
+CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) 
+
+
+
+
+VPATH = cacti
+
+SRCS  = \
+  Ucache.cc \
+  XML_Parse.cc \
+  arbiter.cc \
+  area.cc \
+  array.cc \
+  bank.cc \
+  basic_circuit.cc \
+  basic_components.cc \
+  cacti_interface.cc \
+  component.cc \
+  core.cc \
+  crossbar.cc \
+  decoder.cc \
+  htree2.cc \
+  interconnect.cc \
+  io.cc \
+  iocontrollers.cc \
+  logic.cc \
+  main.cc \
+  mat.cc \
+  memoryctrl.cc \
+  noc.cc \
+  nuca.cc \
+  parameter.cc \
+  processor.cc \
+  router.cc \
+  sharedcache.cc \
+  subarray.cc \
+  technology.cc \
+  uca.cc \
+  wire.cc \
+  xmlParser.cc \
+  gpgpu_sim_wrapper.cc \
+
+
+
+OBJS = $(patsubst %.cc,$(OUTPUT_DIR)/%.o,$(SRCS))
+
+all: $(OUTPUT_DIR)/$(TARGET)
+
+$(OUTPUT_DIR)/$(TARGET) : $(OBJS)
+	$(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
+
+#obj_$(TAG)/%.o : %.cc
+#	$(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
+
+$(OUTPUT_DIR)/%.o : %.cc
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+$(OUTPUT_DIR)/Makefile.makedepend: depend
+
+depend:
+	touch $(OUTPUT_DIR)/Makefile.makedepend
+	makedepend -f$(OUTPUT_DIR)/Makefile.makedepend -p$(OUTPUT_DIR)/ $(SRCS) 2> /dev/null
+	$(MAKE) -C ./cacti/ depend
+
+clean:
+	-rm -f *.o $(TARGET)
+	rm -f Makefile.makedepend Makefile.makedepend.bak
+
+include $(OUTPUT_DIR)/Makefile.makedepend
diff --git a/src/gpuwattch/mcpatXeonCore.mk b/src/gpuwattch/mcpatXeonCore.mk
new file mode 100644
index 000000000..20cf0ddc8
--- /dev/null
+++ b/src/gpuwattch/mcpatXeonCore.mk
@@ -0,0 +1,81 @@
+TARGET = mcpatXeonCore
+SHELL = /bin/sh
+.PHONY: all depend clean
+.SUFFIXES: .cc .o
+
+ifndef NTHREADS
+  NTHREADS = 4
+endif
+
+
+LIBS = 
+INCS = -lm
+
+ifeq ($(TAG),dbg)
+  DBG = -Wall 
+  OPT = -ggdb -g -O0 -DNTHREADS=1 -Icacti
+else
+  DBG = 
+  OPT = -O3 -msse2 -mfpmath=sse -DNTHREADS=$(NTHREADS) -Icacti
+  #OPT = -O0 -DNTHREADS=$(NTHREADS)
+endif
+
+#CXXFLAGS = -Wall -Wno-unknown-pragmas -Winline $(DBG) $(OPT) 
+CXXFLAGS = -Wno-unknown-pragmas $(DBG) $(OPT) 
+CXX = g++ -m32
+CC  = gcc -m32
+
+VPATH = cacti
+
+SRCS  = \
+  Ucache.cc \
+  XML_Parse.cc \
+  arbiter.cc \
+  area.cc \
+  array.cc \
+  bank.cc \
+  basic_circuit.cc \
+  basic_components.cc \
+  cacti_interface.cc \
+  component.cc \
+  core.cc \
+  crossbar.cc \
+  decoder.cc \
+  htree2.cc \
+  interconnect.cc \
+  io.cc \
+  iocontrollers.cc \
+  logic.cc \
+  main.cc \
+  mat.cc \
+  memoryctrl.cc \
+  noc.cc \
+  nuca.cc \
+  parameter.cc \
+  processor.cc \
+  router.cc \
+  sharedcache.cc \
+  subarray.cc \
+  technology_xeon_core.cc \
+  uca.cc \
+  wire.cc \
+  xmlParser.cc 
+
+OBJS = $(patsubst %.cc,obj_$(TAG)/%.o,$(SRCS))
+
+all: obj_$(TAG)/$(TARGET)
+	cp -f obj_$(TAG)/$(TARGET) $(TARGET)
+
+obj_$(TAG)/$(TARGET) : $(OBJS)
+	$(CXX) $(OBJS) -o $@ $(INCS) $(CXXFLAGS) $(LIBS) -pthread
+
+#obj_$(TAG)/%.o : %.cc
+#	$(CXX) -c $(CXXFLAGS) $(INCS) -o $@ $<
+
+obj_$(TAG)/%.o : %.cc
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+clean:
+	-rm -f *.o $(TARGET)
+
+
diff --git a/src/gpuwattch/memoryctrl.cc b/src/gpuwattch/memoryctrl.cc
new file mode 100644
index 000000000..60317fd52
--- /dev/null
+++ b/src/gpuwattch/memoryctrl.cc
@@ -0,0 +1,1254 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+#include "io.h"
+#include "parameter.h"
+#include "const.h"
+#include "logic.h"
+#include "cacti/basic_circuit.h"
+#include <iostream>
+#include <algorithm>
+#include "XML_Parse.h"
+#include <string>
+#include <cmath>
+#include <assert.h>
+#include "memoryctrl.h"
+#include "basic_components.h"
+/* overview of MC models:
+ * McPAT memory controllers are modeled according to large number of industrial
+ * data points. The Basic memory controller architecture is base on the Synopsis
+ * designs (DesignWare DDR2/DDR3-Lite memory controllers and DDR2/DDR3-Lite
+ * protocol controllers) as in Cadence ChipEstimator Tool
+ *
+ * An MC has 3 parts as shown in this design. McPAT models both high performance
+ * MC based on Niagara processor designs and curving and low power MC based on
+ * data points in Cadence ChipEstimator Tool.
+ *
+ * The frontend is modeled analytically, the backend is modeled empirically
+ * according to DDR2/DDR3-Lite protocol controllers in Cadence ChipEstimator
+ * Tool The PHY is modeled based on "A 100mW 9.6Gb/s Transceiver in 90nm CMOS
+ * for next-generation memory interfaces ," ISSCC 2006, and A 14mW 6.25Gb/s
+ * Transceiver in 90nm CMOS for Serial Chip-to-Chip Communication," ISSCC 2007
+ *
+ * In Cadence ChipEstimator Tool there are two types of memory controllers: the
+ * full memory controllers that includes the frontend as the DesignWare
+ * DDR2/DDR3-Lite memory controllers and the backend only memory controllers as
+ * the DDR2/DDR3-Lite protocol controllers (except DesignWare DDR2/DDR3-Lite
+ * memory controllers, all memory controller IP in Cadence ChipEstimator Tool
+ * are backend memory controllers such as DDRC 1600A and DDRC 800A). Thus,to
+ * some extend the area and power difference between DesignWare DDR2/DDR3-Lite
+ * memory controllers and DDR2/DDR3-Lite protocol controllers can be an
+ * estimation to the frontend power and area, which is very close the
+ * analitically modeled results of the frontend for Niagara2@65nm
+ *
+ */
+
+MCBackend::MCBackend(InputParameter *interface_ip_, const MCParam &mcp_,
+                     enum MemoryCtrl_type mc_type_)
+    : l_ip(*interface_ip_), mc_type(mc_type_), mcp(mcp_) {
+
+  local_result = init_interface(&l_ip);
+  compute();
+}
+
+void MCBackend::compute() {
+  // double max_row_addr_width = 20.0;//Current address 12~18bits
+  double C_MCB, mc_power, backend_dyn,
+      backend_gates; //, refresh_period,refresh_freq;//Equivalent per bit Cap
+                     // for backend,
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  double NMOS_sizing, PMOS_sizing;
+
+  if (mc_type == MC) {
+    if (mcp.type == 0) {
+      // area =
+      // (2.2927*log(peakDataTransferRate)-14.504)*memDataWidth/144.0*(l_ip.F_sz_um/0.09);
+      area.set_area((2.7927 * log(mcp.peakDataTransferRate * 2) - 19.862) /
+                    2.0 * mcp.dataBusWidth / 128.0 * (l_ip.F_sz_um / 0.09) *
+                    mcp.num_channels * 1e6); // um^2
+      // assuming the approximately same scaling factor as seen in processors.
+      // C_MCB=0.2/1.3/1.3/266/64/0.09*g_ip.F_sz_um;//based on AMD Geode
+      // processor which has a very basic mc on chip. C_MCB
+      // = 1.6/200/1e6/144/1.2/1.2*g_ip.F_sz_um/0.19;//Based on Niagara power
+      // numbers.The base power (W) is divided by device frequency and vdd and
+      // scale to target process. mc_power = 0.0291*2;//29.1mW@200MHz @130nm
+      // From Power Analysis of SystemLevel OnChip Communication Architectures
+      // by Lahiri et
+      mc_power =
+          4.32 *
+          0.1; // 4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for backend
+      C_MCB = mc_power / 1e9 / 72 / 1.1 / 1.1 * l_ip.F_sz_um / 0.065;
+      power_t.readOp.dynamic =
+          C_MCB * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd *
+          (mcp.dataBusWidth /*+mcp.addressBusWidth*/); // per access energy in
+                                                       // memory controller
+      power_t.readOp.leakage =
+          area.get_area() / 2 * (g_tp.scaling_factor.core_tx_density) *
+          cmos_Isub_leakage(g_tp.min_w_nmos_,
+                            g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1, inv) *
+          g_tp.peri_global.Vdd; // unit W
+      power_t.readOp.gate_leakage =
+          area.get_area() / 2 * (g_tp.scaling_factor.core_tx_density) *
+          cmos_Ig_leakage(g_tp.min_w_nmos_,
+                          g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1, inv) *
+          g_tp.peri_global.Vdd; // unit W
+
+    } else {
+      NMOS_sizing = g_tp.min_w_nmos_;
+      PMOS_sizing = g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
+      area.set_area(0.15 * mcp.dataBusWidth / 72.0 * (l_ip.F_sz_um / 0.065) *
+                    (l_ip.F_sz_um / 0.065) * mcp.num_channels * 1e6); // um^2
+      backend_dyn =
+          0.9e-9 / 800e6 * mcp.clockRate / 12800 * mcp.peakDataTransferRate *
+          mcp.dataBusWidth / 72.0 * g_tp.peri_global.Vdd / 1.1 *
+          g_tp.peri_global.Vdd / 1.1 *
+          (l_ip.F_sz_nm / 65.0); // Average on DDR2/3 protocol controller and
+                                 // DDRC 1600/800A in Cadence ChipEstimate
+      // Scaling to technology and DIMM feature. The base IP support
+      // DDR3-1600(PC3 12800)
+      backend_gates =
+          50000 * mcp.dataBusWidth / 64.0; // 5000 is from Cadence ChipEstimator
+
+      power_t.readOp.dynamic = backend_dyn;
+      power_t.readOp.leakage =
+          (backend_gates)*cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
+          g_tp.peri_global.Vdd; // unit W
+      power_t.readOp.gate_leakage =
+          (backend_gates)*cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
+          g_tp.peri_global.Vdd; // unit W
+    }
+  } else { // skip old model
+    cout << "Unknown memory controllers" << endl;
+    exit(0);
+    area.set_area(0.243 * mcp.dataBusWidth /
+                  8); // area based on Cadence ChipEstimator for 8bit bus
+    // mc_power = 4.32*0.1;//4.32W@1GhzMHz @65nm Cadence ChipEstimator 10% for
+    // backend
+    C_MCB = mc_power / 1e9 / 72 / 1.1 / 1.1 * l_ip.F_sz_um / 0.065;
+    power_t.readOp.leakage =
+        area.get_area() / 2 * (g_tp.scaling_factor.core_tx_density) *
+        cmos_Isub_leakage(g_tp.min_w_nmos_,
+                          g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1, inv) *
+        g_tp.peri_global.Vdd; // unit W
+    power_t.readOp.gate_leakage =
+        area.get_area() / 2 * (g_tp.scaling_factor.core_tx_density) *
+        cmos_Ig_leakage(g_tp.min_w_nmos_,
+                        g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1, inv) *
+        g_tp.peri_global.Vdd; // unit W
+    power_t.readOp.dynamic *= 1.2;
+    power_t.readOp.leakage *= 1.2;
+    power_t.readOp.gate_leakage *= 1.2;
+    // flash controller has about 20% more backend power since BCH ECC in flash
+    // is complex and power hungry
+  }
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(Uncore_device);
+  power_t.readOp.longer_channel_leakage =
+      power_t.readOp.leakage * long_channel_device_reduction;
+}
+
+void MCBackend::computeEnergy(bool is_tdp) {
+  // backend uses internal data buswidth
+  if (is_tdp) {
+    power.reset(); // Jingwen
+    // init stats for Peak
+    stats_t.readAc.access = 0.5 * mcp.num_channels;
+    stats_t.writeAc.access = 0.5 * mcp.num_channels;
+    tdp_stats = stats_t;
+  } else {
+    rt_power.reset(); // Jingwen
+    // init stats for runtime power (RTP)
+    // Jingwen: should use stats from XML object, modified in
+    // MemoryController::computeEnergy
+    stats_t.readAc.access = mcp.reads;
+    stats_t.writeAc.access = mcp.writes;
+    tdp_stats = stats_t;
+  }
+  if (is_tdp) {
+    power = power_t;
+    power.readOp.dynamic = (stats_t.readAc.access + stats_t.writeAc.access) *
+                           power_t.readOp.dynamic;
+
+  } else {
+    rt_power.readOp.dynamic = (stats_t.readAc.access + stats_t.writeAc.access) *
+                              mcp.llcBlockSize * 8.0 / mcp.dataBusWidth *
+                              power_t.readOp.dynamic;
+    rt_power = rt_power + power_t * pppm_lkg;
+    rt_power.readOp.dynamic =
+        rt_power.readOp.dynamic + power.readOp.dynamic * 0.1 * mcp.clockRate *
+                                      mcp.num_mcs * mcp.executionTime;
+    // Assume 10% of peak power is consumed by routine job including memory
+    // refreshing and scrubbing
+  }
+}
+
+MCPHY::MCPHY(InputParameter *interface_ip_, const MCParam &mcp_,
+             enum MemoryCtrl_type mc_type_)
+    : l_ip(*interface_ip_), mc_type(mc_type_), mcp(mcp_) {
+
+  local_result = init_interface(&l_ip);
+  compute();
+}
+
+void MCPHY::compute() {
+  // PHY uses internal data buswidth but the actuall off-chip datawidth is
+  // 64bits + ecc
+  double pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  /*
+   * according to "A 100mW 9.6Gb/s Transceiver in 90nm CMOS for next-generation
+   * memory interfaces ," ISSCC 2006; From Cadence ChipEstimator for normal I/O
+   * around 0.4~0.8 mW/Gb/s
+   */
+  double power_per_gb_per_s, phy_gates, NMOS_sizing, PMOS_sizing;
+
+  if (mc_type == MC) {
+    if (mcp.type == 0) {
+      power_per_gb_per_s = mcp.LVDS ? 0.01 : 0.04;
+      // Based on die photos from Niagara 1 and 2.
+      // TODO merge this into undifferentiated core.PHY only achieves square
+      // root of the ideal scaling. area =
+      // (6.4323*log(peakDataTransferRate)-34.76)*memDataWidth/128.0*(l_ip.F_sz_um/0.09);
+      area.set_area((6.4323 * log(mcp.peakDataTransferRate * 2) - 48.134) *
+                    mcp.dataBusWidth / 128.0 * (l_ip.F_sz_um / 0.09) *
+                    mcp.num_channels * 1e6 / 2); // TODO:/2
+      // This is from curve fitting based on Niagara 1 and 2's PHY die photo.
+      // This is power not energy, 10mw/Gb/s @90nm for each channel and scaling
+      // down power.readOp.dynamic = 0.02*memAccesses*llcBlocksize*8;//change
+      // from Bytes to bits.
+      power_t.readOp.dynamic = power_per_gb_per_s * sqrt(l_ip.F_sz_um / 0.09) *
+                               g_tp.peri_global.Vdd / 1.2 *
+                               g_tp.peri_global.Vdd / 1.2;
+      power_t.readOp.leakage =
+          area.get_area() / 2 * (g_tp.scaling_factor.core_tx_density) *
+          cmos_Isub_leakage(g_tp.min_w_nmos_,
+                            g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1, inv) *
+          g_tp.peri_global.Vdd; // unit W
+      power_t.readOp.gate_leakage =
+          area.get_area() / 2 * (g_tp.scaling_factor.core_tx_density) *
+          cmos_Ig_leakage(g_tp.min_w_nmos_,
+                          g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r, 1, inv) *
+          g_tp.peri_global.Vdd; // unit W
+
+    } else {
+      NMOS_sizing = g_tp.min_w_nmos_;
+      PMOS_sizing = g_tp.min_w_nmos_ * pmos_to_nmos_sizing_r;
+      // Designware/synopsis 16bit DDR3 PHY is 1.3mm (WITH IOs) at 40nm for upto
+      // DDR3 2133 (PC3 17066)
+      double non_IO_percentage = 0.2;
+      area.set_area(1.3 * non_IO_percentage / 2133.0e6 * mcp.clockRate / 17066 *
+                    mcp.peakDataTransferRate * mcp.dataBusWidth / 16.0 *
+                    (l_ip.F_sz_um / 0.040) * (l_ip.F_sz_um / 0.040) *
+                    mcp.num_channels * 1e6); // um^2
+      phy_gates = 200000 * mcp.dataBusWidth / 64.0;
+      power_per_gb_per_s = 0.01;
+      // This is power not energy, 10mw/Gb/s @90nm for each channel and scaling
+      // down
+      power_t.readOp.dynamic = power_per_gb_per_s * (l_ip.F_sz_um / 0.09) *
+                               g_tp.peri_global.Vdd / 1.2 *
+                               g_tp.peri_global.Vdd / 1.2;
+      power_t.readOp.leakage =
+          (mcp.withPHY ? phy_gates : 0) *
+          cmos_Isub_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
+          g_tp.peri_global.Vdd; // unit W
+      power_t.readOp.gate_leakage =
+          (mcp.withPHY ? phy_gates : 0) *
+          cmos_Ig_leakage(NMOS_sizing, PMOS_sizing, 2, nand) *
+          g_tp.peri_global.Vdd; // unit W
+    }
+
+  } else {
+    area.set_area(0.4e6 / 2 * mcp.dataBusWidth /
+                  8); // area based on Cadence ChipEstimator for 8bit bus
+  }
+
+  //  double phy_factor = (int)ceil(mcp.dataBusWidth/72.0);//Previous phy power
+  //  numbers are based on 72 bit DIMM interface power_t.readOp.dynamic *=
+  //  phy_factor; power_t.readOp.leakage *= phy_factor;
+  //  power_t.readOp.gate_leakage *= phy_factor;
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(Uncore_device);
+  power_t.readOp.longer_channel_leakage =
+      power_t.readOp.leakage * long_channel_device_reduction;
+}
+
+void MCPHY::computeEnergy(bool is_tdp) {
+  if (is_tdp) {
+    power.reset(); // Jingwen
+    // init stats for Peak
+    stats_t.readAc.access = 0.5 * mcp.num_channels; // time share on buses
+    stats_t.writeAc.access = 0.5 * mcp.num_channels;
+    tdp_stats = stats_t;
+  } else {
+    rt_power.reset(); // Jingwen
+    // init stats for runtime power (RTP)
+    // Jingwen: should use stats from XML object, modified in
+    // MemoryController::computeEnergy
+    stats_t.readAc.access = mcp.reads;
+    stats_t.writeAc.access = mcp.writes;
+    tdp_stats = stats_t;
+  }
+
+  if (is_tdp) {
+    double data_transfer_unit = (mc_type == MC) ? 72 : 16; /*DIMM data width*/
+    power = power_t;
+    power.readOp.dynamic =
+        power.readOp.dynamic *
+        (mcp.peakDataTransferRate * 8 * 1e6 / 1e9 /*change to Gbs*/) *
+        mcp.dataBusWidth / data_transfer_unit * mcp.num_channels /
+        mcp.clockRate;
+    // divide by clock rate is for match the final computation where *clock is
+    // used
+    //(stats_t.readAc.access*power_t.readOp.dynamic+
+    //					stats_t.writeAc.access*power_t.readOp.dynamic);
+
+  } else {
+    rt_power = power_t;
+    //    	rt_power.readOp.dynamic	=
+    //    (stats_t.readAc.access*power_t.readOp.dynamic+
+    //    						stats_t.writeAc.access*power_t.readOp.dynamic);
+
+    rt_power.readOp.dynamic = power_t.readOp.dynamic *
+                              (stats_t.readAc.access + stats_t.writeAc.access) *
+                              (mcp.llcBlockSize) * 8 / 1e9 / mcp.executionTime *
+                              (mcp.executionTime);
+    rt_power.readOp.dynamic =
+        rt_power.readOp.dynamic + power.readOp.dynamic * 0.1 * mcp.clockRate *
+                                      mcp.num_mcs * mcp.executionTime;
+  }
+}
+
+MCFrontEnd::MCFrontEnd(ParseXML *XML_interface, InputParameter *interface_ip_,
+                       const MCParam &mcp_, enum MemoryCtrl_type mc_type_)
+    : XML(XML_interface), interface_ip(*interface_ip_), mc_type(mc_type_),
+      mcp(mcp_), MC_arb(0), frontendBuffer(0), readBuffer(0), writeBuffer(0),
+      coalesce_scale(1.0) {
+  /* All computations are for a single MC
+   *
+   */
+
+  int tag, data;
+  bool is_default = true; // indication for default setup
+
+  /* MC frontend engine channels share the same engines but logically
+   * partitioned For all hardware inside MC. different channels do not share
+   * resources.
+   * TODO: add docodeing/mux stage to steer memory requests to different
+   * channels.
+   */
+
+  // memory request reorder buffer
+  tag = mcp.addressBusWidth + EXTRA_TAG_BITS + mcp.opcodeW;
+  data = int(ceil((XML->sys.physical_address_width + mcp.opcodeW) / 8.0));
+  interface_ip.cache_sz = data * XML->sys.mc.req_window_size_per_channel;
+  interface_ip.line_sz = data;
+  interface_ip.assoc = 0;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput = 1.0 / mcp.clockRate;
+  interface_ip.latency = 1.0 / mcp.clockRate;
+  interface_ip.is_cache = true;
+  interface_ip.pure_cam = false;
+  interface_ip.pure_ram = false;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = XML->sys.mc.memory_channels_per_mc;
+  interface_ip.num_wr_ports = interface_ip.num_rd_ports;
+  interface_ip.num_se_rd_ports = 0;
+  interface_ip.num_search_ports = XML->sys.mc.memory_channels_per_mc;
+  frontendBuffer =
+      new ArrayST(&interface_ip, "MC ReorderBuffer", Uncore_device);
+  frontendBuffer->area.set_area(frontendBuffer->area.get_area() +
+                                frontendBuffer->local_result.area *
+                                    XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area() + frontendBuffer->local_result.area *
+                                      XML->sys.mc.memory_channels_per_mc);
+
+  // selection and arbitration logic
+  MC_arb =
+      new selection_logic(is_default, XML->sys.mc.req_window_size_per_channel,
+                          1, &interface_ip, Uncore_device);
+
+  // read buffers.
+  data = (int)ceil(mcp.dataBusWidth / 8.0); // Support key words first operation
+                                            // //8 means converting bit to Byte
+  interface_ip.cache_sz =
+      data * XML->sys.mc.IO_buffer_size_per_channel; //*llcBlockSize;
+  interface_ip.line_sz = data;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 1;
+  interface_ip.throughput = 1.0 / mcp.clockRate;
+  interface_ip.latency = 2.0 / mcp.clockRate;
+  interface_ip.is_cache = false;
+  interface_ip.pure_cam = false;
+  interface_ip.pure_ram = true;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports =
+      0; // XML->sys.mc.memory_channels_per_mc*2>2?2:XML->sys.mc.memory_channels_per_mc*2;
+  interface_ip.num_rd_ports = XML->sys.mc.memory_channels_per_mc;
+  interface_ip.num_wr_ports = interface_ip.num_rd_ports;
+  interface_ip.num_se_rd_ports = 0;
+  readBuffer = new ArrayST(&interface_ip, "MC ReadBuffer", Uncore_device);
+  readBuffer->area.set_area(readBuffer->area.get_area() +
+                            readBuffer->local_result.area *
+                                XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area() + readBuffer->local_result.area *
+                                      XML->sys.mc.memory_channels_per_mc);
+
+  // write buffer
+  data = (int)ceil(mcp.dataBusWidth / 8.0); // Support key words first operation
+                                            // //8 means converting bit to Byte
+  interface_ip.cache_sz =
+      data * XML->sys.mc.IO_buffer_size_per_channel; //*llcBlockSize;
+  interface_ip.line_sz = data;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput = 1.0 / mcp.clockRate;
+  interface_ip.latency = 2.0 / mcp.clockRate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = XML->sys.mc.memory_channels_per_mc;
+  interface_ip.num_wr_ports = interface_ip.num_rd_ports;
+  interface_ip.num_se_rd_ports = 0;
+  writeBuffer = new ArrayST(&interface_ip, "MC writeBuffer", Uncore_device);
+  writeBuffer->area.set_area(writeBuffer->area.get_area() +
+                             writeBuffer->local_result.area *
+                                 XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area() + writeBuffer->local_result.area *
+                                      XML->sys.mc.memory_channels_per_mc);
+
+  // SRAM structures for memory coalescing --Syed Gilani
+  // Pending Request Table (base addresses, offset addresses, threads IDs),
+  // Thread Masks
+  //***PRT
+  // We assume 24 bits of base address and 8 bits of offset address.
+  // THese values are used for coalesing memory requests to the same base
+  // address block. TIDs are assumed to be 8 bits
+  /*Contents of each PRT entry
+   *
+   *  Warp ID (6 bits) | Memory address (32 bits) per thread | Request Size
+   * (2-bits) per thread | line size= 6+ 32*16 + 2*16  ~ 64 bytes
+   *
+   *
+   */
+  data =
+      64; // Support key words first operation //8 means converting bit to Byte
+  interface_ip.cache_sz = data * XML->sys.mc.PRT_entries; // PRT table;
+  interface_ip.line_sz = data;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput = 1.0 / XML->sys.target_core_clockrate;
+  interface_ip.latency = 2.0 / XML->sys.target_core_clockrate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = 1;
+  interface_ip.num_wr_ports = 1;
+  interface_ip.num_se_rd_ports = 0;
+  PRT = new ArrayST(&interface_ip, "MC PRT", Uncore_device);
+  PRT->area.set_area(PRT->area.get_area() +
+                     PRT->local_result.area *
+                         XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area() +
+                PRT->local_result.area * XML->sys.mc.memory_channels_per_mc);
+
+  //***ThreadMasks storage (coalesced threads whose memory requests are
+  // satisfied by each memory access)
+  /* contents of the thread masks Array
+   *  16-bit bit masks for up to 16 memory requests of a warp | Number of
+   * pending memory requests (5 bits)
+   *
+   *  16*PRT_entry thread Mask, each entry has 16 mask bits.
+   *
+   */
+  data = 2;
+  interface_ip.cache_sz = data * XML->sys.mc.PRT_entries * 16; // PRT table;
+  interface_ip.line_sz = data;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput = 1.0 / XML->sys.target_core_clockrate;
+  interface_ip.latency = 2.0 / XML->sys.target_core_clockrate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = 1;
+  interface_ip.num_wr_ports = 1;
+  interface_ip.num_se_rd_ports = 0;
+  threadMasks = new ArrayST(&interface_ip, "MC ThreadMasks", Uncore_device);
+  threadMasks->area.set_area(threadMasks->area.get_area() +
+                             threadMasks->local_result.area *
+                                 XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area() + threadMasks->local_result.area *
+                                      XML->sys.mc.memory_channels_per_mc);
+
+  //***Numer of pending requests per PRT entry
+  /*
+   * 1-byte data, PRT entries deep
+   */
+  data = 1;
+  interface_ip.cache_sz = data * XML->sys.mc.PRT_entries; // PRT table;
+  interface_ip.line_sz = data;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.access_mode = 0;
+  interface_ip.throughput = 1.0 / XML->sys.target_core_clockrate;
+  interface_ip.latency = 2.0 / XML->sys.target_core_clockrate;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 0;
+  interface_ip.num_rd_ports = 1;
+  interface_ip.num_wr_ports = 1;
+  interface_ip.num_se_rd_ports = 0;
+  PRC = new ArrayST(&interface_ip, "MC PendingRequestCount", Uncore_device);
+  PRC->area.set_area(PRC->area.get_area() +
+                     PRC->local_result.area *
+                         XML->sys.mc.memory_channels_per_mc);
+  area.set_area(area.get_area() +
+                PRC->local_result.area * XML->sys.mc.memory_channels_per_mc);
+}
+
+void DRAM::computeEnergy(bool is_tdp) {
+  if (is_tdp) {
+    power.reset();
+    return; /// not supporting TDP calculation for DRAM
+  }
+  rt_power.reset();
+  dramp.executionTime =
+      XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6);
+
+  power_t.reset();
+  power_t.readOp.dynamic += XML->sys.mc.memory_reads * dramp.rd_coeff;
+  power_t.readOp.dynamic += XML->sys.mc.memory_writes * dramp.wr_coeff;
+  power_t.readOp.dynamic += XML->sys.mc.dram_pre * dramp.pre_coeff;
+
+  rt_power = rt_power + power_t;
+}
+
+void MCFrontEnd::computeEnergy(bool is_tdp) {
+  if (is_tdp) {
+    power.reset();
+    // init stats for Peak
+    frontendBuffer->stats_t.readAc.access =
+        frontendBuffer->l_ip.num_search_ports;
+    frontendBuffer->stats_t.writeAc.access = frontendBuffer->l_ip.num_wr_ports;
+    frontendBuffer->tdp_stats = frontendBuffer->stats_t;
+
+    readBuffer->stats_t.readAc.access =
+        readBuffer->l_ip.num_rd_ports * mcp.frontend_duty_cycle;
+    readBuffer->stats_t.writeAc.access =
+        readBuffer->l_ip.num_wr_ports * mcp.frontend_duty_cycle;
+    readBuffer->tdp_stats = readBuffer->stats_t;
+
+    writeBuffer->stats_t.readAc.access =
+        writeBuffer->l_ip.num_rd_ports * mcp.frontend_duty_cycle;
+    writeBuffer->stats_t.writeAc.access =
+        writeBuffer->l_ip.num_wr_ports * mcp.frontend_duty_cycle;
+    writeBuffer->tdp_stats = writeBuffer->stats_t;
+
+    PRT->stats_t.readAc.access =
+        PRT->l_ip.num_rd_ports * mcp.frontend_duty_cycle;
+    PRT->stats_t.writeAc.access =
+        PRT->l_ip.num_wr_ports * mcp.frontend_duty_cycle;
+    PRT->tdp_stats = PRT->stats_t;
+
+    threadMasks->stats_t.readAc.access =
+        threadMasks->l_ip.num_rd_ports * mcp.frontend_duty_cycle;
+    threadMasks->stats_t.writeAc.access =
+        threadMasks->l_ip.num_wr_ports * mcp.frontend_duty_cycle;
+    threadMasks->tdp_stats = threadMasks->stats_t;
+
+    PRC->stats_t.readAc.access =
+        threadMasks->l_ip.num_rd_ports * mcp.frontend_duty_cycle;
+    PRC->stats_t.writeAc.access =
+        threadMasks->l_ip.num_wr_ports * mcp.frontend_duty_cycle;
+    PRC->tdp_stats = threadMasks->stats_t;
+
+  } else {
+    rt_power.reset(); // Jingwen
+    // init stats for runtime power (RTP)
+    frontendBuffer->stats_t.readAc.access =
+        XML->sys.mc.memory_reads * mcp.llcBlockSize * 8.0 / mcp.dataBusWidth *
+        mcp.dataBusWidth / 72;
+    // For each channel, each memory word need to check the address data to
+    // achieve best scheduling results. and this need to be done on all physical
+    // DIMMs in each logical memory DIMM *mcp.dataBusWidth/72
+    frontendBuffer->stats_t.writeAc.access =
+        XML->sys.mc.memory_writes * mcp.llcBlockSize * 8.0 / mcp.dataBusWidth *
+        mcp.dataBusWidth / 72;
+    frontendBuffer->rtp_stats = frontendBuffer->stats_t;
+
+    readBuffer->stats_t.readAc.access =
+        XML->sys.mc.memory_reads * mcp.llcBlockSize * 8.0 /
+        mcp.dataBusWidth; // support key word first
+    readBuffer->stats_t.writeAc.access =
+        XML->sys.mc.memory_reads * mcp.llcBlockSize * 8.0 /
+        mcp.dataBusWidth; // support key word first
+    readBuffer->rtp_stats = readBuffer->stats_t;
+
+    writeBuffer->stats_t.readAc.access =
+        XML->sys.mc.memory_writes * mcp.llcBlockSize * 8.0 / mcp.dataBusWidth;
+    writeBuffer->stats_t.writeAc.access =
+        XML->sys.mc.memory_writes * mcp.llcBlockSize * 8.0 / mcp.dataBusWidth;
+    writeBuffer->rtp_stats = writeBuffer->stats_t;
+
+    // Pending request table
+    // Co-alesce all misses in caches and add an entry for them in PRT
+    // TODO: Change 0 to ithCore and move to LSU (Syed)
+    // TODO: Do these accesses represent coalesced accesses?
+    PRT->stats_t.readAc.access = XML->sys.core[0].dcache.read_accesses +
+                                 XML->sys.core[0].ccache.read_accesses +
+                                 XML->sys.core[0].tcache.read_accesses;
+    PRT->stats_t.writeAc.access = XML->sys.core[0].dcache.write_accesses +
+                                  XML->sys.core[0].ccache.write_accesses +
+                                  XML->sys.core[0].tcache.write_accesses;
+    PRT->rtp_stats = PRT->stats_t;
+
+    threadMasks->stats_t.readAc.access = XML->sys.core[0].dcache.read_accesses +
+                                         XML->sys.core[0].ccache.read_accesses +
+                                         XML->sys.core[0].tcache.read_accesses;
+    threadMasks->stats_t.writeAc.access =
+        XML->sys.core[0].dcache.write_accesses +
+        XML->sys.core[0].ccache.write_accesses +
+        XML->sys.core[0].tcache.write_accesses;
+    threadMasks->rtp_stats = threadMasks->stats_t;
+
+    PRC->stats_t.readAc.access = XML->sys.core[0].dcache.read_accesses +
+                                 XML->sys.core[0].ccache.read_accesses +
+                                 XML->sys.core[0].tcache.read_accesses;
+    PRC->stats_t.writeAc.access = XML->sys.core[0].dcache.write_accesses +
+                                  XML->sys.core[0].ccache.write_accesses +
+                                  XML->sys.core[0].tcache.write_accesses;
+    PRC->rtp_stats = threadMasks->stats_t;
+  }
+
+  frontendBuffer->power_t.reset();
+  readBuffer->power_t.reset();
+  writeBuffer->power_t.reset();
+  threadMasks->power_t.reset();
+  PRT->power_t.reset();
+  PRC->power_t.reset();
+
+  //	frontendBuffer->power_t.readOp.dynamic	+=
+  //(frontendBuffer->stats_t.readAc.access*
+  //			(frontendBuffer->local_result.power.searchOp.dynamic+frontendBuffer->local_result.power.readOp.dynamic)+
+  //    		frontendBuffer->stats_t.writeAc.access*frontendBuffer->local_result.power.writeOp.dynamic);
+
+  frontendBuffer->power_t.readOp.dynamic +=
+      (frontendBuffer->stats_t.readAc.access +
+       frontendBuffer->stats_t.writeAc.access) *
+          frontendBuffer->local_result.power.searchOp.dynamic +
+      frontendBuffer->stats_t.readAc.access *
+          frontendBuffer->local_result.power.readOp.dynamic +
+      frontendBuffer->stats_t.writeAc.access *
+          frontendBuffer->local_result.power.writeOp.dynamic;
+
+  readBuffer->power_t.readOp.dynamic +=
+      (readBuffer->stats_t.readAc.access *
+           readBuffer->local_result.power.readOp.dynamic +
+       readBuffer->stats_t.writeAc.access *
+           readBuffer->local_result.power.writeOp.dynamic);
+  writeBuffer->power_t.readOp.dynamic +=
+      (writeBuffer->stats_t.readAc.access *
+           writeBuffer->local_result.power.readOp.dynamic +
+       writeBuffer->stats_t.writeAc.access *
+           writeBuffer->local_result.power.writeOp.dynamic);
+
+  PRT->power_t.readOp.dynamic +=
+      (PRT->stats_t.readAc.access * PRT->local_result.power.readOp.dynamic +
+       PRT->stats_t.writeAc.access * PRT->local_result.power.writeOp.dynamic);
+
+  threadMasks->power_t.readOp.dynamic +=
+      (threadMasks->stats_t.readAc.access *
+           threadMasks->local_result.power.readOp.dynamic +
+       threadMasks->stats_t.writeAc.access *
+           threadMasks->local_result.power.writeOp.dynamic);
+
+  PRC->power_t.readOp.dynamic +=
+      (PRC->stats_t.readAc.access * PRC->local_result.power.readOp.dynamic +
+       PRC->stats_t.writeAc.access * PRC->local_result.power.writeOp.dynamic);
+
+  // Add coalescing logic power (Estimated from Verilog HDL description and
+  // Synopsys PowerCompiler)--Syed
+#define COALESCE_SCALE 1
+  double perAccessCoalescingEnergy =
+      coalesce_scale *
+      ((0.443e-3) * (0.5e-9) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd) /
+      (1 * 1);
+  threadMasks->power_t.readOp.dynamic += (threadMasks->stats_t.readAc.access +
+                                          threadMasks->stats_t.writeAc.access) *
+                                         perAccessCoalescingEnergy;
+
+  // printf("***PRT: %10.30f, threadMasks: %10.30f, PRC:
+  // %10.30f\n",PRT->power_t.readOp.dynamic,threadMasks->power_t.readOp.dynamic,PRC->power_t.readOp.dynamic);
+  // printf("***Accesses: read:%lf
+  // write:%lf\n",threadMasks->stats_t.readAc.access,
+  // threadMasks->stats_t.writeAc.access);
+  if (is_tdp) {
+    power =
+        power + frontendBuffer->power_t + readBuffer->power_t +
+        writeBuffer->power_t + PRT->power_t + threadMasks->power_t +
+        PRC->power_t +
+        (frontendBuffer->local_result.power + readBuffer->local_result.power +
+         writeBuffer->local_result.power + PRT->local_result.power +
+         threadMasks->local_result.power + PRC->local_result.power) *
+            pppm_lkg;
+
+  } else {
+    rt_power =
+        rt_power + frontendBuffer->power_t + readBuffer->power_t +
+        writeBuffer->power_t + PRT->power_t + threadMasks->power_t +
+        PRC->power_t +
+        (frontendBuffer->local_result.power + readBuffer->local_result.power +
+         writeBuffer->local_result.power + PRT->local_result.power +
+         threadMasks->local_result.power + PRC->local_result.power) *
+            pppm_lkg;
+    rt_power.readOp.dynamic =
+        rt_power.readOp.dynamic + power.readOp.dynamic * 0.1 * mcp.clockRate *
+                                      mcp.num_mcs * mcp.executionTime;
+  }
+}
+
+void MCFrontEnd::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+
+  if (is_tdp) {
+    cout << indent_str << "Front End ROB:" << endl;
+    cout << indent_str_next
+         << "Area = " << frontendBuffer->area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str_next << "Peak Dynamic = "
+         << frontendBuffer->power.readOp.dynamic * mcp.clockRate << " W"
+         << endl;
+    cout << indent_str_next
+         << "Subthreshold Leakage = " << frontendBuffer->power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << frontendBuffer->power.readOp.gate_leakage
+         << " W" << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << frontendBuffer->rt_power.readOp.dynamic / mcp.executionTime << " W"
+         << endl;
+
+    cout << endl;
+    cout << indent_str << "Read Buffer:" << endl;
+    cout << indent_str_next << "Area = " << readBuffer->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next << "Peak Dynamic = "
+         << readBuffer->power.readOp.dynamic * mcp.clockRate << " W" << endl;
+    cout << indent_str_next
+         << "Subthreshold Leakage = " << readBuffer->power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << readBuffer->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << readBuffer->rt_power.readOp.dynamic / mcp.executionTime << " W"
+         << endl;
+    cout << endl;
+    cout << indent_str << "Write Buffer:" << endl;
+    cout << indent_str_next << "Area = " << writeBuffer->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next << "Peak Dynamic = "
+         << writeBuffer->power.readOp.dynamic * mcp.clockRate << " W" << endl;
+    cout << indent_str_next
+         << "Subthreshold Leakage = " << writeBuffer->power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << writeBuffer->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << writeBuffer->rt_power.readOp.dynamic / mcp.executionTime << " W"
+         << endl;
+    cout << endl;
+    cout << indent_str << "PRT:" << endl;
+    cout << indent_str_next << "Area = " << PRT->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << PRT->power.readOp.dynamic * mcp.clockRate
+         << " W" << endl;
+    cout << indent_str_next
+         << "Subthreshold Leakage = " << PRT->power.readOp.leakage << " W"
+         << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << PRT->power.readOp.gate_leakage << " W" << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << PRT->rt_power.readOp.dynamic / mcp.executionTime << " W" << endl;
+    cout << endl;
+    cout << indent_str << "Thread Masks and coalescing logic:" << endl;
+    cout << indent_str_next << "Area = " << threadMasks->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next << "Peak Dynamic = "
+         << threadMasks->power.readOp.dynamic * mcp.clockRate << " W" << endl;
+    cout << indent_str_next
+         << "Subthreshold Leakage = " << threadMasks->power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << threadMasks->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << threadMasks->rt_power.readOp.dynamic / mcp.executionTime << " W"
+         << endl;
+    cout << endl;
+
+  } else {
+    cout << indent_str << "Front End ROB:" << endl;
+    cout << indent_str_next
+         << "Area = " << frontendBuffer->area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str_next << "Peak Dynamic = "
+         << frontendBuffer->rt_power.readOp.dynamic * mcp.clockRate << " W"
+         << endl;
+    cout << indent_str_next
+         << "Subthreshold Leakage = " << frontendBuffer->rt_power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << frontendBuffer->rt_power.readOp.gate_leakage
+         << " W" << endl;
+    cout << endl;
+    cout << indent_str << "Read Buffer:" << endl;
+    cout << indent_str_next << "Area = " << readBuffer->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next << "Peak Dynamic = "
+         << readBuffer->rt_power.readOp.dynamic * mcp.clockRate << " W" << endl;
+    cout << indent_str_next
+         << "Subthreshold Leakage = " << readBuffer->rt_power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << readBuffer->rt_power.readOp.gate_leakage
+         << " W" << endl;
+    cout << endl;
+    cout << indent_str << "Write Buffer:" << endl;
+    cout << indent_str_next << "Area = " << writeBuffer->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next << "Peak Dynamic = "
+         << writeBuffer->rt_power.readOp.dynamic * mcp.clockRate << " W"
+         << endl;
+    cout << indent_str_next
+         << "Subthreshold Leakage = " << writeBuffer->rt_power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << writeBuffer->rt_power.readOp.gate_leakage
+         << " W" << endl;
+    cout << endl;
+    cout << indent_str << "PRT:" << endl;
+    cout << indent_str_next << "Area = " << PRT->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << PRT->rt_power.readOp.dynamic * mcp.clockRate
+         << " W" << endl;
+    cout << indent_str_next
+         << "Subthreshold Leakage = " << PRT->rt_power.readOp.leakage << " W"
+         << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << PRT->rt_power.readOp.gate_leakage << " W"
+         << endl;
+    cout << endl;
+    cout << indent_str << "Thread masks and coalescing logic:" << endl;
+    cout << indent_str_next << "Area = " << threadMasks->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next << "Peak Dynamic = "
+         << threadMasks->rt_power.readOp.dynamic * mcp.clockRate << " W"
+         << endl;
+    cout << indent_str_next
+         << "Subthreshold Leakage = " << threadMasks->rt_power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << threadMasks->rt_power.readOp.gate_leakage
+         << " W" << endl;
+  }
+}
+
+DRAM::DRAM(ParseXML *XML_interface, InputParameter *interface_ip_,
+           enum Dram_type dram_type_)
+    : XML(XML_interface), interface_ip(*interface_ip_), dram_type(dram_type_) {
+
+  set_dram_param();
+}
+MemoryController::MemoryController(ParseXML *XML_interface,
+                                   InputParameter *interface_ip_,
+                                   enum MemoryCtrl_type mc_type_,
+                                   enum Dram_type dram_type_)
+    : XML(XML_interface), interface_ip(*interface_ip_), mc_type(mc_type_),
+      frontend(0), transecEngine(0), PHY(0), pipeLogic(0) {
+  /* All computations are for a single MC
+   *
+   */
+  interface_ip.wire_is_mat_type = 2;
+  interface_ip.wire_os_mat_type = 2;
+  interface_ip.wt = Global;
+  set_mc_param();
+  frontend = new MCFrontEnd(XML, &interface_ip, mcp, mc_type);
+  dram = new DRAM(XML, &interface_ip, dram_type_);
+  area.set_area(area.get_area() + frontend->area.get_area());
+  transecEngine = new MCBackend(&interface_ip, mcp, mc_type);
+  area.set_area(area.get_area() + transecEngine->area.get_area());
+  if (mcp.type == 0 || (mcp.type == 1 && mcp.withPHY)) {
+    PHY = new MCPHY(&interface_ip, mcp, mc_type);
+    area.set_area(area.get_area() + PHY->area.get_area());
+  }
+  //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better numbers,
+  // Run the RTL code from OpenSparc.
+  //  transecEngine.initialize(&interface_ip);
+  //  transecEngine.peakDataTransferRate = XML->sys.mem.peak_transfer_rate;
+  //  transecEngine.memDataWidth = dataBusWidth;
+  //  transecEngine.memRank = XML->sys.mem.number_ranks;
+  //  //transecEngine.memAccesses=XML->sys.mc.memory_accesses;
+  //  //transecEngine.llcBlocksize=llcBlockSize;
+  //  transecEngine.compute();
+  //  transecEngine.area.set_area(XML->sys.mc.memory_channels_per_mc*transecEngine.area.get_area())
+  //  ; area.set_area(area.get_area()+ transecEngine.area.get_area());
+  //  ///cout<<"area="<<area<<endl;
+  ////
+  //  //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
+  //  PHY.initialize(&interface_ip);
+  //  PHY.peakDataTransferRate = XML->sys.mem.peak_transfer_rate;
+  //  PHY.memDataWidth = dataBusWidth;
+  //  //PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
+  //  //PHY.llcBlocksize=llcBlockSize;
+  //  PHY.compute();
+  //  PHY.area.set_area(XML->sys.mc.memory_channels_per_mc*PHY.area.get_area())
+  //  ; area.set_area(area.get_area()+ PHY.area.get_area());
+  /// cout<<"area="<<area<<endl;
+  //
+  //  interface_ip.pipeline_stages = 5;//normal memory controller has five
+  //  stages in the pipeline. interface_ip.per_stage_vector = addressBusWidth +
+  //  XML->sys.core[0].opcode_width + dataBusWidth; pipeLogic = new
+  //  pipeline(is_default, &interface_ip);
+  //  //pipeLogic.init_pipeline(is_default, &interface_ip);
+  //  pipeLogic->compute_pipeline();
+  //  area.set_area(area.get_area()+ pipeLogic->area.get_area()*1e-6);
+  //  area.set_area((area.get_area()+mc_area*1e-6)*1.1);//placement and routing
+  //  overhead
+  //
+  //
+  ////  //clock
+  ////  clockNetwork.init_wire_external(is_default, &interface_ip);
+  ////  clockNetwork.clk_area           =area*1.1;//10% of placement overhead.
+  /// rule of thumb /  clockNetwork.end_wiring_level   =5;//toplevel metal /
+  /// clockNetwork.start_wiring_level =5;//toplevel metal /
+  /// clockNetwork.num_regs = pipeLogic.tot_stage_vector; /
+  /// clockNetwork.optimize_wire();
+}
+void MemoryController::computeEnergy(bool is_tdp) {
+
+  rt_power.reset(); // Jingwen
+  frontend->rt_power.reset();
+  transecEngine->rt_power.reset();
+  dram->rt_power.reset();
+  mcp.executionTime =
+      XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6); // Jingwen
+  frontend->mcp.executionTime =
+      XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6); // Jingwen
+  transecEngine->mcp.executionTime =
+      XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6); // Jingwen
+
+  /*Jingwen: give stats for backend and phy */
+  transecEngine->mcp.reads = XML->sys.mc.memory_reads;
+  transecEngine->mcp.writes = XML->sys.mc.memory_writes;
+
+  // set_mc_param();
+
+  frontend->computeEnergy(is_tdp);
+  transecEngine->computeEnergy(is_tdp);
+  dram->computeEnergy(is_tdp);
+  if (mcp.type == 0 || (mcp.type == 1 && mcp.withPHY)) {
+    if (!is_tdp)
+      PHY->rt_power.reset(); // Jingwen
+    PHY->mcp.reads = XML->sys.mc.memory_reads;
+    PHY->mcp.writes = XML->sys.mc.memory_writes;
+    PHY->mcp.executionTime = XML->sys.total_cycles /
+                             (XML->sys.target_core_clockrate * 1e6); // Jingwen
+    PHY->computeEnergy(is_tdp);
+  }
+  if (is_tdp) {
+    power = power + frontend->power + transecEngine->power;
+    if (mcp.type == 0 || (mcp.type == 1 && mcp.withPHY)) {
+      power = power + PHY->power;
+    }
+  } else {
+    rt_power = rt_power + frontend->rt_power + transecEngine->rt_power +
+               dram->rt_power;
+    if (mcp.type == 0 || (mcp.type == 1 && mcp.withPHY)) {
+      rt_power = rt_power + PHY->rt_power;
+    }
+  }
+}
+
+void MemoryController::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+    cout << "Memory Controller:" << endl;
+    cout << indent_str << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str
+         << "Peak Dynamic = " << power.readOp.dynamic * mcp.clockRate << " W"
+         << endl;
+    cout << indent_str << "Subthreshold Leakage = "
+         << (long_channel ? power.readOp.longer_channel_leakage
+                          : power.readOp.leakage)
+         << " W" << endl;
+    // cout << indent_str<< "Subthreshold Leakage = " <<
+    // power.readOp.longer_channel_leakage <<" W" << endl;
+    cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str
+         << "Runtime Dynamic = " << rt_power.readOp.dynamic / mcp.executionTime
+         << " W" << endl;
+    cout << endl;
+    cout << indent_str << "Front End Engine:" << endl;
+    cout << indent_str_next << "Area = " << frontend->area.get_area() * 1e-6
+         << " mm^2" << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << frontend->power.readOp.dynamic * mcp.clockRate
+         << " W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? frontend->power.readOp.longer_channel_leakage
+                          : frontend->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << frontend->power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << frontend->rt_power.readOp.dynamic / mcp.executionTime << " W"
+         << endl;
+    cout << endl;
+    // if (plevel >2){
+    frontend->displayEnergy(indent + 4, is_tdp);
+    //}
+    cout << indent_str << "Transaction Engine:" << endl;
+    cout << indent_str_next
+         << "Area = " << transecEngine->area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str_next << "Peak Dynamic = "
+         << transecEngine->power.readOp.dynamic * mcp.clockRate << " W" << endl;
+    cout << indent_str_next << "Subthreshold Leakage = "
+         << (long_channel ? transecEngine->power.readOp.longer_channel_leakage
+                          : transecEngine->power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str_next
+         << "Gate Leakage = " << transecEngine->power.readOp.gate_leakage
+         << " W" << endl;
+    cout << indent_str_next << "Runtime Dynamic = "
+         << transecEngine->rt_power.readOp.dynamic / mcp.executionTime << " W"
+         << endl;
+    cout << endl;
+    if (mcp.type == 0 || (mcp.type == 1 && mcp.withPHY)) {
+      cout << indent_str << "PHY:" << endl;
+      cout << indent_str_next << "Area = " << PHY->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << PHY->power.readOp.dynamic * mcp.clockRate
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? PHY->power.readOp.longer_channel_leakage
+                            : PHY->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << PHY->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << PHY->rt_power.readOp.dynamic / mcp.executionTime << " W" << endl;
+      cout << endl;
+    }
+  } else {
+    cout << "Memory Controller:" << endl;
+    cout << indent_str_next << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str_next
+         << "Peak Dynamic = " << power.readOp.dynamic * mcp.clockRate << " W"
+         << endl;
+    cout << indent_str_next << "Subthreshold Leakage = " << power.readOp.leakage
+         << " W" << endl;
+    cout << indent_str_next << "Gate Leakage = " << power.readOp.gate_leakage
+         << " W" << endl;
+    cout << endl;
+  }
+}
+
+void DRAM::set_dram_param() {
+  dramp.cmd_coeff = XML->sys.mc.dram_rd_coeff;
+  dramp.act_coeff = XML->sys.mc.dram_act_coeff;
+  dramp.nop_coeff = XML->sys.mc.dram_nop_coeff;
+  dramp.activity_coeff = XML->sys.mc.dram_activity_coeff;
+  dramp.pre_coeff = XML->sys.mc.dram_pre_coeff;
+  dramp.rd_coeff = XML->sys.mc.dram_rd_coeff;
+  dramp.wr_coeff = XML->sys.mc.dram_wr_coeff;
+  dramp.req_coeff = XML->sys.mc.dram_req_coeff;
+  dramp.const_coeff = XML->sys.mc.dram_const_coeff;
+}
+
+void MemoryController::set_mc_param() {
+
+  if (mc_type == MC) {
+    mcp.clockRate = XML->sys.mc.mc_clock * 2; // DDR double pumped
+    mcp.clockRate *= 1e6;
+    mcp.executionTime =
+        XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6);
+
+    mcp.llcBlockSize = int(ceil(XML->sys.mc.llc_line_length / 8.0)) +
+                       XML->sys.mc.llc_line_length; // ecc overhead
+    mcp.dataBusWidth =
+        int(ceil(XML->sys.mc.databus_width / 8.0)) + XML->sys.mc.databus_width;
+    mcp.addressBusWidth = int(
+        ceil(XML->sys.mc.addressbus_width)); // XML->sys.physical_address_width;
+    mcp.opcodeW = 16;
+    mcp.num_mcs = XML->sys.mc.number_mcs;
+    mcp.num_channels = XML->sys.mc.memory_channels_per_mc;
+    mcp.reads = XML->sys.mc.memory_reads;
+    mcp.writes = XML->sys.mc.memory_writes;
+    //+++++++++Transaction engine +++++++++++++++++ ////TODO needs better
+    // numbers, Run the RTL code from OpenSparc.
+    mcp.peakDataTransferRate = XML->sys.mc.peak_transfer_rate;
+    mcp.memRank = XML->sys.mc.number_ranks;
+    //++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs better numbers
+    // PHY.memAccesses=PHY.peakDataTransferRate;//this is the max power
+    // PHY.llcBlocksize=llcBlockSize;
+    mcp.frontend_duty_cycle = 0.5; // for max power, the actual off-chip links
+                                   // is bidirectional but time shared
+    mcp.LVDS = XML->sys.mc.LVDS;
+    mcp.type = XML->sys.mc.type;
+    mcp.withPHY = XML->sys.mc.withPHY;
+  }
+  //	else if (mc_type==FLASHC)
+  //	{
+  //		mcp.clockRate       =XML->sys.flashc.mc_clock*2;//DDR double
+  // pumped 		mcp.clockRate       *= 1e6; mcp.executionTime =
+  // XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);
+  //
+  //		mcp.llcBlockSize
+  //=int(ceil(XML->sys.flashc.llc_line_length/8.0))+XML->sys.flashc.llc_line_length;//ecc
+  // overhead 		mcp.dataBusWidth
+  // =int(ceil(XML->sys.flashc.databus_width/8.0)) +
+  // XML->sys.flashc.databus_width; 		mcp.addressBusWidth
+  //=int(ceil(XML->sys.flashc.addressbus_width));//XML->sys.physical_address_width;
+  //		mcp.opcodeW         =16;
+  //		mcp.num_mcs         = XML->sys.flashc.number_mcs;
+  //		mcp.num_channels    = XML->sys.flashc.memory_channels_per_mc;
+  //		mcp.reads  = XML->sys.flashc.memory_reads;
+  //		mcp.writes = XML->sys.flashc.memory_writes;
+  //		//+++++++++Transaction engine +++++++++++++++++ ////TODO needs
+  // better numbers, Run the RTL code from OpenSparc.
+  // mcp.peakDataTransferRate =
+  // XML->sys.flashc.peak_transfer_rate; 		mcp.memRank =
+  // XML->sys.flashc.number_ranks;
+  //		//++++++++++++++PHY ++++++++++++++++++++++++++ //TODO needs
+  // better numbers
+  //		//PHY.memAccesses=PHY.peakDataTransferRate;//this is the max
+  // power
+  //		//PHY.llcBlocksize=llcBlockSize;
+  //		mcp.frontend_duty_cycle = 0.5;//for max power, the actual
+  // off-chip links is bidirectional but time shared 		mcp.LVDS =
+  // XML->sys.flashc.LVDS; 		mcp.type = XML->sys.flashc.type;
+  //	}
+  else {
+    cout << "Unknown memory controller type: neither DRAM controller nor Flash "
+            "controller"
+         << endl;
+    exit(0);
+  }
+}
+
+MCFrontEnd ::~MCFrontEnd() {
+
+  if (MC_arb) {
+    delete MC_arb;
+    MC_arb = 0;
+  }
+  if (frontendBuffer) {
+    delete frontendBuffer;
+    frontendBuffer = 0;
+  }
+  if (readBuffer) {
+    delete readBuffer;
+    readBuffer = 0;
+  }
+  if (writeBuffer) {
+    delete writeBuffer;
+    writeBuffer = 0;
+  }
+}
+
+MemoryController ::~MemoryController() {
+
+  if (frontend) {
+    delete frontend;
+    frontend = 0;
+  }
+  if (transecEngine) {
+    delete transecEngine;
+    transecEngine = 0;
+  }
+  if (PHY) {
+    delete PHY;
+    PHY = 0;
+  }
+  if (pipeLogic) {
+    delete pipeLogic;
+    pipeLogic = 0;
+  }
+}
diff --git a/src/gpuwattch/memoryctrl.h b/src/gpuwattch/memoryctrl.h
new file mode 100644
index 000000000..587c4a51f
--- /dev/null
+++ b/src/gpuwattch/memoryctrl.h
@@ -0,0 +1,147 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+
+#ifndef MEMORYCTRL_H_
+#define MEMORYCTRL_H_
+
+#include "XML_Parse.h"
+#include "cacti/parameter.h"
+//#include "io.h"
+#include "array.h"
+//#include "Undifferentiated_Core_Area.h"
+#include "basic_components.h"
+#include <vector>
+
+class MCBackend : public Component {
+public:
+  InputParameter l_ip;
+  uca_org_t local_result;
+  enum MemoryCtrl_type mc_type;
+  MCParam mcp;
+  statsDef tdp_stats;
+  statsDef rtp_stats;
+  statsDef stats_t;
+  powerDef power_t;
+  MCBackend(InputParameter *interface_ip_, const MCParam &mcp_,
+            enum MemoryCtrl_type mc_type_);
+  void compute();
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~MCBackend(){};
+};
+
+class MCPHY : public Component {
+public:
+  InputParameter l_ip;
+  uca_org_t local_result;
+  enum MemoryCtrl_type mc_type;
+  MCParam mcp;
+  statsDef tdp_stats;
+  statsDef rtp_stats;
+  statsDef stats_t;
+  powerDef power_t;
+  MCPHY(InputParameter *interface_ip_, const MCParam &mcp_,
+        enum MemoryCtrl_type mc_type_);
+  void compute();
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~MCPHY(){};
+};
+
+class MCFrontEnd : public Component {
+public:
+  ParseXML *XML;
+  InputParameter interface_ip;
+  enum MemoryCtrl_type mc_type;
+  MCParam mcp;
+  selection_logic *MC_arb;
+  ArrayST *frontendBuffer;
+  ArrayST *readBuffer;
+  ArrayST *writeBuffer;
+
+  ArrayST *PRT;
+  ArrayST *threadMasks;
+  ArrayST *PRC;
+  double coalesce_scale;
+
+  MCFrontEnd(ParseXML *XML_interface, InputParameter *interface_ip_,
+             const MCParam &mcp_, enum MemoryCtrl_type mc_type_);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~MCFrontEnd();
+};
+
+class DRAM : public Component {
+public:
+  ParseXML *XML;
+  InputParameter interface_ip;
+  enum Dram_type dram_type;
+  DRAMParam dramp;
+  powerDef power_t;
+  DRAM(ParseXML *XML_interface, InputParameter *interface_ip_,
+       enum Dram_type dram_type_);
+  void set_dram_param();
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~DRAM();
+};
+
+class MemoryController : public Component {
+public:
+  ParseXML *XML;
+  InputParameter interface_ip;
+  enum MemoryCtrl_type mc_type;
+  MCParam mcp;
+  DRAM *dram;
+  MCFrontEnd *frontend;
+  MCBackend *transecEngine;
+  MCPHY *PHY;
+  Pipeline *pipeLogic;
+
+  // Add coalescing logic related modules with each memory controller --Syed
+  // Gilani
+
+  // clock_network clockNetwork;
+  MemoryController(ParseXML *XML_interface, InputParameter *interface_ip_,
+                   enum MemoryCtrl_type mc_type_, enum Dram_type dram_type_);
+  void set_mc_param();
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  ~MemoryController();
+};
+#endif /* MEMORYCTRL_H_ */
diff --git a/src/gpuwattch/noc.cc b/src/gpuwattch/noc.cc
new file mode 100644
index 000000000..0845cda12
--- /dev/null
+++ b/src/gpuwattch/noc.cc
@@ -0,0 +1,452 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+
+#include "noc.h"
+#include "XML_Parse.h"
+#include "cacti/basic_circuit.h"
+#include "const.h"
+#include "io.h"
+#include "parameter.h"
+#include <algorithm>
+#include <assert.h>
+#include <cmath>
+#include <iostream>
+#include <string>
+
+NoC::NoC(ParseXML *XML_interface, int ithNoC_, InputParameter *interface_ip_,
+         double M_traffic_pattern_, double link_len_)
+    : XML(XML_interface), ithNoC(ithNoC_), interface_ip(*interface_ip_),
+      router(0), link_bus(0), link_bus_exist(false), router_exist(false),
+      M_traffic_pattern(M_traffic_pattern_) {
+  /*
+   * initialize, compute and optimize individual components.
+   */
+
+  if (XML->sys.Embedded) {
+    interface_ip.wt = Global_30;
+    interface_ip.wire_is_mat_type = 0;
+    interface_ip.wire_os_mat_type = 1;
+  } else {
+    interface_ip.wt = Global;
+    interface_ip.wire_is_mat_type = 2;
+    interface_ip.wire_os_mat_type = 2;
+  }
+  set_noc_param();
+  local_result = init_interface(&interface_ip);
+  scktRatio = g_tp.sckt_co_eff;
+
+  if (nocdynp.type) { /*
+                       * if NOC compute router, router links must be computed
+                       * separately and called from external since total chip
+                       * area must be known first
+                       */
+    init_router();
+  } else {
+    init_link_bus(link_len_); // if bus compute bus
+  }
+
+  //  //clock power
+  //  clockNetwork.init_wire_external(is_default, &interface_ip);
+  //  clockNetwork.clk_area           =area*1.1;//10% of placement overhead.
+  //  rule of thumb clockNetwork.end_wiring_level   =5;//toplevel metal
+  //  clockNetwork.start_wiring_level =5;//toplevel metal
+  //  clockNetwork.num_regs           = corepipe.tot_stage_vector;
+  //  clockNetwork.optimize_wire();
+}
+
+void NoC::init_router() {
+  router = new MCPAT_Router(
+      nocdynp.flit_size,
+      nocdynp.virtual_channel_per_port * nocdynp.input_buffer_entries_per_vc,
+      nocdynp.virtual_channel_per_port, &(g_tp.peri_global),
+      nocdynp.input_ports, nocdynp.output_ports, M_traffic_pattern);
+  // router->print_router();
+  area.set_area(area.get_area() +
+                router->area.get_area() * nocdynp.total_nodes);
+
+  double long_channel_device_reduction =
+      longer_channel_device_reduction(Uncore_device);
+  router->power.readOp.longer_channel_leakage =
+      router->power.readOp.leakage * long_channel_device_reduction;
+  router->buffer.power.readOp.longer_channel_leakage =
+      router->buffer.power.readOp.leakage * long_channel_device_reduction;
+  router->crossbar.power.readOp.longer_channel_leakage =
+      router->crossbar.power.readOp.leakage * long_channel_device_reduction;
+  router->arbiter.power.readOp.longer_channel_leakage =
+      router->arbiter.power.readOp.leakage * long_channel_device_reduction;
+  router_exist = true;
+}
+
+void NoC ::init_link_bus(double link_len_) {
+
+  //	if (nocdynp.min_ports==1 )
+  if (nocdynp.type)
+    link_name = "Links";
+  else
+    link_name = "Bus";
+
+  link_len = link_len_;
+  assert(link_len > 0);
+
+  interface_ip.throughput = nocdynp.link_throughput / nocdynp.clockRate;
+  interface_ip.latency = nocdynp.link_latency / nocdynp.clockRate;
+
+  link_len /= (nocdynp.horizontal_nodes + nocdynp.vertical_nodes) / 2;
+
+  if (nocdynp.total_nodes > 1)
+    link_len /= 2; // All links are shared by neighbors
+  link_bus = new interconnect(name, Uncore_device, 1, 1, nocdynp.flit_size,
+                              link_len, &interface_ip, 3, true /*pipelinable*/,
+                              nocdynp.route_over_perc);
+
+  link_bus_tot_per_Router.area.set_area(
+      link_bus_tot_per_Router.area.get_area() +
+      link_bus->area.get_area() * nocdynp.global_linked_ports);
+
+  area.set_area(area.get_area() +
+                link_bus_tot_per_Router.area.get_area() * nocdynp.total_nodes);
+  link_bus_exist = true;
+}
+void NoC::computeEnergy(bool is_tdp) {
+  // power_point_product_masks
+  double pppm_t[4] = {1, 1, 1, 1};
+  double M = nocdynp.duty_cycle;
+  // nocdynp.executionTime=XML->sys.total_cycles/(XML->sys.target_core_clockrate*1e6);//Syed
+  // cout<<"NOC Total Cycles: "<<XML->sys.total_cycles<<endl;
+  // cout<<"NOC Clock Rate: "<<XML->sys.target_core_clockrate<<endl;
+  if (is_tdp) {
+    // init stats for TDP
+    stats_t.readAc.access = M;
+    tdp_stats = stats_t;
+    if (router_exist) {
+      set_pppm(pppm_t, 1 * M, 1, 1, 1); // reset traffic pattern
+      router->power = router->power * pppm_t;
+      set_pppm(pppm_t, nocdynp.total_nodes, nocdynp.total_nodes,
+               nocdynp.total_nodes, nocdynp.total_nodes);
+      power = power + router->power * pppm_t;
+    }
+    if (link_bus_exist) {
+      if (nocdynp.type)
+        set_pppm(pppm_t, 1 * M_traffic_pattern * M * (nocdynp.min_ports - 1),
+                 nocdynp.global_linked_ports, nocdynp.global_linked_ports,
+                 nocdynp.global_linked_ports);
+      // reset traffic pattern; local port do not have router links
+      else
+        set_pppm(pppm_t, 1 * M_traffic_pattern * M * (nocdynp.min_ports),
+                 nocdynp.global_linked_ports, nocdynp.global_linked_ports,
+                 nocdynp.global_linked_ports); // reset traffic pattern
+
+      link_bus_tot_per_Router.power = link_bus->power * pppm_t;
+
+      set_pppm(pppm_t, nocdynp.total_nodes, nocdynp.total_nodes,
+               nocdynp.total_nodes, nocdynp.total_nodes);
+      power = power + link_bus_tot_per_Router.power * pppm_t;
+    }
+  } else {
+    rt_power.reset();
+    router->buffer.rt_power.reset();
+    router->crossbar.rt_power.reset();
+    router->arbiter.rt_power.reset();
+    router->rt_power.reset();
+    // link_bus->rt_power.reset();
+
+    // init stats for runtime power (RTP)
+    stats_t.readAc.access = XML->sys.NoC[ithNoC].total_accesses;
+    // cout<<"NOC(computeEnergy) read accesses: "<< stats_t.readAc.access<<endl;
+    rtp_stats = stats_t;
+    set_pppm(pppm_t, 1, 0, 0, 0);
+    if (router_exist) {
+      router->buffer.rt_power.readOp.dynamic =
+          (router->buffer.power.readOp.dynamic +
+           router->buffer.power.writeOp.dynamic) *
+          rtp_stats.readAc.access;
+      router->crossbar.rt_power.readOp.dynamic =
+          router->crossbar.power.readOp.dynamic * rtp_stats.readAc.access;
+      router->arbiter.rt_power.readOp.dynamic =
+          router->arbiter.power.readOp.dynamic * rtp_stats.readAc.access;
+
+      router->rt_power =
+          router->rt_power +
+          (router->buffer.rt_power + router->crossbar.rt_power +
+           router->arbiter.rt_power) *
+              pppm_t +
+          router->power * pppm_lkg; // TDP power must be calculated first!
+      rt_power = rt_power + router->rt_power;
+    }
+    if (link_bus_exist) {
+      link_bus->rt_power.reset();
+      set_pppm(pppm_t, rtp_stats.readAc.access, 1, 1, rtp_stats.readAc.access);
+      link_bus->rt_power = link_bus->power * pppm_t;
+      rt_power = rt_power + link_bus->rt_power;
+    }
+  }
+}
+
+void NoC::displayEnergy(uint32_t indent, int plevel, bool is_tdp) {
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  double M = M_traffic_pattern * nocdynp.duty_cycle;
+  /*only router as a whole has been applied the M_traffic_pattern(0.6 by
+   * default) factor in router.cc; When power of crossbars, arbiters, etc need
+   * to be displayed, the M_traffic_pattern factor need to be applied together
+   * with McPAT's extra traffic pattern.
+   * */
+  if (is_tdp) {
+    cout << name << endl;
+    cout << indent_str << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str
+         << "Peak Dynamic = " << power.readOp.dynamic * nocdynp.clockRate
+         << " W" << endl;
+    cout << indent_str << "Subthreshold Leakage = "
+         << (long_channel ? power.readOp.longer_channel_leakage
+                          : power.readOp.leakage)
+         << " W" << endl;
+    cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str << "Runtime Dynamic = "
+         << rt_power.readOp.dynamic / nocdynp.executionTime << " W" << endl;
+    // cout << indent_str<< "Execution Time = " << nocdynp.executionTime << " s"
+    // << endl;
+    cout << endl;
+
+    if (router_exist) {
+      cout << indent_str << "Router: " << endl;
+      cout << indent_str_next << "Area = " << router->area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next << "Peak Dynamic = "
+           << router->power.readOp.dynamic * nocdynp.clockRate << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? router->power.readOp.longer_channel_leakage
+                            : router->power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << router->power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << router->rt_power.readOp.dynamic / nocdynp.executionTime << " W"
+           << endl;
+      cout << endl;
+      if (plevel > 2) {
+        cout << indent_str << indent_str << "Virtual Channel Buffer:" << endl;
+        cout << indent_str << indent_str_next << "Area = "
+             << router->buffer.area.get_area() * 1e-6 * nocdynp.input_ports
+             << " mm^2" << endl;
+        cout << indent_str << indent_str_next << "Peak Dynamic = "
+             << (router->buffer.power.readOp.dynamic +
+                 router->buffer.power.writeOp.dynamic) *
+                    nocdynp.min_ports * M * nocdynp.clockRate
+             << " W" << endl;
+        cout << indent_str << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel
+                     ? router->buffer.power.readOp.longer_channel_leakage *
+                           nocdynp.input_ports
+                     : router->buffer.power.readOp.leakage *
+                           nocdynp.input_ports)
+             << " W" << endl;
+        cout << indent_str << indent_str_next << "Gate Leakage = "
+             << router->buffer.power.readOp.gate_leakage * nocdynp.input_ports
+             << " W" << endl;
+        cout << indent_str << indent_str_next << "Runtime Dynamic = "
+             << router->buffer.rt_power.readOp.dynamic / nocdynp.executionTime
+             << " W" << endl;
+        cout << endl;
+        cout << indent_str << indent_str << "Crossbar:" << endl;
+        cout << indent_str << indent_str_next
+             << "Area = " << router->crossbar.area.get_area() * 1e-6 << " mm^2"
+             << endl;
+        cout << indent_str << indent_str_next << "Peak Dynamic = "
+             << router->crossbar.power.readOp.dynamic * nocdynp.clockRate *
+                    nocdynp.min_ports * M
+             << " W" << endl;
+        cout << indent_str << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel
+                     ? router->crossbar.power.readOp.longer_channel_leakage
+                     : router->crossbar.power.readOp.leakage)
+             << " W" << endl;
+        cout << indent_str << indent_str_next
+             << "Gate Leakage = " << router->crossbar.power.readOp.gate_leakage
+             << " W" << endl;
+        cout << indent_str << indent_str_next << "Runtime Dynamic = "
+             << router->crossbar.rt_power.readOp.dynamic / nocdynp.executionTime
+             << " W" << endl;
+        cout << endl;
+        cout << indent_str << indent_str << "Arbiter:" << endl;
+        cout << indent_str << indent_str_next << "Peak Dynamic = "
+             << router->arbiter.power.readOp.dynamic * nocdynp.clockRate *
+                    nocdynp.min_ports * M
+             << " W" << endl;
+        cout << indent_str << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel
+                     ? router->arbiter.power.readOp.longer_channel_leakage
+                     : router->arbiter.power.readOp.leakage)
+             << " W" << endl;
+        cout << indent_str << indent_str_next
+             << "Gate Leakage = " << router->arbiter.power.readOp.gate_leakage
+             << " W" << endl;
+        cout << indent_str << indent_str_next << "Runtime Dynamic = "
+             << router->arbiter.rt_power.readOp.dynamic / nocdynp.executionTime
+             << " W" << endl;
+        cout << endl;
+      }
+    }
+    if (link_bus_exist) {
+      cout << indent_str << (nocdynp.type ? "Per Router " : "") << link_name
+           << ": " << endl;
+      cout << indent_str_next
+           << "Area = " << link_bus_tot_per_Router.area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next << "Peak Dynamic = "
+           << link_bus_tot_per_Router.power.readOp.dynamic * nocdynp.clockRate
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel
+                   ? link_bus_tot_per_Router.power.readOp.longer_channel_leakage
+                   : link_bus_tot_per_Router.power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next << "Gate Leakage = "
+           << link_bus_tot_per_Router.power.readOp.gate_leakage << " W" << endl;
+      cout << indent_str_next << "Runtime Dynamic = "
+           << link_bus->rt_power.readOp.dynamic / nocdynp.executionTime << " W"
+           << endl;
+      cout << endl;
+    }
+  } else {
+    //		cout << indent_str_next << "Instruction Fetch Unit    Peak
+    //Dynamic
+    //=
+    //"
+    //<< ifu->rt_power.readOp.dynamic*clockRate << " W" << endl;
+    // cout
+    //<< indent_str_next << "Instruction Fetch Unit    Subthreshold Leakage = "
+    // << ifu->rt_power.readOp.leakage <<" W" << endl; 		cout <<
+    // indent_str_next << "Instruction Fetch Unit    Gate Leakage = " <<
+    // ifu->rt_power.readOp.gate_leakage << " W" << endl; 		cout <<
+    // indent_str_next
+    //<< "Load Store Unit   Peak Dynamic = " <<
+    // lsu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+    // cout
+    // << indent_str_next << "Load Store Unit   Subthreshold Leakage = " <<
+    // lsu->rt_power.readOp.leakage  << " W" << endl; 		cout <<
+    // indent_str_next
+    // << "Load Store Unit   Gate Leakage = " <<
+    // lsu->rt_power.readOp.gate_leakage
+    //<< " W" << endl; 		cout << indent_str_next << "Memory Management
+    // Unit Peak Dynamic = " << mmu->rt_power.readOp.dynamic*clockRate  << " W"
+    // <<
+    // endl; 		cout << indent_str_next << "Memory Management Unit
+    // Subthreshold Leakage = " << mmu->rt_power.readOp.leakage  << " W" <<
+    // endl; 		cout
+    // << indent_str_next << "Memory Management Unit   Gate Leakage = " <<
+    // mmu->rt_power.readOp.gate_leakage  << " W" << endl; 		cout <<
+    // indent_str_next << "Execution Unit   Peak Dynamic = " <<
+    // exu->rt_power.readOp.dynamic*clockRate  << " W" << endl;
+    // cout
+    // << indent_str_next << "Execution Unit   Subthreshold Leakage = " <<
+    // exu->rt_power.readOp.leakage  << " W" << endl; 		cout <<
+    // indent_str_next
+    // << "Execution Unit   Gate Leakage = " <<
+    // exu->rt_power.readOp.gate_leakage
+    //<< " W" << endl;
+  }
+}
+
+void NoC::set_noc_param() {
+
+  nocdynp.type = XML->sys.NoC[ithNoC].type;
+  nocdynp.clockRate = XML->sys.NoC[ithNoC].clockrate;
+  nocdynp.clockRate *= 1e6;
+  nocdynp.executionTime =
+      XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6);
+
+  nocdynp.flit_size = XML->sys.NoC[ithNoC].flit_bits;
+  if (nocdynp.type) {
+    nocdynp.input_ports = XML->sys.NoC[ithNoC].input_ports;
+    nocdynp.output_ports = XML->sys.NoC[ithNoC].output_ports; // later minus 1
+    nocdynp.min_ports = min(nocdynp.input_ports, nocdynp.output_ports);
+    nocdynp.global_linked_ports =
+        (nocdynp.input_ports - 1) + (nocdynp.output_ports - 1);
+    /*
+     * 	Except local i/o ports, all ports needs links( global_linked_ports);
+     *  However only min_ports can be fully active simultaneously
+     *  since the fewer number of ports (input or output ) is the bottleneck.
+     */
+  } else {
+    nocdynp.input_ports = 1;
+    nocdynp.output_ports = 1;
+    nocdynp.min_ports = min(nocdynp.input_ports, nocdynp.output_ports);
+    nocdynp.global_linked_ports = 1;
+  }
+
+  nocdynp.virtual_channel_per_port =
+      XML->sys.NoC[ithNoC].virtual_channel_per_port;
+  nocdynp.input_buffer_entries_per_vc =
+      XML->sys.NoC[ithNoC].input_buffer_entries_per_vc;
+
+  nocdynp.horizontal_nodes = XML->sys.NoC[ithNoC].horizontal_nodes;
+  nocdynp.vertical_nodes = XML->sys.NoC[ithNoC].vertical_nodes;
+  nocdynp.total_nodes = nocdynp.horizontal_nodes * nocdynp.vertical_nodes;
+  nocdynp.duty_cycle = XML->sys.NoC[ithNoC].duty_cycle;
+  nocdynp.has_global_link = XML->sys.NoC[ithNoC].has_global_link;
+  nocdynp.link_throughput = XML->sys.NoC[ithNoC].link_throughput;
+  nocdynp.link_latency = XML->sys.NoC[ithNoC].link_latency;
+  nocdynp.chip_coverage = XML->sys.NoC[ithNoC].chip_coverage;
+  nocdynp.route_over_perc = XML->sys.NoC[ithNoC].route_over_perc;
+
+  assert(nocdynp.chip_coverage <= 1);
+  assert(nocdynp.route_over_perc <= 1);
+
+  if (nocdynp.type)
+    name = "NOC";
+  else
+    name = "BUSES";
+}
+
+NoC ::~NoC() {
+
+  if (router) {
+    delete router;
+    router = 0;
+  }
+  if (link_bus) {
+    delete link_bus;
+    link_bus = 0;
+  }
+}
diff --git a/src/gpuwattch/noc.h b/src/gpuwattch/noc.h
new file mode 100644
index 000000000..7075166d2
--- /dev/null
+++ b/src/gpuwattch/noc.h
@@ -0,0 +1,83 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+
+#ifndef NOC_H_
+#define NOC_H_
+#include "XML_Parse.h"
+#include "array.h"
+#include "basic_components.h"
+#include "cacti/parameter.h"
+#include "cacti/router.h"
+#include "interconnect.h"
+#include "logic.h"
+
+class NoC : public Component {
+public:
+  ParseXML *XML;
+  int ithNoC;
+  InputParameter interface_ip;
+  double link_len;
+  double executionTime;
+  double scktRatio, chip_PR_overhead, macro_PR_overhead;
+  MCPAT_Router *router;
+  interconnect *link_bus;
+  NoCParam nocdynp;
+  uca_org_t local_result;
+  statsDef tdp_stats;
+  statsDef rtp_stats;
+  statsDef stats_t;
+  powerDef power_t;
+  Component link_bus_tot_per_Router;
+  bool link_bus_exist;
+  bool router_exist;
+  string name, link_name;
+  double M_traffic_pattern;
+  NoC(ParseXML *XML_interface, int ithNoC_, InputParameter *interface_ip_,
+      double M_traffic_pattern_ = 0.6, double link_len_ = 0);
+  void set_noc_param();
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100, bool is_tdp = true);
+  void init_link_bus(double link_len_);
+  void init_router();
+  void computeEnergy_link_bus(bool is_tdp = true);
+  void displayEnergy_link_bus(uint32_t indent = 0, int plevel = 100,
+                              bool is_tdp = true);
+  ~NoC();
+};
+
+#endif /* NOC_H_ */
diff --git a/src/gpuwattch/processor.cc b/src/gpuwattch/processor.cc
new file mode 100644
index 000000000..d9de4aa2f
--- /dev/null
+++ b/src/gpuwattch/processor.cc
@@ -0,0 +1,1205 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+#include "processor.h"
+#include "XML_Parse.h"
+#include "array.h"
+#include "cacti/basic_circuit.h"
+#include "const.h"
+#include "parameter.h"
+#include "version.h"
+#include <algorithm>
+#include <assert.h>
+#include <cmath>
+#include <fstream>
+#include <iostream>
+#include <stdio.h>
+#include <string.h>
+
+Processor::Processor(ParseXML *XML_interface)
+    : XML(XML_interface), // TODO: using one global copy may have problems.
+      mc(0), niu(0), pcie(0), flashcontroller(0) {
+  /*
+   *  placement and routing overhead is 10%, core scales worse than cache 40% is
+   * accumulated from 90 to 22nm There is no point to have heterogeneous memory
+   * controller on chip, thus McPAT only support homogeneous memory controllers.
+   */
+  rt_power.reset();
+  int i;
+  double pppm_t[4] = {1, 1, 1, 1};
+  l2_power = 0;
+  idle_core_power = 0;
+  set_proc_param();
+  if (procdynp.homoCore)
+    numCore = procdynp.numCore == 0 ? 0 : 1;
+  else
+    numCore = procdynp.numCore;
+
+  if (procdynp.homoL2)
+    numL2 = procdynp.numL2 == 0 ? 0 : 1;
+  else
+    numL2 = procdynp.numL2;
+
+  if (XML->sys.Private_L2 && numCore != numL2) {
+    cout << "Number of private L2 does not match number of cores" << endl;
+    exit(0);
+  }
+
+  if (procdynp.homoL3)
+    numL3 = procdynp.numL3 == 0 ? 0 : 1;
+  else
+    numL3 = procdynp.numL3;
+
+  if (procdynp.homoNOC)
+    numNOC = procdynp.numNOC == 0 ? 0 : 1;
+  else
+    numNOC = procdynp.numNOC;
+
+  //  if (!procdynp.homoNOC)
+  //  {
+  //	  cout<<"Current McPAT does not support heterogeneous NOC"<<endl;
+  //      exit(0);
+  //  }
+
+  if (procdynp.homoL1Dir)
+    numL1Dir = procdynp.numL1Dir == 0 ? 0 : 1;
+  else
+    numL1Dir = procdynp.numL1Dir;
+
+  if (procdynp.homoL2Dir)
+    numL2Dir = procdynp.numL2Dir == 0 ? 0 : 1;
+  else
+    numL2Dir = procdynp.numL2Dir;
+
+  for (i = 0; i < numCore; i++) {
+    cores.push_back(new Core(XML, i, &interface_ip));
+    cores[i]->computeEnergy();
+    cores[i]->computeEnergy(false);
+    if (procdynp.homoCore) {
+      core.area.set_area(core.area.get_area() +
+                         cores[i]->area.get_area() * procdynp.numCore);
+      set_pppm(pppm_t, cores[i]->clockRate * procdynp.numCore, procdynp.numCore,
+               procdynp.numCore, procdynp.numCore);
+      // set the exClockRate
+      exClockRate = cores[0]->clockRate * 2; // TODO; get from XML file
+      // cout<<"****EX clock rate:"<<exClockRate<<endl;
+      core.power = core.power + cores[i]->power * pppm_t;
+      set_pppm(pppm_t, 1 / cores[i]->executionTime, procdynp.numCore,
+               procdynp.numCore, procdynp.numCore);
+      core.rt_power = core.rt_power + cores[i]->rt_power * pppm_t;
+      area.set_area(area.get_area() +
+                    core.area.get_area()); // placement and routing overhead is
+                                           // 10%, core scales worse than cache
+                                           // 40% is accumulated from 90 to 22nm
+      power = power + core.power;
+      rt_power = rt_power + core.rt_power;
+    } else {
+      core.area.set_area(core.area.get_area() + cores[i]->area.get_area());
+      area.set_area(
+          area.get_area() +
+          cores[i]->area.get_area()); // placement and routing overhead is 10%,
+                                      // core scales worse than cache 40% is
+                                      // accumulated from 90 to 22nm
+
+      set_pppm(pppm_t, cores[i]->clockRate, 1, 1, 1);
+      // set the exClockRate
+      exClockRate = cores[0]->clockRate; // TODO; get from XML file
+      // cout<<"****EX clock rate:"<<exClockRate<<endl;
+      core.power = core.power + cores[i]->power * pppm_t;
+      power = power + cores[i]->power * pppm_t;
+
+      set_pppm(pppm_t, 1 / cores[i]->executionTime, 1, 1, 1);
+      core.rt_power = core.rt_power + cores[i]->rt_power * pppm_t;
+      rt_power = rt_power + cores[i]->rt_power * pppm_t;
+    }
+  }
+
+  if (!XML->sys.Private_L2) {
+
+    if (numL2 > 0)
+      for (i = 0; i < numL2; i++) {
+        l2array.push_back(new SharedCache(XML, i, &interface_ip));
+
+        l2array[i]->computeEnergy();
+        l2array[i]->computeEnergy(false);
+        if (procdynp.homoL2) {
+          l2.area.set_area(l2.area.get_area() +
+                           l2array[i]->area.get_area() * procdynp.numL2);
+          set_pppm(pppm_t, l2array[i]->cachep.clockRate * procdynp.numL2,
+                   procdynp.numL2, procdynp.numL2, procdynp.numL2);
+          l2.power = l2.power + l2array[i]->power * pppm_t;
+          set_pppm(pppm_t, 1 / l2array[i]->cachep.executionTime, procdynp.numL2,
+                   procdynp.numL2, procdynp.numL2);
+          l2.rt_power = l2.rt_power + l2array[i]->rt_power * pppm_t;
+          area.set_area(
+              area.get_area() +
+              l2.area.get_area()); // placement and routing overhead is 10%, l2
+                                   // scales worse than cache 40% is accumulated
+                                   // from 90 to 22nm
+          power = power + l2.power;
+          rt_power = rt_power + l2.rt_power;
+        } else {
+          l2.area.set_area(l2.area.get_area() + l2array[i]->area.get_area());
+          area.set_area(
+              area.get_area() +
+              l2array[i]
+                  ->area.get_area()); // placement and routing overhead is
+                                      // 10%, l2 scales worse than cache
+                                      // 40% is accumulated from 90 to 22nm
+
+          set_pppm(pppm_t, l2array[i]->cachep.clockRate, 1, 1, 1);
+          l2.power = l2.power + l2array[i]->power * pppm_t;
+          power = power + l2array[i]->power * pppm_t;
+          ;
+          set_pppm(pppm_t, 1 / l2array[i]->cachep.executionTime, 1, 1, 1);
+          l2.rt_power = l2.rt_power + l2array[i]->rt_power * pppm_t;
+          rt_power = rt_power + l2array[i]->rt_power * pppm_t;
+        }
+      }
+  }
+
+  if (numL3 > 0)
+    for (i = 0; i < numL3; i++) {
+      l3array.push_back(new SharedCache(XML, i, &interface_ip, L3));
+      l3array[i]->computeEnergy();
+      l3array[i]->computeEnergy(false);
+      if (procdynp.homoL3) {
+        l3.area.set_area(l3.area.get_area() +
+                         l3array[i]->area.get_area() * procdynp.numL3);
+        set_pppm(pppm_t, l3array[i]->cachep.clockRate * procdynp.numL3,
+                 procdynp.numL3, procdynp.numL3, procdynp.numL3);
+        l3.power = l3.power + l3array[i]->power * pppm_t;
+        set_pppm(pppm_t, 1 / l3array[i]->cachep.executionTime, procdynp.numL3,
+                 procdynp.numL3, procdynp.numL3);
+        l3.rt_power = l3.rt_power + l3array[i]->rt_power * pppm_t;
+        area.set_area(area.get_area() +
+                      l3.area.get_area()); // placement and routing overhead is
+                                           // 10%, l3 scales worse than cache
+                                           // 40% is accumulated from 90 to 22nm
+        power = power + l3.power;
+        rt_power = rt_power + l3.rt_power;
+
+      } else {
+        l3.area.set_area(l3.area.get_area() + l3array[i]->area.get_area());
+        area.set_area(
+            area.get_area() +
+            l3array[i]->area.get_area()); // placement and routing overhead is
+                                          // 10%, l3 scales worse than cache 40%
+                                          // is accumulated from 90 to 22nm
+        set_pppm(pppm_t, l3array[i]->cachep.clockRate, 1, 1, 1);
+        l3.power = l3.power + l3array[i]->power * pppm_t;
+        power = power + l3array[i]->power * pppm_t;
+        set_pppm(pppm_t, 1 / l3array[i]->cachep.executionTime, 1, 1, 1);
+        l3.rt_power = l3.rt_power + l3array[i]->rt_power * pppm_t;
+        rt_power = rt_power + l3array[i]->rt_power * pppm_t;
+      }
+    }
+  if (numL1Dir > 0)
+    for (i = 0; i < numL1Dir; i++) {
+      l1dirarray.push_back(new SharedCache(XML, i, &interface_ip, L1Directory));
+      l1dirarray[i]->computeEnergy();
+      l1dirarray[i]->computeEnergy(false);
+      if (procdynp.homoL1Dir) {
+        l1dir.area.set_area(l1dir.area.get_area() +
+                            l1dirarray[i]->area.get_area() * procdynp.numL1Dir);
+        set_pppm(pppm_t, l1dirarray[i]->cachep.clockRate * procdynp.numL1Dir,
+                 procdynp.numL1Dir, procdynp.numL1Dir, procdynp.numL1Dir);
+        l1dir.power = l1dir.power + l1dirarray[i]->power * pppm_t;
+        set_pppm(pppm_t, 1 / l1dirarray[i]->cachep.executionTime,
+                 procdynp.numL1Dir, procdynp.numL1Dir, procdynp.numL1Dir);
+        l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power * pppm_t;
+        area.set_area(
+            area.get_area() +
+            l1dir.area.get_area()); // placement and routing overhead is 10%,
+                                    // l1dir scales worse than cache 40% is
+                                    // accumulated from 90 to 22nm
+        power = power + l1dir.power;
+        rt_power = rt_power + l1dir.rt_power;
+
+      } else {
+        l1dir.area.set_area(l1dir.area.get_area() +
+                            l1dirarray[i]->area.get_area());
+        area.set_area(area.get_area() + l1dirarray[i]->area.get_area());
+        set_pppm(pppm_t, l1dirarray[i]->cachep.clockRate, 1, 1, 1);
+        l1dir.power = l1dir.power + l1dirarray[i]->power * pppm_t;
+        power = power + l1dirarray[i]->power;
+        set_pppm(pppm_t, 1 / l1dirarray[i]->cachep.executionTime, 1, 1, 1);
+        l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power * pppm_t;
+        rt_power = rt_power + l1dirarray[i]->rt_power;
+      }
+    }
+
+  if (numL2Dir > 0)
+    for (i = 0; i < numL2Dir; i++) {
+      l2dirarray.push_back(new SharedCache(XML, i, &interface_ip, L2Directory));
+      l2dirarray[i]->computeEnergy();
+      l2dirarray[i]->computeEnergy(false);
+      if (procdynp.homoL2Dir) {
+        l2dir.area.set_area(l2dir.area.get_area() +
+                            l2dirarray[i]->area.get_area() * procdynp.numL2Dir);
+        set_pppm(pppm_t, l2dirarray[i]->cachep.clockRate * procdynp.numL2Dir,
+                 procdynp.numL2Dir, procdynp.numL2Dir, procdynp.numL2Dir);
+        l2dir.power = l2dir.power + l2dirarray[i]->power * pppm_t;
+        set_pppm(pppm_t, 1 / l2dirarray[i]->cachep.executionTime,
+                 procdynp.numL2Dir, procdynp.numL2Dir, procdynp.numL2Dir);
+        l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power * pppm_t;
+        area.set_area(
+            area.get_area() +
+            l2dir.area.get_area()); // placement and routing overhead is 10%,
+                                    // l2dir scales worse than cache 40% is
+                                    // accumulated from 90 to 22nm
+        power = power + l2dir.power;
+        rt_power = rt_power + l2dir.rt_power;
+
+      } else {
+        l2dir.area.set_area(l2dir.area.get_area() +
+                            l2dirarray[i]->area.get_area());
+        area.set_area(area.get_area() + l2dirarray[i]->area.get_area());
+        set_pppm(pppm_t, l2dirarray[i]->cachep.clockRate, 1, 1, 1);
+        l2dir.power = l2dir.power + l2dirarray[i]->power * pppm_t;
+        power = power + l2dirarray[i]->power * pppm_t;
+        set_pppm(pppm_t, 1 / l2dirarray[i]->cachep.executionTime, 1, 1, 1);
+        l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power * pppm_t;
+        rt_power = rt_power + l2dirarray[i]->rt_power * pppm_t;
+      }
+    }
+
+  if (XML->sys.mc.number_mcs > 0 && XML->sys.mc.memory_channels_per_mc > 0) {
+    if (XML->sys.architecture == 1) // 1 for fermi
+      mc = new MemoryController(XML, &interface_ip, MC, GDDR5);
+    else if (XML->sys.architecture == 2) // 2 for quadro
+      mc = new MemoryController(XML, &interface_ip, MC, GDDR3);
+    else {
+      printf("Architecture %d not defined!\n", XML->sys.architecture);
+      printf("use 1 for fermi and 2 for quadro!\n");
+      exit(1);
+    }
+    mc->computeEnergy();
+    mc->computeEnergy(false);
+    mcs.area.set_area(mcs.area.get_area() +
+                      mc->area.get_area() * XML->sys.mc.number_mcs);
+    area.set_area(area.get_area() +
+                  mc->area.get_area() * XML->sys.mc.number_mcs);
+    set_pppm(pppm_t, XML->sys.mc.number_mcs * mc->mcp.clockRate,
+             XML->sys.mc.number_mcs, XML->sys.mc.number_mcs,
+             XML->sys.mc.number_mcs);
+    mcs.power = mc->power * pppm_t;
+    power = power + mcs.power;
+    set_pppm(pppm_t, 1 / mc->mcp.executionTime, XML->sys.mc.number_mcs,
+             XML->sys.mc.number_mcs, XML->sys.mc.number_mcs);
+    mcs.rt_power = mc->rt_power * pppm_t;
+    rt_power = rt_power + mcs.rt_power;
+  }
+
+  if (XML->sys.flashc.number_mcs > 0) // flash controller
+  {
+    flashcontroller = new FlashController(XML, &interface_ip);
+    flashcontroller->computeEnergy();
+    flashcontroller->computeEnergy(false);
+    double number_fcs = flashcontroller->fcp.num_mcs;
+    flashcontrollers.area.set_area(flashcontrollers.area.get_area() +
+                                   flashcontroller->area.get_area() *
+                                       number_fcs);
+    area.set_area(area.get_area() + flashcontrollers.area.get_area());
+    set_pppm(pppm_t, number_fcs, number_fcs, number_fcs, number_fcs);
+    flashcontrollers.power = flashcontroller->power * pppm_t;
+    power = power + flashcontrollers.power;
+    set_pppm(pppm_t, number_fcs, number_fcs, number_fcs, number_fcs);
+    flashcontrollers.rt_power = flashcontroller->rt_power * pppm_t;
+    rt_power = rt_power + flashcontrollers.rt_power;
+  }
+
+  if (XML->sys.niu.number_units > 0) {
+    niu = new NIUController(XML, &interface_ip);
+    niu->computeEnergy();
+    niu->computeEnergy(false);
+    nius.area.set_area(nius.area.get_area() +
+                       niu->area.get_area() * XML->sys.niu.number_units);
+    area.set_area(area.get_area() +
+                  niu->area.get_area() * XML->sys.niu.number_units);
+    set_pppm(pppm_t, XML->sys.niu.number_units * niu->niup.clockRate,
+             XML->sys.niu.number_units, XML->sys.niu.number_units,
+             XML->sys.niu.number_units);
+    nius.power = niu->power * pppm_t;
+    power = power + nius.power;
+    set_pppm(pppm_t, XML->sys.niu.number_units * niu->niup.clockRate,
+             XML->sys.niu.number_units, XML->sys.niu.number_units,
+             XML->sys.niu.number_units);
+    nius.rt_power = niu->rt_power * pppm_t;
+    rt_power = rt_power + nius.rt_power;
+  }
+
+  if (XML->sys.pcie.number_units > 0 && XML->sys.pcie.num_channels > 0) {
+    pcie = new PCIeController(XML, &interface_ip);
+    pcie->computeEnergy();
+    pcie->computeEnergy(false);
+    pcies.area.set_area(pcies.area.get_area() +
+                        pcie->area.get_area() * XML->sys.pcie.number_units);
+    area.set_area(area.get_area() +
+                  pcie->area.get_area() * XML->sys.pcie.number_units);
+    set_pppm(pppm_t, XML->sys.pcie.number_units * pcie->pciep.clockRate,
+             XML->sys.pcie.number_units, XML->sys.pcie.number_units,
+             XML->sys.pcie.number_units);
+    pcies.power = pcie->power * pppm_t;
+    power = power + pcies.power;
+    set_pppm(pppm_t, XML->sys.pcie.number_units * pcie->pciep.clockRate,
+             XML->sys.pcie.number_units, XML->sys.pcie.number_units,
+             XML->sys.pcie.number_units);
+    pcies.rt_power = pcie->rt_power * pppm_t;
+    rt_power = rt_power + pcies.rt_power;
+  }
+
+  if (numNOC > 0) {
+    for (i = 0; i < numNOC; i++) {
+      if (XML->sys.NoC[i].type) { // First add up area of routers if NoC is used
+        nocs.push_back(new NoC(XML, i, &interface_ip, 1));
+
+        if (procdynp.homoNOC) {
+          noc.area.set_area(noc.area.get_area() +
+                            nocs[i]->area.get_area() * procdynp.numNOC);
+          area.set_area(area.get_area() + noc.area.get_area());
+        } else {
+          noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area());
+          area.set_area(area.get_area() + nocs[i]->area.get_area());
+        }
+      } else { // Bus based interconnect
+        nocs.push_back(
+            new NoC(XML, i, &interface_ip, 1,
+                    sqrt(area.get_area() * XML->sys.NoC[i].chip_coverage)));
+        if (procdynp.homoNOC) {
+          noc.area.set_area(noc.area.get_area() +
+                            nocs[i]->area.get_area() * procdynp.numNOC);
+          area.set_area(area.get_area() + noc.area.get_area());
+        } else {
+          noc.area.set_area(noc.area.get_area() + nocs[i]->area.get_area());
+          area.set_area(area.get_area() + nocs[i]->area.get_area());
+        }
+      }
+    }
+
+    /*
+     * Compute global links associated with each NOC, if any. This must be done
+     * at the end (even after the NOC router part) since the total chip area
+     * must be obtain to decide the link routing
+     */
+    for (i = 0; i < numNOC; i++) {
+      if (nocs[i]->nocdynp.has_global_link && XML->sys.NoC[i].type) {
+        nocs[i]->init_link_bus(
+            sqrt(area.get_area() *
+                 XML->sys.NoC[i].chip_coverage)); // compute global links
+        if (procdynp.homoNOC) {
+          noc.area.set_area(noc.area.get_area() +
+                            nocs[i]->link_bus_tot_per_Router.area.get_area() *
+                                nocs[i]->nocdynp.total_nodes * procdynp.numNOC);
+          area.set_area(area.get_area() +
+                        nocs[i]->link_bus_tot_per_Router.area.get_area() *
+                            nocs[i]->nocdynp.total_nodes * procdynp.numNOC);
+        } else {
+          noc.area.set_area(noc.area.get_area() +
+                            nocs[i]->link_bus_tot_per_Router.area.get_area() *
+                                nocs[i]->nocdynp.total_nodes);
+          area.set_area(area.get_area() +
+                        nocs[i]->link_bus_tot_per_Router.area.get_area() *
+                            nocs[i]->nocdynp.total_nodes);
+        }
+      }
+    }
+    // Compute energy of NoC (w or w/o links) or buses
+    for (i = 0; i < numNOC; i++) {
+      // cout<<"******************COMPUTE NOC ENERGY********************"<<endl;
+      nocs[i]->computeEnergy();
+      nocs[i]->computeEnergy(false);
+      if (procdynp.homoNOC) {
+
+        set_pppm(pppm_t, procdynp.numNOC * nocs[i]->nocdynp.clockRate,
+                 procdynp.numNOC, procdynp.numNOC, procdynp.numNOC);
+        noc.power = noc.power + nocs[i]->power * pppm_t;
+        set_pppm(pppm_t, 1 / nocs[i]->nocdynp.executionTime, procdynp.numNOC,
+                 procdynp.numNOC, procdynp.numNOC);
+        noc.rt_power = noc.rt_power + nocs[i]->rt_power * pppm_t;
+        power = power + noc.power;
+        rt_power = rt_power + noc.rt_power;
+      } else {
+        set_pppm(pppm_t, nocs[i]->nocdynp.clockRate, 1, 1, 1);
+        noc.power = noc.power + nocs[i]->power * pppm_t;
+        power = power + nocs[i]->power * pppm_t;
+        set_pppm(pppm_t, 1 / nocs[i]->nocdynp.executionTime, 1, 1, 1);
+        noc.rt_power = noc.rt_power + nocs[i]->rt_power * pppm_t;
+        rt_power = rt_power + nocs[i]->rt_power * pppm_t;
+      }
+    }
+  }
+
+  //  //clock power
+  //  globalClock.init_wire_external(is_default, &interface_ip);
+  //  globalClock.clk_area           =area*1e6; //change it from mm^2 to um^2
+  //  globalClock.end_wiring_level   =5;//toplevel metal
+  //  globalClock.start_wiring_level =5;//toplevel metal
+  //  globalClock.l_ip.with_clock_grid=false;//global clock does not drive local
+  //  final nodes globalClock.optimize_wire();
+}
+
+void Processor::compute() {
+  int i;
+  double pppm_t[4] = {1, 1, 1, 1};
+
+  rt_power.reset();
+  // power.reset();
+  // core.power.reset();
+
+  core.rt_power.reset();
+  for (i = 0; i < numCore; i++) {
+    cores[i]->executionTime =
+        XML->sys.total_cycles / (XML->sys.core[i].clock_rate * 1e6);
+    cores[i]->rt_power.reset();
+    cores[i]->compute();
+    // cores[i]->computeEnergy(false);
+    if (procdynp.homoCore) {
+      set_pppm(pppm_t, 1 / cores[i]->executionTime, procdynp.numCore,
+               procdynp.numCore, procdynp.numCore);
+      core.rt_power = core.rt_power + cores[i]->rt_power * pppm_t;
+      rt_power = rt_power + core.rt_power;
+    } else {
+      set_pppm(pppm_t, 1 / cores[i]->executionTime, 1, 1, 1);
+      core.rt_power = core.rt_power + cores[i]->rt_power * pppm_t;
+      rt_power = rt_power + cores[i]->rt_power * pppm_t;
+    }
+  }
+
+  if (!XML->sys.Private_L2) {
+    if (numL2 > 0)
+      l2.rt_power.reset();
+    for (i = 0; i < numL2; i++) {
+      l2array[i]->rt_power.reset();
+      l2array[i]->cachep.executionTime =
+          XML->sys.total_cycles / (XML->sys.core[0].clock_rate * 1e6);
+      l2array[i]->computeEnergy(false);
+      if (procdynp.homoL2) {
+        set_pppm(pppm_t, 1 / l2array[i]->cachep.executionTime, procdynp.numL2,
+                 procdynp.numL2, procdynp.numL2);
+        l2.rt_power = l2.rt_power + l2array[i]->rt_power * pppm_t;
+        rt_power = rt_power + l2.rt_power;
+      } else {
+        set_pppm(pppm_t, 1 / l2array[i]->cachep.executionTime, 1, 1, 1);
+        l2.rt_power = l2.rt_power + l2array[i]->rt_power * pppm_t;
+        rt_power = rt_power + l2array[i]->rt_power * pppm_t;
+      }
+    }
+  }
+
+  l3.rt_power.reset();
+  if (numL3 > 0)
+    for (i = 0; i < numL3; i++) {
+      l3array[i]->rt_power.reset();
+      l3array[i]->computeEnergy(false);
+      if (procdynp.homoL3) {
+        set_pppm(pppm_t, 1 / l3array[i]->cachep.executionTime, procdynp.numL3,
+                 procdynp.numL3, procdynp.numL3);
+        l3.rt_power = l3.rt_power + l3array[i]->rt_power * pppm_t;
+        rt_power = rt_power + l3.rt_power;
+
+      } else {
+        set_pppm(pppm_t, 1 / l3array[i]->cachep.executionTime, 1, 1, 1);
+        l3.rt_power = l3.rt_power + l3array[i]->rt_power * pppm_t;
+        rt_power = rt_power + l3array[i]->rt_power * pppm_t;
+      }
+    }
+
+  l1dir.rt_power.reset();
+  if (numL1Dir > 0)
+    for (i = 0; i < numL1Dir; i++) {
+      l1dirarray[i]->rt_power.reset();
+      l1dirarray[i]->computeEnergy(false);
+      if (procdynp.homoL1Dir) {
+        set_pppm(pppm_t, 1 / l1dirarray[i]->cachep.executionTime,
+                 procdynp.numL1Dir, procdynp.numL1Dir, procdynp.numL1Dir);
+        l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power * pppm_t;
+        rt_power = rt_power + l1dir.rt_power;
+
+      } else {
+        set_pppm(pppm_t, 1 / l1dirarray[i]->cachep.executionTime, 1, 1, 1);
+        l1dir.rt_power = l1dir.rt_power + l1dirarray[i]->rt_power * pppm_t;
+        rt_power = rt_power + l1dirarray[i]->rt_power;
+      }
+    }
+
+  l2dir.rt_power.reset();
+  if (numL2Dir > 0)
+    for (i = 0; i < numL2Dir; i++) {
+      l2dirarray[i]->rt_power.reset();
+      l2dirarray[i]->computeEnergy(false);
+      if (procdynp.homoL2Dir) {
+        set_pppm(pppm_t, 1 / l2dirarray[i]->cachep.executionTime,
+                 procdynp.numL2Dir, procdynp.numL2Dir, procdynp.numL2Dir);
+        l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power * pppm_t;
+        rt_power = rt_power + l2dir.rt_power;
+
+      } else {
+        set_pppm(pppm_t, 1 / l2dirarray[i]->cachep.executionTime, 1, 1, 1);
+        l2dir.rt_power = l2dir.rt_power + l2dirarray[i]->rt_power * pppm_t;
+        rt_power = rt_power + l2dirarray[i]->rt_power * pppm_t;
+      }
+    }
+
+  mcs.rt_power.reset();
+  if (XML->sys.mc.number_mcs > 0 && XML->sys.mc.memory_channels_per_mc > 0) {
+    mc->rt_power.reset();
+    mc->mcp.executionTime =
+        XML->sys.total_cycles / (XML->sys.core[0].clock_rate * 1e6); // Jingwen
+    mc->computeEnergy(false);
+    set_pppm(pppm_t, 1 / mc->mcp.executionTime, XML->sys.mc.number_mcs,
+             XML->sys.mc.number_mcs, XML->sys.mc.number_mcs);
+    mcs.rt_power = mc->rt_power * pppm_t;
+    rt_power = rt_power + mcs.rt_power;
+  }
+
+  /*
+    if (XML->sys.flashc.number_mcs >0 )//flash controller
+    {
+            flashcontrollers.rt_power.reset();
+            flashcontroller->computeEnergy(false);
+            double number_fcs = flashcontroller->fcp.num_mcs;
+            set_pppm(pppm_t,number_fcs , number_fcs ,number_fcs ,number_fcs );
+            flashcontrollers.rt_power = flashcontroller->rt_power*pppm_t;
+            rt_power = rt_power  + flashcontrollers.rt_power;
+
+    }
+
+    if (XML->sys.niu.number_units >0)
+    {
+            niu->computeEnergy(false);
+            nius.rt_power.reset();
+            set_pppm(pppm_t,XML->sys.niu.number_units*niu->niup.clockRate,
+    XML->sys.niu.number_units,XML->sys.niu.number_units,XML->sys.niu.number_units);
+            nius.rt_power = niu->rt_power*pppm_t;
+            rt_power = rt_power  + nius.rt_power;
+
+    }
+
+    if (XML->sys.pcie.number_units >0 && XML->sys.pcie.num_channels >0)
+    {
+            pcie->computeEnergy(false);
+            pcies.rt_power.reset();
+            set_pppm(pppm_t,XML->sys.pcie.number_units*pcie->pciep.clockRate,
+    XML->sys.pcie.number_units,XML->sys.pcie.number_units,XML->sys.pcie.number_units);
+            pcies.rt_power = pcie->rt_power*pppm_t;
+            rt_power = rt_power  + pcies.rt_power;
+
+    }
+
+
+             // * Compute global links associated with each NOC, if any. This
+    must be done at the end (even after the NOC router part) since the total
+    chip
+             // * area must be obtain to decide the link routing
+            */
+  // Compute energy of NoC (w or w/o links) or buses
+  noc.rt_power.reset();
+  for (i = 0; i < numNOC; i++) {
+    nocs[i]->nocdynp.executionTime =
+        XML->sys.total_cycles / (XML->sys.core[0].clock_rate * 1e6);
+    nocs[i]->computeEnergy(false);
+    if (procdynp.homoNOC) {
+      set_pppm(pppm_t, 1 / nocs[i]->nocdynp.executionTime, procdynp.numNOC,
+               procdynp.numNOC, procdynp.numNOC);
+      noc.rt_power = noc.rt_power + nocs[i]->rt_power * pppm_t;
+      rt_power = rt_power + noc.rt_power;
+    } else {
+      set_pppm(pppm_t, 1 / nocs[i]->nocdynp.executionTime, 1, 1, 1);
+      noc.rt_power = noc.rt_power + nocs[i]->rt_power * pppm_t;
+      rt_power = rt_power + nocs[i]->rt_power * pppm_t;
+    }
+  }
+
+  //  //clock power
+  //  globalClock.init_wire_external(is_default, &interface_ip);
+  //  globalClock.clk_area           =area*1e6; //change it from mm^2 to um^2
+  //  globalClock.end_wiring_level   =5;//toplevel metal
+  //  globalClock.start_wiring_level =5;//toplevel metal
+  //  globalClock.l_ip.with_clock_grid=false;//global clock does not drive local
+  //  final nodes globalClock.optimize_wire();
+}
+
+void Processor::displayDeviceType(int device_type_, uint32_t indent) {
+  string indent_str(indent, ' ');
+
+  switch (device_type_) {
+
+  case 0:
+    cout << indent_str << "Device Type= "
+         << "ITRS high performance device type" << endl;
+    break;
+  case 1:
+    cout << indent_str << "Device Type= "
+         << "ITRS low standby power device type" << endl;
+    break;
+  case 2:
+    cout << indent_str << "Device Type= "
+         << "ITRS low operating power device type" << endl;
+    break;
+  case 3:
+    cout << indent_str << "Device Type= "
+         << "LP-DRAM device type" << endl;
+    break;
+  case 4:
+    cout << indent_str << "Device Type= "
+         << "COMM-DRAM device type" << endl;
+    break;
+  default: {
+    cout << indent_str << "Unknown Device Type" << endl;
+    exit(0);
+  }
+  }
+}
+
+void Processor::displayInterconnectType(int interconnect_type_,
+                                        uint32_t indent) {
+  string indent_str(indent, ' ');
+
+  switch (interconnect_type_) {
+
+  case 0:
+    cout << indent_str << "Interconnect metal projection= "
+         << "aggressive interconnect technology projection" << endl;
+    break;
+  case 1:
+    cout << indent_str << "Interconnect metal projection= "
+         << "conservative interconnect technology projection" << endl;
+    break;
+  default: {
+    cout << indent_str << "Unknown Interconnect Projection Type" << endl;
+    exit(0);
+  }
+  }
+}
+
+void Processor::displayEnergy(uint32_t indent, int plevel, bool is_tdp_parm) {
+  int i;
+  bool long_channel = XML->sys.longer_channel_device;
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool is_tdp = is_tdp_parm;
+  if (is_tdp_parm) {
+
+    if (plevel < 5) {
+      cout
+          << "\nMcPAT (version " << VER_MAJOR << "." << VER_MINOR << " of "
+          << VER_UPDATE << ") results (current print level is " << plevel
+          << ", please increase print level to see the details in components): "
+          << endl;
+    } else {
+      cout << "\nMcPAT (version " << VER_MAJOR << "." << VER_MINOR << " of "
+           << VER_UPDATE << ") results  (current print level is 5)" << endl;
+    }
+    cout << "******************************************************************"
+            "***********************"
+         << endl;
+    cout << indent_str << "Technology " << XML->sys.core_tech_node << " nm"
+         << endl;
+    // cout <<indent_str<<"Device Type= "<<XML->sys.device_type<<endl;
+    if (long_channel)
+      cout << indent_str << "Using Long Channel Devices When Appropriate"
+           << endl;
+    // cout <<indent_str<<"Interconnect metal projection=
+    // "<<XML->sys.interconnect_projection_type<<endl;
+    displayInterconnectType(XML->sys.interconnect_projection_type, indent);
+    cout << indent_str << "Core clock Rate(MHz) " << XML->sys.core[0].clock_rate
+         << endl;
+    cout << endl;
+    cout << "******************************************************************"
+            "***********************"
+         << endl;
+    cout << "Processor: " << endl;
+    cout << indent_str << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str << "Peak Power = "
+         << power.readOp.dynamic +
+                (long_channel ? power.readOp.longer_channel_leakage
+                              : power.readOp.leakage) +
+                power.readOp.gate_leakage
+         << " W" << endl;
+    cout << indent_str << "Total Leakage = "
+         << (long_channel ? power.readOp.longer_channel_leakage
+                          : power.readOp.leakage) +
+                power.readOp.gate_leakage
+         << " W" << endl;
+    cout << indent_str << "Peak Dynamic = " << power.readOp.dynamic << " W"
+         << endl;
+    cout << indent_str << "Subthreshold Leakage = "
+         << (long_channel ? power.readOp.longer_channel_leakage
+                          : power.readOp.leakage)
+         << " W" << endl;
+    // cout << indent_str << "Subthreshold Leakage = " <<
+    // power.readOp.longer_channel_leakage <<" W" << endl;
+    cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str << "Runtime Dynamic = " << rt_power.readOp.dynamic
+         << " W" << endl;
+    cout << endl;
+    if (numCore > 0) {
+      cout << indent_str << "Total Cores: " << XML->sys.number_of_cores
+           << " cores " << endl;
+      displayDeviceType(XML->sys.device_type, indent);
+      cout << indent_str_next << "Area = " << core.area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next << "Peak Dynamic = " << core.power.readOp.dynamic
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? core.power.readOp.longer_channel_leakage
+                            : core.power.readOp.leakage)
+           << " W" << endl;
+      // cout << indent_str_next << "Subthreshold Leakage = " <<
+      // core.power.readOp.longer_channel_leakage <<" W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << core.power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << core.rt_power.readOp.dynamic << " W"
+           << endl;
+      cout << endl;
+    }
+    if (!XML->sys.Private_L2) {
+      if (numL2 > 0) {
+        cout << indent_str << "Total L2s: " << endl;
+        displayDeviceType(XML->sys.L2[0].device_type, indent);
+        cout << indent_str_next << "Area = " << l2.area.get_area() * 1e-6
+             << " mm^2" << endl;
+        cout << indent_str_next << "Peak Dynamic = " << l2.power.readOp.dynamic
+             << " W" << endl;
+        cout << indent_str_next << "Subthreshold Leakage = "
+             << (long_channel ? l2.power.readOp.longer_channel_leakage
+                              : l2.power.readOp.leakage)
+             << " W" << endl;
+        // cout << indent_str_next << "Subthreshold Leakage = " <<
+        // l2.power.readOp.longer_channel_leakage <<" W" << endl;
+        cout << indent_str_next
+             << "Gate Leakage = " << l2.power.readOp.gate_leakage << " W"
+             << endl;
+        cout << indent_str_next
+             << "Runtime Dynamic = " << l2.rt_power.readOp.dynamic << " W"
+             << endl;
+        cout << endl;
+      }
+    }
+    if (numL3 > 0) {
+      cout << indent_str << "Total L3s: " << endl;
+      displayDeviceType(XML->sys.L3[0].device_type, indent);
+      cout << indent_str_next << "Area = " << l3.area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next << "Peak Dynamic = " << l3.power.readOp.dynamic
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? l3.power.readOp.longer_channel_leakage
+                            : l3.power.readOp.leakage)
+           << " W" << endl;
+      // cout << indent_str_next << "Subthreshold Leakage = " <<
+      // l3.power.readOp.longer_channel_leakage <<" W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << l3.power.readOp.gate_leakage << " W" << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << l3.rt_power.readOp.dynamic << " W"
+           << endl;
+      cout << endl;
+    }
+    if (numL1Dir > 0) {
+      cout << indent_str << "Total First Level Directory: " << endl;
+      displayDeviceType(XML->sys.L1Directory[0].device_type, indent);
+      cout << indent_str_next << "Area = " << l1dir.area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next << "Peak Dynamic = " << l1dir.power.readOp.dynamic
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? l1dir.power.readOp.longer_channel_leakage
+                            : l1dir.power.readOp.leakage)
+           << " W" << endl;
+      // cout << indent_str_next << "Subthreshold Leakage = " <<
+      // l1dir.power.readOp.longer_channel_leakage <<" W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << l1dir.power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << l1dir.rt_power.readOp.dynamic << " W"
+           << endl;
+      cout << endl;
+    }
+    if (numL2Dir > 0) {
+      cout << indent_str << "Total First Level Directory: " << endl;
+      displayDeviceType(XML->sys.L1Directory[0].device_type, indent);
+      cout << indent_str_next << "Area = " << l2dir.area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next << "Peak Dynamic = " << l2dir.power.readOp.dynamic
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? l2dir.power.readOp.longer_channel_leakage
+                            : l2dir.power.readOp.leakage)
+           << " W" << endl;
+      // cout << indent_str_next << "Subthreshold Leakage = " <<
+      // l2dir.power.readOp.longer_channel_leakage <<" W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << l2dir.power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << l2dir.rt_power.readOp.dynamic << " W"
+           << endl;
+      cout << endl;
+    }
+    if (numNOC > 0) {
+      cout << indent_str << "Total NoCs (Network/Bus): " << endl;
+      displayDeviceType(XML->sys.device_type, indent);
+      cout << indent_str_next << "Area = " << noc.area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next << "Peak Dynamic = " << noc.power.readOp.dynamic
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? noc.power.readOp.longer_channel_leakage
+                            : noc.power.readOp.leakage)
+           << " W" << endl;
+      // cout << indent_str_next << "Subthreshold Leakage = " <<
+      // noc.power.readOp.longer_channel_leakage  <<" W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << noc.power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << noc.rt_power.readOp.dynamic << " W"
+           << endl;
+      cout << endl;
+    }
+    if (XML->sys.mc.number_mcs > 0 && XML->sys.mc.memory_channels_per_mc > 0) {
+      cout << indent_str << "Total MCs: " << XML->sys.mc.number_mcs
+           << " Memory Controllers " << endl;
+      displayDeviceType(XML->sys.device_type, indent);
+      cout << indent_str_next << "Area = " << mcs.area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next << "Peak Dynamic = " << mcs.power.readOp.dynamic
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? mcs.power.readOp.longer_channel_leakage
+                            : mcs.power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << mcs.power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << mcs.rt_power.readOp.dynamic << " W"
+           << endl;
+      cout << endl;
+    }
+    if (XML->sys.flashc.number_mcs > 0) {
+      cout << indent_str
+           << "Total Flash/SSD Controllers: " << flashcontroller->fcp.num_mcs
+           << " Flash/SSD Controllers " << endl;
+      displayDeviceType(XML->sys.device_type, indent);
+      cout << indent_str_next
+           << "Area = " << flashcontrollers.area.get_area() * 1e-6 << " mm^2"
+           << endl;
+      cout << indent_str_next
+           << "Peak Dynamic = " << flashcontrollers.power.readOp.dynamic << " W"
+           << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel
+                   ? flashcontrollers.power.readOp.longer_channel_leakage
+                   : flashcontrollers.power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << flashcontrollers.power.readOp.gate_leakage
+           << " W" << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << flashcontrollers.rt_power.readOp.dynamic
+           << " W" << endl;
+      cout << endl;
+    }
+    if (XML->sys.niu.number_units > 0) {
+      cout << indent_str << "Total NIUs: " << niu->niup.num_units
+           << " Network Interface Units " << endl;
+      displayDeviceType(XML->sys.device_type, indent);
+      cout << indent_str_next << "Area = " << nius.area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next << "Peak Dynamic = " << nius.power.readOp.dynamic
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? nius.power.readOp.longer_channel_leakage
+                            : nius.power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << nius.power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << nius.rt_power.readOp.dynamic << " W"
+           << endl;
+      cout << endl;
+    }
+    if (XML->sys.pcie.number_units > 0 && XML->sys.pcie.num_channels > 0) {
+      cout << indent_str << "Total PCIes: " << pcie->pciep.num_units
+           << " PCIe Controllers " << endl;
+      displayDeviceType(XML->sys.device_type, indent);
+      cout << indent_str_next << "Area = " << pcies.area.get_area() * 1e-6
+           << " mm^2" << endl;
+      cout << indent_str_next << "Peak Dynamic = " << pcies.power.readOp.dynamic
+           << " W" << endl;
+      cout << indent_str_next << "Subthreshold Leakage = "
+           << (long_channel ? pcies.power.readOp.longer_channel_leakage
+                            : pcies.power.readOp.leakage)
+           << " W" << endl;
+      cout << indent_str_next
+           << "Gate Leakage = " << pcies.power.readOp.gate_leakage << " W"
+           << endl;
+      cout << indent_str_next
+           << "Runtime Dynamic = " << pcies.rt_power.readOp.dynamic << " W"
+           << endl;
+      cout << endl;
+    }
+    cout << "******************************************************************"
+            "***********************"
+         << endl;
+    if (plevel > 1) {
+      for (i = 0; i < numCore; i++) {
+        cores[i]->displayEnergy(indent + 4, plevel, is_tdp);
+        cout << "**************************************************************"
+                "***************************"
+             << endl;
+      }
+      if (!XML->sys.Private_L2) {
+        for (i = 0; i < numL2; i++) {
+          l2array[i]->displayEnergy(indent + 4, is_tdp);
+          cout << "************************************************************"
+                  "*****************************"
+               << endl;
+        }
+      }
+      for (i = 0; i < numL3; i++) {
+        l3array[i]->displayEnergy(indent + 4, is_tdp);
+        cout << "**************************************************************"
+                "***************************"
+             << endl;
+      }
+      for (i = 0; i < numL1Dir; i++) {
+        l1dirarray[i]->displayEnergy(indent + 4, is_tdp);
+        cout << "**************************************************************"
+                "***************************"
+             << endl;
+      }
+      for (i = 0; i < numL2Dir; i++) {
+        l2dirarray[i]->displayEnergy(indent + 4, is_tdp);
+        cout << "**************************************************************"
+                "***************************"
+             << endl;
+      }
+      if (XML->sys.mc.number_mcs > 0 &&
+          XML->sys.mc.memory_channels_per_mc > 0) {
+        mc->displayEnergy(indent + 4, is_tdp);
+        cout << "**************************************************************"
+                "***************************"
+             << endl;
+      }
+      if (XML->sys.flashc.number_mcs > 0 &&
+          XML->sys.flashc.memory_channels_per_mc > 0) {
+        flashcontroller->displayEnergy(indent + 4, is_tdp);
+        cout << "**************************************************************"
+                "***************************"
+             << endl;
+      }
+      if (XML->sys.niu.number_units > 0) {
+        niu->displayEnergy(indent + 4, is_tdp);
+        cout << "**************************************************************"
+                "***************************"
+             << endl;
+      }
+      if (XML->sys.pcie.number_units > 0 && XML->sys.pcie.num_channels > 0) {
+        pcie->displayEnergy(indent + 4, is_tdp);
+        cout << "**************************************************************"
+                "***************************"
+             << endl;
+      }
+
+      for (i = 0; i < numNOC; i++) {
+        nocs[i]->displayEnergy(indent + 4, plevel, is_tdp);
+        cout << "**************************************************************"
+                "***************************"
+             << endl;
+      }
+    }
+  } else {
+  }
+}
+
+void Processor::set_proc_param() {
+  bool debug = false;
+
+  procdynp.homoCore = bool(debug ? 1 : XML->sys.homogeneous_cores);
+  procdynp.homoL2 = bool(debug ? 1 : XML->sys.homogeneous_L2s);
+  procdynp.homoL3 = bool(debug ? 1 : XML->sys.homogeneous_L3s);
+  procdynp.homoNOC = bool(debug ? 1 : XML->sys.homogeneous_NoCs);
+  procdynp.homoL1Dir = bool(debug ? 1 : XML->sys.homogeneous_L1Directories);
+  procdynp.homoL2Dir = bool(debug ? 1 : XML->sys.homogeneous_L2Directories);
+
+  procdynp.numCore = XML->sys.number_of_cores;
+  procdynp.numL2 = XML->sys.number_of_L2s;
+  procdynp.numL3 = XML->sys.number_of_L3s;
+  procdynp.numNOC = XML->sys.number_of_NoCs;
+  procdynp.numL1Dir = XML->sys.number_of_L1Directories;
+  procdynp.numL2Dir = XML->sys.number_of_L2Directories;
+  procdynp.numMC = XML->sys.mc.number_mcs;
+  procdynp.numMCChannel = XML->sys.mc.memory_channels_per_mc;
+
+  //	if (procdynp.numCore<1)
+  //	{
+  //		cout<<" The target processor should at least have one core on
+  // chip."
+  //<<endl; 		exit(0);
+  //	}
+
+  //  if (numNOCs<0 || numNOCs>2)
+  //    {
+  //  	  cout <<"number of NOCs must be 1 (only global NOCs) or 2 (both global
+  //  and local NOCs)"<<endl; 	  exit(0);
+  //    }
+
+  /* Basic parameters*/
+  interface_ip.data_arr_ram_cell_tech_type = debug ? 0 : XML->sys.device_type;
+  interface_ip.data_arr_peri_global_tech_type =
+      debug ? 0 : XML->sys.device_type;
+  interface_ip.tag_arr_ram_cell_tech_type = debug ? 0 : XML->sys.device_type;
+  interface_ip.tag_arr_peri_global_tech_type = debug ? 0 : XML->sys.device_type;
+
+  interface_ip.ic_proj_type = debug ? 0 : XML->sys.interconnect_projection_type;
+  interface_ip.delay_wt =
+      100;                  // Fixed number, make sure timing can be satisfied.
+  interface_ip.area_wt = 0; // Fixed number, This is used to exhaustive search
+                            // for individual components.
+  interface_ip.dynamic_power_wt =
+      100; // Fixed number, This is used to exhaustive search for individual
+           // components.
+  interface_ip.leakage_power_wt = 0;
+  interface_ip.cycle_time_wt = 0;
+
+  interface_ip.delay_dev =
+      10000; // Fixed number, make sure timing can be satisfied.
+  interface_ip.area_dev = 10000; // Fixed number, This is used to exhaustive
+                                 // search for individual components.
+  interface_ip.dynamic_power_dev =
+      10000; // Fixed number, This is used to exhaustive search for individual
+             // components.
+  interface_ip.leakage_power_dev = 10000;
+  interface_ip.cycle_time_dev = 10000;
+
+  interface_ip.ed = 2;
+  interface_ip.burst_len = 1; // parameters are fixed for processor section,
+                              // since memory is processed separately
+  interface_ip.int_prefetch_w = 1;
+  interface_ip.page_sz_bits = 0;
+  interface_ip.temp = debug ? 360 : XML->sys.temperature;
+  interface_ip.F_sz_nm =
+      debug ? 90 : XML->sys.core_tech_node; // XML->sys.core_tech_node;
+  interface_ip.F_sz_um = interface_ip.F_sz_nm / 1000;
+
+  //***********This section of code does not have real meaning, they are just to
+  // ensure all data will have initial value to prevent errors. They will be
+  // overridden  during each components initialization
+  interface_ip.cache_sz = 64;
+  interface_ip.line_sz = 1;
+  interface_ip.assoc = 1;
+  interface_ip.nbanks = 1;
+  interface_ip.out_w = interface_ip.line_sz * 8;
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = 64;
+  interface_ip.access_mode = 2;
+
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+
+  interface_ip.is_main_mem = false;
+  interface_ip.rpters_in_htree = true;
+  interface_ip.ver_htree_wires_over_array = 0;
+  interface_ip.broadcast_addr_din_over_ver_htrees = 0;
+
+  interface_ip.num_rw_ports = 1;
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  interface_ip.num_search_ports = 1;
+  interface_ip.nuca = 0;
+  interface_ip.nuca_bank_count = 0;
+  interface_ip.is_cache = true;
+  interface_ip.pure_ram = false;
+  interface_ip.pure_cam = false;
+  interface_ip.force_cache_config = false;
+  if (XML->sys.Embedded) {
+    interface_ip.wt = Global_30;
+    interface_ip.wire_is_mat_type = 0;
+    interface_ip.wire_os_mat_type = 0;
+  } else {
+    interface_ip.wt = Global;
+    interface_ip.wire_is_mat_type = 2;
+    interface_ip.wire_os_mat_type = 2;
+  }
+  interface_ip.force_wiretype = false;
+  interface_ip.print_detail = 1;
+  interface_ip.add_ecc_b_ = true;
+}
+
+Processor::~Processor() {
+  while (!cores.empty()) {
+    delete cores.back();
+    cores.pop_back();
+  }
+  while (!l2array.empty()) {
+    delete l2array.back();
+    l2array.pop_back();
+  }
+  while (!l3array.empty()) {
+    delete l3array.back();
+    l3array.pop_back();
+  }
+  while (!nocs.empty()) {
+    delete nocs.back();
+    nocs.pop_back();
+  }
+  if (!mc) {
+    delete mc;
+  }
+  if (!niu) {
+    delete niu;
+  }
+  if (!pcie) {
+    delete pcie;
+  }
+  if (!flashcontroller) {
+    delete flashcontroller;
+  }
+};
diff --git a/src/gpuwattch/processor.h b/src/gpuwattch/processor.h
new file mode 100644
index 000000000..6acdf6cc8
--- /dev/null
+++ b/src/gpuwattch/processor.h
@@ -0,0 +1,325 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+/********************************************************************
+ *      Modified by:
+ ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
+ *University of Wisconsin–Madison                * Tayler Hetherington,
+ *University of British Columbia         * Ahmed ElTantawy, University of
+ *British Columbia             *
+ ********************************************************************/
+#ifndef PROCESSOR_H_
+#define PROCESSOR_H_
+
+#include "../gpgpu-sim/visualizer.h"
+#include "XML_Parse.h"
+#include "array.h"
+#include "basic_components.h"
+#include "cacti/arbiter.h"
+#include "cacti/area.h"
+#include "cacti/decoder.h"
+#include "cacti/parameter.h"
+#include "cacti/router.h"
+#include "core.h"
+#include "iocontrollers.h"
+#include "memoryctrl.h"
+#include "noc.h"
+#include "sharedcache.h"
+#include <vector>
+
+class Processor : public Component {
+public:
+  ParseXML *XML;
+  vector<Core *> cores;
+  vector<SharedCache *> l2array;
+  vector<SharedCache *> l3array;
+  vector<SharedCache *> l1dirarray;
+  vector<SharedCache *> l2dirarray;
+  vector<NoC *> nocs;
+  MemoryController *mc;
+  NIUController *niu;
+  PCIeController *pcie;
+  FlashController *flashcontroller;
+  InputParameter interface_ip;
+  double exClockRate;
+  ProcParam procdynp;
+  // for debugging nonlinear model
+  double dyn_power_before_scaling;
+
+  // wire	globalInterconnect;
+  // clock_network globalClock;
+  Component core, l2, l3, l1dir, l2dir, noc, mcs, cc, nius, pcies,
+      flashcontrollers;
+  int numCore, numL2, numL3, numNOC, numL1Dir, numL2Dir;
+  Processor(ParseXML *XML_interface);
+  void compute();
+  void set_proc_param();
+  void visualizer_print(gzFile visualizer_file);
+  void displayEnergy(uint32_t indent = 0, int plevel = 100,
+                     bool is_tdp_parm = true);
+  void displayDeviceType(int device_type_, uint32_t indent = 0);
+  void displayInterconnectType(int interconnect_type_, uint32_t indent = 0);
+  double l2_power;
+  double idle_core_power;
+
+  double get_const_dynamic_power() {
+
+    double constpart = 0;
+    constpart += (mc->frontend->power.readOp.dynamic * 0.1 *
+                  mc->frontend->mcp.clockRate * mc->frontend->mcp.num_mcs *
+                  mc->frontend->mcp.executionTime);
+    constpart +=
+        (mc->transecEngine->power.readOp.dynamic * 0.1 *
+         mc->transecEngine->mcp.clockRate * mc->transecEngine->mcp.num_mcs *
+         mc->transecEngine->mcp.executionTime);
+    constpart += (mc->PHY->power.readOp.dynamic * 0.1 * mc->PHY->mcp.clockRate *
+                  mc->PHY->mcp.num_mcs * mc->PHY->mcp.executionTime);
+    constpart +=
+        (cores[0]->exu->exeu->base_energy / cores[0]->exu->exeu->clockRate) *
+        (cores[0]->exu->rf_fu_clockRate / cores[0]->exu->clockRate);
+    constpart +=
+        (cores[0]->exu->mul->base_energy / cores[0]->exu->mul->clockRate);
+    constpart +=
+        (cores[0]->exu->fp_u->base_energy / cores[0]->exu->fp_u->clockRate);
+    return constpart;
+  }
+#define COALESCE_SCALE 1
+  double get_coefficient_readcoalescing() {
+    double value = 0;
+    double perAccessCoalescingEnergy =
+        COALESCE_SCALE *
+        ((0.443e-3) * (0.5e-9) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd) /
+        (1 * 1);
+    value += mc->frontend->PRT->local_result.power.readOp.dynamic;
+    value += mc->frontend->threadMasks->local_result.power.readOp.dynamic;
+    value += mc->frontend->PRC->local_result.power.readOp.dynamic;
+    value += perAccessCoalescingEnergy;
+    return value;
+  }
+  double get_coefficient_writecoalescing() {
+    double value = 0;
+    double perAccessCoalescingEnergy =
+        COALESCE_SCALE *
+        ((0.443e-3) * (0.5e-9) * g_tp.peri_global.Vdd * g_tp.peri_global.Vdd) /
+        (1 * 1);
+    value += (mc->frontend->PRT->local_result.power.writeOp.dynamic);
+    value += mc->frontend->threadMasks->local_result.power.writeOp.dynamic;
+    value += mc->frontend->PRC->local_result.power.writeOp.dynamic;
+    value += perAccessCoalescingEnergy;
+    return value;
+  }
+
+  double get_coefficient_noc_accesses() {
+    double read_coef = 0;
+    // the 32/4 is applied to the NoC access counters (32/4*L2 cache access)
+    read_coef += nocs[0]->router->buffer.power.readOp.dynamic;
+    read_coef += nocs[0]->router->buffer.power.writeOp.dynamic;
+    read_coef += nocs[0]->router->crossbar.power.readOp.dynamic;
+    read_coef += nocs[0]->router->arbiter.power.readOp.dynamic;
+    return read_coef;
+  }
+
+  double get_coefficient_l2_read_hits() {
+    double read_coef = 0;
+    if (XML->sys.number_of_L2s > 0)
+      read_coef =
+          l2array[0]->unicache.caches->local_result.power.readOp.dynamic;
+    return read_coef;
+  }
+
+  double get_coefficient_l2_read_misses() {
+    double read_coef = 0;
+    if (XML->sys.number_of_L2s > 0)
+      read_coef =
+          l2array[0]
+              ->unicache.caches->local_result.tag_array2->power.readOp.dynamic;
+    return read_coef;
+  }
+
+  double get_coefficient_l2_write_hits() {
+    double read_coef = 0;
+    if (XML->sys.number_of_L2s > 0)
+      read_coef =
+          l2array[0]->unicache.caches->local_result.power.writeOp.dynamic;
+    return read_coef;
+  }
+  double get_coefficient_l2_write_misses() {
+    double read_coef = 0;
+    if (XML->sys.number_of_L2s > 0) {
+      read_coef = l2array[0]
+                      ->unicache.caches->local_result.tag_array2->power.writeOp
+                      .dynamic; //*(32/4); // removed by Jingwen, the scaling of
+                                // 32/4 is not used in the mcpat
+      read_coef +=
+          l2array[0]->unicache.caches->local_result.power.writeOp.dynamic;
+      read_coef +=
+          l2array[0]->unicache.missb->local_result.power.searchOp.dynamic;
+      read_coef +=
+          l2array[0]->unicache.missb->local_result.power.writeOp.dynamic;
+      read_coef +=
+          l2array[0]->unicache.ifb->local_result.power.searchOp.dynamic;
+      read_coef += l2array[0]->unicache.ifb->local_result.power.writeOp.dynamic;
+      read_coef +=
+          l2array[0]->unicache.prefetchb->local_result.power.searchOp.dynamic;
+      read_coef +=
+          l2array[0]->unicache.prefetchb->local_result.power.writeOp.dynamic;
+      read_coef +=
+          l2array[0]->unicache.wbb->local_result.power.searchOp.dynamic;
+      read_coef += l2array[0]->unicache.wbb->local_result.power.writeOp.dynamic;
+    }
+
+    return read_coef;
+  }
+
+  double get_coefficient_mem_reads() {
+    double value = 0;
+    value +=
+        (mc->frontend->mcp.llcBlockSize * 8.0 / mc->frontend->mcp.dataBusWidth *
+         mc->frontend->mcp.dataBusWidth / 72) *
+        (mc->frontend->frontendBuffer->local_result.power.searchOp.dynamic);
+
+    value +=
+        (mc->frontend->mcp.llcBlockSize * 8.0 / mc->frontend->mcp.dataBusWidth *
+         mc->frontend->mcp.dataBusWidth / 72) *
+        (mc->frontend->frontendBuffer->local_result.power.readOp.dynamic);
+
+    // TODO: Jingwen this should only compute for one time?
+    // value+=(mc->frontend->mcp.llcBlockSize*8.0/mc->frontend->mcp.dataBusWidth*mc->frontend->mcp.dataBusWidth/72)
+    //*(mc->frontend->frontendBuffer->local_result.power.readOp.dynamic);
+
+    value += (mc->frontend->mcp.llcBlockSize * 8.0 / mc->mcp.dataBusWidth) *
+             (mc->frontend->readBuffer->local_result.power.readOp.dynamic);
+
+    value += (mc->frontend->mcp.llcBlockSize * 8.0 / mc->mcp.dataBusWidth) *
+             (mc->frontend->readBuffer->local_result.power.writeOp.dynamic);
+
+    value += mc->dram->dramp.rd_coeff;
+    /*
+            value+=mc->frontend->PRT->local_result.power.readOp.dynamic;
+            value+=mc->frontend->threadMasks->local_result.power.readOp.dynamic;
+            value+=mc->frontend->PRC->local_result.power.readOp.dynamic;
+            value+=perAccessCoalescingEnergy;
+            */
+    value += (mc->transecEngine->mcp.llcBlockSize * 8.0 /
+              mc->transecEngine->mcp.dataBusWidth *
+              mc->transecEngine->power_t.readOp.dynamic);
+
+    // if mcp.type ==1 TODO: add this check here
+    value += (mc->PHY->power_t.readOp.dynamic) * (mc->PHY->mcp.llcBlockSize) *
+             8 / 1e9 / mc->PHY->mcp.executionTime *
+             (mc->PHY->mcp.executionTime);
+    // printf("MC PHY read power coeff:
+    // %f\n",(mc->PHY->power_t.readOp.dynamic)*(mc->PHY->mcp.llcBlockSize)*8/1e9/mc->PHY->mcp.executionTime*(mc->PHY->mcp.executionTime));
+    // printf("MC trans read power coeff:
+    // %f\n",(mc->transecEngine->mcp.llcBlockSize*8.0/mc->transecEngine->mcp.dataBusWidth*mc->transecEngine->power_t.readOp.dynamic));
+
+    // TODO: Jingwen nocs stats should not be here
+    //		value+= nocs[0]->router->buffer.power.readOp.dynamic*(32/4);
+    //		value+= nocs[0]->router->buffer.power.writeOp.dynamic*(32/4);
+    //		value+= nocs[0]->router->crossbar.power.readOp.dynamic*(32/4);
+    //		value+= nocs[0]->router->arbiter.power.readOp.dynamic*(32/4);
+
+    // return 0.4*value;
+    return value;
+  }
+
+  double get_coefficient_mem_writes() {
+    double value = 0;
+
+    value +=
+        (mc->frontend->mcp.llcBlockSize * 8.0 / mc->frontend->mcp.dataBusWidth *
+         mc->frontend->mcp.dataBusWidth / 72) *
+        (mc->frontend->frontendBuffer->local_result.power.searchOp.dynamic);
+
+    value +=
+        (mc->frontend->mcp.llcBlockSize * 8.0 / mc->frontend->mcp.dataBusWidth *
+         mc->frontend->mcp.dataBusWidth / 72) *
+        (mc->frontend->frontendBuffer->local_result.power.writeOp.dynamic);
+
+    // value+=(mc->frontend->mcp.llcBlockSize*8.0/mc->frontend->mcp.dataBusWidth*mc->frontend->mcp.dataBusWidth/72)*
+    // (mc->frontend->frontendBuffer->local_result.power.writeOp.dynamic);
+
+    value += (mc->frontend->mcp.llcBlockSize * 8.0 /
+              mc->frontend->mcp.dataBusWidth) *
+             (mc->frontend->writeBuffer->local_result.power.readOp.dynamic);
+
+    value += (mc->frontend->mcp.llcBlockSize * 8.0 /
+              mc->frontend->mcp.dataBusWidth) *
+             (mc->frontend->writeBuffer->local_result.power.writeOp.dynamic);
+
+    value += mc->dram->dramp.wr_coeff;
+    /*
+            value+=(mc->frontend->PRT->local_result.power.writeOp.dynamic);
+
+            value+=mc->frontend->threadMasks->local_result.power.writeOp.dynamic;
+
+            value+=mc->frontend->PRC->local_result.power.writeOp.dynamic;
+
+            value+=perAccessCoalescingEnergy;
+            */
+
+    value += (mc->transecEngine->mcp.llcBlockSize * 8.0 /
+              mc->transecEngine->mcp.dataBusWidth *
+              mc->transecEngine->power_t.readOp.dynamic);
+
+    // if mcp.type ==1 TODO: add this check here
+    value += (mc->PHY->power_t.readOp.dynamic) * (mc->PHY->mcp.llcBlockSize) *
+             8 / 1e9 / mc->PHY->mcp.executionTime *
+             (mc->PHY->mcp.executionTime);
+
+    // TODO: Jingwen nocs stats should not be here
+    //		value+= nocs[0]->router->buffer.power.readOp.dynamic*(32/4);
+    //
+    //		value+= nocs[0]->router->buffer.power.writeOp.dynamic*(32/4);
+    //
+    //		value+= nocs[0]->router->crossbar.power.readOp.dynamic*(32/4);
+    //
+    //		value+= nocs[0]->router->arbiter.power.readOp.dynamic*(32/4);
+    //
+    // return 0.4*value;
+    return value;
+  }
+
+  double get_coefficient_mem_pre() {
+    double value = 0;
+    value += mc->dram->dramp.pre_coeff;
+    // return 0.4*value;
+    return value;
+  }
+
+  // nonlinear scale
+  void nonlinear_scale(int, double, int);
+  void coefficient_scale();
+  void iterative_lse(double *, double *);
+
+  ~Processor();
+};
+
+#endif /* PROCESSOR_H_ */
diff --git a/src/gpuwattch/quadro.xml b/src/gpuwattch/quadro.xml
new file mode 100644
index 000000000..934e027d6
--- /dev/null
+++ b/src/gpuwattch/quadro.xml
@@ -0,0 +1,497 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+		<!--McPAT will skip the components if number is set to 0 -->
+		<param name="GPU_Architecture" value="0"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="number_of_cores" value="16"/>
+		<param name="architecture" value="2"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="0"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="0"/>
+		<param name="core_tech_node" value="65"/><!-- nm -->
+		<param name="target_core_clockrate" value="650"/><!--MHz -->
+		<param name="temperature" value="400"/> <!-- Kelvin -->
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="650"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="1"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="1"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="1"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="1"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="8"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="16"/>
+			<param name="simd_width" value="8"/>
+			<param name="collector_units" value="16"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="8192"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="1"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="0.4"/>
+			<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->  
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="1"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,32,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="1"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="65536,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="16384,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1300"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1300"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="100"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="786432,64,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="1300"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="650"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="4"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="7"/>
+			<param name="output_ports" value="7"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="360000"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="65"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="8"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="800"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="9600"/><!--MB/S  Syed: Quadro FX 5800 has 76.8GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="8"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
diff --git a/src/gpuwattch/results/Alpha21364 b/src/gpuwattch/results/Alpha21364
new file mode 100644
index 000000000..1b3d9e4bd
--- /dev/null
+++ b/src/gpuwattch/results/Alpha21364
@@ -0,0 +1,441 @@
+McPAT (version 0.7 of May, 2010) is computing the target processor...
+ 
+Warning: icache array structure cannot satisfy throughput constraint.
+Warning: icache array structure cannot satisfy latency constraint.
+Warning: InstBuffer array structure cannot satisfy throughput constraint.
+Warning: InstBuffer array structure cannot satisfy latency constraint.
+Warning: Branch Target Buffer array structure cannot satisfy throughput constraint.
+Warning: Branch Target Buffer array structure cannot satisfy latency constraint.
+Warning: Global Predictor array structure cannot satisfy throughput constraint.
+Warning: Global Predictor array structure cannot satisfy latency constraint.
+Warning: L1 local Predictor array structure cannot satisfy throughput constraint.
+Warning: L1 local Predictor array structure cannot satisfy latency constraint.
+Warning: L2 local Predictor array structure cannot satisfy throughput constraint.
+Warning: L2 local Predictor array structure cannot satisfy latency constraint.
+Warning: Predictor Chooser array structure cannot satisfy throughput constraint.
+Warning: Predictor Chooser array structure cannot satisfy latency constraint.
+Warning: RAS array structure cannot satisfy throughput constraint.
+Warning: RAS array structure cannot satisfy latency constraint.
+Warning: dcache array structure cannot satisfy throughput constraint.
+Warning: dcache array structure cannot satisfy latency constraint.
+Warning: Integer Register File array structure cannot satisfy throughput constraint.
+Warning: Integer Register File array structure cannot satisfy latency constraint.
+Warning: Floating point Register File array structure cannot satisfy throughput constraint.
+Warning: Floating point Register File array structure cannot satisfy latency constraint.
+Warning: ReorderBuffer array structure cannot satisfy throughput constraint.
+Warning: ReorderBuffer array structure cannot satisfy latency constraint.
+Warning: Int RetireRAT array structure cannot satisfy throughput constraint.
+Warning: Int RetireRAT array structure cannot satisfy latency constraint.
+Warning: Int RetireRAT array structure cannot satisfy latency constraint.
+Warning: Int Free List array structure cannot satisfy throughput constraint.
+Warning: Int Free List array structure cannot satisfy latency constraint.
+Warning: Int Free List array structure cannot satisfy throughput constraint.
+Warning: Int Free List array structure cannot satisfy latency constraint.
+Warning: MC ReadBuffer array structure cannot satisfy throughput constraint.
+Warning: MC ReadBuffer array structure cannot satisfy latency constraint.
+Warning: MC writeBuffer array structure cannot satisfy throughput constraint.
+Warning: MC writeBuffer array structure cannot satisfy latency constraint.
+
+McPAT (version 0.7 of May, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 180 nm
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 1200
+
+*****************************************************************************************
+Processor: 
+  Area = 323.859 mm^2
+  Peak Power = 90.0375 W
+  Total Leakage = 0.156795 W
+  Peak Dynamic = 89.8807 W
+  Subthreshold Leakage = 0.151936 W
+  Gate Leakage = 0.00485969 W
+  Runtime Dynamic = 85.2036 W
+
+  Total Cores: 
+  Device Type= ITRS high performance device type
+    Area = 137.839 mm^2
+    Peak Dynamic = 60.6776 W
+    Subthreshold Leakage = 0.067186 W
+    Gate Leakage = 0.00428355 W
+    Runtime Dynamic = 73.9555 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 137.063 mm^2
+    Peak Dynamic = 3.55835 W
+    Subthreshold Leakage = 0.0778886 W
+    Gate Leakage = 0.00016078 W
+    Runtime Dynamic = 6.34872 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 1.59954 mm^2
+    Peak Dynamic = 0.805902 W
+    Subthreshold Leakage = 0.000311783 W
+    Gate Leakage = 2.63568e-05 W
+    Runtime Dynamic = 0.547665 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 29.1057 mm^2
+    Peak Dynamic = 16.5188 W
+    Subthreshold Leakage = 0.00292556 W
+    Gate Leakage = 0.000166293 W
+    Runtime Dynamic = 2.54446 W
+
+  Total MCs: 
+  Device Type= ITRS high performance device type
+    Area = 18.2519 mm^2
+    Peak Dynamic = 8.32001 W
+    Subthreshold Leakage = 0.00362353 W
+    Gate Leakage = 0.000222708 W
+    Runtime Dynamic = 1.80731 W
+
+*****************************************************************************************
+Core:
+      Area = 137.839 mm^2
+      Peak Dynamic = 60.6776 W
+      Subthreshold Leakage = 0.067186 W
+      Gate Leakage = 0.00428355 W
+      Runtime Dynamic = 73.9555 W
+
+      Instruction Fetch Unit:
+        Area = 27.6096 mm^2
+        Peak Dynamic = 9.86655 W
+        Subthreshold Leakage = 0.00622106 W
+        Gate Leakage = 0.000344671 W
+        Runtime Dynamic = 10.0567 W
+
+          Instruction Cache:
+            Area = 11.4511 mm^2
+            Peak Dynamic = 1.53259 W
+            Subthreshold Leakage = 0.00371341 W
+            Gate Leakage = 0.000171069 W
+            Runtime Dynamic = 2.13168 W
+
+          Branch Target Buffer:
+            Area = 13.3377 mm^2
+            Peak Dynamic = 0.56236 W
+            Subthreshold Leakage = 0.001581 W
+            Gate Leakage = 9.5198e-05 W
+            Runtime Dynamic = 2.24944 W
+
+          Branch Predictor:
+            Area = 2.1618 mm^2
+            Peak Dynamic = 0.234643 W
+            Subthreshold Leakage = 0.000469396 W
+            Gate Leakage = 2.01907e-05 W
+            Runtime Dynamic = 0.198646 W
+
+              Global Predictor:
+                Area = 0.893575 mm^2
+                Peak Dynamic = 0.0726984 W
+                Subthreshold Leakage = 0.000182866 W
+                Gate Leakage = 7.91951e-06 W
+                Runtime Dynamic = 0.0726984 W
+
+              Local Predictor:
+                Area = 0.420241 mm^2
+                Peak Dynamic = 0.0532456 W
+                Subthreshold Leakage = 9.20027e-05 W
+                Gate Leakage = 3.89162e-06 W
+                Runtime Dynamic = 0.0532456 W
+
+                Area = 0.291886 mm^2
+                Peak Dynamic = 0.0292091 W
+                Subthreshold Leakage = 5.262e-05 W
+                Gate Leakage = 2.51093e-06 W
+                Runtime Dynamic = 0.0292091 W
+
+              Chooser:
+                Area = 0.893575 mm^2
+                Peak Dynamic = 0.0726984 W
+                Subthreshold Leakage = 0.000182866 W
+                Gate Leakage = 7.91951e-06 W
+                Runtime Dynamic = 0.0726984 W
+
+              RAS:
+                Area = 0.0827607 mm^2
+                Peak Dynamic = 0.0360009 W
+                Subthreshold Leakage = 1.16623e-05 W
+                Gate Leakage = 4.60036e-07 W
+                Runtime Dynamic = 3.58028e-06 W
+
+          Instruction Buffer:
+            Area = 0.465385 mm^2
+            Peak Dynamic = 2.10455 W
+            Subthreshold Leakage = 6.13248e-05 W
+            Gate Leakage = 4.88113e-06 W
+            Runtime Dynamic = 1.40303 W
+
+          Instruction Decoder:
+            Area = 0.146031 mm^2
+            Peak Dynamic = 4.07384 W
+            Subthreshold Leakage = 7.07416e-05 W
+            Gate Leakage = 3.32268e-06 W
+            Runtime Dynamic = 4.07384 W
+
+      Renaming Unit:
+        Area = 11.7262 mm^2
+        Peak Dynamic = 12.5584 W
+        Subthreshold Leakage = 0.000886804 W
+        Gate Leakage = 9.92419e-05 W
+        Runtime Dynamic = 9.90647 W
+
+          Int Front End RAT:
+            Area = 8.24345 mm^2
+            Peak Dynamic = 8.04227 W
+            Subthreshold Leakage = 0.000376247 W
+            Gate Leakage = 3.40623e-05 W
+            Runtime Dynamic = 8.04227 W
+
+          FP Front End RAT:
+            Area = 2.549 mm^2
+            Peak Dynamic = 2.75082 W
+            Subthreshold Leakage = 0.000149367 W
+            Gate Leakage = 1.30084e-05 W
+            Runtime Dynamic = 1.37541 W
+
+          Free List:
+            Area = 0.446019 mm^2
+            Peak Dynamic = 0.156051 W
+            Subthreshold Leakage = 1.32133e-05 W
+            Gate Leakage = 7.4667e-07 W
+            Runtime Dynamic = 0.312102 W
+
+          Int Retire RAT: 
+            Area = 0.184445 mm^2
+            Peak Dynamic = 0.102656 W
+            Subthreshold Leakage = 8.50239e-06 W
+            Gate Leakage = 5.28869e-07 W
+            Runtime Dynamic = 0.102656 W
+
+          FP Retire RAT:
+            Area = 0.0567228 mm^2
+            Peak Dynamic = 0.0367258 W
+            Subthreshold Leakage = 5.67894e-06 W
+            Gate Leakage = 3.75578e-07 W
+            Runtime Dynamic = 0.0183629 W
+
+          FP Free List:
+            Area = 0.198929 mm^2
+            Peak Dynamic = 0.111293 W
+            Subthreshold Leakage = 8.61952e-06 W
+            Gate Leakage = 5.10875e-07 W
+            Runtime Dynamic = 0.0556467 W
+
+      Load Store Unit:
+        Area = 49.742 mm^2
+        Peak Dynamic = 11.7952 W
+        Subthreshold Leakage = 0.00715349 W
+        Gate Leakage = 0.00052778 W
+        Runtime Dynamic = 31.7658 W
+
+          Data Cache:
+            Area = 36.106 mm^2
+            Peak Dynamic = 9.28008 W
+            Subthreshold Leakage = 0.00663485 W
+            Gate Leakage = 0.000466572 W
+            Runtime Dynamic = 31.332 W
+
+          LoadQ:
+            Area = 2.60005 mm^2
+            Peak Dynamic = 0.578279 W
+            Subthreshold Leakage = 9.67302e-05 W
+            Gate Leakage = 5.59905e-06 W
+            Runtime Dynamic = 0.14457 W
+
+          StoreQ:
+            Area = 2.60005 mm^2
+            Peak Dynamic = 0.578279 W
+            Subthreshold Leakage = 9.67302e-05 W
+            Gate Leakage = 5.59905e-06 W
+            Runtime Dynamic = 0.289139 W
+
+      Memory Management Unit:
+        Area = 8.74543 mm^2
+        Peak Dynamic = 3.77198 W
+        Subthreshold Leakage = 0.00119904 W
+        Gate Leakage = 0.000127183 W
+        Runtime Dynamic = 4.82688 W
+
+          Itlb:
+            Area = 1.97969 mm^2
+            Peak Dynamic = 0.537563 W
+            Subthreshold Leakage = 0.000270576 W
+            Gate Leakage = 2.0845e-05 W
+            Runtime Dynamic = 1.07513 W
+
+          Dtlb:
+            Area = 6.71814 mm^2
+            Peak Dynamic = 1.87586 W
+            Subthreshold Leakage = 0.00060329 W
+            Gate Leakage = 5.63286e-05 W
+            Runtime Dynamic = 3.75174 W
+
+      Execution Unit:
+        Area = 31.4918 mm^2
+        Peak Dynamic = 22.6855 W
+        Subthreshold Leakage = 0.0320294 W
+        Gate Leakage = 0.00198102 W
+        Runtime Dynamic = 17.3997 W
+
+          Register Files:
+            Area = 9.9318 mm^2
+            Peak Dynamic = 3.92301 W
+            Subthreshold Leakage = 0.000295352 W
+            Gate Leakage = 1.33517e-05 W
+            Runtime Dynamic = 1.7929 W
+
+              Integer RF:
+                Area = 6.76678 mm^2
+                Peak Dynamic = 2.35597 W
+                Subthreshold Leakage = 0.000185762 W
+                Gate Leakage = 8.51701e-06 W
+                Runtime Dynamic = 1.60634 W
+
+              Floating Point RF:
+                Area = 3.16503 mm^2
+                Peak Dynamic = 1.56704 W
+                Subthreshold Leakage = 0.00010959 W
+                Gate Leakage = 4.83467e-06 W
+                Runtime Dynamic = 0.186553 W
+
+          Instruction Scheduler:
+            Area = 5.20691 mm^2
+            Peak Dynamic = 2.77224 W
+            Subthreshold Leakage = 0.000202187 W
+            Gate Leakage = 1.05832e-05 W
+            Runtime Dynamic = 3.11355 W
+
+              Instruction Window:
+                Area = 1.23862 mm^2
+                Peak Dynamic = 0.985117 W
+                Subthreshold Leakage = 5.55506e-05 W
+                Gate Leakage = 3.78978e-06 W
+                Runtime Dynamic = 1.23906 W
+
+              FP Instruction Window:
+                Area = 0.481718 mm^2
+                Peak Dynamic = 0.438839 W
+                Subthreshold Leakage = 2.5962e-05 W
+                Gate Leakage = 2.00351e-06 W
+                Runtime Dynamic = 0.526208 W
+
+              ROB:
+                Area = 3.48657 mm^2
+                Peak Dynamic = 1.34828 W
+                Subthreshold Leakage = 0.000120674 W
+                Gate Leakage = 4.78991e-06 W
+                Runtime Dynamic = 1.34828 W
+
+          Integer ALUs (Count: 4 ):
+            Area = 3.4944 mm^2
+            Peak Dynamic = 4.23312 W
+            Subthreshold Leakage = 0.016149 W
+            Gate Leakage = 0.000986885 W
+            Runtime Dynamic = 3.21343 W
+
+          Floating Point Units (FPUs) (Count: 1 ):
+            Area = 12.705 mm^2
+            Peak Dynamic = 3.52215 W
+            Subthreshold Leakage = 0.0146787 W
+            Gate Leakage = 0.000897034 W
+            Runtime Dynamic = 3.52215 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.106062 mm^2
+            Peak Dynamic = 6.87645 W
+            Subthreshold Leakage = 0.000378957 W
+            Gate Leakage = 2.31585e-05 W
+            Runtime Dynamic = 5.75766 W
+
+*****************************************************************************************
+L2
+      Area = 137.063 mm^2
+      Peak Dynamic = 3.55835 W
+      Subthreshold Leakage = 0.0778886 W
+      Gate Leakage = 0.00016078 W
+      Runtime Dynamic = 6.34872 W
+
+*****************************************************************************************
+Second Level Directory
+      Area = 1.59954 mm^2
+      Peak Dynamic = 0.805902 W
+      Subthreshold Leakage = 0.000311783 W
+      Gate Leakage = 2.63568e-05 W
+      Runtime Dynamic = 0.547665 W
+
+*****************************************************************************************
+Memory Controller:
+      Area = 9.12595 mm^2
+      Peak Dynamic = 4.16 W
+      Subthreshold Leakage = 0.00181177 W
+      Gate Leakage = 0.000111354 W
+      Runtime Dynamic = 1.80731 W
+
+      Front End Engine:
+        Area = 5.49326 mm^2
+        Peak Dynamic = 1.42883 W
+        Subthreshold Leakage = 0.000132955 W
+        Gate Leakage = 8.76015e-06 W
+        Runtime Dynamic = 0.348049 W
+
+      Transaction Engine:
+        Area = 1.50616 mm^2
+        Peak Dynamic = 1.93117 W
+        Subthreshold Leakage = 0.000696058 W
+        Gate Leakage = 4.25369e-05 W
+        Runtime Dynamic = 0.579332 W
+
+      PHY:
+        Area = 2.12653 mm^2
+        Peak Dynamic = 0.8 W
+        Subthreshold Leakage = 0.000982753 W
+        Gate Leakage = 6.00571e-05 W
+        Runtime Dynamic = 0.879928 W
+
+*****************************************************************************************
+NOC
+      Area = 29.1057 mm^2
+      Peak Dynamic = 16.5188 W
+      Subthreshold Leakage = 0.00292556 W
+      Gate Leakage = 0.000166293 W
+      Runtime Dynamic = 2.54446 W
+
+      Router: 
+        Area = 28.4197 mm^2
+        Peak Dynamic = 8.76431 W
+        Subthreshold Leakage = 0.00199965 W
+        Gate Leakage = 0.000109709 W
+        Runtime Dynamic = 1.25204 W
+
+            Virtual Channel Buffer:
+              Area = 17.0424 mm^2
+              Peak Dynamic = 7.30291 W
+              Subthreshold Leakage = 0.00119658 W
+              Gate Leakage = 4.15511e-05 W
+              Runtime Dynamic = 1.04327 W
+
+            Crossbar:
+              Area = 0.357655 mm^2
+              Peak Dynamic = 1.27997 W
+              Subthreshold Leakage = 0.000801415 W
+              Gate Leakage = 6.80527e-05 W
+              Runtime Dynamic = 0.182853 W
+
+            Arbiter:
+              Peak Dynamic = 0.18143 W
+              Subthreshold Leakage = 1.65956e-06 W
+              Gate Leakage = 1.05559e-07 W
+              Runtime Dynamic = 0.0259186 W
+
+      Per Router : 
+        Area = 0.685989 mm^2
+        Peak Dynamic = 7.75447 W
+        Subthreshold Leakage = 0.000925911 W
+        Gate Leakage = 5.65834e-05 W
+        Runtime Dynamic = 1.29241 W
+
+*****************************************************************************************
diff --git a/src/gpuwattch/results/Alpha21364_90nm b/src/gpuwattch/results/Alpha21364_90nm
new file mode 100644
index 000000000..2a97d7732
--- /dev/null
+++ b/src/gpuwattch/results/Alpha21364_90nm
@@ -0,0 +1,408 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+ 
+Warning: icache array structure cannot satisfy latency constraint.
+Warning: dcache array structure cannot satisfy latency constraint.
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 90 nm
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 1200
+
+*****************************************************************************************
+Processor: 
+  Area = 139.86 mm^2
+  Peak Power = 34.9936 W
+  Total Leakage = 4.16949 W
+  Peak Dynamic = 30.8241 W
+  Subthreshold Leakage = 3.86203 W
+  Gate Leakage = 0.307463 W
+  Runtime Dynamic = 34.0612 W
+
+  Total Cores: 
+  Device Type= ITRS high performance device type
+    Area = 61.1957 mm^2
+    Peak Dynamic = 19.6269 W
+    Subthreshold Leakage = 2.04452 W
+    Gate Leakage = 0.277429 W
+    Runtime Dynamic = 29.5972 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 62.2653 mm^2
+    Peak Dynamic = 1.42987 W
+    Subthreshold Leakage = 1.65481 W
+    Gate Leakage = 0.00860545 W
+    Runtime Dynamic = 2.73329 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 0.533824 mm^2
+    Peak Dynamic = 0.275566 W
+    Subthreshold Leakage = 0.00929753 W
+    Gate Leakage = 0.00179126 W
+    Runtime Dynamic = 0.193681 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 8.77595 mm^2
+    Peak Dynamic = 6.17873 W
+    Subthreshold Leakage = 0.108357 W
+    Gate Leakage = 0.0139259 W
+    Runtime Dynamic = 0.963385 W
+
+  Total MCs: 
+  Device Type= ITRS high performance device type
+    Area = 7.08925 mm^2
+    Peak Dynamic = 3.3131 W
+    Subthreshold Leakage = 0.0450389 W
+    Gate Leakage = 0.00571171 W
+    Runtime Dynamic = 0.573656 W
+
+*****************************************************************************************
+Core:
+      Area = 61.1957 mm^2
+      Peak Dynamic = 19.6269 W
+      Subthreshold Leakage = 2.04452 W
+      Gate Leakage = 0.277429 W
+      Runtime Dynamic = 29.5972 W
+
+      Instruction Fetch Unit:
+        Area = 7.40352 mm^2
+        Peak Dynamic = 2.10646 W
+        Subthreshold Leakage = 0.126581 W
+        Gate Leakage = 0.0150397 W
+        Runtime Dynamic = 2.55478 W
+
+          Instruction Cache:
+            Area = 5.01657 mm^2
+            Peak Dynamic = 0.745807 W
+            Subthreshold Leakage = 0.0906167 W
+            Gate Leakage = 0.010922 W
+            Runtime Dynamic = 1.22193 W
+
+          Branch Target Buffer:
+            Area = 1.63475 mm^2
+            Peak Dynamic = 0.0974373 W
+            Subthreshold Leakage = 0.0188281 W
+            Gate Leakage = 0.00126965 W
+            Runtime Dynamic = 0.389749 W
+
+          Branch Predictor:
+            Area = 0.474272 mm^2
+            Peak Dynamic = 0.0682449 W
+            Subthreshold Leakage = 0.00901262 W
+            Gate Leakage = 0.00067136 W
+            Runtime Dynamic = 0.0636543 W
+
+              Global Predictor:
+                Area = 0.190297 mm^2
+                Peak Dynamic = 0.0224229 W
+                Subthreshold Leakage = 0.00351842 W
+                Gate Leakage = 0.000260107 W
+                Runtime Dynamic = 0.0239711 W
+
+              Local Predictor:
+                Area = 0.0959237 mm^2
+                Peak Dynamic = 0.0143301 W
+                Subthreshold Leakage = 0.00171829 W
+                Gate Leakage = 0.00012889 W
+                Runtime Dynamic = 0.015711 W
+
+                Area = 0.0484908 mm^2
+                Peak Dynamic = 0.0077514 W
+                Subthreshold Leakage = 0.000926283 W
+                Gate Leakage = 7.55051e-05 W
+                Runtime Dynamic = 0.00850163 W
+
+              Chooser:
+                Area = 0.190297 mm^2
+                Peak Dynamic = 0.0224229 W
+                Subthreshold Leakage = 0.00351842 W
+                Gate Leakage = 0.000260107 W
+                Runtime Dynamic = 0.0239711 W
+
+              RAS:
+                Area = 0.0451868 mm^2
+                Peak Dynamic = 0.00906891 W
+                Subthreshold Leakage = 0.00025749 W
+                Gate Leakage = 2.22565e-05 W
+                Runtime Dynamic = 1.06361e-06 W
+
+          Instruction Buffer:
+            Area = 0.11139 mm^2
+            Peak Dynamic = 0.30298 W
+            Subthreshold Leakage = 0.000556928 W
+            Gate Leakage = 4.34124e-05 W
+            Runtime Dynamic = 0.201987 W
+
+          Instruction Decoder:
+            Area = 0.0481902 mm^2
+            Peak Dynamic = 0.677465 W
+            Subthreshold Leakage = 0.00135195 W
+            Gate Leakage = 0.000132907 W
+            Runtime Dynamic = 0.677465 W
+
+      Renaming Unit:
+        Area = 4.5037 mm^2
+        Peak Dynamic = 4.11785 W
+        Subthreshold Leakage = 0.0296009 W
+        Gate Leakage = 0.00668098 W
+        Runtime Dynamic = 3.24944 W
+
+          Int Front End RAT:
+            Area = 2.76467 mm^2
+            Peak Dynamic = 2.43279 W
+            Subthreshold Leakage = 0.0129405 W
+            Gate Leakage = 0.00255854 W
+            Runtime Dynamic = 2.43279 W
+
+          FP Front End RAT:
+            Area = 1.39233 mm^2
+            Peak Dynamic = 1.35403 W
+            Subthreshold Leakage = 0.00981219 W
+            Gate Leakage = 0.00205621 W
+            Runtime Dynamic = 0.677017 W
+
+          Free List:
+            Area = 0.116928 mm^2
+            Peak Dynamic = 0.0436483 W
+            Subthreshold Leakage = 0.000259915 W
+            Gate Leakage = 2.53395e-05 W
+            Runtime Dynamic = 0.0872966 W
+
+          Int Retire RAT: 
+            Area = 0.0429772 mm^2
+            Peak Dynamic = 0.0318091 W
+            Subthreshold Leakage = 0.000152798 W
+            Gate Leakage = 1.86722e-05 W
+            Runtime Dynamic = 0.0318091 W
+
+          FP Retire RAT:
+            Area = 0.0153516 mm^2
+            Peak Dynamic = 0.00997874 W
+            Subthreshold Leakage = 8.06509e-05 W
+            Gate Leakage = 7.17049e-06 W
+            Runtime Dynamic = 0.00498937 W
+
+          FP Free List:
+            Area = 0.0530951 mm^2
+            Peak Dynamic = 0.0310624 W
+            Subthreshold Leakage = 0.000140326 W
+            Gate Leakage = 1.46766e-05 W
+            Runtime Dynamic = 0.0155312 W
+
+      Load Store Unit:
+        Area = 20.5622 mm^2
+        Peak Dynamic = 5.14439 W
+        Subthreshold Leakage = 0.207699 W
+        Gate Leakage = 0.0357344 W
+        Runtime Dynamic = 16.0217 W
+
+          Data Cache:
+            Area = 15.2468 mm^2
+            Peak Dynamic = 4.5468 W
+            Subthreshold Leakage = 0.19694 W
+            Gate Leakage = 0.0331746 W
+            Runtime Dynamic = 15.8781 W
+
+          LoadQ:
+            Area = 0.863734 mm^2
+            Peak Dynamic = 0.191536 W
+            Subthreshold Leakage = 0.00227213 W
+            Gate Leakage = 0.000279753 W
+            Runtime Dynamic = 0.047884 W
+
+          StoreQ:
+            Area = 0.863734 mm^2
+            Peak Dynamic = 0.191536 W
+            Subthreshold Leakage = 0.00227213 W
+            Gate Leakage = 0.000279753 W
+            Runtime Dynamic = 0.0957681 W
+
+      Memory Management Unit:
+        Area = 3.49533 mm^2
+        Peak Dynamic = 1.34391 W
+        Subthreshold Leakage = 0.0412098 W
+        Gate Leakage = 0.00931467 W
+        Runtime Dynamic = 2.25879 W
+
+          Itlb:
+            Area = 1.12903 mm^2
+            Peak Dynamic = 0.425717 W
+            Subthreshold Leakage = 0.0152632 W
+            Gate Leakage = 0.00308734 W
+            Runtime Dynamic = 0.851444 W
+
+          Dtlb:
+            Area = 2.24796 mm^2
+            Peak Dynamic = 0.703668 W
+            Subthreshold Leakage = 0.0197321 W
+            Gate Leakage = 0.00422696 W
+            Runtime Dynamic = 1.40735 W
+
+      Execution Unit:
+        Area = 18.9802 mm^2
+        Peak Dynamic = 6.91426 W
+        Subthreshold Leakage = 1.01207 W
+        Gate Leakage = 0.130415 W
+        Runtime Dynamic = 5.51245 W
+
+          Register Files:
+            Area = 4.63431 mm^2
+            Peak Dynamic = 1.07973 W
+            Subthreshold Leakage = 0.00557121 W
+            Gate Leakage = 0.000534421 W
+            Runtime Dynamic = 0.491409 W
+
+              Integer RF:
+                Area = 3.11444 mm^2
+                Peak Dynamic = 0.64479 W
+                Subthreshold Leakage = 0.00348926 W
+                Gate Leakage = 0.000338898 W
+                Runtime Dynamic = 0.43963 W
+
+              Floating Point RF:
+                Area = 1.51987 mm^2
+                Peak Dynamic = 0.434944 W
+                Subthreshold Leakage = 0.00208194 W
+                Gate Leakage = 0.000195523 W
+                Runtime Dynamic = 0.051779 W
+
+          Instruction Scheduler:
+            Area = 2.2958 mm^2
+            Peak Dynamic = 0.682653 W
+            Subthreshold Leakage = 0.0043779 W
+            Gate Leakage = 0.000496354 W
+            Runtime Dynamic = 0.783433 W
+
+              Instruction Window:
+                Area = 0.416485 mm^2
+                Peak Dynamic = 0.230852 W
+                Subthreshold Leakage = 0.001531 W
+                Gate Leakage = 0.000214549 W
+                Runtime Dynamic = 0.308242 W
+
+              FP Instruction Window:
+                Area = 0.160067 mm^2
+                Peak Dynamic = 0.0899719 W
+                Subthreshold Leakage = 0.000573841 W
+                Gate Leakage = 9.08104e-05 W
+                Runtime Dynamic = 0.113361 W
+
+              ROB:
+                Area = 1.71925 mm^2
+                Peak Dynamic = 0.361829 W
+                Subthreshold Leakage = 0.00227307 W
+                Gate Leakage = 0.000190995 W
+                Runtime Dynamic = 0.361829 W
+
+          Integer ALUs (Count: 4 ):
+            Area = 2.56256 mm^2
+            Peak Dynamic = 1.45952 W
+            Subthreshold Leakage = 0.514377 W
+            Gate Leakage = 0.0657924 W
+            Runtime Dynamic = 1.12031 W
+
+          Floating Point Units (FPUs) (Count: 1 ):
+            Area = 9.317 mm^2
+            Peak Dynamic = 1.32571 W
+            Subthreshold Leakage = 0.467545 W
+            Gate Leakage = 0.0598023 W
+            Runtime Dynamic = 1.32571 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0521609 mm^2
+            Peak Dynamic = 2.15212 W
+            Subthreshold Leakage = 0.0139887 W
+            Gate Leakage = 0.00178925 W
+            Runtime Dynamic = 1.79159 W
+
+*****************************************************************************************
+L2
+      Area = 62.2653 mm^2
+      Peak Dynamic = 1.42987 W
+      Subthreshold Leakage = 1.65481 W
+      Gate Leakage = 0.00860545 W
+      Runtime Dynamic = 2.73329 W
+
+*****************************************************************************************
+Second Level Directory
+      Area = 0.533824 mm^2
+      Peak Dynamic = 0.275566 W
+      Subthreshold Leakage = 0.00929753 W
+      Gate Leakage = 0.00179126 W
+      Runtime Dynamic = 0.193681 W
+
+*****************************************************************************************
+Memory Controller:
+      Area = 3.54463 mm^2
+      Peak Dynamic = 1.65655 W
+      Subthreshold Leakage = 0.0225194 W
+      Gate Leakage = 0.00285586 W
+      Runtime Dynamic = 0.573656 W
+
+      Front End Engine:
+        Area = 1.72828 mm^2
+        Peak Dynamic = 0.389588 W
+        Subthreshold Leakage = 0.00246696 W
+        Gate Leakage = 0.000291005 W
+        Runtime Dynamic = 0.0911898 W
+
+      Transaction Engine:
+        Area = 0.75308 mm^2
+        Peak Dynamic = 1.13896 W
+        Subthreshold Leakage = 0.00831402 W
+        Gate Leakage = 0.00106342 W
+        Runtime Dynamic = 0.341678 W
+
+      PHY:
+        Area = 1.06326 mm^2
+        Peak Dynamic = 0.128 W
+        Subthreshold Leakage = 0.0117384 W
+        Gate Leakage = 0.00150143 W
+        Runtime Dynamic = 0.140788 W
+
+*****************************************************************************************
+NOC
+      Area = 8.77595 mm^2
+      Peak Dynamic = 6.17873 W
+      Subthreshold Leakage = 0.108357 W
+      Gate Leakage = 0.0139259 W
+      Runtime Dynamic = 0.963385 W
+
+      Router: 
+        Area = 8.3047 mm^2
+        Peak Dynamic = 2.78895 W
+        Subthreshold Leakage = 0.0606175 W
+        Gate Leakage = 0.00781974 W
+        Runtime Dynamic = 0.398421 W
+
+            Virtual Channel Buffer:
+              Area = 4.2978 mm^2
+              Peak Dynamic = 2.31409 W
+              Subthreshold Leakage = 0.028002 W
+              Gate Leakage = 0.00227471 W
+              Runtime Dynamic = 0.330584 W
+
+            Crossbar:
+              Area = 0.160538 mm^2
+              Peak Dynamic = 0.437862 W
+              Subthreshold Leakage = 0.0325996 W
+              Gate Leakage = 0.00554292 W
+              Runtime Dynamic = 0.0625517 W
+
+            Arbiter:
+              Peak Dynamic = 0.0370018 W
+              Subthreshold Leakage = 1.5858e-05 W
+              Gate Leakage = 2.11117e-06 W
+              Runtime Dynamic = 0.00528597 W
+
+      Per Router Links: 
+        Area = 0.471256 mm^2
+        Peak Dynamic = 3.38978 W
+        Subthreshold Leakage = 0.0477391 W
+        Gate Leakage = 0.00610616 W
+        Runtime Dynamic = 0.564963 W
+
+*****************************************************************************************
diff --git a/src/gpuwattch/results/Penryn b/src/gpuwattch/results/Penryn
new file mode 100644
index 000000000..af39390d1
--- /dev/null
+++ b/src/gpuwattch/results/Penryn
@@ -0,0 +1,315 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+ 
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 45 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3700
+
+*****************************************************************************************
+Processor: 
+  Area = 92.2661 mm^2
+  Peak Power = 61.0228 W
+  Total Leakage = 10.8609 W
+  Peak Dynamic = 50.1619 W
+  Subthreshold Leakage = 10.2773 W
+  Gate Leakage = 0.583567 W
+  Runtime Dynamic = 69.6347 W
+
+  Total Cores: 2 cores 
+  Device Type= ITRS high performance device type
+    Area = 48.2438 mm^2
+    Peak Dynamic = 39.6676 W
+    Subthreshold Leakage = 6.96165 W
+    Gate Leakage = 0.541077 W
+    Runtime Dynamic = 51.4987 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 43.1009 mm^2
+    Peak Dynamic = 6.43272 W
+    Subthreshold Leakage = 3.28049 W
+    Gate Leakage = 0.0386655 W
+    Runtime Dynamic = 13.716 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 0.921404 mm^2
+    Peak Dynamic = 4.06164 W
+    Subthreshold Leakage = 0.035183 W
+    Gate Leakage = 0.00382481 W
+    Runtime Dynamic = 4.42002 W
+
+*****************************************************************************************
+Core:
+      Area = 24.1219 mm^2
+      Peak Dynamic = 19.8338 W
+      Subthreshold Leakage = 3.48083 W
+      Gate Leakage = 0.270538 W
+      Runtime Dynamic = 51.4987 W
+
+      Instruction Fetch Unit:
+        Area = 3.13582 mm^2
+        Peak Dynamic = 2.49774 W
+        Subthreshold Leakage = 0.421089 W
+        Gate Leakage = 0.0246791 W
+        Runtime Dynamic = 2.42869 W
+
+          Instruction Cache:
+            Area = 0.702441 mm^2
+            Peak Dynamic = 0.419702 W
+            Subthreshold Leakage = 0.0413175 W
+            Gate Leakage = 0.00175164 W
+            Runtime Dynamic = 0.487111 W
+
+          Branch Target Buffer:
+            Area = 0.349484 mm^2
+            Peak Dynamic = 0.0903353 W
+            Subthreshold Leakage = 0.0243658 W
+            Gate Leakage = 0.000966387 W
+            Runtime Dynamic = 0.361341 W
+
+          Branch Predictor:
+            Area = 0.153017 mm^2
+            Peak Dynamic = 0.0718712 W
+            Subthreshold Leakage = 0.0142615 W
+            Gate Leakage = 0.000619154 W
+            Runtime Dynamic = 0.0647272 W
+
+              Global Predictor:
+                Area = 0.0475693 mm^2
+                Peak Dynamic = 0.0231158 W
+                Subthreshold Leakage = 0.00544747 W
+                Gate Leakage = 0.000234591 W
+                Runtime Dynamic = 0.0245764 W
+
+              Local Predictor:
+              L1_Local Predictor:
+                Area = 0.0239764 mm^2
+                Peak Dynamic = 0.0142817 W
+                Subthreshold Leakage = 0.00265926 W
+                Gate Leakage = 0.00011608 W
+                Runtime Dynamic = 0.0155731 W
+
+              L2_Local Predictor:
+                Area = 0.012121 mm^2
+                Peak Dynamic = 0.00767395 W
+                Subthreshold Leakage = 0.00143248 W
+                Gate Leakage = 6.77717e-05 W
+                Runtime Dynamic = 0.00837399 W
+
+              Chooser:
+                Area = 0.0475693 mm^2
+                Peak Dynamic = 0.0231158 W
+                Subthreshold Leakage = 0.00544747 W
+                Gate Leakage = 0.000234591 W
+                Runtime Dynamic = 0.0245764 W
+
+              RAS:
+                Area = 0.0217815 mm^2
+                Peak Dynamic = 0.0113578 W
+                Subthreshold Leakage = 0.000707258 W
+                Gate Leakage = 3.38921e-05 W
+                Runtime Dynamic = 1.2459e-06 W
+
+          Instruction Buffer:
+            Area = 0.0278406 mm^2
+            Peak Dynamic = 0.282368 W
+            Subthreshold Leakage = 0.000861686 W
+            Gate Leakage = 3.91839e-05 W
+            Runtime Dynamic = 0.188245 W
+
+          Instruction Decoder:
+            Area = 1.85799 mm^2
+            Peak Dynamic = 1.32726 W
+            Subthreshold Leakage = 0.325606 W
+            Gate Leakage = 0.0185411 W
+            Runtime Dynamic = 1.32726 W
+
+      Renaming Unit:
+        Area = 1.02517 mm^2
+        Peak Dynamic = 2.25746 W
+        Subthreshold Leakage = 0.042129 W
+        Gate Leakage = 0.00480502 W
+        Runtime Dynamic = 1.55315 W
+
+          Int Front End RAT:
+            Area = 0.59725 mm^2
+            Peak Dynamic = 1.25286 W
+            Subthreshold Leakage = 0.0159587 W
+            Gate Leakage = 0.00122436 W
+            Runtime Dynamic = 1.11309 W
+
+          FP Front End RAT:
+            Area = 0.350662 mm^2
+            Peak Dynamic = 0.652971 W
+            Subthreshold Leakage = 0.0110219 W
+            Gate Leakage = 0.00079321 W
+            Runtime Dynamic = 0.326485 W
+
+          Free List:
+            Area = 0.0322035 mm^2
+            Peak Dynamic = 0.0454309 W
+            Subthreshold Leakage = 0.000471802 W
+            Gate Leakage = 2.57995e-05 W
+            Runtime Dynamic = 0.113577 W
+
+      Load Store Unit:
+        Area = 7.24152 mm^2
+        Peak Dynamic = 6.57278 W
+        Subthreshold Leakage = 0.310798 W
+        Gate Leakage = 0.0358085 W
+        Runtime Dynamic = 34.9208 W
+
+          Data Cache:
+            Area = 4.65034 mm^2
+            Peak Dynamic = 5.03369 W
+            Subthreshold Leakage = 0.237004 W
+            Gate Leakage = 0.0253255 W
+            Runtime Dynamic = 33.601 W
+
+          LoadQ:
+            Area = 0.260806 mm^2
+            Peak Dynamic = 0.132332 W
+            Subthreshold Leakage = 0.00523814 W
+            Gate Leakage = 0.000359005 W
+            Runtime Dynamic = 0.0661662 W
+
+          StoreQ:
+            Area = 1.06006 mm^2
+            Peak Dynamic = 1.25365 W
+            Subthreshold Leakage = 0.0538794 W
+            Gate Leakage = 0.00736236 W
+            Runtime Dynamic = 1.25365 W
+
+      Memory Management Unit:
+        Area = 0.363299 mm^2
+        Peak Dynamic = 0.610831 W
+        Subthreshold Leakage = 0.0388017 W
+        Gate Leakage = 0.00431691 W
+        Runtime Dynamic = 1.29234 W
+
+          Itlb:
+            Area = 0.0590462 mm^2
+            Peak Dynamic = 0.116192 W
+            Subthreshold Leakage = 0.00608044 W
+            Gate Leakage = 0.000398475 W
+            Runtime Dynamic = 0.232386 W
+
+          Dtlb:
+            Area = 0.259199 mm^2
+            Peak Dynamic = 0.264986 W
+            Subthreshold Leakage = 0.0180446 W
+            Gate Leakage = 0.00115678 W
+            Runtime Dynamic = 1.05995 W
+
+      Execution Unit:
+        Area = 7.9594 mm^2
+        Peak Dynamic = 7.89497 W
+        Subthreshold Leakage = 1.28761 W
+        Gate Leakage = 0.0977152 W
+        Runtime Dynamic = 11.3037 W
+
+          Register Files:
+            Area = 0.528076 mm^2
+            Peak Dynamic = 0.554172 W
+            Subthreshold Leakage = 0.00459231 W
+            Gate Leakage = 0.000305031 W
+            Runtime Dynamic = 0.283985 W
+
+              Integer RF:
+                Area = 0.336446 mm^2
+                Peak Dynamic = 0.461344 W
+                Subthreshold Leakage = 0.00257976 W
+                Gate Leakage = 0.00018025 W
+                Runtime Dynamic = 0.247149 W
+
+              Floating Point RF:
+                Area = 0.19163 mm^2
+                Peak Dynamic = 0.0928276 W
+                Subthreshold Leakage = 0.00201255 W
+                Gate Leakage = 0.000124781 W
+                Runtime Dynamic = 0.0368364 W
+
+          Instruction Scheduler:
+            Area = 1.97424 mm^2
+            Peak Dynamic = 1.76421 W
+            Subthreshold Leakage = 0.0212898 W
+            Gate Leakage = 0.0014052 W
+            Runtime Dynamic = 1.96388 W
+
+              Instruction Window:
+                Area = 0.889691 mm^2
+                Peak Dynamic = 0.468182 W
+                Subthreshold Leakage = 0.0081033 W
+                Gate Leakage = 0.000620258 W
+                Runtime Dynamic = 0.601258 W
+
+              FP Instruction Window:
+                Area = 0.347423 mm^2
+                Peak Dynamic = 0.230453 W
+                Subthreshold Leakage = 0.00381664 W
+                Gate Leakage = 0.000293336 W
+                Runtime Dynamic = 0.29704 W
+
+              ROB:
+                Area = 0.737129 mm^2
+                Peak Dynamic = 1.06558 W
+                Subthreshold Leakage = 0.00936988 W
+                Gate Leakage = 0.000491606 W
+                Runtime Dynamic = 1.06558 W
+
+          Integer ALUs (Count: 6 ):
+            Area = 0.47087 mm^2
+            Peak Dynamic = 2.2206 W
+            Subthreshold Leakage = 0.295671 W
+            Gate Leakage = 0.0221076 W
+            Runtime Dynamic = 1.14549 W
+
+          Floating Point Units (FPUs) (Count: 2 ):
+            Area = 4.6585 mm^2
+            Peak Dynamic = 0.708407 W
+            Subthreshold Leakage = 0.731296 W
+            Gate Leakage = 0.0546797 W
+            Runtime Dynamic = 1.28625 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.235435 mm^2
+            Peak Dynamic = 0.257249 W
+            Subthreshold Leakage = 0.147835 W
+            Gate Leakage = 0.0110538 W
+            Runtime Dynamic = 1.57424 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0472187 mm^2
+            Peak Dynamic = 2.08413 W
+            Subthreshold Leakage = 0.0722513 W
+            Gate Leakage = 0.00540229 W
+            Runtime Dynamic = 5.04986 W
+
+*****************************************************************************************
+L2
+      Area = 43.1009 mm^2
+      Peak Dynamic = 6.43272 W
+      Subthreshold Leakage = 3.28049 W
+      Gate Leakage = 0.0386655 W
+      Runtime Dynamic = 13.716 W
+
+*****************************************************************************************
+BUSES
+      Area = 0.921404 mm^2
+      Peak Dynamic = 4.06164 W
+      Subthreshold Leakage = 0.035183 W
+      Gate Leakage = 0.00382481 W
+      Runtime Dynamic = 4.42002 W
+
+      Bus: 
+        Area = 0.921404 mm^2
+        Peak Dynamic = 4.06164 W
+        Subthreshold Leakage = 0.035183 W
+        Gate Leakage = 0.00382481 W
+        Runtime Dynamic = 4.42002 W
+
+*****************************************************************************************
diff --git a/src/gpuwattch/results/T1 b/src/gpuwattch/results/T1
new file mode 100644
index 000000000..f63e51c81
--- /dev/null
+++ b/src/gpuwattch/results/T1
@@ -0,0 +1,296 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+ 
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 90 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 1200
+
+*****************************************************************************************
+Processor: 
+  Area = 283.287 mm^2
+  Peak Power = 55.0318 W
+  Total Leakage = 9.78078 W
+  Peak Dynamic = 45.2511 W
+  Subthreshold Leakage = 8.64906 W
+  Gate Leakage = 1.13172 W
+  Runtime Dynamic = 45.5013 W
+
+  Total Cores: 
+  Device Type= ITRS high performance device type
+    Area = 117.887 mm^2
+    Peak Dynamic = 28.1307 W
+    Subthreshold Leakage = 5.19354 W
+    Gate Leakage = 0.730037 W
+    Runtime Dynamic = 18.917 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 116.308 mm^2
+    Peak Dynamic = 5.51367 W
+    Subthreshold Leakage = 2.41316 W
+    Gate Leakage = 0.242513 W
+    Runtime Dynamic = 4.00707 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 8.77473 mm^2
+    Peak Dynamic = 3.38588 W
+    Subthreshold Leakage = 0.224524 W
+    Gate Leakage = 0.0320801 W
+    Runtime Dynamic = 15.1158 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 8.87598 mm^2
+    Peak Dynamic = 3.67515 W
+    Subthreshold Leakage = 0.488892 W
+    Gate Leakage = 0.0852308 W
+    Runtime Dynamic = 2.20509 W
+
+  Total MCs: 
+  Device Type= ITRS high performance device type
+    Area = 31.441 mm^2
+    Peak Dynamic = 4.5457 W
+    Subthreshold Leakage = 0.328953 W
+    Gate Leakage = 0.0418558 W
+    Runtime Dynamic = 5.25637 W
+
+*****************************************************************************************
+Core:
+      Area = 14.7359 mm^2
+      Peak Dynamic = 3.51633 W
+      Subthreshold Leakage = 0.649192 W
+      Gate Leakage = 0.0912546 W
+      Runtime Dynamic = 18.917 W
+
+      Instruction Fetch Unit:
+        Area = 3.60967 mm^2
+        Peak Dynamic = 0.560912 W
+        Subthreshold Leakage = 0.0396492 W
+        Gate Leakage = 0.00709504 W
+        Runtime Dynamic = 3.76593 W
+
+          Instruction Cache:
+            Area = 3.41818 mm^2
+            Peak Dynamic = 0.308492 W
+            Subthreshold Leakage = 0.0286475 W
+            Gate Leakage = 0.00418329 W
+            Runtime Dynamic = 0.95332 W
+
+          Instruction Buffer:
+            Area = 0.0122742 mm^2
+            Peak Dynamic = 0.0121268 W
+            Subthreshold Leakage = 0.0002042 W
+            Gate Leakage = 1.78658e-05 W
+            Runtime Dynamic = 0.0970143 W
+
+          Instruction Decoder:
+            Area = 0.0229327 mm^2
+            Peak Dynamic = 0.169467 W
+            Subthreshold Leakage = 0.00259055 W
+            Gate Leakage = 0.000252139 W
+            Runtime Dynamic = 1.35574 W
+
+      Load Store Unit:
+        Area = 3.07616 mm^2
+        Peak Dynamic = 0.390349 W
+        Subthreshold Leakage = 0.0362126 W
+        Gate Leakage = 0.00713432 W
+        Runtime Dynamic = 3.85623 W
+
+          Data Cache:
+            Area = 1.47986 mm^2
+            Peak Dynamic = 0.191211 W
+            Subthreshold Leakage = 0.0157454 W
+            Gate Leakage = 0.00208738 W
+            Runtime Dynamic = 0.443377 W
+
+          Load/Store Queue:
+            Area = 1.17458 mm^2
+            Peak Dynamic = 0.128312 W
+            Subthreshold Leakage = 0.0122603 W
+            Gate Leakage = 0.0024052 W
+            Runtime Dynamic = 2.05299 W
+
+      Memory Management Unit:
+        Area = 1.27751 mm^2
+        Peak Dynamic = 0.324071 W
+        Subthreshold Leakage = 0.0192968 W
+        Gate Leakage = 0.0049902 W
+        Runtime Dynamic = 2.53591 W
+
+          Itlb:
+            Area = 0.560615 mm^2
+            Peak Dynamic = 0.117604 W
+            Subthreshold Leakage = 0.00554488 W
+            Gate Leakage = 0.00117423 W
+            Runtime Dynamic = 0.940838 W
+
+          Dtlb:
+            Area = 0.560615 mm^2
+            Peak Dynamic = 0.0294011 W
+            Subthreshold Leakage = 0.00554488 W
+            Gate Leakage = 0.00117423 W
+            Runtime Dynamic = 0.235211 W
+
+      Execution Unit:
+        Area = 3.47025 mm^2
+        Peak Dynamic = 2.241 W
+        Subthreshold Leakage = 0.222601 W
+        Gate Leakage = 0.0296426 W
+        Runtime Dynamic = 8.75894 W
+
+          Register Files:
+            Area = 1.38355 mm^2
+            Peak Dynamic = 0.0746572 W
+            Subthreshold Leakage = 0.00827136 W
+            Gate Leakage = 0.000628178 W
+            Runtime Dynamic = 0.320633 W
+
+              Integer RF:
+                Area = 0.592652 mm^2
+                Peak Dynamic = 0.0582404 W
+                Subthreshold Leakage = 0.00161128 W
+                Gate Leakage = 0.000148771 W
+                Runtime Dynamic = 0.312722 W
+
+              Floating Point RF:
+                Area = 0.592652 mm^2
+                Peak Dynamic = 0.0164168 W
+                Subthreshold Leakage = 0.00161128 W
+                Gate Leakage = 0.000148771 W
+                Runtime Dynamic = 0.00783962 W
+
+              Register Windows:
+                Area = 0.198243 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 0.00504879 W
+                Gate Leakage = 0.000330636 W
+                Runtime Dynamic = 7.11291e-05 W
+
+          Instruction Scheduler:
+            Area = 0.04377 mm^2
+            Peak Dynamic = 0.0284368 W
+            Subthreshold Leakage = 0.000336066 W
+            Gate Leakage = 5.10703e-05 W
+            Runtime Dynamic = 0.244528 W
+
+              Instruction Window:
+                Area = 0.04377 mm^2
+                Peak Dynamic = 0.0284368 W
+                Subthreshold Leakage = 0.000336066 W
+                Gate Leakage = 5.10703e-05 W
+                Runtime Dynamic = 0.244528 W
+
+          Integer ALUs (Count: 1 ):
+            Area = 0.16016 mm^2
+            Peak Dynamic = 0.305285 W
+            Subthreshold Leakage = 0.0321485 W
+            Gate Leakage = 0.00411202 W
+            Runtime Dynamic = 2.71365 W
+
+          Floating Point Units (FPUs) (Count: 0.125 ):
+            Area = 1.16463 mm^2
+            Peak Dynamic = 0.0508808 W
+            Subthreshold Leakage = 0.0584431 W
+            Gate Leakage = 0.00747528 W
+            Runtime Dynamic = 0.101762 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.48048 mm^2
+            Peak Dynamic = 0.339206 W
+            Subthreshold Leakage = 0.0964456 W
+            Gate Leakage = 0.0123361 W
+            Runtime Dynamic = 0.678411 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0813807 mm^2
+            Peak Dynamic = 1.18756 W
+            Subthreshold Leakage = 0.0187498 W
+            Gate Leakage = 0.00239823 W
+            Runtime Dynamic = 3.3401 W
+
+*****************************************************************************************
+L2
+      Area = 29.0771 mm^2
+      Peak Dynamic = 1.37842 W
+      Subthreshold Leakage = 0.603289 W
+      Gate Leakage = 0.0606283 W
+      Runtime Dynamic = 4.00707 W
+
+*****************************************************************************************
+First Level Directory
+      Area = 2.19368 mm^2
+      Peak Dynamic = 0.84647 W
+      Subthreshold Leakage = 0.0561311 W
+      Gate Leakage = 0.00802003 W
+      Runtime Dynamic = 15.1158 W
+
+*****************************************************************************************
+Memory Controller:
+      Area = 7.86025 mm^2
+      Peak Dynamic = 1.13642 W
+      Subthreshold Leakage = 0.0822383 W
+      Gate Leakage = 0.0104639 W
+      Runtime Dynamic = 5.25637 W
+
+      Front End Engine:
+        Area = 0.63078 mm^2
+        Peak Dynamic = 0.0549429 W
+        Subthreshold Leakage = 0.00242476 W
+        Gate Leakage = 0.00025524 W
+        Runtime Dynamic = 0.241753 W
+
+      Transaction Engine:
+        Area = 2.59502 mm^2
+        Peak Dynamic = 0.569482 W
+        Subthreshold Leakage = 0.0286491 W
+        Gate Leakage = 0.00366442 W
+        Runtime Dynamic = 2.50577 W
+
+      PHY:
+        Area = 4.63445 mm^2
+        Peak Dynamic = 0.512 W
+        Subthreshold Leakage = 0.0511644 W
+        Gate Leakage = 0.00654429 W
+        Runtime Dynamic = 2.50885 W
+
+*****************************************************************************************
+NOC
+      Area = 8.87598 mm^2
+      Peak Dynamic = 3.67515 W
+      Subthreshold Leakage = 0.488892 W
+      Gate Leakage = 0.0852308 W
+      Runtime Dynamic = 2.20509 W
+
+      Router: 
+        Area = 4.43799 mm^2
+        Peak Dynamic = 1.83757 W
+        Subthreshold Leakage = 0.244446 W
+        Gate Leakage = 0.0426154 W
+        Runtime Dynamic = 2.20509 W
+
+            Virtual Channel Buffer:
+              Area = 1.22928 mm^2
+              Peak Dynamic = 0.0508654 W
+              Subthreshold Leakage = 0.000485491 W
+              Gate Leakage = 7.24213e-05 W
+              Runtime Dynamic = 0.0610385 W
+
+            Crossbar:
+              Area = 1.35717 mm^2
+              Peak Dynamic = 1.77185 W
+              Subthreshold Leakage = 0.243949 W
+              Gate Leakage = 0.0425414 W
+              Runtime Dynamic = 2.12622 W
+
+            Arbiter:
+              Peak Dynamic = 0.0148566 W
+              Subthreshold Leakage = 1.15783e-05 W
+              Gate Leakage = 1.54103e-06 W
+              Runtime Dynamic = 0.0178279 W
+
+*****************************************************************************************
diff --git a/src/gpuwattch/results/T1_DC_64 b/src/gpuwattch/results/T1_DC_64
new file mode 100644
index 000000000..cdb0a1b3c
--- /dev/null
+++ b/src/gpuwattch/results/T1_DC_64
@@ -0,0 +1,270 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+ 
+line64
+size1.04858e+06
+line9
+size1.04858e+06
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 22 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3500
+
+*****************************************************************************************
+Processor: 
+  Area = 322.362 mm^2
+  Peak Power = 112.557 W
+  Total Leakage = 28.0714 W
+  Peak Dynamic = 84.4853 W
+  Subthreshold Leakage = 27.7571 W
+  Gate Leakage = 0.314289 W
+  Runtime Dynamic = 13.4278 W
+
+  Total Cores: 64 cores 
+  Device Type= ITRS high performance device type
+    Area = 87.1986 mm^2
+    Peak Dynamic = 42.426 W
+    Subthreshold Leakage = 7.80232 W
+    Gate Leakage = 0.0799149 W
+    Runtime Dynamic = 9.61388 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 161.532 mm^2
+    Peak Dynamic = 21.1059 W
+    Subthreshold Leakage = 8.9583 W
+    Gate Leakage = 0.100733 W
+    Runtime Dynamic = 1.14063 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 22.1741 mm^2
+    Peak Dynamic = 0.831407 W
+    Subthreshold Leakage = 1.57123 W
+    Gate Leakage = 0.0148674 W
+    Runtime Dynamic = 0.175856 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 51.4571 mm^2
+    Peak Dynamic = 20.122 W
+    Subthreshold Leakage = 9.42527 W
+    Gate Leakage = 0.118774 W
+    Runtime Dynamic = 2.49747 W
+
+*****************************************************************************************
+Core:
+      Area = 1.36248 mm^2
+      Peak Dynamic = 0.662906 W
+      Subthreshold Leakage = 0.121911 W
+      Gate Leakage = 0.00124867 W
+      Runtime Dynamic = 9.61388 W
+
+      Instruction Fetch Unit:
+        Area = 0.140786 mm^2
+        Peak Dynamic = 0.0863256 W
+        Subthreshold Leakage = 0.00636762 W
+        Gate Leakage = 7.4998e-05 W
+        Runtime Dynamic = 2.08883 W
+
+          Instruction Cache:
+            Area = 0.129377 mm^2
+            Peak Dynamic = 0.0476007 W
+            Subthreshold Leakage = 0.00381804 W
+            Gate Leakage = 2.35266e-05 W
+            Runtime Dynamic = 0.0698158 W
+
+          Instruction Buffer:
+            Area = 0.000754971 mm^2
+            Peak Dynamic = 0.00238165 W
+            Subthreshold Leakage = 4.99334e-05 W
+            Gate Leakage = 3.27157e-07 W
+            Runtime Dynamic = 0.0190532 W
+
+          Instruction Decoder:
+            Area = 0.00131543 mm^2
+            Peak Dynamic = 0.0246042 W
+            Subthreshold Leakage = 0.000538954 W
+            Gate Leakage = 3.91915e-06 W
+            Runtime Dynamic = 0.196833 W
+
+      Load Store Unit:
+        Area = 0.0977414 mm^2
+        Peak Dynamic = 0.0587123 W
+        Subthreshold Leakage = 0.00580883 W
+        Gate Leakage = 7.48788e-05 W
+        Runtime Dynamic = 2.07447 W
+
+          Data Cache:
+            Area = 0.0569223 mm^2
+            Peak Dynamic = 0.0329939 W
+            Subthreshold Leakage = 0.00249221 W
+            Gate Leakage = 1.63814e-05 W
+            Runtime Dynamic = 0.0476753 W
+
+          Load/Store Queue:
+            Area = 0.023444 mm^2
+            Peak Dynamic = 0.0139792 W
+            Subthreshold Leakage = 0.00135593 W
+            Gate Leakage = 1.12722e-05 W
+            Runtime Dynamic = 0.223667 W
+
+      Memory Management Unit:
+        Area = 0.0313997 mm^2
+        Peak Dynamic = 0.0446647 W
+        Subthreshold Leakage = 0.0029577 W
+        Gate Leakage = 5.57335e-05 W
+        Runtime Dynamic = 1.92566 W
+
+          Itlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.0122535 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0980282 W
+
+          Dtlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.00306337 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0245072 W
+
+      Execution Unit:
+        Area = 0.299667 mm^2
+        Peak Dynamic = 0.473204 W
+        Subthreshold Leakage = 0.0379242 W
+        Gate Leakage = 0.000384077 W
+        Runtime Dynamic = 3.52491 W
+
+          Register Files:
+            Area = 0.0598365 mm^2
+            Peak Dynamic = 0.0168768 W
+            Subthreshold Leakage = 0.0020814 W
+            Gate Leakage = 1.24237e-05 W
+            Runtime Dynamic = 0.072481 W
+
+              Integer RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.0131657 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0706931 W
+
+              Floating Point RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.00371113 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0017722 W
+
+              Register Windows:
+                Area = 0.0118221 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 0.00118307 W
+                Gate Leakage = 5.76149e-06 W
+                Runtime Dynamic = 1.56951e-05 W
+
+          Instruction Scheduler:
+            Area = 0.00263062 mm^2
+            Peak Dynamic = 0.00540689 W
+            Subthreshold Leakage = 8.27524e-05 W
+            Gate Leakage = 9.38261e-07 W
+            Runtime Dynamic = 0.0464411 W
+
+              Instruction Window:
+                Area = 0.00263062 mm^2
+                Peak Dynamic = 0.00540689 W
+                Subthreshold Leakage = 8.27524e-05 W
+                Gate Leakage = 9.38261e-07 W
+                Runtime Dynamic = 0.0464411 W
+
+          Integer ALUs (Count: 1 ):
+            Area = 0.0384544 mm^2
+            Peak Dynamic = 0.0946992 W
+            Subthreshold Leakage = 0.00667865 W
+            Gate Leakage = 6.39207e-05 W
+            Runtime Dynamic = 0.841771 W
+
+          Floating Point Units (FPUs) (Count: 0.125 ):
+            Area = 0.0695899 mm^2
+            Peak Dynamic = 0.0157832 W
+            Subthreshold Leakage = 0.00302155 W
+            Gate Leakage = 2.89189e-05 W
+            Runtime Dynamic = 0.0315664 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.115363 mm^2
+            Peak Dynamic = 0.105221 W
+            Subthreshold Leakage = 0.020036 W
+            Gate Leakage = 0.000191762 W
+            Runtime Dynamic = 0.210443 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.00445381 mm^2
+            Peak Dynamic = 0.192955 W
+            Subthreshold Leakage = 0.00406321 W
+            Gate Leakage = 3.88886e-05 W
+            Runtime Dynamic = 0.519078 W
+
+*****************************************************************************************
+L2
+      Area = 2.52394 mm^2
+      Peak Dynamic = 0.32978 W
+      Subthreshold Leakage = 0.139973 W
+      Gate Leakage = 0.00157395 W
+      Runtime Dynamic = 1.14063 W
+
+*****************************************************************************************
+Second Level Directory
+      Area = 2.77176 mm^2
+      Peak Dynamic = 0.103926 W
+      Subthreshold Leakage = 0.196403 W
+      Gate Leakage = 0.00185842 W
+      Runtime Dynamic = 0.175856 W
+
+*****************************************************************************************
+NOC
+      Area = 51.4571 mm^2
+      Peak Dynamic = 20.122 W
+      Subthreshold Leakage = 9.42527 W
+      Gate Leakage = 0.118774 W
+      Runtime Dynamic = 2.49747 W
+
+      Router: 
+        Area = 0.578434 mm^2
+        Peak Dynamic = 0.184548 W
+        Subthreshold Leakage = 0.125515 W
+        Gate Leakage = 0.0016409 W
+        Runtime Dynamic = 1.32875 W
+
+            Virtual Channel Buffer:
+              Area = 0.159162 mm^2
+              Peak Dynamic = 0.00394081 W
+              Subthreshold Leakage = 0.000194478 W
+              Gate Leakage = 1.84946e-06 W
+              Runtime Dynamic = 0.0283738 W
+
+            Crossbar:
+              Area = 0.160976 mm^2
+              Peak Dynamic = 0.179891 W
+              Subthreshold Leakage = 0.12532 W
+              Gate Leakage = 0.00163905 W
+              Runtime Dynamic = 1.29522 W
+
+            Arbiter:
+              Peak Dynamic = 0.000716053 W
+              Subthreshold Leakage = 3.67148e-07 W
+              Gate Leakage = 3.86991e-09 W
+              Runtime Dynamic = 0.00515558 W
+
+      Per Router Links: 
+        Area = 0.225583 mm^2
+        Peak Dynamic = 0.129858 W
+        Subthreshold Leakage = 0.0217549 W
+        Gate Leakage = 0.000214933 W
+        Runtime Dynamic = 1.16872 W
+
+*****************************************************************************************
diff --git a/src/gpuwattch/results/T1_SBT_64 b/src/gpuwattch/results/T1_SBT_64
new file mode 100644
index 000000000..ec8968a19
--- /dev/null
+++ b/src/gpuwattch/results/T1_SBT_64
@@ -0,0 +1,252 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+ 
+line72
+size1.17965e+06
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 22 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3500
+
+*****************************************************************************************
+Processor: 
+  Area = 321.412 mm^2
+  Peak Power = 114.076 W
+  Total Leakage = 27.4353 W
+  Peak Dynamic = 86.6406 W
+  Subthreshold Leakage = 27.1256 W
+  Gate Leakage = 0.309772 W
+  Runtime Dynamic = 13.4064 W
+
+  Total Cores: 64 cores 
+  Device Type= ITRS high performance device type
+    Area = 87.1986 mm^2
+    Peak Dynamic = 42.426 W
+    Subthreshold Leakage = 7.80232 W
+    Gate Leakage = 0.0799149 W
+    Runtime Dynamic = 9.61388 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 182.778 mm^2
+    Peak Dynamic = 24.1051 W
+    Subthreshold Leakage = 9.90006 W
+    Gate Leakage = 0.111104 W
+    Runtime Dynamic = 1.29686 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 51.4353 mm^2
+    Peak Dynamic = 20.1095 W
+    Subthreshold Leakage = 9.42317 W
+    Gate Leakage = 0.118753 W
+    Runtime Dynamic = 2.4957 W
+
+*****************************************************************************************
+Core:
+      Area = 1.36248 mm^2
+      Peak Dynamic = 0.662906 W
+      Subthreshold Leakage = 0.121911 W
+      Gate Leakage = 0.00124867 W
+      Runtime Dynamic = 9.61388 W
+
+      Instruction Fetch Unit:
+        Area = 0.140786 mm^2
+        Peak Dynamic = 0.0863256 W
+        Subthreshold Leakage = 0.00636762 W
+        Gate Leakage = 7.4998e-05 W
+        Runtime Dynamic = 2.08883 W
+
+          Instruction Cache:
+            Area = 0.129377 mm^2
+            Peak Dynamic = 0.0476007 W
+            Subthreshold Leakage = 0.00381804 W
+            Gate Leakage = 2.35266e-05 W
+            Runtime Dynamic = 0.0698158 W
+
+          Instruction Buffer:
+            Area = 0.000754971 mm^2
+            Peak Dynamic = 0.00238165 W
+            Subthreshold Leakage = 4.99334e-05 W
+            Gate Leakage = 3.27157e-07 W
+            Runtime Dynamic = 0.0190532 W
+
+          Instruction Decoder:
+            Area = 0.00131543 mm^2
+            Peak Dynamic = 0.0246042 W
+            Subthreshold Leakage = 0.000538954 W
+            Gate Leakage = 3.91915e-06 W
+            Runtime Dynamic = 0.196833 W
+
+      Load Store Unit:
+        Area = 0.0977414 mm^2
+        Peak Dynamic = 0.0587123 W
+        Subthreshold Leakage = 0.00580883 W
+        Gate Leakage = 7.48788e-05 W
+        Runtime Dynamic = 2.07447 W
+
+          Data Cache:
+            Area = 0.0569223 mm^2
+            Peak Dynamic = 0.0329939 W
+            Subthreshold Leakage = 0.00249221 W
+            Gate Leakage = 1.63814e-05 W
+            Runtime Dynamic = 0.0476753 W
+
+          Load/Store Queue:
+            Area = 0.023444 mm^2
+            Peak Dynamic = 0.0139792 W
+            Subthreshold Leakage = 0.00135593 W
+            Gate Leakage = 1.12722e-05 W
+            Runtime Dynamic = 0.223667 W
+
+      Memory Management Unit:
+        Area = 0.0313997 mm^2
+        Peak Dynamic = 0.0446647 W
+        Subthreshold Leakage = 0.0029577 W
+        Gate Leakage = 5.57335e-05 W
+        Runtime Dynamic = 1.92566 W
+
+          Itlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.0122535 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0980282 W
+
+          Dtlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.00306337 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0245072 W
+
+      Execution Unit:
+        Area = 0.299667 mm^2
+        Peak Dynamic = 0.473204 W
+        Subthreshold Leakage = 0.0379242 W
+        Gate Leakage = 0.000384077 W
+        Runtime Dynamic = 3.52491 W
+
+          Register Files:
+            Area = 0.0598365 mm^2
+            Peak Dynamic = 0.0168768 W
+            Subthreshold Leakage = 0.0020814 W
+            Gate Leakage = 1.24237e-05 W
+            Runtime Dynamic = 0.072481 W
+
+              Integer RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.0131657 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0706931 W
+
+              Floating Point RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.00371113 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0017722 W
+
+              Register Windows:
+                Area = 0.0118221 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 0.00118307 W
+                Gate Leakage = 5.76149e-06 W
+                Runtime Dynamic = 1.56951e-05 W
+
+          Instruction Scheduler:
+            Area = 0.00263062 mm^2
+            Peak Dynamic = 0.00540689 W
+            Subthreshold Leakage = 8.27524e-05 W
+            Gate Leakage = 9.38261e-07 W
+            Runtime Dynamic = 0.0464411 W
+
+              Instruction Window:
+                Area = 0.00263062 mm^2
+                Peak Dynamic = 0.00540689 W
+                Subthreshold Leakage = 8.27524e-05 W
+                Gate Leakage = 9.38261e-07 W
+                Runtime Dynamic = 0.0464411 W
+
+          Integer ALUs (Count: 1 ):
+            Area = 0.0384544 mm^2
+            Peak Dynamic = 0.0946992 W
+            Subthreshold Leakage = 0.00667865 W
+            Gate Leakage = 6.39207e-05 W
+            Runtime Dynamic = 0.841771 W
+
+          Floating Point Units (FPUs) (Count: 0.125 ):
+            Area = 0.0695899 mm^2
+            Peak Dynamic = 0.0157832 W
+            Subthreshold Leakage = 0.00302155 W
+            Gate Leakage = 2.89189e-05 W
+            Runtime Dynamic = 0.0315664 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.115363 mm^2
+            Peak Dynamic = 0.105221 W
+            Subthreshold Leakage = 0.020036 W
+            Gate Leakage = 0.000191762 W
+            Runtime Dynamic = 0.210443 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.00445381 mm^2
+            Peak Dynamic = 0.192955 W
+            Subthreshold Leakage = 0.00406321 W
+            Gate Leakage = 3.88886e-05 W
+            Runtime Dynamic = 0.519078 W
+
+*****************************************************************************************
+L2
+      Area = 2.85591 mm^2
+      Peak Dynamic = 0.376642 W
+      Subthreshold Leakage = 0.154688 W
+      Gate Leakage = 0.001736 W
+      Runtime Dynamic = 1.29686 W
+
+*****************************************************************************************
+NOC
+      Area = 51.4353 mm^2
+      Peak Dynamic = 20.1095 W
+      Subthreshold Leakage = 9.42317 W
+      Gate Leakage = 0.118753 W
+      Runtime Dynamic = 2.4957 W
+
+      Router: 
+        Area = 0.578434 mm^2
+        Peak Dynamic = 0.184548 W
+        Subthreshold Leakage = 0.125515 W
+        Gate Leakage = 0.0016409 W
+        Runtime Dynamic = 1.32875 W
+
+            Virtual Channel Buffer:
+              Area = 0.159162 mm^2
+              Peak Dynamic = 0.00394081 W
+              Subthreshold Leakage = 0.000194478 W
+              Gate Leakage = 1.84946e-06 W
+              Runtime Dynamic = 0.0283738 W
+
+            Crossbar:
+              Area = 0.160976 mm^2
+              Peak Dynamic = 0.179891 W
+              Subthreshold Leakage = 0.12532 W
+              Gate Leakage = 0.00163905 W
+              Runtime Dynamic = 1.29522 W
+
+            Arbiter:
+              Peak Dynamic = 0.000716053 W
+              Subthreshold Leakage = 3.67148e-07 W
+              Gate Leakage = 3.86991e-09 W
+              Runtime Dynamic = 0.00515558 W
+
+      Per Router Links: 
+        Area = 0.225243 mm^2
+        Peak Dynamic = 0.129662 W
+        Subthreshold Leakage = 0.0217221 W
+        Gate Leakage = 0.000214609 W
+        Runtime Dynamic = 1.16696 W
+
+*****************************************************************************************
diff --git a/src/gpuwattch/results/T1_ST_64 b/src/gpuwattch/results/T1_ST_64
new file mode 100644
index 000000000..f3d95b541
--- /dev/null
+++ b/src/gpuwattch/results/T1_ST_64
@@ -0,0 +1,270 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+ 
+line64
+size1.04858e+06
+line9
+size8.38861e+06
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 22 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3500
+
+*****************************************************************************************
+Processor: 
+  Area = 358.016 mm^2
+  Peak Power = 168.519 W
+  Total Leakage = 30.8855 W
+  Peak Dynamic = 137.634 W
+  Subthreshold Leakage = 30.5351 W
+  Gate Leakage = 0.350385 W
+  Runtime Dynamic = 84.2366 W
+
+  Total Cores: 64 cores 
+  Device Type= ITRS high performance device type
+    Area = 87.1986 mm^2
+    Peak Dynamic = 42.426 W
+    Subthreshold Leakage = 7.80232 W
+    Gate Leakage = 0.0799149 W
+    Runtime Dynamic = 9.61388 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 161.532 mm^2
+    Peak Dynamic = 21.1059 W
+    Subthreshold Leakage = 8.9583 W
+    Gate Leakage = 0.100733 W
+    Runtime Dynamic = 1.14063 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 57.033 mm^2
+    Peak Dynamic = 53.5219 W
+    Subthreshold Leakage = 4.27249 W
+    Gate Leakage = 0.050206 W
+    Runtime Dynamic = 70.9203 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 52.2524 mm^2
+    Peak Dynamic = 20.5798 W
+    Subthreshold Leakage = 9.50197 W
+    Gate Leakage = 0.119531 W
+    Runtime Dynamic = 2.56185 W
+
+*****************************************************************************************
+Core:
+      Area = 1.36248 mm^2
+      Peak Dynamic = 0.662906 W
+      Subthreshold Leakage = 0.121911 W
+      Gate Leakage = 0.00124867 W
+      Runtime Dynamic = 9.61388 W
+
+      Instruction Fetch Unit:
+        Area = 0.140786 mm^2
+        Peak Dynamic = 0.0863256 W
+        Subthreshold Leakage = 0.00636762 W
+        Gate Leakage = 7.4998e-05 W
+        Runtime Dynamic = 2.08883 W
+
+          Instruction Cache:
+            Area = 0.129377 mm^2
+            Peak Dynamic = 0.0476007 W
+            Subthreshold Leakage = 0.00381804 W
+            Gate Leakage = 2.35266e-05 W
+            Runtime Dynamic = 0.0698158 W
+
+          Instruction Buffer:
+            Area = 0.000754971 mm^2
+            Peak Dynamic = 0.00238165 W
+            Subthreshold Leakage = 4.99334e-05 W
+            Gate Leakage = 3.27157e-07 W
+            Runtime Dynamic = 0.0190532 W
+
+          Instruction Decoder:
+            Area = 0.00131543 mm^2
+            Peak Dynamic = 0.0246042 W
+            Subthreshold Leakage = 0.000538954 W
+            Gate Leakage = 3.91915e-06 W
+            Runtime Dynamic = 0.196833 W
+
+      Load Store Unit:
+        Area = 0.0977414 mm^2
+        Peak Dynamic = 0.0587123 W
+        Subthreshold Leakage = 0.00580883 W
+        Gate Leakage = 7.48788e-05 W
+        Runtime Dynamic = 2.07447 W
+
+          Data Cache:
+            Area = 0.0569223 mm^2
+            Peak Dynamic = 0.0329939 W
+            Subthreshold Leakage = 0.00249221 W
+            Gate Leakage = 1.63814e-05 W
+            Runtime Dynamic = 0.0476753 W
+
+          Load/Store Queue:
+            Area = 0.023444 mm^2
+            Peak Dynamic = 0.0139792 W
+            Subthreshold Leakage = 0.00135593 W
+            Gate Leakage = 1.12722e-05 W
+            Runtime Dynamic = 0.223667 W
+
+      Memory Management Unit:
+        Area = 0.0313997 mm^2
+        Peak Dynamic = 0.0446647 W
+        Subthreshold Leakage = 0.0029577 W
+        Gate Leakage = 5.57335e-05 W
+        Runtime Dynamic = 1.92566 W
+
+          Itlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.0122535 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0980282 W
+
+          Dtlb:
+            Area = 0.0110306 mm^2
+            Peak Dynamic = 0.00306337 W
+            Subthreshold Leakage = 0.000498504 W
+            Gate Leakage = 4.25417e-06 W
+            Runtime Dynamic = 0.0245072 W
+
+      Execution Unit:
+        Area = 0.299667 mm^2
+        Peak Dynamic = 0.473204 W
+        Subthreshold Leakage = 0.0379242 W
+        Gate Leakage = 0.000384077 W
+        Runtime Dynamic = 3.52491 W
+
+          Register Files:
+            Area = 0.0598365 mm^2
+            Peak Dynamic = 0.0168768 W
+            Subthreshold Leakage = 0.0020814 W
+            Gate Leakage = 1.24237e-05 W
+            Runtime Dynamic = 0.072481 W
+
+              Integer RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.0131657 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0706931 W
+
+              Floating Point RF:
+                Area = 0.0240072 mm^2
+                Peak Dynamic = 0.00371113 W
+                Subthreshold Leakage = 0.000449165 W
+                Gate Leakage = 3.33111e-06 W
+                Runtime Dynamic = 0.0017722 W
+
+              Register Windows:
+                Area = 0.0118221 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 0.00118307 W
+                Gate Leakage = 5.76149e-06 W
+                Runtime Dynamic = 1.56951e-05 W
+
+          Instruction Scheduler:
+            Area = 0.00263062 mm^2
+            Peak Dynamic = 0.00540689 W
+            Subthreshold Leakage = 8.27524e-05 W
+            Gate Leakage = 9.38261e-07 W
+            Runtime Dynamic = 0.0464411 W
+
+              Instruction Window:
+                Area = 0.00263062 mm^2
+                Peak Dynamic = 0.00540689 W
+                Subthreshold Leakage = 8.27524e-05 W
+                Gate Leakage = 9.38261e-07 W
+                Runtime Dynamic = 0.0464411 W
+
+          Integer ALUs (Count: 1 ):
+            Area = 0.0384544 mm^2
+            Peak Dynamic = 0.0946992 W
+            Subthreshold Leakage = 0.00667865 W
+            Gate Leakage = 6.39207e-05 W
+            Runtime Dynamic = 0.841771 W
+
+          Floating Point Units (FPUs) (Count: 0.125 ):
+            Area = 0.0695899 mm^2
+            Peak Dynamic = 0.0157832 W
+            Subthreshold Leakage = 0.00302155 W
+            Gate Leakage = 2.89189e-05 W
+            Runtime Dynamic = 0.0315664 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.115363 mm^2
+            Peak Dynamic = 0.105221 W
+            Subthreshold Leakage = 0.020036 W
+            Gate Leakage = 0.000191762 W
+            Runtime Dynamic = 0.210443 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.00445381 mm^2
+            Peak Dynamic = 0.192955 W
+            Subthreshold Leakage = 0.00406321 W
+            Gate Leakage = 3.88886e-05 W
+            Runtime Dynamic = 0.519078 W
+
+*****************************************************************************************
+L2
+      Area = 2.52394 mm^2
+      Peak Dynamic = 0.32978 W
+      Subthreshold Leakage = 0.139973 W
+      Gate Leakage = 0.00157395 W
+      Runtime Dynamic = 1.14063 W
+
+*****************************************************************************************
+Second Level Directory
+      Area = 57.033 mm^2
+      Peak Dynamic = 53.5219 W
+      Subthreshold Leakage = 4.27249 W
+      Gate Leakage = 0.050206 W
+      Runtime Dynamic = 70.9203 W
+
+*****************************************************************************************
+NOC
+      Area = 52.2524 mm^2
+      Peak Dynamic = 20.5798 W
+      Subthreshold Leakage = 9.50197 W
+      Gate Leakage = 0.119531 W
+      Runtime Dynamic = 2.56185 W
+
+      Router: 
+        Area = 0.578434 mm^2
+        Peak Dynamic = 0.184548 W
+        Subthreshold Leakage = 0.125515 W
+        Gate Leakage = 0.0016409 W
+        Runtime Dynamic = 1.32875 W
+
+            Virtual Channel Buffer:
+              Area = 0.159162 mm^2
+              Peak Dynamic = 0.00394081 W
+              Subthreshold Leakage = 0.000194478 W
+              Gate Leakage = 1.84946e-06 W
+              Runtime Dynamic = 0.0283738 W
+
+            Crossbar:
+              Area = 0.160976 mm^2
+              Peak Dynamic = 0.179891 W
+              Subthreshold Leakage = 0.12532 W
+              Gate Leakage = 0.00163905 W
+              Runtime Dynamic = 1.29522 W
+
+            Arbiter:
+              Peak Dynamic = 0.000716053 W
+              Subthreshold Leakage = 3.67148e-07 W
+              Gate Leakage = 3.86991e-09 W
+              Runtime Dynamic = 0.00515558 W
+
+      Per Router Links: 
+        Area = 0.238009 mm^2
+        Peak Dynamic = 0.137011 W
+        Subthreshold Leakage = 0.0229533 W
+        Gate Leakage = 0.000226773 W
+        Runtime Dynamic = 1.2331 W
+
+*****************************************************************************************
diff --git a/src/gpuwattch/results/T2 b/src/gpuwattch/results/T2
new file mode 100644
index 000000000..e24701ab2
--- /dev/null
+++ b/src/gpuwattch/results/T2
@@ -0,0 +1,321 @@
+McPAT (version 0.8 of Aug, 2010) is computing the target processor...
+ 
+
+McPAT (version 0.8 of Aug, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 65 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 1400
+
+*****************************************************************************************
+Processor: 
+  Area = 277.068 mm^2
+  Peak Power = 71.8237 W
+  Total Leakage = 18.2234 W
+  Peak Dynamic = 53.6003 W
+  Subthreshold Leakage = 14.7124 W
+  Gate Leakage = 3.51096 W
+  Runtime Dynamic = 48.652 W
+
+  Total Cores: 8 cores 
+  Device Type= ITRS high performance device type
+    Area = 116.441 mm^2
+    Peak Dynamic = 28.0277 W
+    Subthreshold Leakage = 9.00023 W
+    Gate Leakage = 1.93139 W
+    Runtime Dynamic = 27.9237 W
+
+  Total L2s: 
+  Device Type= ITRS high performance device type
+    Area = 85.0391 mm^2
+    Peak Dynamic = 9.87481 W
+    Subthreshold Leakage = 2.71188 W
+    Gate Leakage = 0.684324 W
+    Runtime Dynamic = 3.97632 W
+
+  Total First Level Directory: 
+  Device Type= ITRS high performance device type
+    Area = 11.6417 mm^2
+    Peak Dynamic = 5.32369 W
+    Subthreshold Leakage = 0.249885 W
+    Gate Leakage = 0.107486 W
+    Runtime Dynamic = 5.38275 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 9.56584 mm^2
+    Peak Dynamic = 1.07754 W
+    Subthreshold Leakage = 1.61961 W
+    Gate Leakage = 0.389994 W
+    Runtime Dynamic = 1.07754 W
+
+  Total MCs: 4 Memory Controllers 
+  Device Type= ITRS high performance device type
+    Area = 32.2777 mm^2
+    Peak Dynamic = 5.92507 W
+    Subthreshold Leakage = 0.559071 W
+    Gate Leakage = 0.10416 W
+    Runtime Dynamic = 7.93157 W
+
+  Total NIUs: 2 Network Interface Units 
+  Device Type= ITRS high performance device type
+    Area = 15.8633 mm^2
+    Peak Dynamic = 1.86482 W
+    Subthreshold Leakage = 0.357626 W
+    Gate Leakage = 0.183662 W
+    Runtime Dynamic = 1.30537 W
+
+  Total PCIes: 1 PCIe Controllers 
+  Device Type= ITRS high performance device type
+    Area = 6.24 mm^2
+    Peak Dynamic = 1.5067 W
+    Subthreshold Leakage = 0.214091 W
+    Gate Leakage = 0.109948 W
+    Runtime Dynamic = 1.05469 W
+
+*****************************************************************************************
+Core:
+      Area = 14.5551 mm^2
+      Peak Dynamic = 3.50346 W
+      Subthreshold Leakage = 1.12503 W
+      Gate Leakage = 0.241423 W
+      Runtime Dynamic = 27.9237 W
+
+      Instruction Fetch Unit:
+        Area = 2.75911 mm^2
+        Peak Dynamic = 0.817936 W
+        Subthreshold Leakage = 0.0912466 W
+        Gate Leakage = 0.0284483 W
+        Runtime Dynamic = 4.81754 W
+
+          Instruction Cache:
+            Area = 2.51671 mm^2
+            Peak Dynamic = 0.513783 W
+            Subthreshold Leakage = 0.062355 W
+            Gate Leakage = 0.0164185 W
+            Runtime Dynamic = 1.59033 W
+
+          Instruction Buffer:
+            Area = 0.0130935 mm^2
+            Peak Dynamic = 0.0100268 W
+            Subthreshold Leakage = 0.000434992 W
+            Gate Leakage = 6.02581e-05 W
+            Runtime Dynamic = 0.160429 W
+
+          Instruction Decoder:
+            Area = 0.0119193 mm^2
+            Peak Dynamic = 0.0892213 W
+            Subthreshold Leakage = 0.00298091 W
+            Gate Leakage = 0.000408973 W
+            Runtime Dynamic = 1.42754 W
+
+      Load Store Unit:
+        Area = 2.14252 mm^2
+        Peak Dynamic = 0.487978 W
+        Subthreshold Leakage = 0.0802768 W
+        Gate Leakage = 0.0247378 W
+        Runtime Dynamic = 10.9331 W
+
+          Data Cache:
+            Area = 0.52868 mm^2
+            Peak Dynamic = 0.0991646 W
+            Subthreshold Leakage = 0.0119043 W
+            Gate Leakage = 0.00145618 W
+            Runtime Dynamic = 0.1303 W
+
+          Load/Store Queue:
+            Area = 1.22144 mm^2
+            Peak Dynamic = 0.286361 W
+            Subthreshold Leakage = 0.0428969 W
+            Gate Leakage = 0.011721 W
+            Runtime Dynamic = 9.16355 W
+
+      Memory Management Unit:
+        Area = 1.1006 mm^2
+        Peak Dynamic = 0.399121 W
+        Subthreshold Leakage = 0.0527367 W
+        Gate Leakage = 0.0195353 W
+        Runtime Dynamic = 2.78316 W
+
+          Itlb:
+            Area = 0.293144 mm^2
+            Peak Dynamic = 0.0743045 W
+            Subthreshold Leakage = 0.00720086 W
+            Gate Leakage = 0.00218791 W
+            Runtime Dynamic = 0.594438 W
+
+          Dtlb:
+            Area = 0.590071 mm^2
+            Peak Dynamic = 0.0686851 W
+            Subthreshold Leakage = 0.0200602 W
+            Gate Leakage = 0.00578676 W
+            Runtime Dynamic = 0.549486 W
+
+      Execution Unit:
+        Area = 6.79584 mm^2
+        Peak Dynamic = 1.79843 W
+        Subthreshold Leakage = 0.610924 W
+        Gate Leakage = 0.116437 W
+        Runtime Dynamic = 9.38994 W
+
+          Register Files:
+            Area = 1.18037 mm^2
+            Peak Dynamic = 0.0639548 W
+            Subthreshold Leakage = 0.00981018 W
+            Gate Leakage = 0.00106415 W
+            Runtime Dynamic = 0.401933 W
+
+              Integer RF:
+                Area = 0.648931 mm^2
+                Peak Dynamic = 0.0485174 W
+                Subthreshold Leakage = 0.00196627 W
+                Gate Leakage = 0.000259389 W
+                Runtime Dynamic = 0.392074 W
+
+              Floating Point RF:
+                Area = 0.324465 mm^2
+                Peak Dynamic = 0.0154374 W
+                Subthreshold Leakage = 0.00196627 W
+                Gate Leakage = 0.000259389 W
+                Runtime Dynamic = 0.0098154 W
+
+              Register Windows:
+                Area = 0.206972 mm^2
+                Peak Dynamic = 0 W
+                Subthreshold Leakage = 0.00587765 W
+                Gate Leakage = 0.000545372 W
+                Runtime Dynamic = 4.40062e-05 W
+
+          Instruction Scheduler:
+            Area = 0.0458096 mm^2
+            Peak Dynamic = 0.0333897 W
+            Subthreshold Leakage = 0.000402487 W
+            Gate Leakage = 8.61395e-05 W
+            Runtime Dynamic = 0.287483 W
+
+              Instruction Window:
+                Area = 0.0458096 mm^2
+                Peak Dynamic = 0.0333897 W
+                Subthreshold Leakage = 0.000402487 W
+                Gate Leakage = 8.61395e-05 W
+                Runtime Dynamic = 0.287483 W
+
+          Integer ALUs (Count: 2 ):
+            Area = 0.448448 mm^2
+            Peak Dynamic = 0.425547 W
+            Subthreshold Leakage = 0.147955 W
+            Gate Leakage = 0.0266792 W
+            Runtime Dynamic = 3.78264 W
+
+          Floating Point Units (FPUs) (Count: 1 ):
+            Area = 4.85979 mm^2
+            Peak Dynamic = 0.425547 W
+            Subthreshold Leakage = 0.400843 W
+            Gate Leakage = 0.07228 W
+            Runtime Dynamic = 0.0709246 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0440413 mm^2
+            Peak Dynamic = 0.481158 W
+            Subthreshold Leakage = 0.0264373 W
+            Gate Leakage = 0.00476717 W
+            Runtime Dynamic = 3.20772 W
+
+*****************************************************************************************
+L2
+      Area = 10.6299 mm^2
+      Peak Dynamic = 1.23435 W
+      Subthreshold Leakage = 0.338985 W
+      Gate Leakage = 0.0855405 W
+      Runtime Dynamic = 3.97632 W
+
+*****************************************************************************************
+First Level Directory
+      Area = 1.45521 mm^2
+      Peak Dynamic = 0.665462 W
+      Subthreshold Leakage = 0.0312356 W
+      Gate Leakage = 0.0134358 W
+      Runtime Dynamic = 5.38275 W
+
+*****************************************************************************************
+Memory Controller:
+      Area = 8.06942 mm^2
+      Peak Dynamic = 1.48127 W
+      Subthreshold Leakage = 0.139768 W
+      Gate Leakage = 0.0260401 W
+      Runtime Dynamic = 7.93157 W
+
+      Front End Engine:
+        Area = 0.250458 mm^2
+        Peak Dynamic = 0.05883 W
+        Subthreshold Leakage = 0.0029079 W
+        Gate Leakage = 0.000455875 W
+        Runtime Dynamic = 0.298069 W
+
+      Transaction Engine:
+        Area = 2.66058 mm^2
+        Peak Dynamic = 0.6912 W
+        Subthreshold Leakage = 0.0465697 W
+        Gate Leakage = 0.00870562 W
+        Runtime Dynamic = 3.50205 W
+
+      PHY:
+        Area = 5.15838 mm^2
+        Peak Dynamic = 0.731237 W
+        Subthreshold Leakage = 0.0902901 W
+        Gate Leakage = 0.0168786 W
+        Runtime Dynamic = 4.13145 W
+
+*****************************************************************************************
+NIU:
+      Area = 7.93167 mm^2
+      Peak Dynamic = 0.93241 W
+      Subthreshold Leakage = 0.178813 W
+      Gate Leakage = 0.0918312 W
+      Runtime Dynamic = 0.652687 W
+
+*****************************************************************************************
+PCIe:
+      Area = 6.24 mm^2
+      Peak Dynamic = 1.5067 W
+      Subthreshold Leakage = 0.214091 W
+      Gate Leakage = 0.109948 W
+      Runtime Dynamic = 1.05469 W
+
+*****************************************************************************************
+NOC
+      Area = 9.56584 mm^2
+      Peak Dynamic = 1.07754 W
+      Subthreshold Leakage = 1.61961 W
+      Gate Leakage = 0.389994 W
+      Runtime Dynamic = 1.07754 W
+
+      Router: 
+        Area = 4.78292 mm^2
+        Peak Dynamic = 0.538772 W
+        Subthreshold Leakage = 0.809805 W
+        Gate Leakage = 0.194997 W
+        Runtime Dynamic = 1.07754 W
+
+            Virtual Channel Buffer:
+              Area = 0.827721 mm^2
+              Peak Dynamic = 0.0223838 W
+              Subthreshold Leakage = 0.00314985 W
+              Gate Leakage = 0.000413272 W
+              Runtime Dynamic = 0.0447677 W
+
+            Crossbar:
+              Area = 1.69589 mm^2
+              Peak Dynamic = 0.511174 W
+              Subthreshold Leakage = 0.806641 W
+              Gate Leakage = 0.194581 W
+              Runtime Dynamic = 1.02235 W
+
+            Arbiter:
+              Peak Dynamic = 0.00521447 W
+              Subthreshold Leakage = 1.42757e-05 W
+              Gate Leakage = 2.78294e-06 W
+              Runtime Dynamic = 0.0104289 W
+
+*****************************************************************************************
diff --git a/src/gpuwattch/results/Xeon_core b/src/gpuwattch/results/Xeon_core
new file mode 100644
index 000000000..0cc9ae648
--- /dev/null
+++ b/src/gpuwattch/results/Xeon_core
@@ -0,0 +1,341 @@
+McPAT (version 0.7 of May, 2010) is computing the target processor...
+ 
+
+McPAT (version 0.7 of May, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 65 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3400
+
+*****************************************************************************************
+Processor: 
+  Area = 417.445 mm^2
+  Peak Power = 142.148 W
+  Total Leakage = 55.8021 W
+  Peak Dynamic = 86.3458 W
+  Subthreshold Leakage = 52.785 W
+  Gate Leakage = 3.01712 W
+  Runtime Dynamic = 63.1851 W
+
+  Total Cores: 
+  Device Type= ITRS high performance device type
+    Area = 133.278 mm^2
+    Peak Dynamic = 63.8414 W
+    Subthreshold Leakage = 32.4393 W
+    Gate Leakage = 2.72517 W
+    Runtime Dynamic = 41.616 W
+
+  Total L3s: 
+  Device Type= ITRS high performance device type
+    Area = 278.612 mm^2
+    Peak Dynamic = 6.11346 W
+    Subthreshold Leakage = 20.1995 W
+    Gate Leakage = 0.267752 W
+    Runtime Dynamic = 5.1782 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 5.5548 mm^2
+    Peak Dynamic = 16.3909 W
+    Subthreshold Leakage = 0.146229 W
+    Gate Leakage = 0.0241913 W
+    Runtime Dynamic = 16.3909 W
+
+*****************************************************************************************
+Core:
+      Area = 66.6389 mm^2
+      Peak Dynamic = 31.9207 W
+      Subthreshold Leakage = 16.2197 W
+      Gate Leakage = 1.36259 W
+      Runtime Dynamic = 41.616 W
+
+      Instruction Fetch Unit:
+        Area = 7.41271 mm^2
+        Peak Dynamic = 5.04492 W
+        Subthreshold Leakage = 1.26751 W
+        Gate Leakage = 0.09429 W
+        Runtime Dynamic = 5.39803 W
+
+          Instruction Cache:
+            Area = 2.44324 mm^2
+            Peak Dynamic = 1.42048 W
+            Subthreshold Leakage = 0.359444 W
+            Gate Leakage = 0.0187045 W
+            Runtime Dynamic = 2.13804 W
+
+          Branch Target Buffer:
+            Area = 0.729086 mm^2
+            Peak Dynamic = 0.161698 W
+            Subthreshold Leakage = 0.0616324 W
+            Gate Leakage = 0.00336254 W
+            Runtime Dynamic = 0.646794 W
+
+          Branch Predictor:
+            Area = 0.430961 mm^2
+            Peak Dynamic = 0.188469 W
+            Subthreshold Leakage = 0.0698834 W
+            Gate Leakage = 0.00415943 W
+            Runtime Dynamic = 0.166045 W
+
+              Global Predictor:
+                Area = 0.174771 mm^2
+                Peak Dynamic = 0.0633335 W
+                Subthreshold Leakage = 0.0274086 W
+                Gate Leakage = 0.00158249 W
+                Runtime Dynamic = 0.0633335 W
+
+              Local Predictor:
+                Area = 0.0735854 mm^2
+                Peak Dynamic = 0.0393754 W
+                Subthreshold Leakage = 0.0111166 W
+                Gate Leakage = 0.000721196 W
+                Runtime Dynamic = 0.0393754 W
+
+                Area = 0.0507308 mm^2
+                Peak Dynamic = 0.0258383 W
+                Subthreshold Leakage = 0.00749994 W
+                Gate Leakage = 0.000498805 W
+                Runtime Dynamic = 0.0258383 W
+
+              Chooser:
+                Area = 0.174771 mm^2
+                Peak Dynamic = 0.0633335 W
+                Subthreshold Leakage = 0.0274086 W
+                Gate Leakage = 0.00158249 W
+                Runtime Dynamic = 0.0633335 W
+
+              RAS:
+                Area = 0.0613744 mm^2
+                Peak Dynamic = 0.0224266 W
+                Subthreshold Leakage = 0.00394955 W
+                Gate Leakage = 0.000273252 W
+                Runtime Dynamic = 2.51602e-06 W
+
+          Instruction Buffer:
+            Area = 0.0684348 mm^2
+            Peak Dynamic = 0.704461 W
+            Subthreshold Leakage = 0.00411741 W
+            Gate Leakage = 0.000240288 W
+            Runtime Dynamic = 0.46964 W
+
+          Instruction Decoder:
+            Area = 3.73007 mm^2
+            Peak Dynamic = 1.97751 W
+            Subthreshold Leakage = 0.733056 W
+            Gate Leakage = 0.0575912 W
+            Runtime Dynamic = 1.97751 W
+
+      Renaming Unit:
+        Area = 1.82421 mm^2
+        Peak Dynamic = 2.76284 W
+        Subthreshold Leakage = 0.0765654 W
+        Gate Leakage = 0.0125478 W
+        Runtime Dynamic = 1.94438 W
+
+          Int Front End RAT:
+            Area = 0.875874 mm^2
+            Peak Dynamic = 1.249 W
+            Subthreshold Leakage = 0.0113878 W
+            Gate Leakage = 0.000693471 W
+            Runtime Dynamic = 1.249 W
+
+          FP Front End RAT:
+            Area = 0.405459 mm^2
+            Peak Dynamic = 0.610062 W
+            Subthreshold Leakage = 0.0144803 W
+            Gate Leakage = 0.000906674 W
+            Runtime Dynamic = 0.305031 W
+
+          Free List:
+            Area = 0.297629 mm^2
+            Peak Dynamic = 0.137664 W
+            Subthreshold Leakage = 0.0054316 W
+            Gate Leakage = 0.000326171 W
+            Runtime Dynamic = 0.275328 W
+
+          Int Retire RAT: 
+            Area = 0.0530903 mm^2
+            Peak Dynamic = 0.056222 W
+            Subthreshold Leakage = 0.00135314 W
+            Gate Leakage = 0.00011607 W
+            Runtime Dynamic = 0.056222 W
+
+          FP Retire RAT:
+            Area = 0.018828 mm^2
+            Peak Dynamic = 0.0186388 W
+            Subthreshold Leakage = 0.000788229 W
+            Gate Leakage = 6.41952e-05 W
+            Runtime Dynamic = 0.00931941 W
+
+          FP Free List:
+            Area = 0.162422 mm^2
+            Peak Dynamic = 0.0989385 W
+            Subthreshold Leakage = 0.00375181 W
+            Gate Leakage = 0.000209083 W
+            Runtime Dynamic = 0.0494693 W
+
+      Load Store Unit:
+        Area = 4.35998 mm^2
+        Peak Dynamic = 2.94939 W
+        Subthreshold Leakage = 0.208781 W
+        Gate Leakage = 0.0232213 W
+        Runtime Dynamic = 3.60184 W
+
+          Data Cache:
+            Area = 2.2051 mm^2
+            Peak Dynamic = 1.08067 W
+            Subthreshold Leakage = 0.0877157 W
+            Gate Leakage = 0.00573003 W
+            Runtime Dynamic = 2.30478 W
+
+          LoadQ:
+            Area = 0.637121 mm^2
+            Peak Dynamic = 0.551016 W
+            Subthreshold Leakage = 0.0283256 W
+            Gate Leakage = 0.00254841 W
+            Runtime Dynamic = 0.275508 W
+
+          StoreQ:
+            Area = 0.809965 mm^2
+            Peak Dynamic = 1.02155 W
+            Subthreshold Leakage = 0.053367 W
+            Gate Leakage = 0.00471074 W
+            Runtime Dynamic = 1.02155 W
+
+      Memory Management Unit:
+        Area = 0.517456 mm^2
+        Peak Dynamic = 0.979218 W
+        Subthreshold Leakage = 0.0808171 W
+        Gate Leakage = 0.0139952 W
+        Runtime Dynamic = 1.66678 W
+
+          Itlb:
+            Area = 0.127123 mm^2
+            Peak Dynamic = 0.236587 W
+            Subthreshold Leakage = 0.0160962 W
+            Gate Leakage = 0.00146431 W
+            Runtime Dynamic = 0.473177 W
+
+          Dtlb:
+            Area = 0.379422 mm^2
+            Peak Dynamic = 0.298399 W
+            Subthreshold Leakage = 0.0253484 W
+            Gate Leakage = 0.00229878 W
+            Runtime Dynamic = 1.1936 W
+
+      Execution Unit:
+        Area = 27.5381 mm^2
+        Peak Dynamic = 16.9637 W
+        Subthreshold Leakage = 7.08185 W
+        Gate Leakage = 0.73316 W
+        Runtime Dynamic = 22.7198 W
+
+          Register Files:
+            Area = 11.2548 mm^2
+            Peak Dynamic = 3.2925 W
+            Subthreshold Leakage = 0.11111 W
+            Gate Leakage = 0.00754256 W
+            Runtime Dynamic = 1.69823 W
+
+              Integer RF:
+                Area = 7.55916 mm^2
+                Peak Dynamic = 2.82012 W
+                Subthreshold Leakage = 0.0664048 W
+                Gate Leakage = 0.00458288 W
+                Runtime Dynamic = 1.51078 W
+
+              Floating Point RF:
+                Area = 3.69565 mm^2
+                Peak Dynamic = 0.472385 W
+                Subthreshold Leakage = 0.0447053 W
+                Gate Leakage = 0.00295968 W
+                Runtime Dynamic = 0.187454 W
+
+          Instruction Scheduler:
+            Area = 2.08681 mm^2
+            Peak Dynamic = 2.1684 W
+            Subthreshold Leakage = 0.0325294 W
+            Gate Leakage = 0.00296372 W
+            Runtime Dynamic = 2.59089 W
+
+              Instruction Window:
+                Area = 0.287309 mm^2
+                Peak Dynamic = 0.929972 W
+                Subthreshold Leakage = 0.0127376 W
+                Gate Leakage = 0.00137073 W
+                Runtime Dynamic = 1.2089 W
+
+              FP Instruction Window:
+                Area = 0.128977 mm^2
+                Peak Dynamic = 0.478661 W
+                Subthreshold Leakage = 0.00802287 W
+                Gate Leakage = 0.000873414 W
+                Runtime Dynamic = 0.622222 W
+
+              ROB:
+                Area = 1.67052 mm^2
+                Peak Dynamic = 0.759764 W
+                Subthreshold Leakage = 0.0117689 W
+                Gate Leakage = 0.000719579 W
+                Runtime Dynamic = 0.759764 W
+
+          Integer ALUs (Count: 6 ):
+            Area = 4.03603 mm^2
+            Peak Dynamic = 4.55818 W
+            Subthreshold Leakage = 3.9898 W
+            Gate Leakage = 0.412015 W
+            Runtime Dynamic = 2.33394 W
+
+          Floating Point Units (FPUs) (Count: 2 ):
+            Area = 9.71959 mm^2
+            Peak Dynamic = 1.43327 W
+            Subthreshold Leakage = 2.40207 W
+            Gate Leakage = 0.248054 W
+            Runtime Dynamic = 2.55333 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.336336 mm^2
+            Peak Dynamic = 0.510666 W
+            Subthreshold Leakage = 0.332484 W
+            Gate Leakage = 0.0343346 W
+            Runtime Dynamic = 3.18505 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0936618 mm^2
+            Peak Dynamic = 4.4084 W
+            Subthreshold Leakage = 0.174486 W
+            Gate Leakage = 0.0180186 W
+            Runtime Dynamic = 10.3584 W
+
+    L2
+    Area = 15.914 mm^2
+    Peak Dynamic = 3.22061 W
+    Subthreshold Leakage = 3.01991 W
+    Gate Leakage = 0.0223008 W
+    Runtime Dynamic = 6.28514 W
+
+*****************************************************************************************
+      L3
+      Area = 278.612 mm^2
+      Peak Dynamic = 6.11346 W
+      Subthreshold Leakage = 20.1995 W
+      Gate Leakage = 0.267752 W
+      Runtime Dynamic = 5.1782 W
+
+*****************************************************************************************
+BUSES
+      Area = 5.5548 mm^2
+      Peak Dynamic = 16.3909 W
+      Subthreshold Leakage = 0.146229 W
+      Gate Leakage = 0.0241913 W
+      Runtime Dynamic = 16.3909 W
+
+      Bus: 
+        Area = 5.5548 mm^2
+        Peak Dynamic = 16.3909 W
+        Subthreshold Leakage = 0.146229 W
+        Gate Leakage = 0.0241913 W
+        Runtime Dynamic = 16.3909 W
+
+*****************************************************************************************
diff --git a/src/gpuwattch/results/Xeon_uncore b/src/gpuwattch/results/Xeon_uncore
new file mode 100644
index 000000000..558331c25
--- /dev/null
+++ b/src/gpuwattch/results/Xeon_uncore
@@ -0,0 +1,341 @@
+McPAT (version 0.7 of May, 2010) is computing the target processor...
+ 
+
+McPAT (version 0.7 of May, 2010) results  (current print level is 5)
+*****************************************************************************************
+  Technology 65 nm
+  Using Long Channel Devices When Appropriate
+  Interconnect metal projection= aggressive interconnect technology projection
+  Core clock Rate(MHz) 3400
+
+*****************************************************************************************
+Processor: 
+  Area = 418.629 mm^2
+  Peak Power = 96.2032 W
+  Total Leakage = 27.5568 W
+  Peak Dynamic = 68.6463 W
+  Subthreshold Leakage = 25.8287 W
+  Gate Leakage = 1.72809 W
+  Runtime Dynamic = 50.332 W
+
+  Total Cores: 
+  Device Type= ITRS high performance device type
+    Area = 134.217 mm^2
+    Peak Dynamic = 50.8677 W
+    Subthreshold Leakage = 15.0187 W
+    Gate Leakage = 1.57092 W
+    Runtime Dynamic = 33.3003 W
+
+  Total L3s: 
+  Device Type= ITRS high performance device type
+    Area = 278.843 mm^2
+    Peak Dynamic = 4.84476 W
+    Subthreshold Leakage = 10.7416 W
+    Gate Leakage = 0.144361 W
+    Runtime Dynamic = 4.09781 W
+
+  Total NoCs (Network/Bus): 
+  Device Type= ITRS high performance device type
+    Area = 5.56828 mm^2
+    Peak Dynamic = 12.9339 W
+    Subthreshold Leakage = 0.0684953 W
+    Gate Leakage = 0.0128043 W
+    Runtime Dynamic = 12.9339 W
+
+*****************************************************************************************
+Core:
+      Area = 67.1085 mm^2
+      Peak Dynamic = 25.4338 W
+      Subthreshold Leakage = 7.50933 W
+      Gate Leakage = 0.78546 W
+      Runtime Dynamic = 33.3003 W
+
+      Instruction Fetch Unit:
+        Area = 7.56843 mm^2
+        Peak Dynamic = 4.27305 W
+        Subthreshold Leakage = 0.571346 W
+        Gate Leakage = 0.0523885 W
+        Runtime Dynamic = 4.67953 W
+
+          Instruction Cache:
+            Area = 2.44678 mm^2
+            Peak Dynamic = 1.1785 W
+            Subthreshold Leakage = 0.151766 W
+            Gate Leakage = 0.009764 W
+            Runtime Dynamic = 1.7926 W
+
+          Branch Target Buffer:
+            Area = 0.718635 mm^2
+            Peak Dynamic = 0.151619 W
+            Subthreshold Leakage = 0.0238082 W
+            Gate Leakage = 0.0015503 W
+            Runtime Dynamic = 0.606475 W
+
+          Branch Predictor:
+            Area = 0.446844 mm^2
+            Peak Dynamic = 0.158508 W
+            Subthreshold Leakage = 0.0293041 W
+            Gate Leakage = 0.0021362 W
+            Runtime Dynamic = 0.14087 W
+
+              Global Predictor:
+                Area = 0.174801 mm^2
+                Peak Dynamic = 0.0543932 W
+                Subthreshold Leakage = 0.0116121 W
+                Gate Leakage = 0.000827171 W
+                Runtime Dynamic = 0.0543932 W
+
+              Local Predictor:
+                Area = 0.0788692 mm^2
+                Peak Dynamic = 0.0320817 W
+                Subthreshold Leakage = 0.00452837 W
+                Gate Leakage = 0.000354718 W
+                Runtime Dynamic = 0.0320817 W
+
+                Area = 0.050748 mm^2
+                Peak Dynamic = 0.0218669 W
+                Subthreshold Leakage = 0.00318852 W
+                Gate Leakage = 0.000264126 W
+                Runtime Dynamic = 0.0218669 W
+
+              Chooser:
+                Area = 0.174801 mm^2
+                Peak Dynamic = 0.0543932 W
+                Subthreshold Leakage = 0.0116121 W
+                Gate Leakage = 0.000827171 W
+                Runtime Dynamic = 0.0543932 W
+
+              RAS:
+                Area = 0.0929863 mm^2
+                Peak Dynamic = 0.0176394 W
+                Subthreshold Leakage = 0.00155163 W
+                Gate Leakage = 0.00012714 W
+                Runtime Dynamic = 1.96119e-06 W
+
+          Instruction Buffer:
+            Area = 0.0687233 mm^2
+            Peak Dynamic = 0.579633 W
+            Subthreshold Leakage = 0.00177049 W
+            Gate Leakage = 0.000129185 W
+            Runtime Dynamic = 0.386422 W
+
+          Instruction Decoder:
+            Area = 3.87654 mm^2
+            Peak Dynamic = 1.75316 W
+            Subthreshold Leakage = 0.348225 W
+            Gate Leakage = 0.0335628 W
+            Runtime Dynamic = 1.75316 W
+
+      Renaming Unit:
+        Area = 1.83366 mm^2
+        Peak Dynamic = 2.16025 W
+        Subthreshold Leakage = 0.0324638 W
+        Gate Leakage = 0.00648876 W
+        Runtime Dynamic = 1.53428 W
+
+          Int Front End RAT:
+            Area = 0.879521 mm^2
+            Peak Dynamic = 0.975897 W
+            Subthreshold Leakage = 0.00490782 W
+            Gate Leakage = 0.000372282 W
+            Runtime Dynamic = 0.975897 W
+
+          FP Front End RAT:
+            Area = 0.407642 mm^2
+            Peak Dynamic = 0.477469 W
+            Subthreshold Leakage = 0.00619591 W
+            Gate Leakage = 0.000483134 W
+            Runtime Dynamic = 0.238735 W
+
+          Free List:
+            Area = 0.300513 mm^2
+            Peak Dynamic = 0.112906 W
+            Subthreshold Leakage = 0.00233243 W
+            Gate Leakage = 0.000174984 W
+            Runtime Dynamic = 0.225813 W
+
+          Int Retire RAT: 
+            Area = 0.0534147 mm^2
+            Peak Dynamic = 0.0453154 W
+            Subthreshold Leakage = 0.00058142 W
+            Gate Leakage = 6.26682e-05 W
+            Runtime Dynamic = 0.0453154 W
+
+          FP Retire RAT:
+            Area = 0.018897 mm^2
+            Peak Dynamic = 0.0151716 W
+            Subthreshold Leakage = 0.000337803 W
+            Gate Leakage = 3.45545e-05 W
+            Runtime Dynamic = 0.00758578 W
+
+          FP Free List:
+            Area = 0.162758 mm^2
+            Peak Dynamic = 0.081858 W
+            Subthreshold Leakage = 0.00163685 W
+            Gate Leakage = 0.000115075 W
+            Runtime Dynamic = 0.040929 W
+
+      Load Store Unit:
+        Area = 4.4281 mm^2
+        Peak Dynamic = 2.34722 W
+        Subthreshold Leakage = 0.0896936 W
+        Gate Leakage = 0.0121845 W
+        Runtime Dynamic = 2.89901 W
+
+          Data Cache:
+            Area = 2.25853 mm^2
+            Peak Dynamic = 0.888323 W
+            Subthreshold Leakage = 0.0382167 W
+            Gate Leakage = 0.00311455 W
+            Runtime Dynamic = 1.88387 W
+
+          LoadQ:
+            Area = 0.638298 mm^2
+            Peak Dynamic = 0.435889 W
+            Subthreshold Leakage = 0.0121526 W
+            Gate Leakage = 0.00134375 W
+            Runtime Dynamic = 0.217944 W
+
+          StoreQ:
+            Area = 0.811765 mm^2
+            Peak Dynamic = 0.79719 W
+            Subthreshold Leakage = 0.0228527 W
+            Gate Leakage = 0.00248017 W
+            Runtime Dynamic = 0.79719 W
+
+      Memory Management Unit:
+        Area = 0.518866 mm^2
+        Peak Dynamic = 0.760463 W
+        Subthreshold Leakage = 0.0342246 W
+        Gate Leakage = 0.00722713 W
+        Runtime Dynamic = 1.31193 W
+
+          Itlb:
+            Area = 0.12744 mm^2
+            Peak Dynamic = 0.187517 W
+            Subthreshold Leakage = 0.00686539 W
+            Gate Leakage = 0.000767441 W
+            Runtime Dynamic = 0.375037 W
+
+          Dtlb:
+            Area = 0.380515 mm^2
+            Peak Dynamic = 0.234221 W
+            Subthreshold Leakage = 0.0108877 W
+            Gate Leakage = 0.00121362 W
+            Runtime Dynamic = 0.936886 W
+
+      Execution Unit:
+        Area = 27.5564 mm^2
+        Peak Dynamic = 13.34 W
+        Subthreshold Leakage = 3.35055 W
+        Gate Leakage = 0.425 W
+        Runtime Dynamic = 17.8618 W
+
+          Register Files:
+            Area = 11.2668 mm^2
+            Peak Dynamic = 2.65925 W
+            Subthreshold Leakage = 0.0472795 W
+            Gate Leakage = 0.00398463 W
+            Runtime Dynamic = 1.37147 W
+
+              Integer RF:
+                Area = 7.56635 mm^2
+                Peak Dynamic = 2.27672 W
+                Subthreshold Leakage = 0.0282472 W
+                Gate Leakage = 0.00241709 W
+                Runtime Dynamic = 1.21967 W
+
+              Floating Point RF:
+                Area = 3.70048 mm^2
+                Peak Dynamic = 0.382527 W
+                Subthreshold Leakage = 0.0190323 W
+                Gate Leakage = 0.00156754 W
+                Runtime Dynamic = 0.151797 W
+
+          Instruction Scheduler:
+            Area = 2.09118 mm^2
+            Peak Dynamic = 1.7092 W
+            Subthreshold Leakage = 0.0139125 W
+            Gate Leakage = 0.00156067 W
+            Runtime Dynamic = 2.04197 W
+
+              Instruction Window:
+                Area = 0.287606 mm^2
+                Peak Dynamic = 0.721714 W
+                Subthreshold Leakage = 0.00547415 W
+                Gate Leakage = 0.000721338 W
+                Runtime Dynamic = 0.940723 W
+
+              FP Instruction Window:
+                Area = 0.129287 mm^2
+                Peak Dynamic = 0.372875 W
+                Subthreshold Leakage = 0.0034355 W
+                Gate Leakage = 0.00045775 W
+                Runtime Dynamic = 0.486639 W
+
+              ROB:
+                Area = 1.67428 mm^2
+                Peak Dynamic = 0.61461 W
+                Subthreshold Leakage = 0.00500288 W
+                Gate Leakage = 0.00038158 W
+                Runtime Dynamic = 0.61461 W
+
+          Integer ALUs (Count: 6 ):
+            Area = 4.03603 mm^2
+            Peak Dynamic = 3.52986 W
+            Subthreshold Leakage = 1.89726 W
+            Gate Leakage = 0.240113 W
+            Runtime Dynamic = 1.8074 W
+
+          Floating Point Units (FPUs) (Count: 2 ):
+            Area = 9.71959 mm^2
+            Peak Dynamic = 1.10993 W
+            Subthreshold Leakage = 1.14225 W
+            Gate Leakage = 0.14456 W
+            Runtime Dynamic = 1.9773 W
+
+          Complex ALUs (Mul/Div) (Count: 1 ):
+            Area = 0.336336 mm^2
+            Peak Dynamic = 0.405148 W
+            Subthreshold Leakage = 0.158105 W
+            Gate Leakage = 0.0200094 W
+            Runtime Dynamic = 2.4988 W
+
+          Results Broadcast Bus:
+            Area Overhead = 0.0954831 mm^2
+            Peak Dynamic = 3.47499 W
+            Subthreshold Leakage = 0.0752739 W
+            Gate Leakage = 0.00952648 W
+            Runtime Dynamic = 8.1649 W
+
+    L2
+    Area = 16.1307 mm^2
+    Peak Dynamic = 2.55285 W
+    Subthreshold Leakage = 1.29868 W
+    Gate Leakage = 0.012304 W
+    Runtime Dynamic = 5.01368 W
+
+*****************************************************************************************
+      L3
+      Area = 278.843 mm^2
+      Peak Dynamic = 4.84476 W
+      Subthreshold Leakage = 10.7416 W
+      Gate Leakage = 0.144361 W
+      Runtime Dynamic = 4.09781 W
+
+*****************************************************************************************
+BUSES
+      Area = 5.56828 mm^2
+      Peak Dynamic = 12.9339 W
+      Subthreshold Leakage = 0.0684953 W
+      Gate Leakage = 0.0128043 W
+      Runtime Dynamic = 12.9339 W
+
+      Bus: 
+        Area = 5.56828 mm^2
+        Peak Dynamic = 12.9339 W
+        Subthreshold Leakage = 0.0684953 W
+        Gate Leakage = 0.0128043 W
+        Runtime Dynamic = 12.9339 W
+
+*****************************************************************************************
diff --git a/src/gpuwattch/sharedcache.cc b/src/gpuwattch/sharedcache.cc
new file mode 100644
index 000000000..31465a0c3
--- /dev/null
+++ b/src/gpuwattch/sharedcache.cc
@@ -0,0 +1,1350 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "sharedcache.h"
+#include "XML_Parse.h"
+#include "array.h"
+#include "cacti/arbiter.h"
+#include "cacti/basic_circuit.h"
+#include "cacti/parameter.h"
+#include "const.h"
+#include "io.h"
+#include "logic.h"
+#include <algorithm>
+#include <assert.h>
+#include <cmath>
+#include <iostream>
+#include <string.h>
+
+SharedCache::SharedCache(ParseXML *XML_interface, int ithCache_,
+                         InputParameter *interface_ip_,
+                         enum cache_level cacheL_)
+    : XML(XML_interface), ithCache(ithCache_), interface_ip(*interface_ip_),
+      cacheL(cacheL_), dir_overhead(0) {
+  int idx;
+  int tag, data;
+  bool debug;
+  enum Device_ty device_t;
+  enum Core_type core_t;
+  double size, line, assoc, banks;
+  if (cacheL == L2 && XML->sys.Private_L2) {
+    device_t = Core_device;
+    core_t = (enum Core_type)XML->sys.core[ithCache].machine_type;
+  } else {
+    device_t = LLC_device;
+    core_t = Inorder;
+  }
+
+  debug = false;
+  if (XML->sys.Embedded) {
+    interface_ip.wt = Global_30;
+    interface_ip.wire_is_mat_type = 0;
+    interface_ip.wire_os_mat_type = 1;
+  } else {
+    interface_ip.wt = Global;
+    interface_ip.wire_is_mat_type = 2;
+    interface_ip.wire_os_mat_type = 2;
+  }
+  set_cache_param();
+
+  // All lower level cache are physically indexed and tagged.
+  size = cachep.capacity;
+  line = cachep.blockW;
+  assoc = cachep.assoc;
+  banks = cachep.nbanks;
+  if ((cachep.dir_ty == ST && cacheL == L1Directory) ||
+      (cachep.dir_ty == ST && cacheL == L2Directory)) {
+    assoc = 0;
+    tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+    interface_ip.num_search_ports = 1;
+  } else {
+    idx = debug ? 9 : int(ceil(log2(size / line / assoc)));
+    tag = debug ? 51
+                : XML->sys.physical_address_width - idx -
+                      int(ceil(log2(line))) + EXTRA_TAG_BITS;
+    interface_ip.num_search_ports = 0;
+    if (cachep.dir_ty == SBT) {
+      dir_overhead =
+          ceil(XML->sys.number_of_cores / 8.0) * 8 / (cachep.blockW * 8);
+      line = cachep.blockW * (1 + dir_overhead);
+      size = cachep.capacity * (1 + dir_overhead);
+    }
+  }
+  //  if (XML->sys.first_level_dir==2)
+  //	  tag += int(XML->sys.domain_size + 5);
+  interface_ip.specific_tag = 1;
+  interface_ip.tag_w = tag;
+  interface_ip.cache_sz = (int)size;
+  interface_ip.line_sz = (int)line;
+  interface_ip.assoc = (int)assoc;
+  interface_ip.nbanks = (int)banks;
+  interface_ip.out_w = interface_ip.line_sz * 8 / 2;
+  interface_ip.access_mode = 1;
+  interface_ip.throughput = cachep.throughput;
+  interface_ip.latency = cachep.latency;
+  interface_ip.is_cache = true;
+  interface_ip.pure_ram = false;
+  interface_ip.pure_cam = false;
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t = 1;
+  interface_ip.num_rw_ports = 1; // lower level cache usually has one port.
+  interface_ip.num_rd_ports = 0;
+  interface_ip.num_wr_ports = 0;
+  interface_ip.num_se_rd_ports = 0;
+  //  interface_ip.force_cache_config  =true;
+  //  interface_ip.ndwl = 4;
+  //  interface_ip.ndbl = 8;
+  //  interface_ip.nspd = 1;
+  //  interface_ip.ndcm =1 ;
+  //  interface_ip.ndsam1 =1;
+  //  interface_ip.ndsam2 =1;
+  unicache.caches =
+      new ArrayST(&interface_ip, cachep.name + "cache", device_t, true, core_t);
+  unicache.area.set_area(unicache.area.get_area() +
+                         unicache.caches->local_result.area);
+  area.set_area(area.get_area() + unicache.caches->local_result.area);
+  interface_ip.force_cache_config = false;
+
+  if (!((cachep.dir_ty == ST && cacheL == L1Directory) ||
+        (cachep.dir_ty == ST && cacheL == L2Directory))) {
+    tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+    data = (XML->sys.physical_address_width) + int(ceil(log2(size / line))) +
+           unicache.caches->l_ip.line_sz;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.line_sz =
+        int(ceil(data / 8.0)); // int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+    interface_ip.cache_sz = cachep.missb_size * interface_ip.line_sz;
+    interface_ip.assoc = 0;
+    interface_ip.is_cache = true;
+    interface_ip.pure_ram = false;
+    interface_ip.pure_cam = false;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8 / 2;
+    interface_ip.access_mode = 0;
+    interface_ip.throughput = cachep.throughput; // means cycle time
+    interface_ip.latency = cachep.latency;       // means access time
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = 1;
+    interface_ip.num_rd_ports = 0;
+    interface_ip.num_wr_ports = 0;
+    interface_ip.num_se_rd_ports = 0;
+    interface_ip.num_search_ports = 1;
+    unicache.missb = new ArrayST(&interface_ip, cachep.name + "MissB", device_t,
+                                 true, core_t);
+    unicache.area.set_area(unicache.area.get_area() +
+                           unicache.missb->local_result.area);
+    area.set_area(area.get_area() + unicache.missb->local_result.area);
+    // fill buffer
+    tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+    data = unicache.caches->l_ip.line_sz;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+    interface_ip.cache_sz = data * cachep.fu_size;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8 / 2;
+    interface_ip.access_mode = 0;
+    interface_ip.throughput = cachep.throughput;
+    interface_ip.latency = cachep.latency;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = 1;
+    interface_ip.num_rd_ports = 0;
+    interface_ip.num_wr_ports = 0;
+    interface_ip.num_se_rd_ports = 0;
+    unicache.ifb = new ArrayST(&interface_ip, cachep.name + "FillB", device_t,
+                               true, core_t);
+    unicache.area.set_area(unicache.area.get_area() +
+                           unicache.ifb->local_result.area);
+    area.set_area(area.get_area() + unicache.ifb->local_result.area);
+    // prefetch buffer
+    tag = XML->sys.physical_address_width +
+          EXTRA_TAG_BITS; // check with previous entries to decide wthether to
+                          // merge.
+    data = unicache.caches->l_ip
+               .line_sz; // separate queue to prevent from cache polution.
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.line_sz = data; // int(pow(2.0,ceil(log2(data))));
+    interface_ip.cache_sz = cachep.prefetchb_size * interface_ip.line_sz;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8 / 2;
+    interface_ip.access_mode = 0;
+    interface_ip.throughput = cachep.throughput;
+    interface_ip.latency = cachep.latency;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = 1;
+    interface_ip.num_rd_ports = 0;
+    interface_ip.num_wr_ports = 0;
+    interface_ip.num_se_rd_ports = 0;
+    unicache.prefetchb = new ArrayST(&interface_ip, cachep.name + "PrefetchB",
+                                     device_t, true, core_t);
+    unicache.area.set_area(unicache.area.get_area() +
+                           unicache.prefetchb->local_result.area);
+    area.set_area(area.get_area() + unicache.prefetchb->local_result.area);
+    // WBB
+    tag = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+    data = unicache.caches->l_ip.line_sz;
+    interface_ip.specific_tag = 1;
+    interface_ip.tag_w = tag;
+    interface_ip.line_sz = data;
+    interface_ip.cache_sz = cachep.wbb_size * interface_ip.line_sz;
+    interface_ip.assoc = 0;
+    interface_ip.nbanks = 1;
+    interface_ip.out_w = interface_ip.line_sz * 8 / 2;
+    interface_ip.access_mode = 0;
+    interface_ip.throughput = cachep.throughput;
+    interface_ip.latency = cachep.latency;
+    interface_ip.obj_func_dyn_energy = 0;
+    interface_ip.obj_func_dyn_power = 0;
+    interface_ip.obj_func_leak_power = 0;
+    interface_ip.obj_func_cycle_t = 1;
+    interface_ip.num_rw_ports = 1;
+    interface_ip.num_rd_ports = 0;
+    interface_ip.num_wr_ports = 0;
+    interface_ip.num_se_rd_ports = 0;
+    unicache.wbb =
+        new ArrayST(&interface_ip, cachep.name + "WBB", device_t, true, core_t);
+    unicache.area.set_area(unicache.area.get_area() +
+                           unicache.wbb->local_result.area);
+    area.set_area(area.get_area() + unicache.wbb->local_result.area);
+  }
+  //  //pipeline
+  //  interface_ip.pipeline_stages =
+  //  int(ceil(llCache.caches.local_result.access_time/llCache.caches.local_result.cycle_time));
+  //  interface_ip.per_stage_vector = llCache.caches.l_ip.out_w +
+  //  llCache.caches.l_ip.tag_w ; pipeLogicCache.init_pipeline(is_default,
+  //  &interface_ip); pipeLogicCache.compute_pipeline();
+
+  /*
+  if (!((XML->sys.number_of_dir_levels==1 && XML->sys.first_level_dir ==1)
+                  ||(XML->sys.number_of_dir_levels==1 &&
+  XML->sys.first_level_dir ==2)))//not single level IC and DIC
+  {
+  //directory Now assuming one directory per bank, TODO:should change it later
+  size                             = XML->sys.L2directory.L2Dir_config[0];
+  line                             = XML->sys.L2directory.L2Dir_config[1];
+  assoc                            = XML->sys.L2directory.L2Dir_config[2];
+  banks                            = XML->sys.L2directory.L2Dir_config[3];
+  tag							   =
+  debug?51:XML->sys.physical_address_width + EXTRA_TAG_BITS;//TODO: a little bit
+  over estimate interface_ip.specific_tag        = 0; interface_ip.tag_w = tag;
+  interface_ip.cache_sz            = XML->sys.L2directory.L2Dir_config[0];
+  interface_ip.line_sz             = XML->sys.L2directory.L2Dir_config[1];
+  interface_ip.assoc               = XML->sys.L2directory.L2Dir_config[2];
+  interface_ip.nbanks              = XML->sys.L2directory.L2Dir_config[3];
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         =
+  0;//debug?0:XML->sys.core[ithCore].icache.icache_config[5];
+  interface_ip.throughput          =
+  XML->sys.L2directory.L2Dir_config[4]/clockRate; interface_ip.latency =
+  XML->sys.L2directory.L2Dir_config[5]/clockRate; interface_ip.is_cache
+  = true; interface_ip.obj_func_dyn_energy = 0; interface_ip.obj_func_dyn_power
+  = 0; interface_ip.obj_func_leak_power = 0; interface_ip.obj_func_cycle_t    =
+  1; interface_ip.num_rw_ports    = 1;//lower level cache usually has one port.
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+
+  strcpy(directory.caches.name,"L2 Directory");
+  directory.caches.init_cache(&interface_ip);
+  directory.caches.optimize_array();
+  directory.area += directory.caches.local_result.area;
+  //output_data_csv(directory.caches.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //miss buffer Each MSHR contains enough state to handle one or more accesses
+  of any type to a single memory line.
+  //Due to the generality of the MSHR mechanism, the amount of state involved is
+  non-trivial,
+  //including the address, pointers to the cache entry and destination register,
+  written data, and various other pieces of state. tag
+  = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data							   =
+  (XML->sys.physical_address_width) + int(ceil(log2(size/line))) +
+  directory.caches.l_ip.line_sz; interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             =
+  int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz            =
+  XML->sys.L2[ithCache].buffer_sizes[0]*interface_ip.line_sz; interface_ip.assoc
+  = 0; interface_ip.nbanks              = 1; interface_ip.out_w               =
+  interface_ip.line_sz*8; interface_ip.access_mode         = 0;
+  interface_ip.throughput          =
+  XML->sys.L2[ithCache].L2_config[4]/clockRate;//means cycle time
+  interface_ip.latency             =
+  XML->sys.L2[ithCache].L2_config[5]/clockRate;//means access time
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory.missb.name,"directoryMissB");
+  directory.missb.init_cache(&interface_ip);
+  directory.missb.optimize_array();
+  directory.area += directory.missb.local_result.area;
+  //output_data_csv(directory.missb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //fill buffer
+  tag							   =
+  XML->sys.physical_address_width + EXTRA_TAG_BITS; data
+  = directory.caches.l_ip.line_sz; interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz            = data*XML->sys.L2[ithCache].buffer_sizes[1];
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          =
+  XML->sys.L2[ithCache].L2_config[4]/clockRate; interface_ip.latency =
+  XML->sys.L2[ithCache].L2_config[5]/clockRate; interface_ip.obj_func_dyn_energy
+  = 0; interface_ip.obj_func_dyn_power  = 0; interface_ip.obj_func_leak_power =
+  0; interface_ip.obj_func_cycle_t    = 1; interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory.ifb.name,"directoryFillB");
+  directory.ifb.init_cache(&interface_ip);
+  directory.ifb.optimize_array();
+  directory.area += directory.ifb.local_result.area;
+  //output_data_csv(directory.ifb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //prefetch buffer
+  tag							   =
+  XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries
+  to decide wthether to merge.
+  data							   =
+  directory.caches.l_ip.line_sz;//separate queue to prevent from cache polution.
+  interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz            =
+  XML->sys.L2[ithCache].buffer_sizes[2]*interface_ip.line_sz; interface_ip.assoc
+  = 0; interface_ip.nbanks              = 1; interface_ip.out_w               =
+  interface_ip.line_sz*8; interface_ip.access_mode         = 0;
+  interface_ip.throughput          =
+  XML->sys.L2[ithCache].L2_config[4]/clockRate; interface_ip.latency =
+  XML->sys.L2[ithCache].L2_config[5]/clockRate; interface_ip.obj_func_dyn_energy
+  = 0; interface_ip.obj_func_dyn_power  = 0; interface_ip.obj_func_leak_power =
+  0; interface_ip.obj_func_cycle_t    = 1; interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory.prefetchb.name,"directoryPrefetchB");
+  directory.prefetchb.init_cache(&interface_ip);
+  directory.prefetchb.optimize_array();
+  directory.area += directory.prefetchb.local_result.area;
+  //output_data_csv(directory.prefetchb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //WBB
+  tag							   =
+  XML->sys.physical_address_width + EXTRA_TAG_BITS; data
+  = directory.caches.l_ip.line_sz; interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;
+  interface_ip.cache_sz            =
+  XML->sys.L2[ithCache].buffer_sizes[3]*interface_ip.line_sz; interface_ip.assoc
+  = 0; interface_ip.nbanks              = 1; interface_ip.out_w               =
+  interface_ip.line_sz*8; interface_ip.access_mode         = 0;
+  interface_ip.throughput          =
+  XML->sys.L2[ithCache].L2_config[4]/clockRate; interface_ip.latency =
+  XML->sys.L2[ithCache].L2_config[4]/clockRate; interface_ip.obj_func_dyn_energy
+  = 0; interface_ip.obj_func_dyn_power  = 0; interface_ip.obj_func_leak_power =
+  0; interface_ip.obj_func_cycle_t    = 1; interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory.wbb.name,"directoryWBB");
+  directory.wbb.init_cache(&interface_ip);
+  directory.wbb.optimize_array();
+  directory.area += directory.wbb.local_result.area;
+  }
+
+  if (XML->sys.number_of_dir_levels ==2 && XML->sys.first_level_dir==0)
+  {
+  //first level directory
+  size                             =
+  XML->sys.L2directory.L2Dir_config[0]*XML->sys.domain_size/128; line =
+  int(ceil(XML->sys.domain_size/8.0)); assoc                            =
+  XML->sys.L2directory.L2Dir_config[2]; banks                            =
+  XML->sys.L2directory.L2Dir_config[3]; tag
+  = debug?51:XML->sys.physical_address_width + EXTRA_TAG_BITS;//TODO: a little
+  bit over estimate interface_ip.specific_tag        = 1; interface_ip.tag_w =
+  tag; interface_ip.cache_sz            = XML->sys.L2directory.L2Dir_config[0];
+  interface_ip.line_sz             = XML->sys.L2directory.L2Dir_config[1];
+  interface_ip.assoc               = XML->sys.L2directory.L2Dir_config[2];
+  interface_ip.nbanks              = XML->sys.L2directory.L2Dir_config[3];
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         =
+  0;//debug?0:XML->sys.core[ithCore].icache.icache_config[5];
+  interface_ip.throughput          =
+  XML->sys.L2directory.L2Dir_config[4]/clockRate; interface_ip.latency =
+  XML->sys.L2directory.L2Dir_config[5]/clockRate; interface_ip.is_cache
+  = true; interface_ip.obj_func_dyn_energy = 0; interface_ip.obj_func_dyn_power
+  = 0; interface_ip.obj_func_leak_power = 0; interface_ip.obj_func_cycle_t    =
+  1; interface_ip.num_rw_ports    = 1;//lower level cache usually has one port.
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+
+  strcpy(directory1.caches.name,"first level Directory");
+  directory1.caches.init_cache(&interface_ip);
+  directory1.caches.optimize_array();
+  directory1.area += directory1.caches.local_result.area;
+  //output_data_csv(directory.caches.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //miss buffer Each MSHR contains enough state to handle one or more accesses
+  of any type to a single memory line.
+  //Due to the generality of the MSHR mechanism, the amount of state involved is
+  non-trivial,
+  //including the address, pointers to the cache entry and destination register,
+  written data, and various other pieces of state. tag
+  = XML->sys.physical_address_width + EXTRA_TAG_BITS;
+  data							   =
+  (XML->sys.physical_address_width) + int(ceil(log2(size/line))) +
+  directory1.caches.l_ip.line_sz; interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             =
+  int(ceil(data/8.0));//int(ceil(pow(2.0,ceil(log2(data)))/8.0));
+  interface_ip.cache_sz            =
+  XML->sys.L2[ithCache].buffer_sizes[0]*interface_ip.line_sz; interface_ip.assoc
+  = 0; interface_ip.nbanks              = 1; interface_ip.out_w               =
+  interface_ip.line_sz*8; interface_ip.access_mode         = 0;
+  interface_ip.throughput          =
+  XML->sys.L2[ithCache].L2_config[4]/clockRate;//means cycle time
+  interface_ip.latency             =
+  XML->sys.L2[ithCache].L2_config[5]/clockRate;//means access time
+  interface_ip.obj_func_dyn_energy = 0;
+  interface_ip.obj_func_dyn_power  = 0;
+  interface_ip.obj_func_leak_power = 0;
+  interface_ip.obj_func_cycle_t    = 1;
+  interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory1.missb.name,"directory1MissB");
+  directory1.missb.init_cache(&interface_ip);
+  directory1.missb.optimize_array();
+  directory1.area += directory1.missb.local_result.area;
+  //output_data_csv(directory.missb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //fill buffer
+  tag							   =
+  XML->sys.physical_address_width + EXTRA_TAG_BITS; data
+  = directory1.caches.l_ip.line_sz; interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz            = data*XML->sys.L2[ithCache].buffer_sizes[1];
+  interface_ip.assoc               = 0;
+  interface_ip.nbanks              = 1;
+  interface_ip.out_w               = interface_ip.line_sz*8;
+  interface_ip.access_mode         = 0;
+  interface_ip.throughput          =
+  XML->sys.L2[ithCache].L2_config[4]/clockRate; interface_ip.latency =
+  XML->sys.L2[ithCache].L2_config[5]/clockRate; interface_ip.obj_func_dyn_energy
+  = 0; interface_ip.obj_func_dyn_power  = 0; interface_ip.obj_func_leak_power =
+  0; interface_ip.obj_func_cycle_t    = 1; interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory1.ifb.name,"directory1FillB");
+  directory1.ifb.init_cache(&interface_ip);
+  directory1.ifb.optimize_array();
+  directory1.area += directory1.ifb.local_result.area;
+  //output_data_csv(directory.ifb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //prefetch buffer
+  tag							   =
+  XML->sys.physical_address_width + EXTRA_TAG_BITS;//check with previous entries
+  to decide wthether to merge.
+  data							   =
+  directory1.caches.l_ip.line_sz;//separate queue to prevent from cache
+  polution. interface_ip.specific_tag        = 1; interface_ip.tag_w = tag;
+  interface_ip.line_sz             = data;//int(pow(2.0,ceil(log2(data))));
+  interface_ip.cache_sz            =
+  XML->sys.L2[ithCache].buffer_sizes[2]*interface_ip.line_sz; interface_ip.assoc
+  = 0; interface_ip.nbanks              = 1; interface_ip.out_w               =
+  interface_ip.line_sz*8; interface_ip.access_mode         = 0;
+  interface_ip.throughput          =
+  XML->sys.L2[ithCache].L2_config[4]/clockRate; interface_ip.latency =
+  XML->sys.L2[ithCache].L2_config[5]/clockRate; interface_ip.obj_func_dyn_energy
+  = 0; interface_ip.obj_func_dyn_power  = 0; interface_ip.obj_func_leak_power =
+  0; interface_ip.obj_func_cycle_t    = 1; interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory1.prefetchb.name,"directory1PrefetchB");
+  directory1.prefetchb.init_cache(&interface_ip);
+  directory1.prefetchb.optimize_array();
+  directory1.area += directory1.prefetchb.local_result.area;
+  //output_data_csv(directory.prefetchb.local_result);
+  ///cout<<"area="<<area<<endl;
+
+  //WBB
+  tag							   =
+  XML->sys.physical_address_width + EXTRA_TAG_BITS; data
+  = directory1.caches.l_ip.line_sz; interface_ip.specific_tag        = 1;
+  interface_ip.tag_w               = tag;
+  interface_ip.line_sz             = data;
+  interface_ip.cache_sz            =
+  XML->sys.L2[ithCache].buffer_sizes[3]*interface_ip.line_sz; interface_ip.assoc
+  = 0; interface_ip.nbanks              = 1; interface_ip.out_w               =
+  interface_ip.line_sz*8; interface_ip.access_mode         = 0;
+  interface_ip.throughput          =
+  XML->sys.L2[ithCache].L2_config[4]/clockRate; interface_ip.latency =
+  XML->sys.L2[ithCache].L2_config[5]/clockRate; interface_ip.obj_func_dyn_energy
+  = 0; interface_ip.obj_func_dyn_power  = 0; interface_ip.obj_func_leak_power =
+  0; interface_ip.obj_func_cycle_t    = 1; interface_ip.num_rw_ports    = 1;
+  interface_ip.num_rd_ports    = 0;
+  interface_ip.num_wr_ports    = 0;
+  interface_ip.num_se_rd_ports = 0;
+  strcpy(directory1.wbb.name,"directoryWBB");
+  directory1.wbb.init_cache(&interface_ip);
+  directory1.wbb.optimize_array();
+  directory1.area += directory1.wbb.local_result.area;
+  }
+
+  if (XML->sys.first_level_dir==1)//IC
+  {
+          tag							   =
+  XML->sys.physical_address_width + EXTRA_TAG_BITS; data
+  = int(ceil(XML->sys.domain_size/8.0)); interface_ip.specific_tag        = 1;
+          interface_ip.tag_w               = tag;
+          interface_ip.line_sz             = data;
+          interface_ip.cache_sz            =
+  XML->sys.domain_size*data*XML->sys.L2[ithCache].L2_config[0]/XML->sys.L2[ithCache].L2_config[1];
+          interface_ip.assoc               = 0;
+          interface_ip.nbanks              = 1024;
+          interface_ip.out_w               = interface_ip.line_sz*8;
+          interface_ip.access_mode         = 0;
+          interface_ip.throughput          =
+  XML->sys.L2[ithCache].L2_config[4]/clockRate; interface_ip.latency =
+  XML->sys.L2[ithCache].L2_config[5]/clockRate; interface_ip.obj_func_dyn_energy
+  = 0; interface_ip.obj_func_dyn_power  = 0; interface_ip.obj_func_leak_power =
+  0; interface_ip.obj_func_cycle_t    = 1; interface_ip.num_rw_ports    = 1;
+          interface_ip.num_rd_ports    = 0;
+          interface_ip.num_wr_ports    = 0;
+          interface_ip.num_se_rd_ports = 0;
+          strcpy(inv_dir.caches.name,"inv_dir");
+          inv_dir.caches.init_cache(&interface_ip);
+          inv_dir.caches.optimize_array();
+          inv_dir.area = inv_dir.caches.local_result.area;
+
+  }
+*/
+  //  //pipeline
+  //  interface_ip.pipeline_stages =
+  //  int(ceil(directory.caches.local_result.access_time/directory.caches.local_result.cycle_time));
+  //  interface_ip.per_stage_vector = directory.caches.l_ip.out_w +
+  //  directory.caches.l_ip.tag_w ; pipeLogicDirectory.init_pipeline(is_default,
+  //  &interface_ip); pipeLogicDirectory.compute_pipeline();
+  //
+  //  //clock power
+  //  clockNetwork.init_wire_external(is_default, &interface_ip);
+  //  clockNetwork.clk_area           =area*1.1;//10% of placement overhead.
+  //  rule of thumb clockNetwork.end_wiring_level   =5;//toplevel metal
+  //  clockNetwork.start_wiring_level =5;//toplevel metal
+  //  clockNetwork.num_regs           = pipeLogicCache.tot_stage_vector +
+  //  pipeLogicDirectory.tot_stage_vector; clockNetwork.optimize_wire();
+}
+
+void SharedCache::computeEnergy(bool is_tdp) {
+  double homenode_data_access = (cachep.dir_ty == SBT) ? 0.9 : 1.0;
+  if (is_tdp) {
+    if (!((cachep.dir_ty == ST && cacheL == L1Directory) ||
+          (cachep.dir_ty == ST && cacheL == L2Directory))) {
+      // init stats for Peak
+      unicache.caches->stats_t.readAc.access =
+          .67 * unicache.caches->l_ip.num_rw_ports * cachep.duty_cycle *
+          homenode_data_access;
+      unicache.caches->stats_t.readAc.miss = 0;
+      unicache.caches->stats_t.readAc.hit =
+          unicache.caches->stats_t.readAc.access -
+          unicache.caches->stats_t.readAc.miss;
+      unicache.caches->stats_t.writeAc.access =
+          .33 * unicache.caches->l_ip.num_rw_ports * cachep.duty_cycle *
+          homenode_data_access;
+      unicache.caches->stats_t.writeAc.miss = 0;
+      unicache.caches->stats_t.writeAc.hit =
+          unicache.caches->stats_t.writeAc.access -
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.caches->tdp_stats = unicache.caches->stats_t;
+
+      if (cachep.dir_ty == SBT) {
+        homenode_stats_t.readAc.access =
+            .67 * unicache.caches->l_ip.num_rw_ports * cachep.dir_duty_cycle *
+            (1 - homenode_data_access);
+        homenode_stats_t.readAc.miss = 0;
+        homenode_stats_t.readAc.hit =
+            homenode_stats_t.readAc.access - homenode_stats_t.readAc.miss;
+        homenode_stats_t.writeAc.access =
+            .67 * unicache.caches->l_ip.num_rw_ports * cachep.dir_duty_cycle *
+            (1 - homenode_data_access);
+        homenode_stats_t.writeAc.miss = 0;
+        homenode_stats_t.writeAc.hit =
+            homenode_stats_t.writeAc.access - homenode_stats_t.writeAc.miss;
+        homenode_tdp_stats = homenode_stats_t;
+      }
+
+      unicache.missb->stats_t.readAc.access =
+          unicache.missb->l_ip.num_search_ports;
+      unicache.missb->stats_t.writeAc.access =
+          unicache.missb->l_ip.num_search_ports;
+      unicache.missb->tdp_stats = unicache.missb->stats_t;
+
+      unicache.ifb->stats_t.readAc.access = unicache.ifb->l_ip.num_search_ports;
+      unicache.ifb->stats_t.writeAc.access =
+          unicache.ifb->l_ip.num_search_ports;
+      unicache.ifb->tdp_stats = unicache.ifb->stats_t;
+
+      unicache.prefetchb->stats_t.readAc.access =
+          unicache.prefetchb->l_ip.num_search_ports;
+      unicache.prefetchb->stats_t.writeAc.access =
+          unicache.ifb->l_ip.num_search_ports;
+      unicache.prefetchb->tdp_stats = unicache.prefetchb->stats_t;
+
+      unicache.wbb->stats_t.readAc.access = unicache.wbb->l_ip.num_search_ports;
+      unicache.wbb->stats_t.writeAc.access =
+          unicache.wbb->l_ip.num_search_ports;
+      unicache.wbb->tdp_stats = unicache.wbb->stats_t;
+    } else {
+      unicache.caches->stats_t.readAc.access =
+          unicache.caches->l_ip.num_search_ports * cachep.duty_cycle;
+      unicache.caches->stats_t.readAc.miss = 0;
+      unicache.caches->stats_t.readAc.hit =
+          unicache.caches->stats_t.readAc.access -
+          unicache.caches->stats_t.readAc.miss;
+      unicache.caches->stats_t.writeAc.access = 0;
+      unicache.caches->stats_t.writeAc.miss = 0;
+      unicache.caches->stats_t.writeAc.hit =
+          unicache.caches->stats_t.writeAc.access -
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.caches->tdp_stats = unicache.caches->stats_t;
+    }
+
+  } else {
+    // init stats for runtime power (RTP)
+    if (cacheL == L2) {
+      // Copy stats from l1 to L1[0]
+      XML->sys.L2[ithCache].total_accesses = XML->sys.l2.total_accesses;
+      XML->sys.L2[ithCache].read_accesses = XML->sys.l2.read_accesses;
+      XML->sys.L2[ithCache].write_accesses = XML->sys.l2.write_accesses;
+      XML->sys.L2[ithCache].read_hits = XML->sys.l2.read_hits;
+      XML->sys.L2[ithCache].read_misses = XML->sys.l2.read_misses;
+      XML->sys.L2[ithCache].write_hits = XML->sys.l2.write_hits;
+      XML->sys.L2[ithCache].write_misses = XML->sys.l2.write_misses;
+
+      unicache.caches->stats_t.readAc.access =
+          XML->sys.L2[ithCache].read_accesses;
+      unicache.caches->stats_t.readAc.miss = XML->sys.L2[ithCache].read_misses;
+      unicache.caches->stats_t.readAc.hit =
+          unicache.caches->stats_t.readAc.access -
+          unicache.caches->stats_t.readAc.miss;
+      unicache.caches->stats_t.writeAc.access =
+          XML->sys.L2[ithCache].write_accesses;
+      unicache.caches->stats_t.writeAc.miss =
+          XML->sys.L2[ithCache].write_misses;
+      unicache.caches->stats_t.writeAc.hit =
+          unicache.caches->stats_t.writeAc.access -
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.caches->rtp_stats = unicache.caches->stats_t;
+
+      if (cachep.dir_ty == SBT) {
+        homenode_rtp_stats.readAc.access =
+            XML->sys.L2[ithCache].homenode_read_accesses;
+        homenode_rtp_stats.readAc.miss =
+            XML->sys.L2[ithCache].homenode_read_misses;
+        homenode_rtp_stats.readAc.hit =
+            homenode_rtp_stats.readAc.access - homenode_rtp_stats.readAc.miss;
+        homenode_rtp_stats.writeAc.access =
+            XML->sys.L2[ithCache].homenode_write_accesses;
+        homenode_rtp_stats.writeAc.miss =
+            XML->sys.L2[ithCache].homenode_write_misses;
+        homenode_rtp_stats.writeAc.hit =
+            homenode_rtp_stats.writeAc.access - homenode_rtp_stats.writeAc.miss;
+      }
+    } else if (cacheL == L3) {
+      unicache.caches->stats_t.readAc.access =
+          XML->sys.L3[ithCache].read_accesses;
+      unicache.caches->stats_t.readAc.miss = XML->sys.L3[ithCache].read_misses;
+      unicache.caches->stats_t.readAc.hit =
+          unicache.caches->stats_t.readAc.access -
+          unicache.caches->stats_t.readAc.miss;
+      unicache.caches->stats_t.writeAc.access =
+          XML->sys.L3[ithCache].write_accesses;
+      unicache.caches->stats_t.writeAc.miss =
+          XML->sys.L3[ithCache].write_misses;
+      unicache.caches->stats_t.writeAc.hit =
+          unicache.caches->stats_t.writeAc.access -
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.caches->rtp_stats = unicache.caches->stats_t;
+
+      if (cachep.dir_ty == SBT) {
+        homenode_rtp_stats.readAc.access =
+            XML->sys.L3[ithCache].homenode_read_accesses;
+        homenode_rtp_stats.readAc.miss =
+            XML->sys.L3[ithCache].homenode_read_misses;
+        homenode_rtp_stats.readAc.hit =
+            homenode_rtp_stats.readAc.access - homenode_rtp_stats.readAc.miss;
+        homenode_rtp_stats.writeAc.access =
+            XML->sys.L3[ithCache].homenode_write_accesses;
+        homenode_rtp_stats.writeAc.miss =
+            XML->sys.L3[ithCache].homenode_write_misses;
+        homenode_rtp_stats.writeAc.hit =
+            homenode_rtp_stats.writeAc.access - homenode_rtp_stats.writeAc.miss;
+      }
+    } else if (cacheL == L1Directory) {
+      unicache.caches->stats_t.readAc.access =
+          XML->sys.L1Directory[ithCache].read_accesses;
+      unicache.caches->stats_t.readAc.miss =
+          XML->sys.L1Directory[ithCache].read_misses;
+      unicache.caches->stats_t.readAc.hit =
+          unicache.caches->stats_t.readAc.access -
+          unicache.caches->stats_t.readAc.miss;
+      unicache.caches->stats_t.writeAc.access =
+          XML->sys.L1Directory[ithCache].write_accesses;
+      unicache.caches->stats_t.writeAc.miss =
+          XML->sys.L1Directory[ithCache].write_misses;
+      unicache.caches->stats_t.writeAc.hit =
+          unicache.caches->stats_t.writeAc.access -
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.caches->rtp_stats = unicache.caches->stats_t;
+    } else if (cacheL == L2Directory) { // cout<<"L2 directory"<<endl;
+      // Copy stats from l1 to L1[0]
+      // XML->sys.L2[ithCache].total_accesses=XML->sys.l2.total_accesses;
+      // XML->sys.L2[ithCache].read_accesses=XML->sys.l2.read_accesses;
+      // XML->sys.L2[ithCache].write_accesses=XML->sys.l2.write_accesses;
+      // XML->sys.L2[ithCache].read_hits=XML->sys.l2.read_hits;
+      // XML->sys.L2[ithCache].read_misses=XML->sys.l2.read_misses;
+      // XML->sys.L2[ithCache].write_hits=XML->sys.l2.write_hits;
+      // XML->sys.L2[ithCache].write_misses=XML->sys.l2.write_misses;
+      unicache.caches->stats_t.readAc.access =
+          XML->sys.L2Directory[ithCache].read_accesses;
+      unicache.caches->stats_t.readAc.miss =
+          XML->sys.L2Directory[ithCache].read_misses;
+      unicache.caches->stats_t.readAc.hit =
+          unicache.caches->stats_t.readAc.access -
+          unicache.caches->stats_t.readAc.miss;
+      unicache.caches->stats_t.writeAc.access =
+          XML->sys.L2Directory[ithCache].write_accesses;
+      unicache.caches->stats_t.writeAc.miss =
+          XML->sys.L2Directory[ithCache].write_misses;
+      unicache.caches->stats_t.writeAc.hit =
+          unicache.caches->stats_t.writeAc.access -
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.caches->rtp_stats = unicache.caches->stats_t;
+    }
+    if (!((cachep.dir_ty == ST && cacheL == L1Directory) ||
+          (cachep.dir_ty == ST &&
+           cacheL ==
+               L2Directory))) { // Assuming write back and write-allocate cache
+
+      unicache.missb->stats_t.readAc.access =
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.missb->stats_t.writeAc.access =
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.missb->rtp_stats = unicache.missb->stats_t;
+
+      unicache.ifb->stats_t.readAc.access =
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.ifb->stats_t.writeAc.access =
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.ifb->rtp_stats = unicache.ifb->stats_t;
+
+      unicache.prefetchb->stats_t.readAc.access =
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.prefetchb->stats_t.writeAc.access =
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.prefetchb->rtp_stats = unicache.prefetchb->stats_t;
+
+      unicache.wbb->stats_t.readAc.access =
+          unicache.caches->stats_t.writeAc.miss;
+      unicache.wbb->stats_t.writeAc.access =
+          unicache.caches->stats_t.writeAc.miss;
+      if (cachep.dir_ty == SBT) {
+        unicache.missb->stats_t.readAc.access +=
+            homenode_rtp_stats.writeAc.miss;
+        unicache.missb->stats_t.writeAc.access +=
+            homenode_rtp_stats.writeAc.miss;
+        unicache.missb->rtp_stats = unicache.missb->stats_t;
+
+        unicache.missb->stats_t.readAc.access +=
+            homenode_rtp_stats.writeAc.miss;
+        unicache.missb->stats_t.writeAc.access +=
+            homenode_rtp_stats.writeAc.miss;
+        unicache.missb->rtp_stats = unicache.missb->stats_t;
+
+        unicache.ifb->stats_t.readAc.access += homenode_rtp_stats.writeAc.miss;
+        unicache.ifb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss;
+        unicache.ifb->rtp_stats = unicache.ifb->stats_t;
+
+        unicache.prefetchb->stats_t.readAc.access +=
+            homenode_rtp_stats.writeAc.miss;
+        unicache.prefetchb->stats_t.writeAc.access +=
+            homenode_rtp_stats.writeAc.miss;
+        unicache.prefetchb->rtp_stats = unicache.prefetchb->stats_t;
+
+        unicache.wbb->stats_t.readAc.access += homenode_rtp_stats.writeAc.miss;
+        unicache.wbb->stats_t.writeAc.access += homenode_rtp_stats.writeAc.miss;
+      }
+      unicache.wbb->rtp_stats = unicache.wbb->stats_t;
+    }
+  }
+
+  unicache.power_t.reset();
+  unicache.rt_power.reset();
+  if (!((cachep.dir_ty == ST && cacheL == L1Directory) ||
+        (cachep.dir_ty == ST && cacheL == L2Directory))) {
+
+    unicache.power_t.readOp.dynamic +=
+        (unicache.caches->stats_t.readAc.hit *
+             unicache.caches->local_result.power.readOp.dynamic +
+         unicache.caches->stats_t.readAc.miss *
+             unicache.caches->local_result.tag_array2->power.readOp.dynamic +
+         unicache.caches->stats_t.writeAc.miss *
+             unicache.caches->local_result.tag_array2->power.writeOp.dynamic +
+         unicache.caches->stats_t.writeAc.access *
+             unicache.caches->local_result.power.writeOp
+                 .dynamic); // write miss will also generate a write later
+
+    if (cachep.dir_ty == SBT) {
+      unicache.power_t.readOp.dynamic +=
+          homenode_stats_t.readAc.hit *
+              (unicache.caches->local_result.data_array2->power.readOp.dynamic *
+                   dir_overhead +
+               unicache.caches->local_result.tag_array2->power.readOp.dynamic) +
+          homenode_stats_t.readAc.miss *
+              unicache.caches->local_result.tag_array2->power.readOp.dynamic +
+          homenode_stats_t.writeAc.miss *
+              unicache.caches->local_result.tag_array2->power.readOp.dynamic +
+          homenode_stats_t.writeAc.hit *
+              (unicache.caches->local_result.data_array2->power.writeOp
+                       .dynamic *
+                   dir_overhead +
+               unicache.caches->local_result.tag_array2->power.readOp.dynamic +
+               homenode_stats_t.writeAc.miss *
+                   unicache.caches->local_result.power.writeOp
+                       .dynamic); // write miss on dynamic home node will
+                                  // generate a replacement write on whole cache
+                                  // block
+    }
+
+    unicache.power_t.readOp.dynamic +=
+        unicache.missb->stats_t.readAc.access *
+            unicache.missb->local_result.power.searchOp.dynamic +
+        unicache.missb->stats_t.writeAc.access *
+            unicache.missb->local_result.power.writeOp
+                .dynamic; // each access to missb involves a CAM and a write
+    unicache.power_t.readOp.dynamic +=
+        unicache.ifb->stats_t.readAc.access *
+            unicache.ifb->local_result.power.searchOp.dynamic +
+        unicache.ifb->stats_t.writeAc.access *
+            unicache.ifb->local_result.power.writeOp.dynamic;
+    unicache.power_t.readOp.dynamic +=
+        unicache.prefetchb->stats_t.readAc.access *
+            unicache.prefetchb->local_result.power.searchOp.dynamic +
+        unicache.prefetchb->stats_t.writeAc.access *
+            unicache.prefetchb->local_result.power.writeOp.dynamic;
+    unicache.power_t.readOp.dynamic +=
+        unicache.wbb->stats_t.readAc.access *
+            unicache.wbb->local_result.power.searchOp.dynamic +
+        unicache.wbb->stats_t.writeAc.access *
+            unicache.wbb->local_result.power.writeOp.dynamic;
+  } else {
+    unicache.power_t.readOp.dynamic +=
+        (unicache.caches->stats_t.readAc.access *
+             unicache.caches->local_result.power.searchOp.dynamic +
+         unicache.caches->stats_t.writeAc.access *
+             unicache.caches->local_result.power.writeOp.dynamic);
+  }
+
+  if (is_tdp) {
+    unicache.power =
+        unicache.power_t + (unicache.caches->local_result.power) * pppm_lkg;
+    if (!((cachep.dir_ty == ST && cacheL == L1Directory) ||
+          (cachep.dir_ty == ST && cacheL == L2Directory))) {
+      unicache.power =
+          unicache.power + (unicache.missb->local_result.power +
+                            unicache.ifb->local_result.power +
+                            unicache.prefetchb->local_result.power +
+                            unicache.wbb->local_result.power) *
+                               pppm_lkg;
+    }
+    power = power + unicache.power;
+    //		cout<<"unicache.caches->local_result.power.readOp.dynamic"<<unicache.caches->local_result.power.readOp.dynamic<<endl;
+    //		cout<<"unicache.caches->local_result.power.writeOp.dynamic"<<unicache.caches->local_result.power.writeOp.dynamic<<endl;
+  } else {
+
+    unicache.rt_power =
+        unicache.power_t + (unicache.caches->local_result.power) * pppm_lkg;
+    if (!((cachep.dir_ty == ST && cacheL == L1Directory) ||
+          (cachep.dir_ty == ST && cacheL == L2Directory))) {
+      unicache.rt_power =
+          unicache.rt_power + (unicache.missb->local_result.power +
+                               unicache.ifb->local_result.power +
+                               unicache.prefetchb->local_result.power +
+                               unicache.wbb->local_result.power) *
+                                  pppm_lkg;
+    }
+
+    rt_power = rt_power + unicache.rt_power;
+  }
+}
+
+void SharedCache::displayEnergy(uint32_t indent, bool is_tdp) {
+  string indent_str(indent, ' ');
+  string indent_str_next(indent + 2, ' ');
+  bool long_channel = XML->sys.longer_channel_device;
+
+  if (is_tdp) {
+    cout << (XML->sys.Private_L2 ? indent_str : "") << cachep.name << endl;
+    cout << indent_str << "Area = " << area.get_area() * 1e-6 << " mm^2"
+         << endl;
+    cout << indent_str
+         << "Peak Dynamic = " << power.readOp.dynamic * cachep.clockRate << " W"
+         << endl;
+    cout << indent_str << "Subthreshold Leakage = "
+         << (long_channel ? power.readOp.longer_channel_leakage
+                          : power.readOp.leakage)
+         << " W" << endl;
+    // cout << indent_str << "Subthreshold Leakage = " <<
+    // power.readOp.longer_channel_leakage <<" W" << endl;
+    cout << indent_str << "Gate Leakage = " << power.readOp.gate_leakage << " W"
+         << endl;
+    cout << indent_str << "Runtime Dynamic = "
+         << rt_power.readOp.dynamic / cachep.executionTime << " W" << endl;
+    cout << endl;
+  } else {
+  }
+}
+
+// void SharedCache::computeMaxPower()
+//{
+//  //Compute maximum power and runtime power.
+//  //When computing runtime power, McPAT gets or reasons out the statistics
+//  based on XML input. maxPower		= 0.0;
+//  //llCache,itlb
+//  llCache.maxPower   = 0.0;
+//  llCache.maxPower	+=
+//  (llCache.caches.l_ip.num_rw_ports*(0.67*llCache.caches.local_result.power.readOp.dynamic+0.33*llCache.caches.local_result.power.writeOp.dynamic)
+//                        +llCache.caches.l_ip.num_rd_ports*llCache.caches.local_result.power.readOp.dynamic+llCache.caches.l_ip.num_wr_ports*llCache.caches.local_result.power.writeOp.dynamic
+//                        +llCache.caches.l_ip.num_se_rd_ports*llCache.caches.local_result.power.readOp.dynamic)*clockRate;
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+//  llCache.maxPower	+=
+//  llCache.missb.l_ip.num_search_ports*llCache.missb.local_result.power.searchOp.dynamic*clockRate;
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+//  llCache.maxPower	+=
+//  llCache.ifb.l_ip.num_search_ports*llCache.ifb.local_result.power.searchOp.dynamic*clockRate;
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+//  llCache.maxPower	+=
+//  llCache.prefetchb.l_ip.num_search_ports*llCache.prefetchb.local_result.power.searchOp.dynamic*clockRate;
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+//  llCache.maxPower	+=
+//  llCache.wbb.l_ip.num_search_ports*llCache.wbb.local_result.power.searchOp.dynamic*clockRate;
+//  //llCache.maxPower *=  scktRatio; //TODO: this calculation should be
+//  self-contained
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+////  directory_power =
+///(directory.caches.l_ip.num_rw_ports*(0.67*directory.caches.local_result.power.readOp.dynamic+0.33*directory.caches.local_result.power.writeOp.dynamic)
+////
+///+directory.caches.l_ip.num_rd_ports*directory.caches.local_result.power.readOp.dynamic+directory.caches.l_ip.num_wr_ports*directory.caches.local_result.power.writeOp.dynamic
+////
+///+directory.caches.l_ip.num_se_rd_ports*directory.caches.local_result.power.readOp.dynamic)*clockRate;
+//
+//  L2Tot.power.readOp.dynamic = llCache.maxPower;
+//  L2Tot.power.readOp.leakage =
+//  llCache.caches.local_result.power.readOp.leakage +
+//                               llCache.missb.local_result.power.readOp.leakage
+//                               + llCache.ifb.local_result.power.readOp.leakage
+//                               +
+//                               llCache.prefetchb.local_result.power.readOp.leakage
+//                               +
+//                               llCache.wbb.local_result.power.readOp.leakage;
+//
+//  L2Tot.area.set_area(llCache.area*1.1*1e-6);//placement and routing overhead
+//
+//  if (XML->sys.number_of_dir_levels==1)
+//  {
+//	  if (XML->sys.first_level_dir==0)
+//	  {
+//		  directory.maxPower   = 0.0;
+//		  directory.maxPower	+=
+//(directory.caches.l_ip.num_rw_ports*(0.67*directory.caches.local_result.power.readOp.dynamic+0.33*directory.caches.local_result.power.writeOp.dynamic)
+//		                        +directory.caches.l_ip.num_rd_ports*directory.caches.local_result.power.readOp.dynamic+directory.caches.l_ip.num_wr_ports*directory.caches.local_result.power.writeOp.dynamic
+//		                        +directory.caches.l_ip.num_se_rd_ports*directory.caches.local_result.power.readOp.dynamic)*clockRate;
+//		  ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//		  directory.maxPower	+=
+// directory.missb.l_ip.num_search_ports*directory.missb.local_result.power.searchOp.dynamic*clockRate;
+//		  ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//		  directory.maxPower	+=
+// directory.ifb.l_ip.num_search_ports*directory.ifb.local_result.power.searchOp.dynamic*clockRate;
+//		  ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//		  directory.maxPower	+=
+// directory.prefetchb.l_ip.num_search_ports*directory.prefetchb.local_result.power.searchOp.dynamic*clockRate;
+//		  ///cout<<"directory.maxPower=" <<directory.maxPower<<endl;
+//
+//		  directory.maxPower	+=
+// directory.wbb.l_ip.num_search_ports*directory.wbb.local_result.power.searchOp.dynamic*clockRate;
+//
+//		  cc.power.readOp.dynamic = directory.maxPower*scktRatio*8;//8
+// is the memory controller counts 		  cc.power.readOp.leakage =
+// directory.caches.local_result.power.readOp.leakage +
+//                                     directory.missb.local_result.power.readOp.leakage
+//                                     +
+//                                     directory.ifb.local_result.power.readOp.leakage
+//                                     +
+//                                     directory.prefetchb.local_result.power.readOp.leakage
+//                                     +
+//                                     directory.wbb.local_result.power.readOp.leakage;
+//
+//		  cc.power.readOp.leakage *=8;
+//
+//		  cc.area.set_area(directory.area*8);
+//		  cout<<"CC area="<<cc.area.get_area()*1e-6<<endl;
+//		  cout<<"CC Power="<<cc.power.readOp.dynamic<<endl;
+//		  ccTot.area.set_area(cc.area.get_area()*1e-6);
+//		  ccTot.power = cc.power;
+//		  cout<<"DC energy per access" <<
+// cc.power.readOp.dynamic/clockRate/8;
+//	  }
+//	  else if (XML->sys.first_level_dir==1)
+//	  {
+//		  inv_dir.maxPower =
+// inv_dir.caches.local_result.power.searchOp.dynamic*clockRate*XML->sys.domain_size;
+//		  cc.power.readOp.dynamic  =
+// inv_dir.maxPower*scktRatio*64/XML->sys.domain_size;
+// cc.power.readOp.leakage  =
+// inv_dir.caches.local_result.power.readOp.leakage*inv_dir.caches.l_ip.nbanks*64/XML->sys.domain_size;
+//
+//		  cc.area.set_area(inv_dir.area*64/XML->sys.domain_size);
+//		  cout<<"CC area="<<cc.area.get_area()*1e-6<<endl;
+//		  cout<<"CC Power="<<cc.power.readOp.dynamic<<endl;
+//		  ccTot.area.set_area(cc.area.get_area()*1e-6);
+//		  cout<<"DC energy per access" <<
+// cc.power.readOp.dynamic/clockRate/8; 		  ccTot.power =
+// cc.power;
+//	  }
+//  }
+//
+//  else if (XML->sys.number_of_dir_levels==2)
+//  {
+//
+//	  		  directory.maxPower   = 0.0;
+//	  		  directory.maxPower	+=
+//(directory.caches.l_ip.num_rw_ports*(0.67*directory.caches.local_result.power.readOp.dynamic+0.33*directory.caches.local_result.power.writeOp.dynamic)
+//	  		                        +directory.caches.l_ip.num_rd_ports*directory.caches.local_result.power.readOp.dynamic+directory.caches.l_ip.num_wr_ports*directory.caches.local_result.power.writeOp.dynamic
+//	  		                        +directory.caches.l_ip.num_se_rd_ports*directory.caches.local_result.power.readOp.dynamic)*clockRate;
+//	  		  ///cout<<"directory.maxPower="
+//<<directory.maxPower<<endl;
+//
+//	  		  directory.maxPower	+=
+// directory.missb.l_ip.num_search_ports*directory.missb.local_result.power.searchOp.dynamic*clockRate;
+//	  		  ///cout<<"directory.maxPower="
+//<<directory.maxPower<<endl;
+//
+//	  		  directory.maxPower	+=
+// directory.ifb.l_ip.num_search_ports*directory.ifb.local_result.power.searchOp.dynamic*clockRate;
+//	  		  ///cout<<"directory.maxPower="
+//<<directory.maxPower<<endl;
+//
+//	  		  directory.maxPower	+=
+// directory.prefetchb.l_ip.num_search_ports*directory.prefetchb.local_result.power.searchOp.dynamic*clockRate;
+//	  		  ///cout<<"directory.maxPower="
+//<<directory.maxPower<<endl;
+//
+//	  		  directory.maxPower	+=
+// directory.wbb.l_ip.num_search_ports*directory.wbb.local_result.power.searchOp.dynamic*clockRate;
+//
+//	  		  cc.power.readOp.dynamic =
+// directory.maxPower*scktRatio*8;//8 is the memory controller counts
+//			  cc.power.readOp.leakage =
+// directory.caches.local_result.power.readOp.leakage +
+//	                                     directory.missb.local_result.power.readOp.leakage
+//+ directory.ifb.local_result.power.readOp.leakage +
+//	                                     directory.prefetchb.local_result.power.readOp.leakage
+//+ directory.wbb.local_result.power.readOp.leakage;
+// cc.power.readOp.leakage
+//*=8; 	  		  cc.area.set_area(directory.area*8);
+//
+//	  		if (XML->sys.first_level_dir==0)
+//	  		{
+//	  		  directory1.maxPower   = 0.0;
+//	  		  directory1.maxPower	+=
+//(directory1.caches.l_ip.num_rw_ports*(0.67*directory1.caches.local_result.power.readOp.dynamic+0.33*directory1.caches.local_result.power.writeOp.dynamic)
+//	  				  +directory1.caches.l_ip.num_rd_ports*directory1.caches.local_result.power.readOp.dynamic+directory1.caches.l_ip.num_wr_ports*directory1.caches.local_result.power.writeOp.dynamic
+//	  				  +directory1.caches.l_ip.num_se_rd_ports*directory1.caches.local_result.power.readOp.dynamic)*clockRate;
+//	  		  ///cout<<"directory1.maxPower="
+//<<directory1.maxPower<<endl;
+//
+//	  		  directory1.maxPower	+=
+// directory1.missb.l_ip.num_search_ports*directory1.missb.local_result.power.searchOp.dynamic*clockRate;
+//	  		  ///cout<<"directory1.maxPower="
+//<<directory1.maxPower<<endl;
+//
+//	  		  directory1.maxPower	+=
+// directory1.ifb.l_ip.num_search_ports*directory1.ifb.local_result.power.searchOp.dynamic*clockRate;
+//	  		  ///cout<<"directory1.maxPower="
+//<<directory1.maxPower<<endl;
+//
+//	  		  directory1.maxPower	+=
+// directory1.prefetchb.l_ip.num_search_ports*directory1.prefetchb.local_result.power.searchOp.dynamic*clockRate;
+//	  		  ///cout<<"directory1.maxPower="
+//<<directory1.maxPower<<endl;
+//
+//	  		  directory1.maxPower	+=
+// directory1.wbb.l_ip.num_search_ports*directory1.wbb.local_result.power.searchOp.dynamic*clockRate;
+//
+//	  		  cc1.power.readOp.dynamic =
+// directory1.maxPower*scktRatio*64/XML->sys.domain_size;
+//			  cc1.power.readOp.leakage =
+// directory1.caches.local_result.power.readOp.leakage +
+//	                                     directory1.missb.local_result.power.readOp.leakage
+//+ directory1.ifb.local_result.power.readOp.leakage +
+//	                                     directory1.prefetchb.local_result.power.readOp.leakage
+//+ directory1.wbb.local_result.power.readOp.leakage;
+// cc1.power.readOp.leakage
+//*= 64/XML->sys.domain_size;
+//	  		  cc1.area.set_area(directory1.area*64/XML->sys.domain_size);
+//
+//	  		  cout<<"CC
+// area="<<(cc.area.get_area()+cc1.area.get_area())*1e-6<<endl;
+// cout<<"CC Power="<<cc.power.readOp.dynamic + cc1.power.readOp.dynamic <<endl;
+//			  ccTot.area.set_area((cc.area.get_area()+cc1.area.get_area())*1e-6);
+//			  ccTot.power = cc.power + cc1.power;
+//	  	  }
+//	  	  else if (XML->sys.first_level_dir==1)
+//	  	  {
+//	  		  inv_dir.maxPower =
+// inv_dir.caches.local_result.power.searchOp.dynamic*clockRate*XML->sys.domain_size;
+//	  		  cc1.power.readOp.dynamic =
+// inv_dir.maxPower*scktRatio*(64/XML->sys.domain_size);
+// cc1.power.readOp.leakage
+//=
+// inv_dir.caches.local_result.power.readOp.leakage*inv_dir.caches.l_ip.nbanks*XML->sys.domain_size;
+//
+//	  		  cc1.area.set_area(inv_dir.area*64/XML->sys.domain_size);
+//			  cout<<"CC
+// area="<<(cc.area.get_area()+cc1.area.get_area())*1e-6<<endl;
+// cout<<"CC Power="<<cc.power.readOp.dynamic + cc1.power.readOp.dynamic <<endl;
+//			  ccTot.area.set_area((cc.area.get_area()+cc1.area.get_area())*1e-6);
+//			  ccTot.power = cc.power + cc1.power;
+//
+//	  	  }
+//	  	  else if (XML->sys.first_level_dir==2)
+//	  	  {
+//			  cout<<"CC area="<<cc.area.get_area()*1e-6<<endl;
+//			  cout<<"CC Power="<<cc.power.readOp.dynamic<<endl;
+//			  ccTot.area.set_area(cc.area.get_area()*1e-6);
+//			  ccTot.power = cc.power;
+//	  	  }
+//  }
+//
+// cout<<"L2cache size="<<L2Tot.area.get_area()*1e-6<<endl;
+// cout<<"L2cache dynamic power="<<L2Tot.power.readOp.dynamic<<endl;
+// cout<<"L2cache laeakge power="<<L2Tot.power.readOp.leakage<<endl;
+//
+//  ///cout<<"llCache.maxPower=" <<llCache.maxPower<<endl;
+//
+//
+//  maxPower          +=  llCache.maxPower;
+//  ///cout<<"maxpower=" <<maxPower<<endl;
+//
+////  maxPower	  +=  pipeLogicCache.power.readOp.dynamic*clockRate;
+////
+//////cout<<"pipeLogic.power="<<pipeLogicCache.power.readOp.dynamic*clockRate<<endl;
+////  ///cout<<"maxpower=" <<maxPower<<endl;
+////
+////  maxPower	  +=  pipeLogicDirectory.power.readOp.dynamic*clockRate;
+////
+//////cout<<"pipeLogic.power="<<pipeLogicDirectory.power.readOp.dynamic*clockRate<<endl;
+////  ///cout<<"maxpower=" <<maxPower<<endl;
+////
+////  //clock power
+////  maxPower += clockNetwork.total_power.readOp.dynamic*clockRate;
+////
+//////cout<<"clockNetwork.total_power="<<clockNetwork.total_power.readOp.dynamic*clockRate<<endl;
+////  ///cout<<"maxpower=" <<maxPower<<endl;
+//
+//}
+
+void SharedCache::set_cache_param() {
+  if (cacheL == L2) {
+    cachep.name = "L2";
+    cachep.clockRate = XML->sys.L2[ithCache].clockrate;
+    cachep.clockRate *= 1e6;
+    cachep.executionTime =
+        XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6);
+    interface_ip.data_arr_ram_cell_tech_type =
+        XML->sys.L2[ithCache].device_type; // long channel device LSTP
+    interface_ip.data_arr_peri_global_tech_type =
+        XML->sys.L2[ithCache].device_type;
+    interface_ip.tag_arr_ram_cell_tech_type = XML->sys.L2[ithCache].device_type;
+    interface_ip.tag_arr_peri_global_tech_type =
+        XML->sys.L2[ithCache].device_type;
+    cachep.capacity = XML->sys.L2[ithCache].L2_config[0];
+    cachep.blockW = XML->sys.L2[ithCache].L2_config[1];
+    cachep.assoc = XML->sys.L2[ithCache].L2_config[2];
+    cachep.nbanks = XML->sys.L2[ithCache].L2_config[3];
+    cachep.throughput = XML->sys.L2[ithCache].L2_config[4] / cachep.clockRate;
+    cachep.latency = XML->sys.L2[ithCache].L2_config[5] / cachep.clockRate;
+    cachep.missb_size = XML->sys.L2[ithCache].buffer_sizes[0];
+    cachep.fu_size = XML->sys.L2[ithCache].buffer_sizes[1];
+    cachep.prefetchb_size = XML->sys.L2[ithCache].buffer_sizes[2];
+    cachep.wbb_size = XML->sys.L2[ithCache].buffer_sizes[3];
+    cachep.duty_cycle = XML->sys.L2[ithCache].duty_cycle;
+    if (!XML->sys.L2[ithCache].merged_dir) {
+      cachep.dir_ty = NonDir;
+    } else {
+      cachep.dir_ty = SBT;
+      cachep.dir_duty_cycle = XML->sys.L2[ithCache].dir_duty_cycle;
+    }
+  } else if (cacheL == L3) {
+    cachep.name = "L3";
+    cachep.clockRate = XML->sys.L3[ithCache].clockrate;
+    cachep.clockRate *= 1e6;
+    cachep.executionTime =
+        XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6);
+    interface_ip.data_arr_ram_cell_tech_type =
+        XML->sys.L3[ithCache].device_type; // long channel device LSTP
+    interface_ip.data_arr_peri_global_tech_type =
+        XML->sys.L3[ithCache].device_type;
+    interface_ip.tag_arr_ram_cell_tech_type = XML->sys.L3[ithCache].device_type;
+    interface_ip.tag_arr_peri_global_tech_type =
+        XML->sys.L3[ithCache].device_type;
+    cachep.capacity = XML->sys.L3[ithCache].L3_config[0];
+    cachep.blockW = XML->sys.L3[ithCache].L3_config[1];
+    cachep.assoc = XML->sys.L3[ithCache].L3_config[2];
+    cachep.nbanks = XML->sys.L3[ithCache].L3_config[3];
+    cachep.throughput = XML->sys.L3[ithCache].L3_config[4] / cachep.clockRate;
+    cachep.latency = XML->sys.L3[ithCache].L3_config[5] / cachep.clockRate;
+    cachep.missb_size = XML->sys.L3[ithCache].buffer_sizes[0];
+    cachep.fu_size = XML->sys.L3[ithCache].buffer_sizes[1];
+    cachep.prefetchb_size = XML->sys.L3[ithCache].buffer_sizes[2];
+    cachep.wbb_size = XML->sys.L3[ithCache].buffer_sizes[3];
+    cachep.duty_cycle = XML->sys.L3[ithCache].duty_cycle;
+    if (!XML->sys.L2[ithCache].merged_dir) {
+      cachep.dir_ty = NonDir;
+    } else {
+      cachep.dir_ty = SBT;
+      cachep.dir_duty_cycle = XML->sys.L2[ithCache].dir_duty_cycle;
+    }
+  } else if (cacheL == L1Directory) {
+    cachep.name = "First Level Directory";
+    cachep.dir_ty =
+        (enum Dir_type)XML->sys.L1Directory[ithCache].Directory_type;
+    cachep.clockRate = XML->sys.L1Directory[ithCache].clockrate;
+    cachep.clockRate *= 1e6;
+    cachep.executionTime =
+        XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6);
+    interface_ip.data_arr_ram_cell_tech_type =
+        XML->sys.L1Directory[ithCache].device_type; // long channel device LSTP
+    interface_ip.data_arr_peri_global_tech_type =
+        XML->sys.L1Directory[ithCache].device_type;
+    interface_ip.tag_arr_ram_cell_tech_type =
+        XML->sys.L1Directory[ithCache].device_type;
+    interface_ip.tag_arr_peri_global_tech_type =
+        XML->sys.L1Directory[ithCache].device_type;
+    cachep.capacity = XML->sys.L1Directory[ithCache].Dir_config[0];
+    cachep.blockW = XML->sys.L1Directory[ithCache].Dir_config[1];
+    cachep.assoc = XML->sys.L1Directory[ithCache].Dir_config[2];
+    cachep.nbanks = XML->sys.L1Directory[ithCache].Dir_config[3];
+    cachep.throughput =
+        XML->sys.L1Directory[ithCache].Dir_config[4] / cachep.clockRate;
+    cachep.latency =
+        XML->sys.L1Directory[ithCache].Dir_config[5] / cachep.clockRate;
+    cachep.missb_size = XML->sys.L1Directory[ithCache].buffer_sizes[0];
+    cachep.fu_size = XML->sys.L1Directory[ithCache].buffer_sizes[1];
+    cachep.prefetchb_size = XML->sys.L1Directory[ithCache].buffer_sizes[2];
+    cachep.wbb_size = XML->sys.L1Directory[ithCache].buffer_sizes[3];
+    cachep.duty_cycle = XML->sys.L1Directory[ithCache].duty_cycle;
+  } else if (cacheL == L2Directory) {
+    cachep.name = "Second Level Directory";
+    cachep.dir_ty =
+        (enum Dir_type)XML->sys.L2Directory[ithCache].Directory_type;
+    cachep.clockRate = XML->sys.L2Directory[ithCache].clockrate;
+    cachep.clockRate *= 1e6;
+    cachep.executionTime =
+        XML->sys.total_cycles / (XML->sys.target_core_clockrate * 1e6);
+    interface_ip.data_arr_ram_cell_tech_type =
+        XML->sys.L2Directory[ithCache].device_type; // long channel device LSTP
+    interface_ip.data_arr_peri_global_tech_type =
+        XML->sys.L2Directory[ithCache].device_type;
+    interface_ip.tag_arr_ram_cell_tech_type =
+        XML->sys.L2Directory[ithCache].device_type;
+    interface_ip.tag_arr_peri_global_tech_type =
+        XML->sys.L2Directory[ithCache].device_type;
+    cachep.capacity = XML->sys.L2Directory[ithCache].Dir_config[0];
+    cachep.blockW = XML->sys.L2Directory[ithCache].Dir_config[1];
+    cachep.assoc = XML->sys.L2Directory[ithCache].Dir_config[2];
+    cachep.nbanks = XML->sys.L2Directory[ithCache].Dir_config[3];
+    cachep.throughput =
+        XML->sys.L2Directory[ithCache].Dir_config[4] / cachep.clockRate;
+    cachep.latency =
+        XML->sys.L2Directory[ithCache].Dir_config[5] / cachep.clockRate;
+    cachep.missb_size = XML->sys.L2Directory[ithCache].buffer_sizes[0];
+    cachep.fu_size = XML->sys.L2Directory[ithCache].buffer_sizes[1];
+    cachep.prefetchb_size = XML->sys.L2Directory[ithCache].buffer_sizes[2];
+    cachep.wbb_size = XML->sys.L2Directory[ithCache].buffer_sizes[3];
+    cachep.duty_cycle = XML->sys.L2Directory[ithCache].duty_cycle;
+  }
+  // cachep.cache_duty_cycle=cachep.dir_duty_cycle = 0.35;
+}
diff --git a/src/gpuwattch/sharedcache.h b/src/gpuwattch/sharedcache.h
new file mode 100644
index 000000000..a151cfde2
--- /dev/null
+++ b/src/gpuwattch/sharedcache.h
@@ -0,0 +1,89 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef SHAREDCACHE_H_
+#define SHAREDCACHE_H_
+#include "XML_Parse.h"
+#include "array.h"
+#include "basic_components.h"
+#include "cacti/area.h"
+#include "cacti/parameter.h"
+#include "logic.h"
+#include <vector>
+
+class SharedCache : public Component {
+public:
+  ParseXML *XML;
+  int ithCache;
+  InputParameter interface_ip;
+  enum cache_level cacheL;
+  DataCache unicache; // Shared cache
+  CacheDynParam cachep;
+  statsDef homenode_tdp_stats;
+  statsDef homenode_rtp_stats;
+  statsDef homenode_stats_t;
+  double dir_overhead;
+  //	cache_processor llCache,directory, directory1, inv_dir;
+
+  // pipeline pipeLogicCache, pipeLogicDirectory;
+  // clock_network				clockNetwork;
+  double scktRatio, executionTime;
+  //   Component L2Tot, cc, cc1, ccTot;
+
+  SharedCache(ParseXML *XML_interface, int ithCache_,
+              InputParameter *interface_ip_, enum cache_level cacheL_ = L2);
+  void set_cache_param();
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, bool is_tdp = true);
+  ~SharedCache(){};
+};
+
+class CCdir : public Component {
+public:
+  ParseXML *XML;
+  int ithCache;
+  InputParameter interface_ip;
+  DataCache dc; // Shared cache
+  ArrayST *shadow_dir;
+  //	cache_processor llCache,directory, directory1, inv_dir;
+
+  // pipeline pipeLogicCache, pipeLogicDirectory;
+  // clock_network				clockNetwork;
+  double scktRatio, clockRate, executionTime;
+  Component L2Tot, cc, cc1, ccTot;
+
+  CCdir(ParseXML *XML_interface, int ithCache_, InputParameter *interface_ip_);
+  void computeEnergy(bool is_tdp = true);
+  void displayEnergy(uint32_t indent = 0, bool is_tdp = true);
+  ~CCdir();
+};
+
+#endif /* SHAREDCACHE_H_ */
diff --git a/src/gpuwattch/technology_xeon_core.cc b/src/gpuwattch/technology_xeon_core.cc
new file mode 100644
index 000000000..fe619825b
--- /dev/null
+++ b/src/gpuwattch/technology_xeon_core.cc
@@ -0,0 +1,2964 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#include "basic_circuit.h"
+
+#include "parameter.h"
+
+double wire_resistance(double resistivity, double wire_width,
+                       double wire_thickness, double barrier_thickness,
+                       double dishing_thickness, double alpha_scatter) {
+  double resistance;
+  resistance = alpha_scatter * resistivity /
+               ((wire_thickness - barrier_thickness - dishing_thickness) *
+                (wire_width - 2 * barrier_thickness));
+  return (resistance);
+}
+
+double wire_capacitance(double wire_width, double wire_thickness,
+                        double wire_spacing, double ild_thickness,
+                        double miller_value, double horiz_dielectric_constant,
+                        double vert_dielectric_constant, double fringe_cap) {
+  double vertical_cap, sidewall_cap, total_cap;
+  vertical_cap = 2 * PERMITTIVITY_FREE_SPACE * vert_dielectric_constant *
+                 wire_width / ild_thickness;
+  sidewall_cap = 2 * PERMITTIVITY_FREE_SPACE * miller_value *
+                 horiz_dielectric_constant * wire_thickness / wire_spacing;
+  total_cap = vertical_cap + sidewall_cap + fringe_cap;
+  return (total_cap);
+}
+
+void init_tech_params(double technology, bool is_tag) {
+  int iter, tech, tech_lo, tech_hi;
+  double curr_alpha, curr_vpp;
+  double wire_width, wire_thickness, wire_spacing, fringe_cap,
+      pmos_to_nmos_sizing_r;
+  //  double aspect_ratio,ild_thickness, miller_value = 1.5,
+  //  horiz_dielectric_constant, vert_dielectric_constant;
+  double barrier_thickness, dishing_thickness, alpha_scatter;
+  double curr_vdd_dram_cell, curr_v_th_dram_access_transistor,
+      curr_I_on_dram_cell, curr_c_dram_cell;
+
+  uint32_t ram_cell_tech_type = (is_tag) ? g_ip->tag_arr_ram_cell_tech_type
+                                         : g_ip->data_arr_ram_cell_tech_type;
+  uint32_t peri_global_tech_type = (is_tag)
+                                       ? g_ip->tag_arr_peri_global_tech_type
+                                       : g_ip->data_arr_peri_global_tech_type;
+
+  technology = technology * 1000.0; // in the unit of nm
+
+  // initialize parameters
+  g_tp.reset();
+  double gmp_to_gmn_multiplier_periph_global = 0;
+
+  double curr_Wmemcella_dram, curr_Wmemcellpmos_dram, curr_Wmemcellnmos_dram,
+      curr_area_cell_dram, curr_asp_ratio_cell_dram, curr_Wmemcella_sram,
+      curr_Wmemcellpmos_sram, curr_Wmemcellnmos_sram, curr_area_cell_sram,
+      curr_asp_ratio_cell_sram, curr_I_off_dram_cell_worst_case_length_temp;
+  double curr_Wmemcella_cam, curr_Wmemcellpmos_cam, curr_Wmemcellnmos_cam,
+      curr_area_cell_cam, // Sheng: CAM data
+      curr_asp_ratio_cell_cam;
+  double SENSE_AMP_D, SENSE_AMP_P; // J
+  double area_cell_dram = 0;
+  double asp_ratio_cell_dram = 0;
+  double area_cell_sram = 0;
+  double asp_ratio_cell_sram = 0;
+  double area_cell_cam = 0;
+  double asp_ratio_cell_cam = 0;
+  double mobility_eff_periph_global = 0;
+  double Vdsat_periph_global = 0;
+  double nmos_effective_resistance_multiplier;
+  double width_dram_access_transistor;
+
+  double curr_logic_scaling_co_eff =
+      0; // This is based on the reported numbers of Intel Merom 65nm,
+         // Penryn45nm and IBM cell 90/65/45 date
+  double curr_core_tx_density =
+      0; // this is density per um^2; 90, ...22nm based on Intel Penryn
+  double curr_chip_layout_overhead = 0;
+  double curr_macro_layout_overhead = 0;
+  double curr_sckt_co_eff = 0;
+
+  if (technology < 91 && technology > 89) {
+    tech_lo = 90;
+    tech_hi = 90;
+  } else if (technology < 66 && technology > 64) {
+    tech_lo = 65;
+    tech_hi = 65;
+  } else if (technology < 46 && technology > 44) {
+    tech_lo = 45;
+    tech_hi = 45;
+  } else if (technology < 33 && technology > 31) {
+    tech_lo = 32;
+    tech_hi = 32;
+  } else if (technology < 23 && technology > 21) {
+    tech_lo = 22;
+    tech_hi = 22;
+    if (ram_cell_tech_type == 3) {
+      cout << "current version does not support eDRAM technologies at 22nm"
+           << endl;
+      exit(0);
+    }
+  }
+  //  else if (technology < 17 && technology > 15)
+  //  {
+  //    tech_lo = 16;
+  //    tech_hi = 16;
+  //  }
+  else if (technology < 90 && technology > 65) {
+    tech_lo = 90;
+    tech_hi = 65;
+  } else if (technology < 65 && technology > 45) {
+    tech_lo = 65;
+    tech_hi = 45;
+  } else if (technology < 45 && technology > 32) {
+    tech_lo = 45;
+    tech_hi = 32;
+  } else if (technology < 32 && technology > 22) {
+    tech_lo = 32;
+    tech_hi = 22;
+  }
+  //  else if (technology < 22 && technology > 16)
+  //    {
+  //      tech_lo = 22;
+  //      tech_hi = 16;
+  //    }
+  else {
+    cout << "Invalid technology nodes" << endl;
+    exit(0);
+  }
+
+  double vdd[NUMBER_TECH_FLAVORS];
+  double Lphy[NUMBER_TECH_FLAVORS];
+  double Lelec[NUMBER_TECH_FLAVORS];
+  double t_ox[NUMBER_TECH_FLAVORS];
+  double v_th[NUMBER_TECH_FLAVORS];
+  double c_ox[NUMBER_TECH_FLAVORS];
+  double mobility_eff[NUMBER_TECH_FLAVORS];
+  double Vdsat[NUMBER_TECH_FLAVORS];
+  double c_g_ideal[NUMBER_TECH_FLAVORS];
+  double c_fringe[NUMBER_TECH_FLAVORS];
+  double c_junc[NUMBER_TECH_FLAVORS];
+  double I_on_n[NUMBER_TECH_FLAVORS];
+  double I_on_p[NUMBER_TECH_FLAVORS];
+  double Rnchannelon[NUMBER_TECH_FLAVORS];
+  double Rpchannelon[NUMBER_TECH_FLAVORS];
+  double n_to_p_eff_curr_drv_ratio[NUMBER_TECH_FLAVORS];
+  double I_off_n[NUMBER_TECH_FLAVORS][101];
+  double I_g_on_n[NUMBER_TECH_FLAVORS][101];
+  // double I_off_p[NUMBER_TECH_FLAVORS][101];
+  double gmp_to_gmn_multiplier[NUMBER_TECH_FLAVORS];
+  // double curr_sckt_co_eff[NUMBER_TECH_FLAVORS];
+  double long_channel_leakage_reduction[NUMBER_TECH_FLAVORS];
+
+  for (iter = 0; iter <= 1; ++iter) {
+    // linear interpolation
+    if (iter == 0) {
+      tech = tech_lo;
+      if (tech_lo == tech_hi) {
+        curr_alpha = 1;
+      } else {
+        curr_alpha = (technology - tech_hi) / (tech_lo - tech_hi);
+      }
+    } else {
+      tech = tech_hi;
+      if (tech_lo == tech_hi) {
+        break;
+      } else {
+        curr_alpha = (tech_lo - technology) / (tech_lo - tech_hi);
+      }
+    }
+
+    if (tech == 90) {
+      SENSE_AMP_D = .28e-9;   // s
+      SENSE_AMP_P = 14.7e-15; // J
+      // 90nm technology-node. Corresponds to year 2004 in ITRS
+      // ITRS HP device type
+      vdd[0] = 1.2;
+      Lphy[0] = 0.037;    // Lphy is the physical gate-length. micron
+      Lelec[0] = 0.0266;  // Lelec is the electrical gate-length. micron
+      t_ox[0] = 1.2e-3;   // micron
+      v_th[0] = 0.23707;  // V
+      c_ox[0] = 1.79e-14; // F/micron2
+      mobility_eff[0] = 342.16 * (1e-2 * 1e6 * 1e-2 * 1e6); // micron2 / Vs
+      Vdsat[0] = 0.128;                                     // V
+      c_g_ideal[0] = 6.64e-16;                              // F/micron
+      c_fringe[0] = 0.08e-15;                               // F/micron
+      c_junc[0] = 1e-15;                                    // F/micron2
+      I_on_n[0] = 1076.9e-6;                                // A/micron
+      I_on_p[0] = 712.6e-6;                                 // A/micron
+      // Note that nmos_effective_resistance_multiplier,
+      // n_to_p_eff_curr_drv_ratio and gmp_to_gmn_multiplier values are
+      // calculated offline
+      nmos_effective_resistance_multiplier = 1.54;
+      n_to_p_eff_curr_drv_ratio[0] = 2.45;
+      gmp_to_gmn_multiplier[0] = 1.22;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] /
+                       I_on_n[0]; // ohm-micron
+      Rpchannelon[0] =
+          n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0]; // ohm-micron
+      long_channel_leakage_reduction[0] = 1;
+      I_off_n[0][0] = 3.24e-8; // A/micron
+      I_off_n[0][10] = 4.01e-8;
+      I_off_n[0][20] = 4.90e-8;
+      I_off_n[0][30] = 5.92e-8;
+      I_off_n[0][40] = 7.08e-8;
+      I_off_n[0][50] = 8.38e-8;
+      I_off_n[0][60] = 9.82e-8;
+      I_off_n[0][70] = 1.14e-7;
+      I_off_n[0][80] = 1.29e-7;
+      I_off_n[0][90] = 1.43e-7;
+      I_off_n[0][100] = 1.54e-7;
+
+      I_g_on_n[0][0] = 1.65e-8; // A/micron
+      I_g_on_n[0][10] = 1.65e-8;
+      I_g_on_n[0][20] = 1.65e-8;
+      I_g_on_n[0][30] = 1.65e-8;
+      I_g_on_n[0][40] = 1.65e-8;
+      I_g_on_n[0][50] = 1.65e-8;
+      I_g_on_n[0][60] = 1.65e-8;
+      I_g_on_n[0][70] = 1.65e-8;
+      I_g_on_n[0][80] = 1.65e-8;
+      I_g_on_n[0][90] = 1.65e-8;
+      I_g_on_n[0][100] = 1.65e-8;
+
+      // ITRS LSTP device type
+      vdd[1] = 1.3;
+      Lphy[1] = 0.075;
+      Lelec[1] = 0.0486;
+      t_ox[1] = 2.2e-3;
+      v_th[1] = 0.48203;
+      c_ox[1] = 1.22e-14;
+      mobility_eff[1] = 356.76 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 0.373;
+      c_g_ideal[1] = 9.15e-16;
+      c_fringe[1] = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 503.6e-6;
+      I_on_p[1] = 235.1e-6;
+      nmos_effective_resistance_multiplier = 1.92;
+      n_to_p_eff_curr_drv_ratio[1] = 2.44;
+      gmp_to_gmn_multiplier[1] = 0.88;
+      Rnchannelon[1] =
+          nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1;
+      I_off_n[1][0] = 2.81e-12;
+      I_off_n[1][10] = 4.76e-12;
+      I_off_n[1][20] = 7.82e-12;
+      I_off_n[1][30] = 1.25e-11;
+      I_off_n[1][40] = 1.94e-11;
+      I_off_n[1][50] = 2.94e-11;
+      I_off_n[1][60] = 4.36e-11;
+      I_off_n[1][70] = 6.32e-11;
+      I_off_n[1][80] = 8.95e-11;
+      I_off_n[1][90] = 1.25e-10;
+      I_off_n[1][100] = 1.7e-10;
+
+      I_g_on_n[1][0] = 3.87e-11; // A/micron
+      I_g_on_n[1][10] = 3.87e-11;
+      I_g_on_n[1][20] = 3.87e-11;
+      I_g_on_n[1][30] = 3.87e-11;
+      I_g_on_n[1][40] = 3.87e-11;
+      I_g_on_n[1][50] = 3.87e-11;
+      I_g_on_n[1][60] = 3.87e-11;
+      I_g_on_n[1][70] = 3.87e-11;
+      I_g_on_n[1][80] = 3.87e-11;
+      I_g_on_n[1][90] = 3.87e-11;
+      I_g_on_n[1][100] = 3.87e-11;
+
+      // ITRS LOP device type
+      vdd[2] = 0.9;
+      Lphy[2] = 0.053;
+      Lelec[2] = 0.0354;
+      t_ox[2] = 1.5e-3;
+      v_th[2] = 0.30764;
+      c_ox[2] = 1.59e-14;
+      mobility_eff[2] = 460.39 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 0.113;
+      c_g_ideal[2] = 8.45e-16;
+      c_fringe[2] = 0.08e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 386.6e-6;
+      I_on_p[2] = 209.7e-6;
+      nmos_effective_resistance_multiplier = 1.77;
+      n_to_p_eff_curr_drv_ratio[2] = 2.54;
+      gmp_to_gmn_multiplier[2] = 0.98;
+      Rnchannelon[2] =
+          nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1;
+      I_off_n[2][0] = 2.14e-9;
+      I_off_n[2][10] = 2.9e-9;
+      I_off_n[2][20] = 3.87e-9;
+      I_off_n[2][30] = 5.07e-9;
+      I_off_n[2][40] = 6.54e-9;
+      I_off_n[2][50] = 8.27e-8;
+      I_off_n[2][60] = 1.02e-7;
+      I_off_n[2][70] = 1.20e-7;
+      I_off_n[2][80] = 1.36e-8;
+      I_off_n[2][90] = 1.52e-8;
+      I_off_n[2][100] = 1.73e-8;
+
+      I_g_on_n[2][0] = 4.31e-8; // A/micron
+      I_g_on_n[2][10] = 4.31e-8;
+      I_g_on_n[2][20] = 4.31e-8;
+      I_g_on_n[2][30] = 4.31e-8;
+      I_g_on_n[2][40] = 4.31e-8;
+      I_g_on_n[2][50] = 4.31e-8;
+      I_g_on_n[2][60] = 4.31e-8;
+      I_g_on_n[2][70] = 4.31e-8;
+      I_g_on_n[2][80] = 4.31e-8;
+      I_g_on_n[2][90] = 4.31e-8;
+      I_g_on_n[2][100] = 4.31e-8;
+
+      if (ram_cell_tech_type == lp_dram) {
+        // LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.2;
+        Lphy[3] = 0.12;
+        Lelec[3] = 0.0756;
+        curr_v_th_dram_access_transistor = 0.4545;
+        width_dram_access_transistor = 0.14;
+        curr_I_on_dram_cell = 45e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 21.1e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 0.168;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        // LP-DRAM wordline transistor parameters
+        curr_vpp = 1.6;
+        t_ox[3] = 2.2e-3;
+        v_th[3] = 0.4545;
+        c_ox[3] = 1.22e-14;
+        mobility_eff[3] = 323.95 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.3;
+        c_g_ideal[3] = 1.47e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 321.6e-6;
+        I_on_p[3] = 203.3e-6;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] =
+            nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.42e-11;
+        I_off_n[3][10] = 2.25e-11;
+        I_off_n[3][20] = 3.46e-11;
+        I_off_n[3][30] = 5.18e-11;
+        I_off_n[3][40] = 7.58e-11;
+        I_off_n[3][50] = 1.08e-10;
+        I_off_n[3][60] = 1.51e-10;
+        I_off_n[3][70] = 2.02e-10;
+        I_off_n[3][80] = 2.57e-10;
+        I_off_n[3][90] = 3.14e-10;
+        I_off_n[3][100] = 3.85e-10;
+      } else if (ram_cell_tech_type == comm_dram) {
+        // COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.6;
+        Lphy[3] = 0.09;
+        Lelec[3] = 0.0576;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.09;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6 * 0.09 * 0.09;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        // COMM-DRAM wordline transistor parameters
+        curr_vpp = 3.7;
+        t_ox[3] = 5.5e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 5.65e-15;
+        mobility_eff[3] = 302.2 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.32;
+        c_g_ideal[3] = 5.08e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1094.3e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.62;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] =
+            nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 5.80e-15;
+        I_off_n[3][10] = 1.21e-14;
+        I_off_n[3][20] = 2.42e-14;
+        I_off_n[3][30] = 4.65e-14;
+        I_off_n[3][40] = 8.60e-14;
+        I_off_n[3][50] = 1.54e-13;
+        I_off_n[3][60] = 2.66e-13;
+        I_off_n[3][70] = 4.45e-13;
+        I_off_n[3][80] = 7.17e-13;
+        I_off_n[3][90] = 1.11e-12;
+        I_off_n[3][100] = 1.67e-12;
+      }
+
+      // SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      // CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um; // 360
+      curr_asp_ratio_cell_cam = 2.92;                           // 2.5
+      // Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 1;
+      curr_core_tx_density = 1.25 * 0.7 * 0.7;
+      curr_sckt_co_eff = 1.1539;
+      curr_chip_layout_overhead =
+          1.2; // die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead =
+          1.1; // EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 65) { // 65nm technology-node. Corresponds to year 2007 in ITRS
+                      // ITRS HP device type
+      //      SENSE_AMP_D = .2e-9; // s
+      //      SENSE_AMP_P = 5.7e-15; // J
+      //      vdd[0] = 1.1;
+      //      Lphy[0] = 0.025;
+      //      Lelec[0] = 0.019;
+      //      t_ox[0] = 1.1e-3;
+      //      v_th[0] = .19491;
+      //      c_ox[0] = 1.88e-14;
+      //      mobility_eff[0] = 436.24 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      //      Vdsat[0] = 7.71e-2;
+      //      c_g_ideal[0] = 4.69e-16;
+      //      c_fringe[0] = 0.077e-15;
+      //      c_junc[0] = 1e-15;
+      //      I_on_n[0] = 1197.2e-6;
+      //      I_on_p[0] = 870.8e-6;
+      //      nmos_effective_resistance_multiplier = 1.50;
+      //      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      //      gmp_to_gmn_multiplier[0] = 1.38;
+      //      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] /
+      //      I_on_n[0]; Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] *
+      //      Rnchannelon[0]; long_channel_leakage_reduction[0] = 1/3.74;
+      //      //Using MASTAR, @380K, increase Lgate until Ion reduces to 90% or
+      //      Lgate increase by 10%, whichever comes first
+      //      //Ioff(Lgate normal)/Ioff(Lgate long)= 3.74.
+      //      I_off_n[0][0] = 1.96e-7;
+      //      I_off_n[0][10] = 2.29e-7;
+      //      I_off_n[0][20] = 2.66e-7;
+      //      I_off_n[0][30] = 3.05e-7;
+      //      I_off_n[0][40] = 3.49e-7;
+      //      I_off_n[0][50] = 3.95e-7;
+      //      I_off_n[0][60] = 4.45e-7;
+      //      I_off_n[0][70] = 4.97e-7;
+      //      I_off_n[0][80] = 5.48e-7;
+      //      I_off_n[0][90] = 5.94e-7;
+      //      I_off_n[0][100] = 6.3e-7;
+      //      I_g_on_n[0][0]  = 4.09e-8;//A/micron
+      //      I_g_on_n[0][10] = 4.09e-8;
+      //      I_g_on_n[0][20] = 4.09e-8;
+      //      I_g_on_n[0][30] = 4.09e-8;
+      //      I_g_on_n[0][40] = 4.09e-8;
+      //      I_g_on_n[0][50] = 4.09e-8;
+      //      I_g_on_n[0][60] = 4.09e-8;
+      //      I_g_on_n[0][70] = 4.09e-8;
+      //      I_g_on_n[0][80] = 4.09e-8;
+      //      I_g_on_n[0][90] = 4.09e-8;
+      //      I_g_on_n[0][100] = 4.09e-8;
+
+      SENSE_AMP_D = .2e-9;   // s
+      SENSE_AMP_P = 5.7e-15; // J
+      vdd[0] = 1.25;
+      Lphy[0] = 0.025;
+      Lelec[0] = 0.019;
+      t_ox[0] = 1.1e-3;
+      v_th[0] = .12491;
+      c_ox[0] = 1.88e-14;
+      mobility_eff[0] = 409.31 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 9.08e-2;
+      c_g_ideal[0] = 4.72e-16;
+      c_fringe[0] = 0.08e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] = 1486.4e-6;
+      I_on_p[0] = 1131.5e-6;
+      nmos_effective_resistance_multiplier = 1.57;
+      n_to_p_eff_curr_drv_ratio[0] = 2;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] =
+          nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];
+      long_channel_leakage_reduction[0] = 1.0 / 4.97;
+      // Using MASTAR, @380K, increase Lgate until Ion reduces to 90% or Lgate
+      // increase by 10%, whichever comes first Ioff(Lgate normal)/Ioff(Lgate
+      // long)= 4.97@Vdd=1.25; (3.74@Vdd=1.1), however, Intel paper suggest the
+      // reduction factor is 3.
+      I_off_n[0][0] = 8.62e-7;
+      I_off_n[0][10] = 9.08e-7;
+      I_off_n[0][20] = 9.55e-7;
+      I_off_n[0][30] = 1.00e-6;
+      I_off_n[0][40] = 1.05e-6;
+      I_off_n[0][50] = 1.09e-6;
+      I_off_n[0][60] = 1.14e-6;
+      I_off_n[0][70] = 1.18e-6;
+      I_off_n[0][80] = 1.23e-6;
+      I_off_n[0][90] = 1.27e-6;
+      I_off_n[0][100] = 1.31e-6;
+
+      I_g_on_n[0][0] = 7.02e-8; // A/micron
+      I_g_on_n[0][10] = 7.02e-8;
+      I_g_on_n[0][20] = 7.02e-8;
+      I_g_on_n[0][30] = 7.02e-8;
+      I_g_on_n[0][40] = 7.02e-8;
+      I_g_on_n[0][50] = 7.02e-8;
+      I_g_on_n[0][60] = 7.02e-8;
+      I_g_on_n[0][70] = 7.02e-8;
+      I_g_on_n[0][80] = 7.02e-8;
+      I_g_on_n[0][90] = 7.02e-8;
+      I_g_on_n[0][100] = 7.02e-8;
+
+      // ITRS LSTP device type
+      vdd[1] = 1.2;
+      Lphy[1] = 0.045;
+      Lelec[1] = 0.0298;
+      t_ox[1] = 1.9e-3;
+      v_th[1] = 0.52354;
+      c_ox[1] = 1.36e-14;
+      mobility_eff[1] = 341.21 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 0.128;
+      c_g_ideal[1] = 6.14e-16;
+      c_fringe[1] = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 519.2e-6;
+      I_on_p[1] = 266e-6;
+      nmos_effective_resistance_multiplier = 1.96;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] =
+          nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1 / 2.82;
+      I_off_n[1][0] = 9.12e-12;
+      I_off_n[1][10] = 1.49e-11;
+      I_off_n[1][20] = 2.36e-11;
+      I_off_n[1][30] = 3.64e-11;
+      I_off_n[1][40] = 5.48e-11;
+      I_off_n[1][50] = 8.05e-11;
+      I_off_n[1][60] = 1.15e-10;
+      I_off_n[1][70] = 1.59e-10;
+      I_off_n[1][80] = 2.1e-10;
+      I_off_n[1][90] = 2.62e-10;
+      I_off_n[1][100] = 3.21e-10;
+
+      I_g_on_n[1][0] = 1.09e-10; // A/micron
+      I_g_on_n[1][10] = 1.09e-10;
+      I_g_on_n[1][20] = 1.09e-10;
+      I_g_on_n[1][30] = 1.09e-10;
+      I_g_on_n[1][40] = 1.09e-10;
+      I_g_on_n[1][50] = 1.09e-10;
+      I_g_on_n[1][60] = 1.09e-10;
+      I_g_on_n[1][70] = 1.09e-10;
+      I_g_on_n[1][80] = 1.09e-10;
+      I_g_on_n[1][90] = 1.09e-10;
+      I_g_on_n[1][100] = 1.09e-10;
+
+      // ITRS LOP device type
+      vdd[2] = 0.8;
+      Lphy[2] = 0.032;
+      Lelec[2] = 0.0216;
+      t_ox[2] = 1.2e-3;
+      v_th[2] = 0.28512;
+      c_ox[2] = 1.87e-14;
+      mobility_eff[2] = 495.19 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 0.292;
+      c_g_ideal[2] = 6e-16;
+      c_fringe[2] = 0.08e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 573.1e-6;
+      I_on_p[2] = 340.6e-6;
+      nmos_effective_resistance_multiplier = 1.82;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] =
+          nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1 / 2.05;
+      I_off_n[2][0] = 4.9e-9;
+      I_off_n[2][10] = 6.49e-9;
+      I_off_n[2][20] = 8.45e-9;
+      I_off_n[2][30] = 1.08e-8;
+      I_off_n[2][40] = 1.37e-8;
+      I_off_n[2][50] = 1.71e-8;
+      I_off_n[2][60] = 2.09e-8;
+      I_off_n[2][70] = 2.48e-8;
+      I_off_n[2][80] = 2.84e-8;
+      I_off_n[2][90] = 3.13e-8;
+      I_off_n[2][100] = 3.42e-8;
+
+      I_g_on_n[2][0] = 9.61e-9; // A/micron
+      I_g_on_n[2][10] = 9.61e-9;
+      I_g_on_n[2][20] = 9.61e-9;
+      I_g_on_n[2][30] = 9.61e-9;
+      I_g_on_n[2][40] = 9.61e-9;
+      I_g_on_n[2][50] = 9.61e-9;
+      I_g_on_n[2][60] = 9.61e-9;
+      I_g_on_n[2][70] = 9.61e-9;
+      I_g_on_n[2][80] = 9.61e-9;
+      I_g_on_n[2][90] = 9.61e-9;
+      I_g_on_n[2][100] = 9.61e-9;
+
+      if (ram_cell_tech_type == lp_dram) {
+        // LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.2;
+        Lphy[3] = 0.12;
+        Lelec[3] = 0.0756;
+        curr_v_th_dram_access_transistor = 0.43806;
+        width_dram_access_transistor = 0.09;
+        curr_I_on_dram_cell = 36e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 19.6e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 0.11;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        // LP-DRAM wordline transistor parameters
+        curr_vpp = 1.6;
+        t_ox[3] = 2.2e-3;
+        v_th[3] = 0.43806;
+        c_ox[3] = 1.22e-14;
+        mobility_eff[3] = 328.32 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.43806;
+        c_g_ideal[3] = 1.46e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 399.8e-6;
+        I_on_p[3] = 243.4e-6;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] =
+            nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 2.23e-11;
+        I_off_n[3][10] = 3.46e-11;
+        I_off_n[3][20] = 5.24e-11;
+        I_off_n[3][30] = 7.75e-11;
+        I_off_n[3][40] = 1.12e-10;
+        I_off_n[3][50] = 1.58e-10;
+        I_off_n[3][60] = 2.18e-10;
+        I_off_n[3][70] = 2.88e-10;
+        I_off_n[3][80] = 3.63e-10;
+        I_off_n[3][90] = 4.41e-10;
+        I_off_n[3][100] = 5.36e-10;
+      } else if (ram_cell_tech_type == comm_dram) {
+        // COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.3;
+        Lphy[3] = 0.065;
+        Lelec[3] = 0.0426;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.065;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6 * 0.065 * 0.065;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        // COMM-DRAM wordline transistor parameters
+        curr_vpp = 3.3;
+        t_ox[3] = 5e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 6.16e-15;
+        mobility_eff[3] = 303.44 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.385;
+        c_g_ideal[3] = 4e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1031e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 2.39;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] =
+            nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.80e-14;
+        I_off_n[3][10] = 3.64e-14;
+        I_off_n[3][20] = 7.03e-14;
+        I_off_n[3][30] = 1.31e-13;
+        I_off_n[3][40] = 2.35e-13;
+        I_off_n[3][50] = 4.09e-13;
+        I_off_n[3][60] = 6.89e-13;
+        I_off_n[3][70] = 1.13e-12;
+        I_off_n[3][80] = 1.78e-12;
+        I_off_n[3][90] = 2.71e-12;
+        I_off_n[3][100] = 3.99e-12;
+      }
+
+      // SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      // CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      // Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7;
+      curr_core_tx_density = 1.25 * 0.7;
+      curr_sckt_co_eff = 1.1359;
+      curr_chip_layout_overhead =
+          1.2; // die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead =
+          1.1; // EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 45) { // 45nm technology-node. Corresponds to year 2010 in ITRS
+      // ITRS HP device type
+      SENSE_AMP_D = .04e-9;  // s
+      SENSE_AMP_P = 2.7e-15; // J
+      vdd[0] = 1.0;
+      Lphy[0] = 0.018;
+      Lelec[0] = 0.01345;
+      t_ox[0] = 0.65e-3;
+      v_th[0] = .18035;
+      c_ox[0] = 3.77e-14;
+      mobility_eff[0] = 266.68 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 9.38E-2;
+      c_g_ideal[0] = 6.78e-16;
+      c_fringe[0] = 0.05e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] = 2046.6e-6;
+      // There are certain problems with the ITRS PMOS numbers in MASTAR for
+      // 45nm. So we are using 65nm values of n_to_p_eff_curr_drv_ratio and
+      // gmp_to_gmn_multiplier for 45nm
+      I_on_p[0] = I_on_n[0] / 2; // This value is fixed arbitrarily but I_on_p
+                                 // is not being used in CACTI
+      nmos_effective_resistance_multiplier = 1.51;
+      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] =
+          nmos_effective_resistance_multiplier * vdd[0] / I_on_n[0];
+      Rpchannelon[0] = n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0];
+      long_channel_leakage_reduction[0] =
+          1 / 3.546; // Using MASTAR, @380K, increase Lgate until Ion reduces to
+                     // 90%, Ioff(Lgate normal)/Ioff(Lgate long)= 3.74
+      I_off_n[0][0] = 2.8e-7;
+      I_off_n[0][10] = 3.28e-7;
+      I_off_n[0][20] = 3.81e-7;
+      I_off_n[0][30] = 4.39e-7;
+      I_off_n[0][40] = 5.02e-7;
+      I_off_n[0][50] = 5.69e-7;
+      I_off_n[0][60] = 6.42e-7;
+      I_off_n[0][70] = 7.2e-7;
+      I_off_n[0][80] = 8.03e-7;
+      I_off_n[0][90] = 8.91e-7;
+      I_off_n[0][100] = 9.84e-7;
+
+      I_g_on_n[0][0] = 3.59e-8; // A/micron
+      I_g_on_n[0][10] = 3.59e-8;
+      I_g_on_n[0][20] = 3.59e-8;
+      I_g_on_n[0][30] = 3.59e-8;
+      I_g_on_n[0][40] = 3.59e-8;
+      I_g_on_n[0][50] = 3.59e-8;
+      I_g_on_n[0][60] = 3.59e-8;
+      I_g_on_n[0][70] = 3.59e-8;
+      I_g_on_n[0][80] = 3.59e-8;
+      I_g_on_n[0][90] = 3.59e-8;
+      I_g_on_n[0][100] = 3.59e-8;
+
+      // ITRS LSTP device type
+      vdd[1] = 1.1;
+      Lphy[1] = 0.028;
+      Lelec[1] = 0.0212;
+      t_ox[1] = 1.4e-3;
+      v_th[1] = 0.50245;
+      c_ox[1] = 2.01e-14;
+      mobility_eff[1] = 363.96 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 9.12e-2;
+      c_g_ideal[1] = 5.18e-16;
+      c_fringe[1] = 0.08e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 666.2e-6;
+      I_on_p[1] = I_on_n[1] / 2;
+      nmos_effective_resistance_multiplier = 1.99;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] =
+          nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1 / 2.08;
+      I_off_n[1][0] = 1.01e-11;
+      I_off_n[1][10] = 1.65e-11;
+      I_off_n[1][20] = 2.62e-11;
+      I_off_n[1][30] = 4.06e-11;
+      I_off_n[1][40] = 6.12e-11;
+      I_off_n[1][50] = 9.02e-11;
+      I_off_n[1][60] = 1.3e-10;
+      I_off_n[1][70] = 1.83e-10;
+      I_off_n[1][80] = 2.51e-10;
+      I_off_n[1][90] = 3.29e-10;
+      I_off_n[1][100] = 4.1e-10;
+
+      I_g_on_n[1][0] = 9.47e-12; // A/micron
+      I_g_on_n[1][10] = 9.47e-12;
+      I_g_on_n[1][20] = 9.47e-12;
+      I_g_on_n[1][30] = 9.47e-12;
+      I_g_on_n[1][40] = 9.47e-12;
+      I_g_on_n[1][50] = 9.47e-12;
+      I_g_on_n[1][60] = 9.47e-12;
+      I_g_on_n[1][70] = 9.47e-12;
+      I_g_on_n[1][80] = 9.47e-12;
+      I_g_on_n[1][90] = 9.47e-12;
+      I_g_on_n[1][100] = 9.47e-12;
+
+      // ITRS LOP device type
+      vdd[2] = 0.7;
+      Lphy[2] = 0.022;
+      Lelec[2] = 0.016;
+      t_ox[2] = 0.9e-3;
+      v_th[2] = 0.22599;
+      c_ox[2] = 2.82e-14; // F/micron2
+      mobility_eff[2] = 508.9 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 5.71e-2;
+      c_g_ideal[2] = 6.2e-16;
+      c_fringe[2] = 0.073e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 748.9e-6;
+      I_on_p[2] = I_on_n[2] / 2;
+      nmos_effective_resistance_multiplier = 1.76;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] =
+          nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1 / 1.92;
+      I_off_n[2][0] = 4.03e-9;
+      I_off_n[2][10] = 5.02e-9;
+      I_off_n[2][20] = 6.18e-9;
+      I_off_n[2][30] = 7.51e-9;
+      I_off_n[2][40] = 9.04e-9;
+      I_off_n[2][50] = 1.08e-8;
+      I_off_n[2][60] = 1.27e-8;
+      I_off_n[2][70] = 1.47e-8;
+      I_off_n[2][80] = 1.66e-8;
+      I_off_n[2][90] = 1.84e-8;
+      I_off_n[2][100] = 2.03e-8;
+
+      I_g_on_n[2][0] = 3.24e-8; // A/micron
+      I_g_on_n[2][10] = 4.01e-8;
+      I_g_on_n[2][20] = 4.90e-8;
+      I_g_on_n[2][30] = 5.92e-8;
+      I_g_on_n[2][40] = 7.08e-8;
+      I_g_on_n[2][50] = 8.38e-8;
+      I_g_on_n[2][60] = 9.82e-8;
+      I_g_on_n[2][70] = 1.14e-7;
+      I_g_on_n[2][80] = 1.29e-7;
+      I_g_on_n[2][90] = 1.43e-7;
+      I_g_on_n[2][100] = 1.54e-7;
+
+      if (ram_cell_tech_type == lp_dram) {
+        // LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.1;
+        Lphy[3] = 0.078;
+        Lelec[3] = 0.0504; // Assume Lelec is 30% lesser than Lphy for DRAM
+                           // access and wordline transistors.
+        curr_v_th_dram_access_transistor = 0.44559;
+        width_dram_access_transistor = 0.079;
+        curr_I_on_dram_cell = 36e-6; // A
+        curr_I_off_dram_cell_worst_case_length_temp = 19.5e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        // LP-DRAM wordline transistor parameters
+        curr_vpp = 1.5;
+        t_ox[3] = 2.1e-3;
+        v_th[3] = 0.44559;
+        c_ox[3] = 1.41e-14;
+        mobility_eff[3] = 426.30 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.181;
+        c_g_ideal[3] = 1.10e-15;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 456e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] =
+            nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 2.54e-11;
+        I_off_n[3][10] = 3.94e-11;
+        I_off_n[3][20] = 5.95e-11;
+        I_off_n[3][30] = 8.79e-11;
+        I_off_n[3][40] = 1.27e-10;
+        I_off_n[3][50] = 1.79e-10;
+        I_off_n[3][60] = 2.47e-10;
+        I_off_n[3][70] = 3.31e-10;
+        I_off_n[3][80] = 4.26e-10;
+        I_off_n[3][90] = 5.27e-10;
+        I_off_n[3][100] = 6.46e-10;
+      } else if (ram_cell_tech_type == comm_dram) {
+        // COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.1;
+        Lphy[3] = 0.045;
+        Lelec[3] = 0.0298;
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.045;
+        curr_I_on_dram_cell = 20e-6; // A
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6 * 0.045 * 0.045;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        // COMM-DRAM wordline transistor parameters
+        curr_vpp = 2.7;
+        t_ox[3] = 4e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 7.98e-15;
+        mobility_eff[3] = 368.58 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.147;
+        c_g_ideal[3] = 3.59e-16;
+        c_fringe[3] = 0.08e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 999.4e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] =
+            nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.31e-14;
+        I_off_n[3][10] = 2.68e-14;
+        I_off_n[3][20] = 5.25e-14;
+        I_off_n[3][30] = 9.88e-14;
+        I_off_n[3][40] = 1.79e-13;
+        I_off_n[3][50] = 3.15e-13;
+        I_off_n[3][60] = 5.36e-13;
+        I_off_n[3][70] = 8.86e-13;
+        I_off_n[3][80] = 1.42e-12;
+        I_off_n[3][90] = 2.20e-12;
+        I_off_n[3][100] = 3.29e-12;
+      }
+
+      // SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      // CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      // Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7 * 0.7;
+      curr_core_tx_density = 1.25;
+      curr_sckt_co_eff = 1.1387;
+      curr_chip_layout_overhead =
+          1.2; // die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead =
+          1.1; // EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 32) {
+      SENSE_AMP_D = .03e-9;   // s
+      SENSE_AMP_P = 2.16e-15; // J
+      // For 2013, MPU/ASIC stagger-contacted M1 half-pitch is 32 nm (so this is
+      // 32 nm technology i.e. FEATURESIZE = 0.032). Using the SOI process
+      // numbers for HP and LSTP.
+      vdd[0] = 0.9;
+      Lphy[0] = 0.013;
+      Lelec[0] = 0.01013;
+      t_ox[0] = 0.5e-3;
+      v_th[0] = 0.21835;
+      c_ox[0] = 4.11e-14;
+      mobility_eff[0] = 361.84 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[0] = 5.09E-2;
+      c_g_ideal[0] = 5.34e-16;
+      c_fringe[0] = 0.04e-15;
+      c_junc[0] = 1e-15;
+      I_on_n[0] = 2211.7e-6;
+      I_on_p[0] = I_on_n[0] / 2;
+      nmos_effective_resistance_multiplier = 1.49;
+      n_to_p_eff_curr_drv_ratio[0] = 2.41;
+      gmp_to_gmn_multiplier[0] = 1.38;
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] /
+                       I_on_n[0]; // ohm-micron
+      Rpchannelon[0] =
+          n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0]; // ohm-micron
+      long_channel_leakage_reduction[0] = 1 / 3.706;
+      // Using MASTAR, @300K (380K does not work in MASTAR), increase Lgate
+      // until Ion reduces to 95% or Lgate increase by 5% (DG device can only
+      // increase by 5%), whichever comes first
+      I_off_n[0][0] = 1.52e-7;
+      I_off_n[0][10] = 1.55e-7;
+      I_off_n[0][20] = 1.59e-7;
+      I_off_n[0][30] = 1.68e-7;
+      I_off_n[0][40] = 1.90e-7;
+      I_off_n[0][50] = 2.69e-7;
+      I_off_n[0][60] = 5.32e-7;
+      I_off_n[0][70] = 1.02e-6;
+      I_off_n[0][80] = 1.62e-6;
+      I_off_n[0][90] = 2.73e-6;
+      I_off_n[0][100] = 6.1e-6;
+
+      I_g_on_n[0][0] = 6.55e-8; // A/micron
+      I_g_on_n[0][10] = 6.55e-8;
+      I_g_on_n[0][20] = 6.55e-8;
+      I_g_on_n[0][30] = 6.55e-8;
+      I_g_on_n[0][40] = 6.55e-8;
+      I_g_on_n[0][50] = 6.55e-8;
+      I_g_on_n[0][60] = 6.55e-8;
+      I_g_on_n[0][70] = 6.55e-8;
+      I_g_on_n[0][80] = 6.55e-8;
+      I_g_on_n[0][90] = 6.55e-8;
+      I_g_on_n[0][100] = 6.55e-8;
+
+      //      32 DG
+      //      I_g_on_n[0][0]  = 2.71e-9;//A/micron
+      //      I_g_on_n[0][10] = 2.71e-9;
+      //      I_g_on_n[0][20] = 2.71e-9;
+      //      I_g_on_n[0][30] = 2.71e-9;
+      //      I_g_on_n[0][40] = 2.71e-9;
+      //      I_g_on_n[0][50] = 2.71e-9;
+      //      I_g_on_n[0][60] = 2.71e-9;
+      //      I_g_on_n[0][70] = 2.71e-9;
+      //      I_g_on_n[0][80] = 2.71e-9;
+      //      I_g_on_n[0][90] = 2.71e-9;
+      //      I_g_on_n[0][100] = 2.71e-9;
+
+      // LSTP device type
+      vdd[1] = 1;
+      Lphy[1] = 0.020;
+      Lelec[1] = 0.0173;
+      t_ox[1] = 1.2e-3;
+      v_th[1] = 0.513;
+      c_ox[1] = 2.29e-14;
+      mobility_eff[1] = 347.46 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[1] = 8.64e-2;
+      c_g_ideal[1] = 4.58e-16;
+      c_fringe[1] = 0.053e-15;
+      c_junc[1] = 1e-15;
+      I_on_n[1] = 683.6e-6;
+      I_on_p[1] = I_on_n[1] / 2;
+      nmos_effective_resistance_multiplier = 1.99;
+      n_to_p_eff_curr_drv_ratio[1] = 2.23;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] =
+          nmos_effective_resistance_multiplier * vdd[1] / I_on_n[1];
+      Rpchannelon[1] = n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1];
+      long_channel_leakage_reduction[1] = 1 / 1.93;
+      I_off_n[1][0] = 2.06e-11;
+      I_off_n[1][10] = 3.30e-11;
+      I_off_n[1][20] = 5.15e-11;
+      I_off_n[1][30] = 7.83e-11;
+      I_off_n[1][40] = 1.16e-10;
+      I_off_n[1][50] = 1.69e-10;
+      I_off_n[1][60] = 2.40e-10;
+      I_off_n[1][70] = 3.34e-10;
+      I_off_n[1][80] = 4.54e-10;
+      I_off_n[1][90] = 5.96e-10;
+      I_off_n[1][100] = 7.44e-10;
+
+      I_g_on_n[1][0] = 3.73e-11; // A/micron
+      I_g_on_n[1][10] = 3.73e-11;
+      I_g_on_n[1][20] = 3.73e-11;
+      I_g_on_n[1][30] = 3.73e-11;
+      I_g_on_n[1][40] = 3.73e-11;
+      I_g_on_n[1][50] = 3.73e-11;
+      I_g_on_n[1][60] = 3.73e-11;
+      I_g_on_n[1][70] = 3.73e-11;
+      I_g_on_n[1][80] = 3.73e-11;
+      I_g_on_n[1][90] = 3.73e-11;
+      I_g_on_n[1][100] = 3.73e-11;
+
+      // LOP device type
+      vdd[2] = 0.6;
+      Lphy[2] = 0.016;
+      Lelec[2] = 0.01232;
+      t_ox[2] = 0.9e-3;
+      v_th[2] = 0.24227;
+      c_ox[2] = 2.84e-14;
+      mobility_eff[2] = 513.52 * (1e-2 * 1e6 * 1e-2 * 1e6);
+      Vdsat[2] = 4.64e-2;
+      c_g_ideal[2] = 4.54e-16;
+      c_fringe[2] = 0.057e-15;
+      c_junc[2] = 1e-15;
+      I_on_n[2] = 827.8e-6;
+      I_on_p[2] = I_on_n[2] / 2;
+      nmos_effective_resistance_multiplier = 1.73;
+      n_to_p_eff_curr_drv_ratio[2] = 2.28;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] =
+          nmos_effective_resistance_multiplier * vdd[2] / I_on_n[2];
+      Rpchannelon[2] = n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2];
+      long_channel_leakage_reduction[2] = 1 / 1.89;
+      I_off_n[2][0] = 5.94e-8;
+      I_off_n[2][10] = 7.23e-8;
+      I_off_n[2][20] = 8.7e-8;
+      I_off_n[2][30] = 1.04e-7;
+      I_off_n[2][40] = 1.22e-7;
+      I_off_n[2][50] = 1.43e-7;
+      I_off_n[2][60] = 1.65e-7;
+      I_off_n[2][70] = 1.90e-7;
+      I_off_n[2][80] = 2.15e-7;
+      I_off_n[2][90] = 2.39e-7;
+      I_off_n[2][100] = 2.63e-7;
+
+      I_g_on_n[2][0] = 2.93e-9; // A/micron
+      I_g_on_n[2][10] = 2.93e-9;
+      I_g_on_n[2][20] = 2.93e-9;
+      I_g_on_n[2][30] = 2.93e-9;
+      I_g_on_n[2][40] = 2.93e-9;
+      I_g_on_n[2][50] = 2.93e-9;
+      I_g_on_n[2][60] = 2.93e-9;
+      I_g_on_n[2][70] = 2.93e-9;
+      I_g_on_n[2][80] = 2.93e-9;
+      I_g_on_n[2][90] = 2.93e-9;
+      I_g_on_n[2][100] = 2.93e-9;
+
+      if (ram_cell_tech_type == lp_dram) {
+        // LP-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.0;
+        Lphy[3] = 0.056;
+        Lelec[3] = 0.0419; // Assume Lelec is 30% lesser than Lphy for DRAM
+                           // access and wordline transistors.
+        curr_v_th_dram_access_transistor = 0.44129;
+        width_dram_access_transistor = 0.056;
+        curr_I_on_dram_cell = 36e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 18.9e-12;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = width_dram_access_transistor * Lphy[3] * 10.0;
+        curr_asp_ratio_cell_dram = 1.46;
+        curr_c_dram_cell = 20e-15;
+
+        // LP-DRAM wordline transistor parameters
+        curr_vpp = 1.5;
+        t_ox[3] = 2e-3;
+        v_th[3] = 0.44467;
+        c_ox[3] = 1.48e-14;
+        mobility_eff[3] = 408.12 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.174;
+        c_g_ideal[3] = 7.45e-16;
+        c_fringe[3] = 0.053e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1055.4e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.65;
+        n_to_p_eff_curr_drv_ratio[3] = 2.05;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] =
+            nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 3.57e-11;
+        I_off_n[3][10] = 5.51e-11;
+        I_off_n[3][20] = 8.27e-11;
+        I_off_n[3][30] = 1.21e-10;
+        I_off_n[3][40] = 1.74e-10;
+        I_off_n[3][50] = 2.45e-10;
+        I_off_n[3][60] = 3.38e-10;
+        I_off_n[3][70] = 4.53e-10;
+        I_off_n[3][80] = 5.87e-10;
+        I_off_n[3][90] = 7.29e-10;
+        I_off_n[3][100] = 8.87e-10;
+      } else if (ram_cell_tech_type == comm_dram) {
+        // COMM-DRAM cell access transistor technology parameters
+        curr_vdd_dram_cell = 1.0;
+        Lphy[3] = 0.032;
+        Lelec[3] = 0.0205; // Assume Lelec is 30% lesser than Lphy for DRAM
+                           // access and wordline transistors.
+        curr_v_th_dram_access_transistor = 1;
+        width_dram_access_transistor = 0.032;
+        curr_I_on_dram_cell = 20e-6;
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15;
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6 * 0.032 * 0.032;
+        curr_asp_ratio_cell_dram = 1.5;
+        curr_c_dram_cell = 30e-15;
+
+        // COMM-DRAM wordline transistor parameters
+        curr_vpp = 2.6;
+        t_ox[3] = 4e-3;
+        v_th[3] = 1.0;
+        c_ox[3] = 7.99e-15;
+        mobility_eff[3] = 380.76 * (1e-2 * 1e6 * 1e-2 * 1e6);
+        Vdsat[3] = 0.129;
+        c_g_ideal[3] = 2.56e-16;
+        c_fringe[3] = 0.053e-15;
+        c_junc[3] = 1e-15;
+        I_on_n[3] = 1024.5e-6;
+        I_on_p[3] = I_on_n[3] / 2;
+        nmos_effective_resistance_multiplier = 1.69;
+        n_to_p_eff_curr_drv_ratio[3] = 1.95;
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] =
+            nmos_effective_resistance_multiplier * curr_vpp / I_on_n[3];
+        Rpchannelon[3] = n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3];
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 3.63e-14;
+        I_off_n[3][10] = 7.18e-14;
+        I_off_n[3][20] = 1.36e-13;
+        I_off_n[3][30] = 2.49e-13;
+        I_off_n[3][40] = 4.41e-13;
+        I_off_n[3][50] = 7.55e-13;
+        I_off_n[3][60] = 1.26e-12;
+        I_off_n[3][70] = 2.03e-12;
+        I_off_n[3][80] = 3.19e-12;
+        I_off_n[3][90] = 4.87e-12;
+        I_off_n[3][100] = 7.16e-12;
+      }
+
+      // SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      // CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      // Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7 * 0.7 * 0.7;
+      curr_core_tx_density = 1.25 / 0.7;
+      curr_sckt_co_eff = 1.1111;
+      curr_chip_layout_overhead =
+          1.2; // die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead =
+          1.1; // EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 22) {
+      // For 2016, MPU/ASIC stagger-contacted M1 half-pitch is 22 nm (so this is
+      // 22 nm technology i.e. FEATURESIZE = 0.022). Using the DG process
+      // numbers for HP. 22 nm HP
+      vdd[0] = 0.8;
+      Lphy[0] = 0.009;    // Lphy is the physical gate-length.
+      Lelec[0] = 0.00468; // Lelec is the electrical gate-length.
+      t_ox[0] = 0.55e-3;  // micron
+      v_th[0] = 0.1395;   // V
+      c_ox[0] = 3.63e-14; // F/micron2
+      mobility_eff[0] = 426.07 * (1e-2 * 1e6 * 1e-2 * 1e6); // micron2 / Vs
+      Vdsat[0] = 2.33e-2;                                   // V/micron
+      c_g_ideal[0] = 3.27e-16;                              // F/micron
+      c_fringe[0] = 0.06e-15;                               // F/micron
+      c_junc[0] = 0;                                        // F/micron2
+      I_on_n[0] = 2626.4e-6;                                // A/micron
+      I_on_p[0] =
+          I_on_n[0] / 2; // A/micron //This value for I_on_p is not really used.
+      nmos_effective_resistance_multiplier = 1.45;
+      n_to_p_eff_curr_drv_ratio[0] =
+          2; // Wpmos/Wnmos = 2 in 2007 MASTAR. Look in
+      //"Dynamic" tab of Device workspace.
+      gmp_to_gmn_multiplier[0] = 1.38; // Just using the 32nm SOI value.
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] /
+                       I_on_n[0]; // ohm-micron
+      Rpchannelon[0] =
+          n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0]; // ohm-micron
+      long_channel_leakage_reduction[0] = 1 / 3.274;
+      I_off_n[0][0] =
+          1.52e-7 / 1.5 *
+          1.2; // From 22nm, leakage current are directly from ITRS report
+               // rather than MASTAR, since MASTAR has serious bugs there.
+      I_off_n[0][10] = 1.55e-7 / 1.5 * 1.2;
+      I_off_n[0][20] = 1.59e-7 / 1.5 * 1.2;
+      I_off_n[0][30] = 1.68e-7 / 1.5 * 1.2;
+      I_off_n[0][40] = 1.90e-7 / 1.5 * 1.2;
+      I_off_n[0][50] = 2.69e-7 / 1.5 * 1.2;
+      I_off_n[0][60] = 5.32e-7 / 1.5 * 1.2;
+      I_off_n[0][70] = 1.02e-6 / 1.5 * 1.2;
+      I_off_n[0][80] = 1.62e-6 / 1.5 * 1.2;
+      I_off_n[0][90] = 2.73e-6 / 1.5 * 1.2;
+      I_off_n[0][100] = 6.1e-6 / 1.5 * 1.2;
+      // for 22nm DG HP
+      I_g_on_n[0][0] = 1.81e-9; // A/micron
+      I_g_on_n[0][10] = 1.81e-9;
+      I_g_on_n[0][20] = 1.81e-9;
+      I_g_on_n[0][30] = 1.81e-9;
+      I_g_on_n[0][40] = 1.81e-9;
+      I_g_on_n[0][50] = 1.81e-9;
+      I_g_on_n[0][60] = 1.81e-9;
+      I_g_on_n[0][70] = 1.81e-9;
+      I_g_on_n[0][80] = 1.81e-9;
+      I_g_on_n[0][90] = 1.81e-9;
+      I_g_on_n[0][100] = 1.81e-9;
+
+      // 22 nm LSTP DG
+      vdd[1] = 0.8;
+      Lphy[1] = 0.014;
+      Lelec[1] = 0.008;   // Lelec is the electrical gate-length.
+      t_ox[1] = 1.1e-3;   // micron
+      v_th[1] = 0.40126;  // V
+      c_ox[1] = 2.30e-14; // F/micron2
+      mobility_eff[1] = 738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); // micron2 / Vs
+      Vdsat[1] = 6.64e-2;                                   // V/micron
+      c_g_ideal[1] = 3.22e-16;                              // F/micron
+      c_fringe[1] = 0.08e-15;
+      c_junc[1] = 0;        // F/micron2
+      I_on_n[1] = 727.6e-6; // A/micron
+      I_on_p[1] = I_on_n[1] / 2;
+      nmos_effective_resistance_multiplier = 1.99;
+      n_to_p_eff_curr_drv_ratio[1] = 2;
+      gmp_to_gmn_multiplier[1] = 0.99;
+      Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] /
+                       I_on_n[1]; // ohm-micron
+      Rpchannelon[1] =
+          n_to_p_eff_curr_drv_ratio[1] * Rnchannelon[1]; // ohm-micron
+      long_channel_leakage_reduction[1] = 1 / 1.89;
+      I_off_n[1][0] = 2.43e-11;
+      I_off_n[1][10] = 4.85e-11;
+      I_off_n[1][20] = 9.68e-11;
+      I_off_n[1][30] = 1.94e-10;
+      I_off_n[1][40] = 3.87e-10;
+      I_off_n[1][50] = 7.73e-10;
+      I_off_n[1][60] = 3.55e-10;
+      I_off_n[1][70] = 3.09e-9;
+      I_off_n[1][80] = 6.19e-9;
+      I_off_n[1][90] = 1.24e-8;
+      I_off_n[1][100] = 2.48e-8;
+
+      I_g_on_n[1][0] = 4.51e-10; // A/micron
+      I_g_on_n[1][10] = 4.51e-10;
+      I_g_on_n[1][20] = 4.51e-10;
+      I_g_on_n[1][30] = 4.51e-10;
+      I_g_on_n[1][40] = 4.51e-10;
+      I_g_on_n[1][50] = 4.51e-10;
+      I_g_on_n[1][60] = 4.51e-10;
+      I_g_on_n[1][70] = 4.51e-10;
+      I_g_on_n[1][80] = 4.51e-10;
+      I_g_on_n[1][90] = 4.51e-10;
+      I_g_on_n[1][100] = 4.51e-10;
+
+      // 22 nm LOP
+      vdd[2] = 0.6;
+      Lphy[2] = 0.011;
+      Lelec[2] = 0.00604; // Lelec is the electrical gate-length.
+      t_ox[2] = 0.8e-3;   // micron
+      v_th[2] = 0.2315;   // V
+      c_ox[2] = 2.87e-14; // F/micron2
+      mobility_eff[2] = 698.37 * (1e-2 * 1e6 * 1e-2 * 1e6); // micron2 / Vs
+      Vdsat[2] = 1.81e-2;                                   // V/micron
+      c_g_ideal[2] = 3.16e-16;                              // F/micron
+      c_fringe[2] = 0.08e-15;
+      c_junc[2] =
+          0; // F/micron2 This is Cj0 not Cjunc in MASTAR results->Dynamic Tab
+      I_on_n[2] = 916.1e-6; // A/micron
+      I_on_p[2] = I_on_n[2] / 2;
+      nmos_effective_resistance_multiplier = 1.73;
+      n_to_p_eff_curr_drv_ratio[2] = 2;
+      gmp_to_gmn_multiplier[2] = 1.11;
+      Rnchannelon[2] = nmos_effective_resistance_multiplier * vdd[2] /
+                       I_on_n[2]; // ohm-micron
+      Rpchannelon[2] =
+          n_to_p_eff_curr_drv_ratio[2] * Rnchannelon[2]; // ohm-micron
+      long_channel_leakage_reduction[2] = 1 / 2.38;
+
+      I_off_n[2][0] = 1.31e-8;
+      I_off_n[2][10] = 2.60e-8;
+      I_off_n[2][20] = 5.14e-8;
+      I_off_n[2][30] = 1.02e-7;
+      I_off_n[2][40] = 2.02e-7;
+      I_off_n[2][50] = 3.99e-7;
+      I_off_n[2][60] = 7.91e-7;
+      I_off_n[2][70] = 1.09e-6;
+      I_off_n[2][80] = 2.09e-6;
+      I_off_n[2][90] = 4.04e-6;
+      I_off_n[2][100] = 4.48e-6;
+
+      I_g_on_n[2][0] = 2.74e-9; // A/micron
+      I_g_on_n[2][10] = 2.74e-9;
+      I_g_on_n[2][20] = 2.74e-9;
+      I_g_on_n[2][30] = 2.74e-9;
+      I_g_on_n[2][40] = 2.74e-9;
+      I_g_on_n[2][50] = 2.74e-9;
+      I_g_on_n[2][60] = 2.74e-9;
+      I_g_on_n[2][70] = 2.74e-9;
+      I_g_on_n[2][80] = 2.74e-9;
+      I_g_on_n[2][90] = 2.74e-9;
+      I_g_on_n[2][100] = 2.74e-9;
+
+      if (ram_cell_tech_type == 3) {
+      } else if (ram_cell_tech_type == 4) {
+        // 22 nm commodity DRAM cell access transistor technology parameters.
+        // parameters
+        curr_vdd_dram_cell = 0.9; // 0.45;//This value has reduced greatly in
+                                  // 2007 ITRS for all technology nodes. In
+        // 2005 ITRS, the value was about twice the value in 2007 ITRS
+        Lphy[3] = 0.022;                      // micron
+        Lelec[3] = 0.0181;                    // micron.
+        curr_v_th_dram_access_transistor = 1; // V
+        width_dram_access_transistor = 0.022; // micron
+        curr_I_on_dram_cell =
+            20e-6; // This is a typical value that I have always
+        // kept constant. In reality this could perhaps be lower
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15; // A
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6 * 0.022 * 0.022; // micron2.
+        curr_asp_ratio_cell_dram = 0.667;
+        curr_c_dram_cell = 30e-15; // This is a typical value that I have alwaus
+        // kept constant.
+
+        // 22 nm commodity DRAM wordline transistor parameters obtained using
+        // MASTAR.
+        curr_vpp = 2.3;                                       // vpp. V
+        t_ox[3] = 3.5e-3;                                     // micron
+        v_th[3] = 1.0;                                        // V
+        c_ox[3] = 9.06e-15;                                   // F/micron2
+        mobility_eff[3] = 367.29 * (1e-2 * 1e6 * 1e-2 * 1e6); // micron2 / Vs
+        Vdsat[3] = 0.0972;                                    // V/micron
+        c_g_ideal[3] = 1.99e-16;                              // F/micron
+        c_fringe[3] = 0.053e-15;                              // F/micron
+        c_junc[3] = 1e-15;                                    // F/micron2
+        I_on_n[3] = 910.5e-6;                                 // A/micron
+        I_on_p[3] = I_on_n[3] / 2; // This value for I_on_p is not really used.
+        nmos_effective_resistance_multiplier =
+            1.69; // Using the value from 32nm.
+        //
+        n_to_p_eff_curr_drv_ratio[3] = 1.95; // Using the value from 32nm
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp /
+                         I_on_n[3]; // ohm-micron
+        Rpchannelon[3] =
+            n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; // ohm-micron
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.1e-13; // A/micron
+        I_off_n[3][10] = 2.11e-13;
+        I_off_n[3][20] = 3.88e-13;
+        I_off_n[3][30] = 6.9e-13;
+        I_off_n[3][40] = 1.19e-12;
+        I_off_n[3][50] = 1.98e-12;
+        I_off_n[3][60] = 3.22e-12;
+        I_off_n[3][70] = 5.09e-12;
+        I_off_n[3][80] = 7.85e-12;
+        I_off_n[3][90] = 1.18e-11;
+        I_off_n[3][100] = 1.72e-11;
+
+      } else {
+        // some error handler
+      }
+
+      // SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      // CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      // Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7 * 0.7 * 0.7 * 0.7;
+      curr_core_tx_density = 1.25 / 0.7 / 0.7;
+      curr_sckt_co_eff = 1.1296;
+      curr_chip_layout_overhead =
+          1.2; // die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead =
+          1.1; // EDA placement and routing tool rule of thumb
+    }
+
+    if (tech == 16) {
+      // For 2019, MPU/ASIC stagger-contacted M1 half-pitch is 16 nm (so this is
+      // 16 nm technology i.e. FEATURESIZE = 0.016). Using the DG process
+      // numbers for HP. 16 nm HP
+      vdd[0] = 0.7;
+      Lphy[0] = 0.006;    // Lphy is the physical gate-length.
+      Lelec[0] = 0.00315; // Lelec is the electrical gate-length.
+      t_ox[0] = 0.5e-3;   // micron
+      v_th[0] = 0.1489;   // V
+      c_ox[0] = 3.83e-14; // F/micron2 Cox_elec in MASTAR
+      mobility_eff[0] = 476.15 * (1e-2 * 1e6 * 1e-2 * 1e6); // micron2 / Vs
+      Vdsat[0] = 1.42e-2;      // V/micron calculated in spreadsheet
+      c_g_ideal[0] = 2.30e-16; // F/micron
+      c_fringe[0] = 0.06e-15;  // F/micron MASTAR inputdynamic/3
+      c_junc[0] = 0;           // F/micron2 MASTAR result dynamic
+      I_on_n[0] = 2768.4e-6;   // A/micron
+      I_on_p[0] =
+          I_on_n[0] / 2; // A/micron //This value for I_on_p is not really used.
+      nmos_effective_resistance_multiplier =
+          1.48; // nmos_effective_resistance_multiplier  is the ratio of Ieff to
+                // Idsat where Ieff is the effective NMOS current and Idsat is
+                // the saturation current.
+      n_to_p_eff_curr_drv_ratio[0] =
+          2; // Wpmos/Wnmos = 2 in 2007 MASTAR. Look in
+      //"Dynamic" tab of Device workspace.
+      gmp_to_gmn_multiplier[0] = 1.38; // Just using the 32nm SOI value.
+      Rnchannelon[0] = nmos_effective_resistance_multiplier * vdd[0] /
+                       I_on_n[0]; // ohm-micron
+      Rpchannelon[0] =
+          n_to_p_eff_curr_drv_ratio[0] * Rnchannelon[0]; // ohm-micron
+      long_channel_leakage_reduction[0] = 1 / 2.655;
+      I_off_n[0][0] = 1.52e-7 / 1.5 * 1.2 * 1.07;
+      I_off_n[0][10] = 1.55e-7 / 1.5 * 1.2 * 1.07;
+      I_off_n[0][20] = 1.59e-7 / 1.5 * 1.2 * 1.07;
+      I_off_n[0][30] = 1.68e-7 / 1.5 * 1.2 * 1.07;
+      I_off_n[0][40] = 1.90e-7 / 1.5 * 1.2 * 1.07;
+      I_off_n[0][50] = 2.69e-7 / 1.5 * 1.2 * 1.07;
+      I_off_n[0][60] = 5.32e-7 / 1.5 * 1.2 * 1.07;
+      I_off_n[0][70] = 1.02e-6 / 1.5 * 1.2 * 1.07;
+      I_off_n[0][80] = 1.62e-6 / 1.5 * 1.2 * 1.07;
+      I_off_n[0][90] = 2.73e-6 / 1.5 * 1.2 * 1.07;
+      I_off_n[0][100] = 6.1e-6 / 1.5 * 1.2 * 1.07;
+      // for 16nm DG HP
+      I_g_on_n[0][0] = 1.07e-9; // A/micron
+      I_g_on_n[0][10] = 1.07e-9;
+      I_g_on_n[0][20] = 1.07e-9;
+      I_g_on_n[0][30] = 1.07e-9;
+      I_g_on_n[0][40] = 1.07e-9;
+      I_g_on_n[0][50] = 1.07e-9;
+      I_g_on_n[0][60] = 1.07e-9;
+      I_g_on_n[0][70] = 1.07e-9;
+      I_g_on_n[0][80] = 1.07e-9;
+      I_g_on_n[0][90] = 1.07e-9;
+      I_g_on_n[0][100] = 1.07e-9;
+
+      //    	//16 nm LSTP DG
+      //    	vdd[1] = 0.8;
+      //    	Lphy[1] = 0.014;
+      //    	Lelec[1] = 0.008;//Lelec is the electrical gate-length.
+      //    	t_ox[1] = 1.1e-3;//micron
+      //    	v_th[1] = 0.40126;//V
+      //    	c_ox[1] = 2.30e-14;//F/micron2
+      //    	mobility_eff[1] =  738.09 * (1e-2 * 1e6 * 1e-2 * 1e6); //micron2
+      //    / Vs 	Vdsat[1] = 6.64e-2; //V/micron 	c_g_ideal[1]
+      //    = 3.22e-16;//F/micron 	c_fringe[1] = 0.008e-15; c_junc[1]
+      //    =
+      //    0;//F/micron2 	I_on_n[1] = 727.6e-6;//A/micron I_on_p[1] =
+      //    I_on_n[1] / 2; 	nmos_effective_resistance_multiplier = 1.99;
+      //    	n_to_p_eff_curr_drv_ratio[1] = 2;
+      //    	gmp_to_gmn_multiplier[1] = 0.99;
+      //    	Rnchannelon[1] = nmos_effective_resistance_multiplier * vdd[1] /
+      //    I_on_n[1];//ohm-micron 	Rpchannelon[1] =
+      //    n_to_p_eff_curr_drv_ratio[1]
+      //    * Rnchannelon[1];//ohm-micron 	I_off_n[1][0] = 2.43e-11;
+      //    	I_off_n[1][10] = 4.85e-11;
+      //    	I_off_n[1][20] = 9.68e-11;
+      //    	I_off_n[1][30] = 1.94e-10;
+      //    	I_off_n[1][40] = 3.87e-10;
+      //    	I_off_n[1][50] = 7.73e-10;
+      //    	I_off_n[1][60] = 3.55e-10;
+      //    	I_off_n[1][70] = 3.09e-9;
+      //    	I_off_n[1][80] = 6.19e-9;
+      //    	I_off_n[1][90] = 1.24e-8;
+      //    	I_off_n[1][100]= 2.48e-8;
+      //
+      //    	//    for 22nm LSTP HP
+      //    	I_g_on_n[1][0]  = 4.51e-10;//A/micron
+      //    	I_g_on_n[1][10] = 4.51e-10;
+      //    	I_g_on_n[1][20] = 4.51e-10;
+      //    	I_g_on_n[1][30] = 4.51e-10;
+      //    	I_g_on_n[1][40] = 4.51e-10;
+      //    	I_g_on_n[1][50] = 4.51e-10;
+      //    	I_g_on_n[1][60] = 4.51e-10;
+      //    	I_g_on_n[1][70] = 4.51e-10;
+      //    	I_g_on_n[1][80] = 4.51e-10;
+      //    	I_g_on_n[1][90] = 4.51e-10;
+      //    	I_g_on_n[1][100] = 4.51e-10;
+
+      if (ram_cell_tech_type == 3) {
+      } else if (ram_cell_tech_type == 4) {
+        // 22 nm commodity DRAM cell access transistor technology parameters.
+        // parameters
+        curr_vdd_dram_cell = 0.9; // 0.45;//This value has reduced greatly in
+                                  // 2007 ITRS for all technology nodes. In
+        // 2005 ITRS, the value was about twice the value in 2007 ITRS
+        Lphy[3] = 0.022;                      // micron
+        Lelec[3] = 0.0181;                    // micron.
+        curr_v_th_dram_access_transistor = 1; // V
+        width_dram_access_transistor = 0.022; // micron
+        curr_I_on_dram_cell =
+            20e-6; // This is a typical value that I have always
+        // kept constant. In reality this could perhaps be lower
+        curr_I_off_dram_cell_worst_case_length_temp = 1e-15; // A
+        curr_Wmemcella_dram = width_dram_access_transistor;
+        curr_Wmemcellpmos_dram = 0;
+        curr_Wmemcellnmos_dram = 0;
+        curr_area_cell_dram = 6 * 0.022 * 0.022; // micron2.
+        curr_asp_ratio_cell_dram = 0.667;
+        curr_c_dram_cell = 30e-15; // This is a typical value that I have alwaus
+        // kept constant.
+
+        // 22 nm commodity DRAM wordline transistor parameters obtained using
+        // MASTAR.
+        curr_vpp = 2.3;                                       // vpp. V
+        t_ox[3] = 3.5e-3;                                     // micron
+        v_th[3] = 1.0;                                        // V
+        c_ox[3] = 9.06e-15;                                   // F/micron2
+        mobility_eff[3] = 367.29 * (1e-2 * 1e6 * 1e-2 * 1e6); // micron2 / Vs
+        Vdsat[3] = 0.0972;                                    // V/micron
+        c_g_ideal[3] = 1.99e-16;                              // F/micron
+        c_fringe[3] = 0.053e-15;                              // F/micron
+        c_junc[3] = 1e-15;                                    // F/micron2
+        I_on_n[3] = 910.5e-6;                                 // A/micron
+        I_on_p[3] = I_on_n[3] / 2; // This value for I_on_p is not really used.
+        nmos_effective_resistance_multiplier =
+            1.69; // Using the value from 32nm.
+        //
+        n_to_p_eff_curr_drv_ratio[3] = 1.95; // Using the value from 32nm
+        gmp_to_gmn_multiplier[3] = 0.90;
+        Rnchannelon[3] = nmos_effective_resistance_multiplier * curr_vpp /
+                         I_on_n[3]; // ohm-micron
+        Rpchannelon[3] =
+            n_to_p_eff_curr_drv_ratio[3] * Rnchannelon[3]; // ohm-micron
+        long_channel_leakage_reduction[3] = 1;
+        I_off_n[3][0] = 1.1e-13; // A/micron
+        I_off_n[3][10] = 2.11e-13;
+        I_off_n[3][20] = 3.88e-13;
+        I_off_n[3][30] = 6.9e-13;
+        I_off_n[3][40] = 1.19e-12;
+        I_off_n[3][50] = 1.98e-12;
+        I_off_n[3][60] = 3.22e-12;
+        I_off_n[3][70] = 5.09e-12;
+        I_off_n[3][80] = 7.85e-12;
+        I_off_n[3][90] = 1.18e-11;
+        I_off_n[3][100] = 1.72e-11;
+
+      } else {
+        // some error handler
+      }
+
+      // SRAM cell properties
+      curr_Wmemcella_sram = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_sram = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_sram = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_sram = 146 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_sram = 1.46;
+      // CAM cell properties //TODO: data need to be revisited
+      curr_Wmemcella_cam = 1.31 * g_ip->F_sz_um;
+      curr_Wmemcellpmos_cam = 1.23 * g_ip->F_sz_um;
+      curr_Wmemcellnmos_cam = 2.08 * g_ip->F_sz_um;
+      curr_area_cell_cam = 292 * g_ip->F_sz_um * g_ip->F_sz_um;
+      curr_asp_ratio_cell_cam = 2.92;
+      // Empirical undifferetiated core/FU coefficient
+      curr_logic_scaling_co_eff = 0.7 * 0.7 * 0.7 * 0.7 * 0.7;
+      curr_core_tx_density = 1.25 / 0.7 / 0.7 / 0.7;
+      curr_sckt_co_eff = 1.1296;
+      curr_chip_layout_overhead =
+          1.2; // die measurement results based on Niagara 1 and 2
+      curr_macro_layout_overhead =
+          1.1; // EDA placement and routing tool rule of thumb
+    }
+
+    g_tp.peri_global.Vdd += curr_alpha * vdd[peri_global_tech_type];
+    g_tp.peri_global.t_ox += curr_alpha * t_ox[peri_global_tech_type];
+    g_tp.peri_global.Vth += curr_alpha * v_th[peri_global_tech_type];
+    g_tp.peri_global.C_ox += curr_alpha * c_ox[peri_global_tech_type];
+    g_tp.peri_global.C_g_ideal += curr_alpha * c_g_ideal[peri_global_tech_type];
+    g_tp.peri_global.C_fringe += curr_alpha * c_fringe[peri_global_tech_type];
+    g_tp.peri_global.C_junc += curr_alpha * c_junc[peri_global_tech_type];
+    g_tp.peri_global.C_junc_sidewall = 0.25e-15; // F/micron
+    g_tp.peri_global.l_phy += curr_alpha * Lphy[peri_global_tech_type];
+    g_tp.peri_global.l_elec += curr_alpha * Lelec[peri_global_tech_type];
+    g_tp.peri_global.I_on_n += curr_alpha * I_on_n[peri_global_tech_type];
+    g_tp.peri_global.R_nch_on +=
+        curr_alpha * Rnchannelon[peri_global_tech_type];
+    g_tp.peri_global.R_pch_on +=
+        curr_alpha * Rpchannelon[peri_global_tech_type];
+    g_tp.peri_global.n_to_p_eff_curr_drv_ratio +=
+        curr_alpha * n_to_p_eff_curr_drv_ratio[peri_global_tech_type];
+    g_tp.peri_global.long_channel_leakage_reduction +=
+        curr_alpha * long_channel_leakage_reduction[peri_global_tech_type];
+    g_tp.peri_global.I_off_n +=
+        curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_off_p +=
+        curr_alpha * I_off_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_g_on_n +=
+        curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300];
+    g_tp.peri_global.I_g_on_p +=
+        curr_alpha * I_g_on_n[peri_global_tech_type][g_ip->temp - 300];
+    gmp_to_gmn_multiplier_periph_global +=
+        curr_alpha * gmp_to_gmn_multiplier[peri_global_tech_type];
+
+    g_tp.sram_cell.Vdd += curr_alpha * vdd[ram_cell_tech_type];
+    g_tp.sram_cell.l_phy += curr_alpha * Lphy[ram_cell_tech_type];
+    g_tp.sram_cell.l_elec += curr_alpha * Lelec[ram_cell_tech_type];
+    g_tp.sram_cell.t_ox += curr_alpha * t_ox[ram_cell_tech_type];
+    g_tp.sram_cell.Vth += curr_alpha * v_th[ram_cell_tech_type];
+    g_tp.sram_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type];
+    g_tp.sram_cell.C_fringe += curr_alpha * c_fringe[ram_cell_tech_type];
+    g_tp.sram_cell.C_junc += curr_alpha * c_junc[ram_cell_tech_type];
+    g_tp.sram_cell.C_junc_sidewall = 0.25e-15; // F/micron
+    g_tp.sram_cell.I_on_n += curr_alpha * I_on_n[ram_cell_tech_type];
+    g_tp.sram_cell.R_nch_on += curr_alpha * Rnchannelon[ram_cell_tech_type];
+    g_tp.sram_cell.R_pch_on += curr_alpha * Rpchannelon[ram_cell_tech_type];
+    g_tp.sram_cell.n_to_p_eff_curr_drv_ratio +=
+        curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type];
+    g_tp.sram_cell.long_channel_leakage_reduction +=
+        curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type];
+    g_tp.sram_cell.I_off_n +=
+        curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_off_p +=
+        curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_g_on_n +=
+        curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.sram_cell.I_g_on_p +=
+        curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+
+    g_tp.dram_cell_Vdd += curr_alpha * curr_vdd_dram_cell;
+    g_tp.dram_acc.Vth += curr_alpha * curr_v_th_dram_access_transistor;
+    g_tp.dram_acc.l_phy += curr_alpha * Lphy[dram_cell_tech_flavor];
+    g_tp.dram_acc.l_elec += curr_alpha * Lelec[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_g_ideal += curr_alpha * c_g_ideal[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_fringe += curr_alpha * c_fringe[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_junc += curr_alpha * c_junc[dram_cell_tech_flavor];
+    g_tp.dram_acc.C_junc_sidewall = 0.25e-15; // F/micron
+    g_tp.dram_cell_I_on += curr_alpha * curr_I_on_dram_cell;
+    g_tp.dram_cell_I_off_worst_case_len_temp +=
+        curr_alpha * curr_I_off_dram_cell_worst_case_length_temp;
+    g_tp.dram_acc.I_on_n += curr_alpha * I_on_n[dram_cell_tech_flavor];
+    g_tp.dram_cell_C += curr_alpha * curr_c_dram_cell;
+    g_tp.vpp += curr_alpha * curr_vpp;
+    g_tp.dram_wl.l_phy += curr_alpha * Lphy[dram_cell_tech_flavor];
+    g_tp.dram_wl.l_elec += curr_alpha * Lelec[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_g_ideal += curr_alpha * c_g_ideal[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_fringe += curr_alpha * c_fringe[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_junc += curr_alpha * c_junc[dram_cell_tech_flavor];
+    g_tp.dram_wl.C_junc_sidewall = 0.25e-15; // F/micron
+    g_tp.dram_wl.I_on_n += curr_alpha * I_on_n[dram_cell_tech_flavor];
+    g_tp.dram_wl.R_nch_on += curr_alpha * Rnchannelon[dram_cell_tech_flavor];
+    g_tp.dram_wl.R_pch_on += curr_alpha * Rpchannelon[dram_cell_tech_flavor];
+    g_tp.dram_wl.n_to_p_eff_curr_drv_ratio +=
+        curr_alpha * n_to_p_eff_curr_drv_ratio[dram_cell_tech_flavor];
+    g_tp.dram_wl.long_channel_leakage_reduction +=
+        curr_alpha * long_channel_leakage_reduction[dram_cell_tech_flavor];
+    g_tp.dram_wl.I_off_n +=
+        curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300];
+    g_tp.dram_wl.I_off_p +=
+        curr_alpha * I_off_n[dram_cell_tech_flavor][g_ip->temp - 300];
+
+    g_tp.cam_cell.Vdd += curr_alpha * vdd[ram_cell_tech_type];
+    g_tp.cam_cell.l_phy += curr_alpha * Lphy[ram_cell_tech_type];
+    g_tp.cam_cell.l_elec += curr_alpha * Lelec[ram_cell_tech_type];
+    g_tp.cam_cell.t_ox += curr_alpha * t_ox[ram_cell_tech_type];
+    g_tp.cam_cell.Vth += curr_alpha * v_th[ram_cell_tech_type];
+    g_tp.cam_cell.C_g_ideal += curr_alpha * c_g_ideal[ram_cell_tech_type];
+    g_tp.cam_cell.C_fringe += curr_alpha * c_fringe[ram_cell_tech_type];
+    g_tp.cam_cell.C_junc += curr_alpha * c_junc[ram_cell_tech_type];
+    g_tp.cam_cell.C_junc_sidewall = 0.25e-15; // F/micron
+    g_tp.cam_cell.I_on_n += curr_alpha * I_on_n[ram_cell_tech_type];
+    g_tp.cam_cell.R_nch_on += curr_alpha * Rnchannelon[ram_cell_tech_type];
+    g_tp.cam_cell.R_pch_on += curr_alpha * Rpchannelon[ram_cell_tech_type];
+    g_tp.cam_cell.n_to_p_eff_curr_drv_ratio +=
+        curr_alpha * n_to_p_eff_curr_drv_ratio[ram_cell_tech_type];
+    g_tp.cam_cell.long_channel_leakage_reduction +=
+        curr_alpha * long_channel_leakage_reduction[ram_cell_tech_type];
+    g_tp.cam_cell.I_off_n +=
+        curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_off_p +=
+        curr_alpha * I_off_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_g_on_n +=
+        curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+    g_tp.cam_cell.I_g_on_p +=
+        curr_alpha * I_g_on_n[ram_cell_tech_type][g_ip->temp - 300];
+
+    g_tp.dram.cell_a_w += curr_alpha * curr_Wmemcella_dram;
+    g_tp.dram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_dram;
+    g_tp.dram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_dram;
+    area_cell_dram += curr_alpha * curr_area_cell_dram;
+    asp_ratio_cell_dram += curr_alpha * curr_asp_ratio_cell_dram;
+
+    g_tp.sram.cell_a_w += curr_alpha * curr_Wmemcella_sram;
+    g_tp.sram.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_sram;
+    g_tp.sram.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_sram;
+    area_cell_sram += curr_alpha * curr_area_cell_sram;
+    asp_ratio_cell_sram += curr_alpha * curr_asp_ratio_cell_sram;
+
+    g_tp.cam.cell_a_w += curr_alpha * curr_Wmemcella_cam; // sheng
+    g_tp.cam.cell_pmos_w += curr_alpha * curr_Wmemcellpmos_cam;
+    g_tp.cam.cell_nmos_w += curr_alpha * curr_Wmemcellnmos_cam;
+    area_cell_cam += curr_alpha * curr_area_cell_cam;
+    asp_ratio_cell_cam += curr_alpha * curr_asp_ratio_cell_cam;
+
+    // Sense amplifier latch Gm calculation
+    mobility_eff_periph_global +=
+        curr_alpha * mobility_eff[peri_global_tech_type];
+    Vdsat_periph_global += curr_alpha * Vdsat[peri_global_tech_type];
+
+    // Empirical undifferetiated core/FU coefficient
+    g_tp.scaling_factor.logic_scaling_co_eff +=
+        curr_alpha * curr_logic_scaling_co_eff;
+    g_tp.scaling_factor.core_tx_density += curr_alpha * curr_core_tx_density;
+    g_tp.chip_layout_overhead += curr_alpha * curr_chip_layout_overhead;
+    g_tp.macro_layout_overhead += curr_alpha * curr_macro_layout_overhead;
+    g_tp.sckt_co_eff += curr_alpha * curr_sckt_co_eff;
+  }
+
+  // Currently we are not modeling the resistance/capacitance of poly anywhere.
+  // Continuous function (or date have been processed) does not need linear
+  // interpolation
+  g_tp.w_comp_inv_p1 =
+      12.5 * g_ip->F_sz_um; // this was 10 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n1 =
+      7.5 * g_ip->F_sz_um; // this was  6 micron for the 0.8 micron process
+  g_tp.w_comp_inv_p2 =
+      25 * g_ip->F_sz_um; // this was 20 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n2 =
+      15 * g_ip->F_sz_um; // this was 12 micron for the 0.8 micron process
+  g_tp.w_comp_inv_p3 =
+      50 * g_ip->F_sz_um; // this was 40 micron for the 0.8 micron process
+  g_tp.w_comp_inv_n3 =
+      30 * g_ip->F_sz_um; // this was 24 micron for the 0.8 micron process
+  g_tp.w_eval_inv_p =
+      100 * g_ip->F_sz_um; // this was 80 micron for the 0.8 micron process
+  g_tp.w_eval_inv_n =
+      50 * g_ip->F_sz_um; // this was 40 micron for the 0.8 micron process
+  g_tp.w_comp_n =
+      12.5 * g_ip->F_sz_um; // this was 10 micron for the 0.8 micron process
+  g_tp.w_comp_p =
+      37.5 * g_ip->F_sz_um; // this was 30 micron for the 0.8 micron process
+
+  g_tp.MIN_GAP_BET_P_AND_N_DIFFS = 5 * g_ip->F_sz_um;
+  g_tp.MIN_GAP_BET_SAME_TYPE_DIFFS = 1.5 * g_ip->F_sz_um;
+  g_tp.HPOWERRAIL = 2 * g_ip->F_sz_um;
+  g_tp.cell_h_def = 50 * g_ip->F_sz_um;
+  g_tp.w_poly_contact = g_ip->F_sz_um;
+  g_tp.spacing_poly_to_contact = g_ip->F_sz_um;
+  g_tp.spacing_poly_to_poly = 1.5 * g_ip->F_sz_um;
+  g_tp.ram_wl_stitching_overhead_ = 7.5 * g_ip->F_sz_um;
+
+  g_tp.min_w_nmos_ = 3 * g_ip->F_sz_um / 2;
+  g_tp.max_w_nmos_ = 100 * g_ip->F_sz_um;
+  g_tp.w_iso = 12.5 * g_ip->F_sz_um; // was 10 micron for the 0.8 micron process
+  g_tp.w_sense_n = 3.75 * g_ip->F_sz_um; // sense amplifier N-trans; was 3
+                                         // micron for the 0.8 micron process
+  g_tp.w_sense_p = 7.5 * g_ip->F_sz_um; // sense amplifier P-trans; was 6 micron
+                                        // for the 0.8 micron process
+  g_tp.w_sense_en =
+      5 * g_ip->F_sz_um; // Sense enable transistor of the sense amplifier; was
+                         // 4 micron for the 0.8 micron process
+  g_tp.w_nmos_b_mux = 6 * g_tp.min_w_nmos_;
+  g_tp.w_nmos_sa_mux = 6 * g_tp.min_w_nmos_;
+
+  if (ram_cell_tech_type == comm_dram) {
+    g_tp.max_w_nmos_dec = 8 * g_ip->F_sz_um;
+    g_tp.h_dec = 8; // in the unit of memory cell height
+  } else {
+    g_tp.max_w_nmos_dec = g_tp.max_w_nmos_;
+    g_tp.h_dec = 4; // in the unit of memory cell height
+  }
+
+  g_tp.peri_global.C_overlap = 0.2 * g_tp.peri_global.C_g_ideal;
+  g_tp.sram_cell.C_overlap = 0.2 * g_tp.sram_cell.C_g_ideal;
+  g_tp.cam_cell.C_overlap = 0.2 * g_tp.cam_cell.C_g_ideal;
+
+  g_tp.dram_acc.C_overlap = 0.2 * g_tp.dram_acc.C_g_ideal;
+  g_tp.dram_acc.R_nch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_n;
+  // g_tp.dram_acc.R_pch_on = g_tp.dram_cell_Vdd / g_tp.dram_acc.I_on_p;
+
+  g_tp.dram_wl.C_overlap = 0.2 * g_tp.dram_wl.C_g_ideal;
+
+  double gmn_sense_amp_latch =
+      (mobility_eff_periph_global / 2) * g_tp.peri_global.C_ox *
+      (g_tp.w_sense_n / g_tp.peri_global.l_elec) * Vdsat_periph_global;
+  double gmp_sense_amp_latch =
+      gmp_to_gmn_multiplier_periph_global * gmn_sense_amp_latch;
+  g_tp.gm_sense_amp_latch = gmn_sense_amp_latch + gmp_sense_amp_latch;
+
+  g_tp.dram.b_w = sqrt(area_cell_dram / (asp_ratio_cell_dram));
+  g_tp.dram.b_h = asp_ratio_cell_dram * g_tp.dram.b_w;
+  g_tp.sram.b_w = sqrt(area_cell_sram / (asp_ratio_cell_sram));
+  g_tp.sram.b_h = asp_ratio_cell_sram * g_tp.sram.b_w;
+  g_tp.cam.b_w = sqrt(area_cell_cam / (asp_ratio_cell_cam)); // Sheng
+  g_tp.cam.b_h = asp_ratio_cell_cam * g_tp.cam.b_w;
+
+  g_tp.dram.Vbitpre = g_tp.dram_cell_Vdd;
+  g_tp.sram.Vbitpre = vdd[ram_cell_tech_type];
+  g_tp.cam.Vbitpre = vdd[ram_cell_tech_type]; // Sheng
+  pmos_to_nmos_sizing_r = pmos_to_nmos_sz_ratio();
+  g_tp.w_pmos_bl_precharge = 6 * pmos_to_nmos_sizing_r * g_tp.min_w_nmos_;
+  g_tp.w_pmos_bl_eq = pmos_to_nmos_sizing_r * g_tp.min_w_nmos_;
+
+  double wire_pitch[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+      wire_r_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES]
+                       [NUMBER_WIRE_TYPES],
+      wire_c_per_micron[NUMBER_INTERCONNECT_PROJECTION_TYPES]
+                       [NUMBER_WIRE_TYPES],
+      horiz_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES]
+                               [NUMBER_WIRE_TYPES],
+      vert_dielectric_constant[NUMBER_INTERCONNECT_PROJECTION_TYPES]
+                              [NUMBER_WIRE_TYPES],
+      aspect_ratio[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+      miller_value[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES],
+      ild_thickness[NUMBER_INTERCONNECT_PROJECTION_TYPES][NUMBER_WIRE_TYPES];
+
+  for (iter = 0; iter <= 1; ++iter) {
+    // linear interpolation
+    if (iter == 0) {
+      tech = tech_lo;
+      if (tech_lo == tech_hi) {
+        curr_alpha = 1;
+      } else {
+        curr_alpha = (technology - tech_hi) / (tech_lo - tech_hi);
+      }
+    } else {
+      tech = tech_hi;
+      if (tech_lo == tech_hi) {
+        break;
+      } else {
+        curr_alpha = (tech_lo - technology) / (tech_lo - tech_hi);
+      }
+    }
+
+    if (tech == 90) {
+      // Aggressive projections
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um; // micron
+      aspect_ratio[0][0] = 2.4;
+      wire_width = wire_pitch[0][0] / 2;                // micron
+      wire_thickness = aspect_ratio[0][0] * wire_width; // micron
+      wire_spacing = wire_pitch[0][0] - wire_width;     // micron
+      barrier_thickness = 0.01;                         // micron
+      dishing_thickness = 0;                            // micron
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] = wire_resistance(
+          CU_RESISTIVITY, wire_width, wire_thickness, barrier_thickness,
+          dishing_thickness, alpha_scatter); // ohm/micron
+      ild_thickness[0][0] = 0.48;            // micron
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 2.709;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15; // F/micron
+      wire_c_per_micron[0][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][0],
+          miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0],
+          fringe_cap); // F/micron.
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 2.4;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.48; // micron
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 2.709;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][1],
+          miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1], fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 2.7;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.96;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 2.709;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][2],
+          miller_value[0][2], horiz_dielectric_constant[0][2],
+          vert_dielectric_constant[0][2], fringe_cap);
+
+      // Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.008;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.48;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 3.038;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][0],
+          miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0], fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.48;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 3.038;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][1],
+          miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1], fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 * wire_thickness;
+      wire_r_per_micron[1][2] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 1.1;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 3.038;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][2],
+          miller_value[1][2], horiz_dielectric_constant[1][2],
+          vert_dielectric_constant[1][2], fringe_cap);
+      // Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.09;
+      wire_c_per_micron[1][3] = 60e-15 / (256 * 2 * 0.09);
+      wire_r_per_micron[1][3] = 12 / 0.09;
+    } else if (tech == 65) {
+      // Aggressive projections
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0] = 2.7;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0] * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0] = 0.405;
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 2.303;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][0],
+          miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0], fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 2.7;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.405;
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 2.303;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][1],
+          miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1], fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 2.8;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.81;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 2.303;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][2],
+          miller_value[0][2], horiz_dielectric_constant[0][2],
+          vert_dielectric_constant[0][2], fringe_cap);
+
+      // Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.006;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.405;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.734;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][0],
+          miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0], fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.405;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.734;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][1],
+          miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1], fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 * wire_thickness;
+      wire_r_per_micron[1][2] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.77;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.734;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][2],
+          miller_value[1][2], horiz_dielectric_constant[1][2],
+          vert_dielectric_constant[1][2], fringe_cap);
+      // Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.065;
+      wire_c_per_micron[1][3] = 52.5e-15 / (256 * 2 * 0.065);
+      wire_r_per_micron[1][3] = 12 / 0.065;
+    } else if (tech == 45) {
+      // Aggressive projections.
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0] = 3.0;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0] * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0] = 0.315;
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 1.958;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][0],
+          miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0], fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 3.0;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.315;
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 1.958;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][1],
+          miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1], fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 3.0;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.63;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 1.958;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][2],
+          miller_value[0][2], horiz_dielectric_constant[0][2],
+          vert_dielectric_constant[0][2], fringe_cap);
+
+      // Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.004;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.315;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.46;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][0],
+          miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0], fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.315;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.46;
+      vert_dielectric_constant[1][1] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][1],
+          miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1], fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 * wire_thickness;
+      wire_r_per_micron[1][2] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.55;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.46;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][2],
+          miller_value[1][2], horiz_dielectric_constant[1][2],
+          vert_dielectric_constant[1][2], fringe_cap);
+      // Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.045;
+      wire_c_per_micron[1][3] = 37.5e-15 / (256 * 2 * 0.045);
+      wire_r_per_micron[1][3] = 12 / 0.045;
+    } else if (tech == 32) {
+      // Aggressive projections.
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[0][0] = 3.0;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0] * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0] = 0.21;
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 1.664;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][0],
+          miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0], fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 3.0;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.21;
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 1.664;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][1],
+          miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1], fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[0][2] = 3.0;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.42;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 1.664;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][2],
+          miller_value[0][2], horiz_dielectric_constant[0][2],
+          vert_dielectric_constant[0][2], fringe_cap);
+
+      // Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.003;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[1][0] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.21;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.214;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][0],
+          miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0], fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      aspect_ratio[1][1] = 2.0;
+      wire_width = wire_pitch[1][1] / 2;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.21;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.214;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][1],
+          miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1], fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 * wire_thickness;
+      wire_r_per_micron[1][2] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.385;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.214;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][2],
+          miller_value[1][2], horiz_dielectric_constant[1][2],
+          vert_dielectric_constant[1][2], fringe_cap);
+      // Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.032;                         // micron
+      wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.032); // F/micron
+      wire_r_per_micron[1][3] = 12 / 0.032;                 // ohm/micron
+    } else if (tech == 22) {
+      // Aggressive projections.
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um; // local
+      aspect_ratio[0][0] = 3.0;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0] * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0] = 0.15;
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 1.414;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][0],
+          miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0], fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um; // semi-global
+      wire_width = wire_pitch[0][1] / 2;
+      aspect_ratio[0][1] = 3.0;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.15;
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 1.414;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][1],
+          miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1], fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um; // global
+      aspect_ratio[0][2] = 3.0;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.3;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 1.414;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][2],
+          miller_value[0][2], horiz_dielectric_constant[0][2],
+          vert_dielectric_constant[0][2], fringe_cap);
+
+      //          //*************************
+      //          wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global
+      //          aspect_ratio = 3.0;
+      //          wire_width = wire_pitch[0][4] / 2;
+      //          wire_thickness = aspect_ratio * wire_width;
+      //          wire_spacing = wire_pitch[0][4] - wire_width;
+      //          wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY,
+      //          wire_width,
+      //        		  wire_thickness, barrier_thickness,
+      //        dishing_thickness, alpha_scatter);
+      //          ild_thickness = 0.3;
+      //          wire_c_per_micron[0][4] = wire_capacitance(wire_width,
+      //          wire_thickness, wire_spacing,
+      //        		  ild_thickness, miller_value,
+      //        horiz_dielectric_constant, vert_dielectric_constant,
+      //        		  fringe_cap);
+      //
+      //          wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global
+      //          aspect_ratio = 3.0;
+      //          wire_width = wire_pitch[0][5] / 2;
+      //          wire_thickness = aspect_ratio * wire_width;
+      //          wire_spacing = wire_pitch[0][5] - wire_width;
+      //          wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY,
+      //          wire_width,
+      //        		  wire_thickness, barrier_thickness,
+      //        dishing_thickness, alpha_scatter);
+      //          ild_thickness = 0.3;
+      //          wire_c_per_micron[0][5] = wire_capacitance(wire_width,
+      //          wire_thickness, wire_spacing,
+      //        		  ild_thickness, miller_value,
+      //        horiz_dielectric_constant, vert_dielectric_constant,
+      //        		  fringe_cap);
+      //
+      //          wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global
+      //          aspect_ratio = 3.0;
+      //          wire_width = wire_pitch[0][6] / 2;
+      //          wire_thickness = aspect_ratio * wire_width;
+      //          wire_spacing = wire_pitch[0][6] - wire_width;
+      //          wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY,
+      //          wire_width,
+      //        		  wire_thickness, barrier_thickness,
+      //        dishing_thickness, alpha_scatter);
+      //          ild_thickness = 0.3;
+      //          wire_c_per_micron[0][6] = wire_capacitance(wire_width,
+      //          wire_thickness, wire_spacing,
+      //        		  ild_thickness, miller_value,
+      //        horiz_dielectric_constant, vert_dielectric_constant,
+      //        		  fringe_cap);
+      //*************************
+
+      // Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.003;
+      dishing_thickness = 0;
+      alpha_scatter = 1.05;
+      wire_r_per_micron[1][0] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.15;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 2.104;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][0],
+          miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0], fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.15;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 2.104;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][1],
+          miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1], fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 * wire_thickness;
+      wire_r_per_micron[1][2] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.275;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 2.104;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][2],
+          miller_value[1][2], horiz_dielectric_constant[1][2],
+          vert_dielectric_constant[1][2], fringe_cap);
+      // Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.022;                         // micron
+      wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.022); // F/micron
+      wire_r_per_micron[1][3] = 12 / 0.022;                 // ohm/micron
+
+      //******************
+      //            wire_pitch[1][4] = 16 * g_ip.F_sz_um;
+      //            aspect_ratio = 2.2;
+      //            wire_width = wire_pitch[1][4] / 2;
+      //            wire_thickness = aspect_ratio * wire_width;
+      //            wire_spacing = wire_pitch[1][4] - wire_width;
+      //            dishing_thickness = 0.1 *  wire_thickness;
+      //            wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY,
+      //            wire_width, 		wire_thickness,
+      //            barrier_thickness, dishing_thickness, alpha_scatter);
+      //            ild_thickness = 0.275; wire_c_per_micron[1][4] =
+      //            wire_capacitance(wire_width, wire_thickness, wire_spacing,
+      //            ild_thickness, miller_value, horiz_dielectric_constant,
+      //            vert_dielectric_constant, 		fringe_cap);
+      //
+      //            wire_pitch[1][5] = 24 * g_ip.F_sz_um;
+      //            aspect_ratio = 2.2;
+      //            wire_width = wire_pitch[1][5] / 2;
+      //            wire_thickness = aspect_ratio * wire_width;
+      //            wire_spacing = wire_pitch[1][5] - wire_width;
+      //            dishing_thickness = 0.1 *  wire_thickness;
+      //            wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY,
+      //            wire_width, 		wire_thickness,
+      //            barrier_thickness, dishing_thickness, alpha_scatter);
+      //            ild_thickness = 0.275; wire_c_per_micron[1][5] =
+      //            wire_capacitance(wire_width, wire_thickness, wire_spacing,
+      //            ild_thickness, miller_value, horiz_dielectric_constant,
+      //            vert_dielectric_constant, 		fringe_cap);
+      //
+      //            wire_pitch[1][6] = 32 * g_ip.F_sz_um;
+      //            aspect_ratio = 2.2;
+      //            wire_width = wire_pitch[1][6] / 2;
+      //            wire_thickness = aspect_ratio * wire_width;
+      //            wire_spacing = wire_pitch[1][6] - wire_width;
+      //            dishing_thickness = 0.1 *  wire_thickness;
+      //            wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY,
+      //            wire_width, 		wire_thickness,
+      //            barrier_thickness, dishing_thickness, alpha_scatter);
+      //            ild_thickness = 0.275; wire_c_per_micron[1][6] =
+      //            wire_capacitance(wire_width, wire_thickness, wire_spacing,
+      //            ild_thickness, miller_value, horiz_dielectric_constant,
+      //            vert_dielectric_constant, 		fringe_cap);
+    }
+
+    else if (tech == 16) {
+      // Aggressive projections.
+      wire_pitch[0][0] = 2.5 * g_ip->F_sz_um; // local
+      aspect_ratio[0][0] = 3.0;
+      wire_width = wire_pitch[0][0] / 2;
+      wire_thickness = aspect_ratio[0][0] * wire_width;
+      wire_spacing = wire_pitch[0][0] - wire_width;
+      barrier_thickness = 0;
+      dishing_thickness = 0;
+      alpha_scatter = 1;
+      wire_r_per_micron[0][0] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][0] = 0.108;
+      miller_value[0][0] = 1.5;
+      horiz_dielectric_constant[0][0] = 1.202;
+      vert_dielectric_constant[0][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[0][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][0],
+          miller_value[0][0], horiz_dielectric_constant[0][0],
+          vert_dielectric_constant[0][0], fringe_cap);
+
+      wire_pitch[0][1] = 4 * g_ip->F_sz_um; // semi-global
+      aspect_ratio[0][1] = 3.0;
+      wire_width = wire_pitch[0][1] / 2;
+      wire_thickness = aspect_ratio[0][1] * wire_width;
+      wire_spacing = wire_pitch[0][1] - wire_width;
+      wire_r_per_micron[0][1] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][1] = 0.108;
+      miller_value[0][1] = 1.5;
+      horiz_dielectric_constant[0][1] = 1.202;
+      vert_dielectric_constant[0][1] = 3.9;
+      wire_c_per_micron[0][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][1],
+          miller_value[0][1], horiz_dielectric_constant[0][1],
+          vert_dielectric_constant[0][1], fringe_cap);
+
+      wire_pitch[0][2] = 8 * g_ip->F_sz_um; // global
+      aspect_ratio[0][2] = 3.0;
+      wire_width = wire_pitch[0][2] / 2;
+      wire_thickness = aspect_ratio[0][2] * wire_width;
+      wire_spacing = wire_pitch[0][2] - wire_width;
+      wire_r_per_micron[0][2] =
+          wire_resistance(BULK_CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[0][2] = 0.216;
+      miller_value[0][2] = 1.5;
+      horiz_dielectric_constant[0][2] = 1.202;
+      vert_dielectric_constant[0][2] = 3.9;
+      wire_c_per_micron[0][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[0][2],
+          miller_value[0][2], horiz_dielectric_constant[0][2],
+          vert_dielectric_constant[0][2], fringe_cap);
+
+      //          //*************************
+      //          wire_pitch[0][4] = 16 * g_ip.F_sz_um;//global
+      //          aspect_ratio = 3.0;
+      //          wire_width = wire_pitch[0][4] / 2;
+      //          wire_thickness = aspect_ratio * wire_width;
+      //          wire_spacing = wire_pitch[0][4] - wire_width;
+      //          wire_r_per_micron[0][4] = wire_resistance(BULK_CU_RESISTIVITY,
+      //          wire_width,
+      //        		  wire_thickness, barrier_thickness,
+      //        dishing_thickness, alpha_scatter);
+      //          ild_thickness = 0.3;
+      //          wire_c_per_micron[0][4] = wire_capacitance(wire_width,
+      //          wire_thickness, wire_spacing,
+      //        		  ild_thickness, miller_value,
+      //        horiz_dielectric_constant, vert_dielectric_constant,
+      //        		  fringe_cap);
+      //
+      //          wire_pitch[0][5] = 24 * g_ip.F_sz_um;//global
+      //          aspect_ratio = 3.0;
+      //          wire_width = wire_pitch[0][5] / 2;
+      //          wire_thickness = aspect_ratio * wire_width;
+      //          wire_spacing = wire_pitch[0][5] - wire_width;
+      //          wire_r_per_micron[0][5] = wire_resistance(BULK_CU_RESISTIVITY,
+      //          wire_width,
+      //        		  wire_thickness, barrier_thickness,
+      //        dishing_thickness, alpha_scatter);
+      //          ild_thickness = 0.3;
+      //          wire_c_per_micron[0][5] = wire_capacitance(wire_width,
+      //          wire_thickness, wire_spacing,
+      //        		  ild_thickness, miller_value,
+      //        horiz_dielectric_constant, vert_dielectric_constant,
+      //        		  fringe_cap);
+      //
+      //          wire_pitch[0][6] = 32 * g_ip.F_sz_um;//global
+      //          aspect_ratio = 3.0;
+      //          wire_width = wire_pitch[0][6] / 2;
+      //          wire_thickness = aspect_ratio * wire_width;
+      //          wire_spacing = wire_pitch[0][6] - wire_width;
+      //          wire_r_per_micron[0][6] = wire_resistance(BULK_CU_RESISTIVITY,
+      //          wire_width,
+      //        		  wire_thickness, barrier_thickness,
+      //        dishing_thickness, alpha_scatter);
+      //          ild_thickness = 0.3;
+      //          wire_c_per_micron[0][6] = wire_capacitance(wire_width,
+      //          wire_thickness, wire_spacing,
+      //        		  ild_thickness, miller_value,
+      //        horiz_dielectric_constant, vert_dielectric_constant,
+      //        		  fringe_cap);
+      //*************************
+
+      // Conservative projections
+      wire_pitch[1][0] = 2.5 * g_ip->F_sz_um;
+      aspect_ratio[1][0] = 2.0;
+      wire_width = wire_pitch[1][0] / 2;
+      wire_thickness = aspect_ratio[1][0] * wire_width;
+      wire_spacing = wire_pitch[1][0] - wire_width;
+      barrier_thickness = 0.002;
+      dishing_thickness = 0;
+      alpha_scatter = 1.05;
+      wire_r_per_micron[1][0] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][0] = 0.108;
+      miller_value[1][0] = 1.5;
+      horiz_dielectric_constant[1][0] = 1.998;
+      vert_dielectric_constant[1][0] = 3.9;
+      fringe_cap = 0.115e-15;
+      wire_c_per_micron[1][0] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][0],
+          miller_value[1][0], horiz_dielectric_constant[1][0],
+          vert_dielectric_constant[1][0], fringe_cap);
+
+      wire_pitch[1][1] = 4 * g_ip->F_sz_um;
+      wire_width = wire_pitch[1][1] / 2;
+      aspect_ratio[1][1] = 2.0;
+      wire_thickness = aspect_ratio[1][1] * wire_width;
+      wire_spacing = wire_pitch[1][1] - wire_width;
+      wire_r_per_micron[1][1] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][1] = 0.108;
+      miller_value[1][1] = 1.5;
+      horiz_dielectric_constant[1][1] = 1.998;
+      vert_dielectric_constant[1][1] = 3.9;
+      wire_c_per_micron[1][1] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][1],
+          miller_value[1][1], horiz_dielectric_constant[1][1],
+          vert_dielectric_constant[1][1], fringe_cap);
+
+      wire_pitch[1][2] = 8 * g_ip->F_sz_um;
+      aspect_ratio[1][2] = 2.2;
+      wire_width = wire_pitch[1][2] / 2;
+      wire_thickness = aspect_ratio[1][2] * wire_width;
+      wire_spacing = wire_pitch[1][2] - wire_width;
+      dishing_thickness = 0.1 * wire_thickness;
+      wire_r_per_micron[1][2] =
+          wire_resistance(CU_RESISTIVITY, wire_width, wire_thickness,
+                          barrier_thickness, dishing_thickness, alpha_scatter);
+      ild_thickness[1][2] = 0.198;
+      miller_value[1][2] = 1.5;
+      horiz_dielectric_constant[1][2] = 1.998;
+      vert_dielectric_constant[1][2] = 3.9;
+      wire_c_per_micron[1][2] = wire_capacitance(
+          wire_width, wire_thickness, wire_spacing, ild_thickness[1][2],
+          miller_value[1][2], horiz_dielectric_constant[1][2],
+          vert_dielectric_constant[1][2], fringe_cap);
+      // Nominal projections for commodity DRAM wordline/bitline
+      wire_pitch[1][3] = 2 * 0.016;                         // micron
+      wire_c_per_micron[1][3] = 31e-15 / (256 * 2 * 0.016); // F/micron
+      wire_r_per_micron[1][3] = 12 / 0.016;                 // ohm/micron
+
+      //******************
+      //            wire_pitch[1][4] = 16 * g_ip.F_sz_um;
+      //            aspect_ratio = 2.2;
+      //            wire_width = wire_pitch[1][4] / 2;
+      //            wire_thickness = aspect_ratio * wire_width;
+      //            wire_spacing = wire_pitch[1][4] - wire_width;
+      //            dishing_thickness = 0.1 *  wire_thickness;
+      //            wire_r_per_micron[1][4] = wire_resistance(CU_RESISTIVITY,
+      //            wire_width, 		wire_thickness,
+      //            barrier_thickness, dishing_thickness, alpha_scatter);
+      //            ild_thickness = 0.275; wire_c_per_micron[1][4] =
+      //            wire_capacitance(wire_width, wire_thickness, wire_spacing,
+      //            ild_thickness, miller_value, horiz_dielectric_constant,
+      //            vert_dielectric_constant, 		fringe_cap);
+      //
+      //            wire_pitch[1][5] = 24 * g_ip.F_sz_um;
+      //            aspect_ratio = 2.2;
+      //            wire_width = wire_pitch[1][5] / 2;
+      //            wire_thickness = aspect_ratio * wire_width;
+      //            wire_spacing = wire_pitch[1][5] - wire_width;
+      //            dishing_thickness = 0.1 *  wire_thickness;
+      //            wire_r_per_micron[1][5] = wire_resistance(CU_RESISTIVITY,
+      //            wire_width, 		wire_thickness,
+      //            barrier_thickness, dishing_thickness, alpha_scatter);
+      //            ild_thickness = 0.275; wire_c_per_micron[1][5] =
+      //            wire_capacitance(wire_width, wire_thickness, wire_spacing,
+      //            ild_thickness, miller_value, horiz_dielectric_constant,
+      //            vert_dielectric_constant, 		fringe_cap);
+      //
+      //            wire_pitch[1][6] = 32 * g_ip.F_sz_um;
+      //            aspect_ratio = 2.2;
+      //            wire_width = wire_pitch[1][6] / 2;
+      //            wire_thickness = aspect_ratio * wire_width;
+      //            wire_spacing = wire_pitch[1][6] - wire_width;
+      //            dishing_thickness = 0.1 *  wire_thickness;
+      //            wire_r_per_micron[1][6] = wire_resistance(CU_RESISTIVITY,
+      //            wire_width, 		wire_thickness,
+      //            barrier_thickness, dishing_thickness, alpha_scatter);
+      //            ild_thickness = 0.275; wire_c_per_micron[1][6] =
+      //            wire_capacitance(wire_width, wire_thickness, wire_spacing,
+      //            ild_thickness, miller_value, horiz_dielectric_constant,
+      //            vert_dielectric_constant, 		fringe_cap);
+    }
+    g_tp.wire_local.pitch +=
+        curr_alpha * wire_pitch[g_ip->ic_proj_type]
+                               [(ram_cell_tech_type == comm_dram) ? 3 : 0];
+    g_tp.wire_local.R_per_um +=
+        curr_alpha *
+        wire_r_per_micron[g_ip->ic_proj_type]
+                         [(ram_cell_tech_type == comm_dram) ? 3 : 0];
+    g_tp.wire_local.C_per_um +=
+        curr_alpha *
+        wire_c_per_micron[g_ip->ic_proj_type]
+                         [(ram_cell_tech_type == comm_dram) ? 3 : 0];
+    g_tp.wire_local.aspect_ratio +=
+        curr_alpha * aspect_ratio[g_ip->ic_proj_type]
+                                 [(ram_cell_tech_type == comm_dram) ? 3 : 0];
+    g_tp.wire_local.ild_thickness +=
+        curr_alpha * ild_thickness[g_ip->ic_proj_type]
+                                  [(ram_cell_tech_type == comm_dram) ? 3 : 0];
+    g_tp.wire_local.miller_value +=
+        curr_alpha * miller_value[g_ip->ic_proj_type]
+                                 [(ram_cell_tech_type == comm_dram) ? 3 : 0];
+    g_tp.wire_local.horiz_dielectric_constant +=
+        curr_alpha *
+        horiz_dielectric_constant[g_ip->ic_proj_type]
+                                 [(ram_cell_tech_type == comm_dram) ? 3 : 0];
+    g_tp.wire_local.vert_dielectric_constant +=
+        curr_alpha *
+        vert_dielectric_constant[g_ip->ic_proj_type]
+                                [(ram_cell_tech_type == comm_dram) ? 3 : 0];
+
+    g_tp.wire_inside_mat.pitch +=
+        curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.R_per_um +=
+        curr_alpha *
+        wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.C_per_um +=
+        curr_alpha *
+        wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.aspect_ratio +=
+        curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.ild_thickness +=
+        curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.miller_value +=
+        curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.horiz_dielectric_constant +=
+        curr_alpha *
+        horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+    g_tp.wire_inside_mat.vert_dielectric_constant +=
+        curr_alpha *
+        vert_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_is_mat_type];
+
+    g_tp.wire_outside_mat.pitch +=
+        curr_alpha * wire_pitch[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.R_per_um +=
+        curr_alpha *
+        wire_r_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.C_per_um +=
+        curr_alpha *
+        wire_c_per_micron[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.aspect_ratio +=
+        curr_alpha * aspect_ratio[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.ild_thickness +=
+        curr_alpha * ild_thickness[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.miller_value +=
+        curr_alpha * miller_value[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.horiz_dielectric_constant +=
+        curr_alpha *
+        horiz_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+    g_tp.wire_outside_mat.vert_dielectric_constant +=
+        curr_alpha *
+        vert_dielectric_constant[g_ip->ic_proj_type][g_ip->wire_os_mat_type];
+
+    g_tp.unit_len_wire_del =
+        g_tp.wire_inside_mat.R_per_um * g_tp.wire_inside_mat.C_per_um / 2;
+
+    g_tp.sense_delay += curr_alpha * SENSE_AMP_D;
+    g_tp.sense_dy_power += curr_alpha * SENSE_AMP_P;
+    //    g_tp.horiz_dielectric_constant += horiz_dielectric_constant;
+    //    g_tp.vert_dielectric_constant  += vert_dielectric_constant;
+    //    g_tp.aspect_ratio              += aspect_ratio;
+    //    g_tp.miller_value              += miller_value;
+    //    g_tp.ild_thickness             += ild_thickness;
+  }
+  g_tp.fringe_cap = fringe_cap;
+
+  double rd = tr_R_on(g_tp.min_w_nmos_, NCH, 1);
+  double p_to_n_sizing_r = pmos_to_nmos_sz_ratio();
+  double c_load = gate_C(g_tp.min_w_nmos_ * (1 + p_to_n_sizing_r), 0.0);
+  double tf = rd * c_load;
+  g_tp.kinv = horowitz(0, tf, 0.5, 0.5, RISE);
+  double KLOAD = 1;
+  c_load = KLOAD * (drain_C_(g_tp.min_w_nmos_, NCH, 1, 1, g_tp.cell_h_def) +
+                    drain_C_(g_tp.min_w_nmos_ * p_to_n_sizing_r, PCH, 1, 1,
+                             g_tp.cell_h_def) +
+                    gate_C(g_tp.min_w_nmos_ * 4 * (1 + p_to_n_sizing_r), 0.0));
+  tf = rd * c_load;
+  g_tp.FO4 = horowitz(0, tf, 0.5, 0.5, RISE);
+}
diff --git a/src/gpuwattch/version.h b/src/gpuwattch/version.h
new file mode 100644
index 000000000..41bd5b9ba
--- /dev/null
+++ b/src/gpuwattch/version.h
@@ -0,0 +1,40 @@
+/*****************************************************************************
+ *                                McPAT
+ *                      SOFTWARE LICENSE AGREEMENT
+ *            Copyright 2012 Hewlett-Packard Development Company, L.P.
+ *                          All Rights Reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.”
+ *
+ ***************************************************************************/
+
+#ifndef VERSION_H_
+#define VERSION_H_
+
+#define VER_MAJOR 0 /* beta release */
+#define VER_MINOR 8
+
+#define VER_UPDATE "Aug, 2010"
+
+#endif /* VERSION_H_ */
diff --git a/src/gpuwattch/xmlParser.cc b/src/gpuwattch/xmlParser.cc
new file mode 100644
index 000000000..e402fccf8
--- /dev/null
+++ b/src/gpuwattch/xmlParser.cc
@@ -0,0 +1,3575 @@
+/**
+ ****************************************************************************
+ * <P> XML.c - implementation file for basic XML parser written in ANSI C++
+ * for portability. It works by using recursion and a node tree for breaking
+ * down the elements of an XML document.  </P>
+ *
+ * @version     V2.41
+ * @author      Frank Vanden Berghen
+ *
+ * NOTE:
+ *
+ *   If you add "#define STRICT_PARSING", on the first line of this file
+ *   the parser will see the following XML-stream:
+ *      <a><b>some text</b><b>other text    </a>
+ *   as an error. Otherwise, this tring will be equivalent to:
+ *      <a><b>some text</b><b>other text</b></a>
+ *
+ * NOTE:
+ *
+ *   If you add "#define APPROXIMATE_PARSING" on the first line of this file
+ *   the parser will see the following XML-stream:
+ *     <data name="n1">
+ *     <data name="n2">
+ *     <data name="n3" />
+ *   as equivalent to the following XML-stream:
+ *     <data name="n1" />
+ *     <data name="n2" />
+ *     <data name="n3" />
+ *   This can be useful for badly-formed XML-streams but prevent the use
+ *   of the following XML-stream (problem is: tags at contiguous levels
+ *   have the same names):
+ *     <data name="n1">
+ *        <data name="n2">
+ *            <data name="n3" />
+ *        </data>
+ *     </data>
+ *
+ * NOTE:
+ *
+ *   If you add "#define _XMLPARSER_NO_MESSAGEBOX_" on the first line of this
+ *file the "openFileHelper" function will always display error messages inside
+ *the console instead of inside a message-box-window. Message-box-windows are
+ *   available on windows 9x/NT/2000/XP/Vista only.
+ *
+ * The following license terms for the "XMLParser library from Business-Insight"
+ *apply to projects that are in some way related to the "mcpat project",
+ *including applications using "mcpat project" and tools developed for enhancing
+ *"mcpat project". All other projects (not related to "mcpat project") have to
+ *use the "XMLParser library from Business-Insight" code under the Aladdin Free
+ *Public License (AFPL) See the file "AFPL-license.txt" for more informations
+ *about the AFPL license. (see http://www.artifex.com/downloads/doc/Public.htm
+ *for detailed AFPL terms)
+ *
+ * Redistribution and use of the "XMLParser library from Business-Insight" in
+ *source and binary forms, with or without modification, are permitted provided
+ *that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Frank Vanden Berghen nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Business-Insight ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Business-Insight BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Copyright (c) 2002, Business-Insight
+ * <a href="http://www.Business-Insight.com">Business-Insight</a>
+ * All rights reserved.
+ *
+ ****************************************************************************
+ */
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#include "xmlParser.h"
+#ifdef _XMLWINDOWS
+//#ifdef _DEBUG
+//#define _CRTDBG_MAP_ALLOC
+//#include <crtdbg.h>
+//#endif
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h> // to have IsTextUnicode, MultiByteToWideChar, WideCharToMultiByte to handle unicode files
+// to have "MessageBoxA" to display error messages for openFilHelper
+#endif
+
+#include <assert.h>
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+XMLCSTR XMLNode::getVersion() { return _CXML("v2.39"); }
+void freeXMLString(XMLSTR t) {
+  if (t)
+    free(t);
+}
+
+static XMLNode::XMLCharEncoding characterEncoding = XMLNode::char_encoding_UTF8;
+static char guessWideCharChars = 1, dropWhiteSpace = 1,
+            removeCommentsInMiddleOfText = 1;
+
+inline int mmin(const int t1, const int t2) { return t1 < t2 ? t1 : t2; }
+
+// You can modify the initialization of the variable "XMLClearTags" below
+// to change the clearTags that are currently recognized by the library.
+// The number on the second columns is the length of the string inside the
+// first column. The "<!DOCTYPE" declaration must be the second in the list.
+// The "<!--" declaration must be the third in the list.
+typedef struct {
+  XMLCSTR lpszOpen;
+  int openTagLen;
+  XMLCSTR lpszClose;
+} ALLXMLClearTag;
+static ALLXMLClearTag XMLClearTags[] = {
+    {_CXML("<![CDATA["), 9, _CXML("]]>")},
+    {_CXML("<!DOCTYPE"), 9, _CXML(">")},
+    {_CXML("<!--"), 4, _CXML("-->")},
+    {_CXML("<PRE>"), 5, _CXML("</PRE>")},
+    //  {    _CXML("<Script>") ,8,  _CXML("</Script>")},
+    {NULL, 0, NULL}};
+
+// You can modify the initialization of the variable "XMLEntities" below
+// to change the character entities that are currently recognized by the
+// library. The number on the second columns is the length of the string inside
+// the first column. Additionally, the syntaxes "&#xA0;" and "&#160;" are
+// recognized.
+typedef struct {
+  XMLCSTR s;
+  int l;
+  XMLCHAR c;
+} XMLCharacterEntity;
+static XMLCharacterEntity XMLEntities[] = {
+    {_CXML("&amp;"), 5, _CXML('&')},   {_CXML("&lt;"), 4, _CXML('<')},
+    {_CXML("&gt;"), 4, _CXML('>')},    {_CXML("&quot;"), 6, _CXML('\"')},
+    {_CXML("&apos;"), 6, _CXML('\'')}, {NULL, 0, '\0'}};
+
+// When rendering the XMLNode to a string (using the "createXMLString"
+// function), you can ask for a beautiful formatting. This formatting is using
+// the following indentation character:
+#define INDENTCHAR _CXML('\t')
+
+// The following function parses the XML errors into a user friendly string.
+// You can edit this to change the output language of the library to something
+// else.
+XMLCSTR XMLNode::getError(XMLError xerror) {
+  switch (xerror) {
+  case eXMLErrorNone:
+    return _CXML("No error");
+  case eXMLErrorMissingEndTag:
+    return _CXML("Warning: Unmatched end tag");
+  case eXMLErrorNoXMLTagFound:
+    return _CXML("Warning: No XML tag found");
+  case eXMLErrorEmpty:
+    return _CXML("Error: No XML data");
+  case eXMLErrorMissingTagName:
+    return _CXML("Error: Missing start tag name");
+  case eXMLErrorMissingEndTagName:
+    return _CXML("Error: Missing end tag name");
+  case eXMLErrorUnmatchedEndTag:
+    return _CXML("Error: Unmatched end tag");
+  case eXMLErrorUnmatchedEndClearTag:
+    return _CXML("Error: Unmatched clear tag end");
+  case eXMLErrorUnexpectedToken:
+    return _CXML("Error: Unexpected token found");
+  case eXMLErrorNoElements:
+    return _CXML("Error: No elements found");
+  case eXMLErrorFileNotFound:
+    return _CXML("Error: File not found");
+  case eXMLErrorFirstTagNotFound:
+    return _CXML("Error: First Tag not found");
+  case eXMLErrorUnknownCharacterEntity:
+    return _CXML("Error: Unknown character entity");
+  case eXMLErrorCharacterCodeAbove255:
+    return _CXML(
+        "Error: Character code above 255 is forbidden in MultiByte char mode.");
+  case eXMLErrorCharConversionError:
+    return _CXML(
+        "Error: unable to convert between WideChar and MultiByte chars");
+  case eXMLErrorCannotOpenWriteFile:
+    return _CXML("Error: unable to open file for writing");
+  case eXMLErrorCannotWriteFile:
+    return _CXML("Error: cannot write into file");
+
+  case eXMLErrorBase64DataSizeIsNotMultipleOf4:
+    return _CXML("Warning: Base64-string length is not a multiple of 4");
+  case eXMLErrorBase64DecodeTruncatedData:
+    return _CXML("Warning: Base64-string is truncated");
+  case eXMLErrorBase64DecodeIllegalCharacter:
+    return _CXML("Error: Base64-string contains an illegal character");
+  case eXMLErrorBase64DecodeBufferTooSmall:
+    return _CXML("Error: Base64 decode output buffer is too small");
+  };
+  return _CXML("Unknown");
+}
+
+/////////////////////////////////////////////////////////////////////////
+//      Here start the abstraction layer to be OS-independent          //
+/////////////////////////////////////////////////////////////////////////
+
+// Here is an abstraction layer to access some common string manipulation
+// functions. The abstraction layer is currently working for gcc, Microsoft
+// Visual Studio 6.0, Microsoft Visual Studio .NET, CC (sun compiler) and
+// Borland C++. If you plan to "port" the library to a new system/compiler, all
+// you have to do is to edit the following lines.
+#ifdef XML_NO_WIDE_CHAR
+char myIsTextWideChar(const void *b, int len) { return FALSE; }
+#else
+#if defined(UNDER_CE) || !defined(_XMLWINDOWS)
+char myIsTextWideChar(const void *b,
+                      int len) // inspired by the Wine API: RtlIsTextUnicode
+{
+#ifdef sun
+  // for SPARC processors: wchar_t* buffers must always be alligned, otherwise
+  // it's a char* buffer.
+  if ((((unsigned long)b) % sizeof(wchar_t)) != 0)
+    return FALSE;
+#endif
+  const wchar_t *s = (const wchar_t *)b;
+
+  // buffer too small:
+  if (len < (int)sizeof(wchar_t))
+    return FALSE;
+
+  // odd length test
+  if (len & 1)
+    return FALSE;
+
+  /* only checks the first 256 characters */
+  len = mmin(256, len / sizeof(wchar_t));
+
+  // Check for the special byte order:
+  if (*((unsigned short *)s) == 0xFFFE)
+    return TRUE; // IS_TEXT_UNICODE_REVERSE_SIGNATURE;
+  if (*((unsigned short *)s) == 0xFEFF)
+    return TRUE; // IS_TEXT_UNICODE_SIGNATURE
+
+  // checks for ASCII characters in the UNICODE stream
+  int i, stats = 0;
+  for (i = 0; i < len; i++)
+    if (s[i] <= (unsigned short)255)
+      stats++;
+  if (stats > len / 2)
+    return TRUE;
+
+  // Check for UNICODE NULL chars
+  for (i = 0; i < len; i++)
+    if (!s[i])
+      return TRUE;
+
+  return FALSE;
+}
+#else
+char myIsTextWideChar(const void *b, int l) {
+  return (char)IsTextUnicode((CONST LPVOID)b, l, NULL);
+};
+#endif
+#endif
+
+#ifdef _XMLWINDOWS
+// for Microsoft Visual Studio 6.0 and Microsoft Visual Studio .NET and Borland
+// C++ Builder 6.0
+#ifdef _XMLWIDECHAR
+wchar_t *myMultiByteToWideChar(const char *s, XMLNode::XMLCharEncoding ce) {
+  int i;
+  if (ce == XMLNode::char_encoding_UTF8)
+    i = (int)MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0);
+  else
+    i = (int)MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, s, -1, NULL, 0);
+  if (i < 0)
+    return NULL;
+  wchar_t *d = (wchar_t *)malloc((i + 1) * sizeof(XMLCHAR));
+  if (ce == XMLNode::char_encoding_UTF8)
+    i = (int)MultiByteToWideChar(CP_UTF8, 0, s, -1, d, i);
+  else
+    i = (int)MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, s, -1, d, i);
+  d[i] = 0;
+  return d;
+}
+static inline FILE *xfopen(XMLCSTR filename, XMLCSTR mode) {
+  return _wfopen(filename, mode);
+}
+static inline int xstrlen(XMLCSTR c) { return (int)wcslen(c); }
+static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return _wcsnicmp(c1, c2, l);
+}
+static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return wcsncmp(c1, c2, l);
+}
+static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return _wcsicmp(c1, c2); }
+static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) {
+  return (XMLSTR)wcsstr(c1, c2);
+}
+static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) {
+  return (XMLSTR)wcscpy(c1, c2);
+}
+#else
+char *myWideCharToMultiByte(const wchar_t *s) {
+  UINT codePage = CP_ACP;
+  if (characterEncoding == XMLNode::char_encoding_UTF8)
+    codePage = CP_UTF8;
+  int i = (int)WideCharToMultiByte(codePage, // code page
+                                   0,        // performance and mapping flags
+                                   s,        // wide-character string
+                                   -1,       // number of chars in string
+                                   NULL,     // buffer for new string
+                                   0,        // size of buffer
+                                   NULL,     // default for unmappable chars
+                                   NULL      // set when default char used
+  );
+  if (i < 0)
+    return NULL;
+  char *d = (char *)malloc(i + 1);
+  WideCharToMultiByte(codePage, // code page
+                      0,        // performance and mapping flags
+                      s,        // wide-character string
+                      -1,       // number of chars in string
+                      d,        // buffer for new string
+                      i,        // size of buffer
+                      NULL,     // default for unmappable chars
+                      NULL      // set when default char used
+  );
+  d[i] = 0;
+  return d;
+}
+static inline FILE *xfopen(XMLCSTR filename, XMLCSTR mode) {
+  return fopen(filename, mode);
+}
+static inline int xstrlen(XMLCSTR c) { return (int)strlen(c); }
+#ifdef __BORLANDC__
+static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return strnicmp(c1, c2, l);
+}
+static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return stricmp(c1, c2); }
+#else
+static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return _strnicmp(c1, c2, l);
+}
+static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return _stricmp(c1, c2); }
+#endif
+static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return strncmp(c1, c2, l);
+}
+static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) {
+  return (XMLSTR)strstr(c1, c2);
+}
+static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) {
+  return (XMLSTR)strcpy(c1, c2);
+}
+#endif
+#else
+// for gcc and CC
+#ifdef XML_NO_WIDE_CHAR
+char *myWideCharToMultiByte(const wchar_t *s) { return NULL; }
+#else
+char *myWideCharToMultiByte(const wchar_t *s) {
+  const wchar_t *ss = s;
+  int i = (int)wcsrtombs(NULL, &ss, 0, NULL);
+  if (i < 0)
+    return NULL;
+  char *d = (char *)malloc(i + 1);
+  wcsrtombs(d, &s, i, NULL);
+  d[i] = 0;
+  return d;
+}
+#endif
+#ifdef _XMLWIDECHAR
+wchar_t *myMultiByteToWideChar(const char *s, XMLNode::XMLCharEncoding ce) {
+  const char *ss = s;
+  int i = (int)mbsrtowcs(NULL, &ss, 0, NULL);
+  if (i < 0)
+    return NULL;
+  wchar_t *d = (wchar_t *)malloc((i + 1) * sizeof(wchar_t));
+  mbsrtowcs(d, &s, i, NULL);
+  d[i] = 0;
+  return d;
+}
+int xstrlen(XMLCSTR c) { return wcslen(c); }
+#ifdef sun
+// for CC
+#include <widec.h>
+static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return wsncasecmp(c1, c2, l);
+}
+static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return wsncmp(c1, c2, l);
+}
+static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) { return wscasecmp(c1, c2); }
+#else
+// for gcc
+static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return wcsncasecmp(c1, c2, l);
+}
+static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return wcsncmp(c1, c2, l);
+}
+static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) {
+  return wcscasecmp(c1, c2);
+}
+#endif
+static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) {
+  return (XMLSTR)wcsstr(c1, c2);
+}
+static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) {
+  return (XMLSTR)wcscpy(c1, c2);
+}
+static inline FILE *xfopen(XMLCSTR filename, XMLCSTR mode) {
+  char *filenameAscii = myWideCharToMultiByte(filename);
+  FILE *f;
+  if (mode[0] == _CXML('r'))
+    f = fopen(filenameAscii, "rb");
+  else
+    f = fopen(filenameAscii, "wb");
+  free(filenameAscii);
+  return f;
+}
+#else
+static inline FILE *xfopen(XMLCSTR filename, XMLCSTR mode) {
+  return fopen(filename, mode);
+}
+static inline int xstrlen(XMLCSTR c) { return strlen(c); }
+static inline int xstrnicmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return strncasecmp(c1, c2, l);
+}
+static inline int xstrncmp(XMLCSTR c1, XMLCSTR c2, int l) {
+  return strncmp(c1, c2, l);
+}
+static inline int xstricmp(XMLCSTR c1, XMLCSTR c2) {
+  return strcasecmp(c1, c2);
+}
+static inline XMLSTR xstrstr(XMLCSTR c1, XMLCSTR c2) {
+  return (XMLSTR)strstr(c1, c2);
+}
+static inline XMLSTR xstrcpy(XMLSTR c1, XMLCSTR c2) {
+  return (XMLSTR)strcpy(c1, c2);
+}
+#endif
+static inline int _strnicmp(const char *c1, const char *c2, int l) {
+  return strncasecmp(c1, c2, l);
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+//            the "xmltoc,xmltob,xmltoi,xmltol,xmltof,xmltoa" functions      //
+///////////////////////////////////////////////////////////////////////////////
+// These 6 functions are not used inside the XMLparser.
+// There are only here as "convenience" functions for the user.
+// If you don't need them, you can delete them without any trouble.
+#ifdef _XMLWIDECHAR
+#ifdef _XMLWINDOWS
+// for Microsoft Visual Studio 6.0 and Microsoft Visual Studio .NET and Borland
+// C++ Builder 6.0
+char xmltob(XMLCSTR t, int v) {
+  if (t && (*t))
+    return (char)_wtoi(t);
+  return v;
+}
+int xmltoi(XMLCSTR t, int v) {
+  if (t && (*t))
+    return _wtoi(t);
+  return v;
+}
+long xmltol(XMLCSTR t, long v) {
+  if (t && (*t))
+    return _wtol(t);
+  return v;
+}
+double xmltof(XMLCSTR t, double v) {
+  if (t && (*t))
+    wscanf(t, "%f", &v); /*v=_wtof(t);*/
+  return v;
+}
+#else
+#ifdef sun
+// for CC
+#include <widec.h>
+char xmltob(XMLCSTR t, int v) {
+  if (t)
+    return (char)wstol(t, NULL, 10);
+  return v;
+}
+int xmltoi(XMLCSTR t, int v) {
+  if (t)
+    return (int)wstol(t, NULL, 10);
+  return v;
+}
+long xmltol(XMLCSTR t, long v) {
+  if (t)
+    return wstol(t, NULL, 10);
+  return v;
+}
+#else
+// for gcc
+char xmltob(XMLCSTR t, int v) {
+  if (t)
+    return (char)wcstol(t, NULL, 10);
+  return v;
+}
+int xmltoi(XMLCSTR t, int v) {
+  if (t)
+    return (int)wcstol(t, NULL, 10);
+  return v;
+}
+long xmltol(XMLCSTR t, long v) {
+  if (t)
+    return wcstol(t, NULL, 10);
+  return v;
+}
+#endif
+double xmltof(XMLCSTR t, double v) {
+  if (t && (*t))
+    wscanf(t, "%f", &v); /*v=_wtof(t);*/
+  return v;
+}
+#endif
+#else
+char xmltob(XMLCSTR t, char v) {
+  if (t && (*t))
+    return (char)atoi(t);
+  return v;
+}
+int xmltoi(XMLCSTR t, int v) {
+  if (t && (*t))
+    return atoi(t);
+  return v;
+}
+long xmltol(XMLCSTR t, long v) {
+  if (t && (*t))
+    return atol(t);
+  return v;
+}
+double xmltof(XMLCSTR t, double v) {
+  if (t && (*t))
+    return atof(t);
+  return v;
+}
+#endif
+XMLCSTR xmltoa(XMLCSTR t, XMLCSTR v) {
+  if (t)
+    return t;
+  return v;
+}
+XMLCHAR xmltoc(XMLCSTR t, XMLCHAR v) {
+  if (t && (*t))
+    return *t;
+  return v;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//                    the "openFileHelper" function                    //
+/////////////////////////////////////////////////////////////////////////
+
+// Since each application has its own way to report and deal with errors, you
+// should modify & rewrite the following "openFileHelper" function to get an
+// "error reporting mechanism" tailored to your needs.
+XMLNode XMLNode::openFileHelper(XMLCSTR filename, XMLCSTR tag) {
+  // guess the value of the global parameter "characterEncoding"
+  // (the guess is based on the first 200 bytes of the file).
+  FILE *f = xfopen(filename, _CXML("rb"));
+  if (f) {
+    char bb[205];
+    int l = (int)fread(bb, 1, 200, f);
+    setGlobalOptions(guessCharEncoding(bb, l), guessWideCharChars,
+                     dropWhiteSpace, removeCommentsInMiddleOfText);
+    fclose(f);
+  }
+
+  // parse the file
+  XMLResults pResults;
+  XMLNode xnode = XMLNode::parseFile(filename, tag, &pResults);
+
+  // display error message (if any)
+  if (pResults.error != eXMLErrorNone) {
+    // create message
+    char message[2000], *s1 = (char *)"", *s3 = (char *)"";
+    XMLCSTR s2 = _CXML("");
+    if (pResults.error == eXMLErrorFirstTagNotFound) {
+      s1 = (char *)"First Tag should be '";
+      s2 = tag;
+      s3 = (char *)"'.\n";
+    }
+    sprintf(message,
+#ifdef _XMLWIDECHAR
+            "XML Parsing error inside file '%S'.\n%S\nAt line %i, column "
+            "%i.\n%s%S%s"
+#else
+            "XML Parsing error inside file '%s'.\n%s\nAt line %i, column "
+            "%i.\n%s%s%s"
+#endif
+            ,
+            filename, XMLNode::getError(pResults.error), pResults.nLine,
+            pResults.nColumn, s1, s2, s3);
+
+    // display message
+#if defined(_XMLWINDOWS) && !defined(UNDER_CE) &&                              \
+    !defined(_XMLPARSER_NO_MESSAGEBOX_)
+    MessageBoxA(NULL, message, "XML Parsing error",
+                MB_OK | MB_ICONERROR | MB_TOPMOST);
+#else
+    printf("%s", message);
+#endif
+    exit(255);
+  }
+  return xnode;
+}
+
+/////////////////////////////////////////////////////////////////////////
+//      Here start the core implementation of the XMLParser library    //
+/////////////////////////////////////////////////////////////////////////
+
+// You should normally not change anything below this point.
+
+#ifndef _XMLWIDECHAR
+// If "characterEncoding=ascii" then we assume that all characters have the same
+// length of 1 byte. If "characterEncoding=UTF8" then the characters have
+// different lengths (from 1 byte to 4 bytes). If "characterEncoding=ShiftJIS"
+// then the characters have different lengths (from 1 byte to 2 bytes). This
+// table is used as lookup-table to know the length of a character (in byte)
+// based on the content of the first byte of the character. (note: if you modify
+// this, you must always have XML_utf8ByteTable[0]=0 ).
+static const char XML_utf8ByteTable[256] = {
+    //  0 1 2 3 4 5 6 7 8 9 a b c d e f
+    0, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0x00
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0x10
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0x20
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0x30
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0x40
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0x50
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0x60
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0x90
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
+    1, 1, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
+    2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
+    3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
+    4, 4, 4, 4, 4, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
+};
+static const char XML_legacyByteTable[256] = {
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const char XML_sjisByteTable[256] = {
+    //  0 1 2 3 4 5 6 7 8 9 a b c d e f
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0x80 0x81 to 0x9F 2 bytes
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0x90
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 0xe0 to 0xef 2 bytes
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1  // 0xf0
+};
+static const char XML_gb2312ByteTable[256] = {
+    //  0 1 2 3 4 5 6 7 8 9 a b c d e f
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xa0 0xa1 to 0xf7 2 bytes
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xb0
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0
+    2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1  // 0xf0
+};
+static const char XML_gbk_big5_ByteTable[256] = {
+    //  0 1 2 3 4 5 6 7 8 9 a b c d e f
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0x80 0x81 to 0xfe 2 bytes
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0x90
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xa0
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xb0
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1  // 0xf0
+};
+static const char *XML_ByteTable = (const char *)
+    XML_utf8ByteTable; // the default is
+                       // "characterEncoding=XMLNode::encoding_UTF8"
+#endif
+
+XMLNode XMLNode::emptyXMLNode;
+XMLClear XMLNode::emptyXMLClear = {NULL, NULL, NULL};
+XMLAttribute XMLNode::emptyXMLAttribute = {NULL, NULL};
+
+// Enumeration used to decipher what type a token is
+typedef enum XMLTokenTypeTag {
+  eTokenText = 0,
+  eTokenQuotedText,
+  eTokenTagStart,       /* "<"            */
+  eTokenTagEnd,         /* "</"           */
+  eTokenCloseTag,       /* ">"            */
+  eTokenEquals,         /* "="            */
+  eTokenDeclaration,    /* "<?"           */
+  eTokenShortHandClose, /* "/>"           */
+  eTokenClear,
+  eTokenError
+} XMLTokenType;
+
+// Main structure used for parsing XML
+typedef struct XML {
+  XMLCSTR lpXML;
+  XMLCSTR lpszText;
+  int nIndex, nIndexMissigEndTag;
+  enum XMLError error;
+  XMLCSTR lpEndTag;
+  int cbEndTag;
+  XMLCSTR lpNewElement;
+  int cbNewElement;
+  int nFirst;
+} XML;
+
+typedef struct {
+  ALLXMLClearTag *pClr;
+  XMLCSTR pStr;
+} NextToken;
+
+// Enumeration used when parsing attributes
+typedef enum Attrib { eAttribName = 0, eAttribEquals, eAttribValue } Attrib;
+
+// Enumeration used when parsing elements to dictate whether we are currently
+// inside a tag
+typedef enum Status { eInsideTag = 0, eOutsideTag } Status;
+
+XMLError XMLNode::writeToFile(XMLCSTR filename, const char *encoding,
+                              char nFormat) const {
+  if (!d)
+    return eXMLErrorNone;
+  FILE *f = xfopen(filename, _CXML("wb"));
+  if (!f)
+    return eXMLErrorCannotOpenWriteFile;
+#ifdef _XMLWIDECHAR
+  unsigned char h[2] = {0xFF, 0xFE};
+  if (!fwrite(h, 2, 1, f))
+    return eXMLErrorCannotWriteFile;
+  if ((!isDeclaration()) &&
+      ((d->lpszName) || (!getChildNode().isDeclaration()))) {
+    if (!fwrite(L"<?xml version=\"1.0\" encoding=\"utf-16\"?>\n",
+                sizeof(wchar_t) * 40, 1, f))
+      return eXMLErrorCannotWriteFile;
+  }
+#else
+  if ((!isDeclaration()) &&
+      ((d->lpszName) || (!getChildNode().isDeclaration()))) {
+    if (characterEncoding == char_encoding_UTF8) {
+      // header so that windows recognize the file as UTF-8:
+      unsigned char h[3] = {0xEF, 0xBB, 0xBF};
+      if (!fwrite(h, 3, 1, f))
+        return eXMLErrorCannotWriteFile;
+      encoding = "utf-8";
+    } else if (characterEncoding == char_encoding_ShiftJIS)
+      encoding = "SHIFT-JIS";
+
+    if (!encoding)
+      encoding = "ISO-8859-1";
+    if (fprintf(f, "<?xml version=\"1.0\" encoding=\"%s\"?>\n", encoding) < 0)
+      return eXMLErrorCannotWriteFile;
+  } else {
+    if (characterEncoding == char_encoding_UTF8) {
+      unsigned char h[3] = {0xEF, 0xBB, 0xBF};
+      if (!fwrite(h, 3, 1, f))
+        return eXMLErrorCannotWriteFile;
+    }
+  }
+#endif
+  int i;
+  XMLSTR t = createXMLString(nFormat, &i);
+  if (!fwrite(t, sizeof(XMLCHAR) * i, 1, f))
+    return eXMLErrorCannotWriteFile;
+  if (fclose(f) != 0)
+    return eXMLErrorCannotWriteFile;
+  free(t);
+  return eXMLErrorNone;
+}
+
+// Duplicate a given string.
+XMLSTR stringDup(XMLCSTR lpszData, int cbData) {
+  if (lpszData == NULL)
+    return NULL;
+
+  XMLSTR lpszNew;
+  if (cbData == -1)
+    cbData = (int)xstrlen(lpszData);
+  lpszNew = (XMLSTR)malloc((cbData + 1) * sizeof(XMLCHAR));
+  if (lpszNew) {
+    memcpy(lpszNew, lpszData, (cbData) * sizeof(XMLCHAR));
+    lpszNew[cbData] = (XMLCHAR)NULL;
+  }
+  return lpszNew;
+}
+
+XMLSTR ToXMLStringTool::toXMLUnSafe(XMLSTR dest, XMLCSTR source) {
+  XMLSTR dd = dest;
+  XMLCHAR ch;
+  XMLCharacterEntity *entity;
+  while ((ch = *source)) {
+    entity = XMLEntities;
+    do {
+      if (ch == entity->c) {
+        xstrcpy(dest, entity->s);
+        dest += entity->l;
+        source++;
+        goto out_of_loop1;
+      }
+      entity++;
+    } while (entity->s);
+#ifdef _XMLWIDECHAR
+    *(dest++) = *(source++);
+#else
+    switch (XML_ByteTable[(unsigned char)ch]) {
+    case 4:
+      *(dest++) = *(source++);
+    case 3:
+      *(dest++) = *(source++);
+    case 2:
+      *(dest++) = *(source++);
+    case 1:
+      *(dest++) = *(source++);
+    }
+#endif
+  out_of_loop1:;
+  }
+  *dest = 0;
+  return dd;
+}
+
+// private (used while rendering):
+int ToXMLStringTool::lengthXMLString(XMLCSTR source) {
+  int r = 0;
+  XMLCharacterEntity *entity;
+  XMLCHAR ch;
+  while ((ch = *source)) {
+    entity = XMLEntities;
+    do {
+      if (ch == entity->c) {
+        r += entity->l;
+        source++;
+        goto out_of_loop1;
+      }
+      entity++;
+    } while (entity->s);
+#ifdef _XMLWIDECHAR
+    r++;
+    source++;
+#else
+    ch = XML_ByteTable[(unsigned char)ch];
+    r += ch;
+    source += ch;
+#endif
+  out_of_loop1:;
+  }
+  return r;
+}
+
+ToXMLStringTool::~ToXMLStringTool() { freeBuffer(); }
+void ToXMLStringTool::freeBuffer() {
+  if (buf)
+    free(buf);
+  buf = NULL;
+  buflen = 0;
+}
+XMLSTR ToXMLStringTool::toXML(XMLCSTR source) {
+  int l = lengthXMLString(source) + 1;
+  if (l > buflen) {
+    buflen = l;
+    buf = (XMLSTR)realloc(buf, l * sizeof(XMLCHAR));
+  }
+  return toXMLUnSafe(buf, source);
+}
+
+// private:
+XMLSTR fromXMLString(XMLCSTR s, int lo, XML *pXML) {
+  // This function is the opposite of the function "toXMLString". It decodes the
+  // escape sequences &amp;, &quot;, &apos;, &lt;, &gt; and replace them by the
+  // characters
+  // &,",',<,>. This function is used internally by the XML Parser. All the
+  // calls to the XML library will always gives you back "decoded" strings.
+  //
+  // in: string (s) and length (lo) of string
+  // out:  new allocated string converted from xml
+  if (!s)
+    return NULL;
+
+  int ll = 0, j;
+  XMLSTR d;
+  XMLCSTR ss = s;
+  XMLCharacterEntity *entity;
+  while ((lo > 0) && (*s)) {
+    if (*s == _CXML('&')) {
+      if ((lo > 2) && (s[1] == _CXML('#'))) {
+        s += 2;
+        lo -= 2;
+        if ((*s == _CXML('X')) || (*s == _CXML('x'))) {
+          s++;
+          lo--;
+        }
+        while ((*s) && (*s != _CXML(';')) && ((lo--) > 0))
+          s++;
+        if (*s != _CXML(';')) {
+          pXML->error = eXMLErrorUnknownCharacterEntity;
+          return NULL;
+        }
+        s++;
+        lo--;
+      } else {
+        entity = XMLEntities;
+        do {
+          if ((lo >= entity->l) && (xstrnicmp(s, entity->s, entity->l) == 0)) {
+            s += entity->l;
+            lo -= entity->l;
+            break;
+          }
+          entity++;
+        } while (entity->s);
+        if (!entity->s) {
+          pXML->error = eXMLErrorUnknownCharacterEntity;
+          return NULL;
+        }
+      }
+    } else {
+#ifdef _XMLWIDECHAR
+      s++;
+      lo--;
+#else
+      j = XML_ByteTable[(unsigned char)*s];
+      s += j;
+      lo -= j;
+      ll += j - 1;
+#endif
+    }
+    ll++;
+  }
+
+  d = (XMLSTR)malloc((ll + 1) * sizeof(XMLCHAR));
+  s = d;
+  while (ll-- > 0) {
+    if (*ss == _CXML('&')) {
+      if (ss[1] == _CXML('#')) {
+        ss += 2;
+        j = 0;
+        if ((*ss == _CXML('X')) || (*ss == _CXML('x'))) {
+          ss++;
+          while (*ss != _CXML(';')) {
+            if ((*ss >= _CXML('0')) && (*ss <= _CXML('9')))
+              j = (j << 4) + *ss - _CXML('0');
+            else if ((*ss >= _CXML('A')) && (*ss <= _CXML('F')))
+              j = (j << 4) + *ss - _CXML('A') + 10;
+            else if ((*ss >= _CXML('a')) && (*ss <= _CXML('f')))
+              j = (j << 4) + *ss - _CXML('a') + 10;
+            else {
+              free((void *)s);
+              pXML->error = eXMLErrorUnknownCharacterEntity;
+              return NULL;
+            }
+            ss++;
+          }
+        } else {
+          while (*ss != _CXML(';')) {
+            if ((*ss >= _CXML('0')) && (*ss <= _CXML('9')))
+              j = (j * 10) + *ss - _CXML('0');
+            else {
+              free((void *)s);
+              pXML->error = eXMLErrorUnknownCharacterEntity;
+              return NULL;
+            }
+            ss++;
+          }
+        }
+#ifndef _XMLWIDECHAR
+        if (j > 255) {
+          free((void *)s);
+          pXML->error = eXMLErrorCharacterCodeAbove255;
+          return NULL;
+        }
+#endif
+        (*d++) = (XMLCHAR)j;
+        ss++;
+      } else {
+        entity = XMLEntities;
+        do {
+          if (xstrnicmp(ss, entity->s, entity->l) == 0) {
+            *(d++) = entity->c;
+            ss += entity->l;
+            break;
+          }
+          entity++;
+        } while (entity->s);
+      }
+    } else {
+#ifdef _XMLWIDECHAR
+      *(d++) = *(ss++);
+#else
+      switch (XML_ByteTable[(unsigned char)*ss]) {
+      case 4:
+        *(d++) = *(ss++);
+        ll--;
+      case 3:
+        *(d++) = *(ss++);
+        ll--;
+      case 2:
+        *(d++) = *(ss++);
+        ll--;
+      case 1:
+        *(d++) = *(ss++);
+      }
+#endif
+    }
+  }
+  *d = 0;
+  return (XMLSTR)s;
+}
+
+#define XML_isSPACECHAR(ch)                                                    \
+  ((ch == _CXML('\n')) || (ch == _CXML(' ')) || (ch == _CXML('\t')) ||         \
+   (ch == _CXML('\r')))
+
+// private:
+char myTagCompare(XMLCSTR cclose, XMLCSTR copen)
+// !!!! WARNING strange convention&:
+// return 0 if equals
+// return 1 if different
+{
+  if (!cclose)
+    return 1;
+  int l = (int)xstrlen(cclose);
+  if (xstrnicmp(cclose, copen, l) != 0)
+    return 1;
+  const XMLCHAR c = copen[l];
+  if (XML_isSPACECHAR(c) || (c == _CXML('/')) || (c == _CXML('<')) ||
+      (c == _CXML('>')) || (c == _CXML('=')))
+    return 0;
+  return 1;
+}
+
+// Obtain the next character from the string.
+static inline XMLCHAR getNextChar(XML *pXML) {
+  XMLCHAR ch = pXML->lpXML[pXML->nIndex];
+#ifdef _XMLWIDECHAR
+  if (ch != 0)
+    pXML->nIndex++;
+#else
+  pXML->nIndex += XML_ByteTable[(unsigned char)ch];
+#endif
+  return ch;
+}
+
+// Find the next token in a string.
+// pcbToken contains the number of characters that have been read.
+static NextToken GetNextToken(XML *pXML, int *pcbToken,
+                              enum XMLTokenTypeTag *pType) {
+  NextToken result;
+  XMLCHAR ch;
+  XMLCHAR chTemp;
+  int indexStart, nFoundMatch, nIsText = FALSE;
+  result.pClr = NULL; // prevent warning
+
+  // Find next non-white space character
+  do {
+    indexStart = pXML->nIndex;
+    ch = getNextChar(pXML);
+  } while XML_isSPACECHAR(ch);
+
+  if (ch) {
+    // Cache the current string pointer
+    result.pStr = &pXML->lpXML[indexStart];
+
+    // First check whether the token is in the clear tag list (meaning it
+    // does not need formatting).
+    ALLXMLClearTag *ctag = XMLClearTags;
+    do {
+      if (xstrncmp(ctag->lpszOpen, result.pStr, ctag->openTagLen) == 0) {
+        result.pClr = ctag;
+        pXML->nIndex += ctag->openTagLen - 1;
+        *pType = eTokenClear;
+        return result;
+      }
+      ctag++;
+    } while (ctag->lpszOpen);
+
+    // If we didn't find a clear tag then check for standard tokens
+    switch (ch) {
+    // Check for quotes
+    case _CXML('\''):
+    case _CXML('\"'):
+      // Type of token
+      *pType = eTokenQuotedText;
+      chTemp = ch;
+
+      // Set the size
+      nFoundMatch = FALSE;
+
+      // Search through the string to find a matching quote
+      while ((ch = getNextChar(pXML))) {
+        if (ch == chTemp) {
+          nFoundMatch = TRUE;
+          break;
+        }
+        if (ch == _CXML('<'))
+          break;
+      }
+
+      // If we failed to find a matching quote
+      if (nFoundMatch == FALSE) {
+        pXML->nIndex = indexStart + 1;
+        nIsText = TRUE;
+        break;
+      }
+
+      //  4.02.2002
+      //            if (FindNonWhiteSpace(pXML)) pXML->nIndex--;
+
+      break;
+
+    // Equals (used with attribute values)
+    case _CXML('='):
+      *pType = eTokenEquals;
+      break;
+
+    // Close tag
+    case _CXML('>'):
+      *pType = eTokenCloseTag;
+      break;
+
+    // Check for tag start and tag end
+    case _CXML('<'):
+
+      // Peek at the next character to see if we have an end tag '</',
+      // or an xml declaration '<?'
+      chTemp = pXML->lpXML[pXML->nIndex];
+
+      // If we have a tag end...
+      if (chTemp == _CXML('/')) {
+        // Set the type and ensure we point at the next character
+        getNextChar(pXML);
+        *pType = eTokenTagEnd;
+      }
+
+      // If we have an XML declaration tag
+      else if (chTemp == _CXML('?')) {
+
+        // Set the type and ensure we point at the next character
+        getNextChar(pXML);
+        *pType = eTokenDeclaration;
+      }
+
+      // Otherwise we must have a start tag
+      else {
+        *pType = eTokenTagStart;
+      }
+      break;
+
+    // Check to see if we have a short hand type end tag ('/>').
+    case _CXML('/'):
+
+      // Peek at the next character to see if we have a short end tag '/>'
+      chTemp = pXML->lpXML[pXML->nIndex];
+
+      // If we have a short hand end tag...
+      if (chTemp == _CXML('>')) {
+        // Set the type and ensure we point at the next character
+        getNextChar(pXML);
+        *pType = eTokenShortHandClose;
+        break;
+      }
+
+      // If we haven't found a short hand closing tag then drop into the
+      // text process
+
+    // Other characters
+    default:
+      nIsText = TRUE;
+    }
+
+    // If this is a TEXT node
+    if (nIsText) {
+      // Indicate we are dealing with text
+      *pType = eTokenText;
+      while ((ch = getNextChar(pXML))) {
+        if
+          XML_isSPACECHAR(ch) {
+            indexStart++;
+            break;
+          }
+        else if (ch == _CXML('/')) {
+          // If we find a slash then this maybe text or a short hand end tag
+          // Peek at the next character to see it we have short hand end tag
+          ch = pXML->lpXML[pXML->nIndex];
+          // If we found a short hand end tag then we need to exit the loop
+          if (ch == _CXML('>')) {
+            pXML->nIndex--;
+            break;
+          }
+
+        } else if ((ch == _CXML('<')) || (ch == _CXML('>')) ||
+                   (ch == _CXML('='))) {
+          pXML->nIndex--;
+          break;
+        }
+      }
+    }
+    *pcbToken = pXML->nIndex - indexStart;
+  } else {
+    // If we failed to obtain a valid character
+    *pcbToken = 0;
+    *pType = eTokenError;
+    result.pStr = NULL;
+  }
+
+  return result;
+}
+
+XMLCSTR XMLNode::updateName_WOSD(XMLSTR lpszName) {
+  if (!d) {
+    free(lpszName);
+    return NULL;
+  }
+  if (d->lpszName && (lpszName != d->lpszName))
+    free((void *)d->lpszName);
+  d->lpszName = lpszName;
+  return lpszName;
+}
+
+// private:
+XMLNode::XMLNode(struct XMLNodeDataTag *p) {
+  d = p;
+  (p->ref_count)++;
+}
+XMLNode::XMLNode(XMLNodeData *pParent, XMLSTR lpszName, char isDeclaration) {
+  d = (XMLNodeData *)malloc(sizeof(XMLNodeData));
+  d->ref_count = 1;
+
+  d->lpszName = NULL;
+  d->nChild = 0;
+  d->nText = 0;
+  d->nClear = 0;
+  d->nAttribute = 0;
+
+  d->isDeclaration = isDeclaration;
+
+  d->pParent = pParent;
+  d->pChild = NULL;
+  d->pText = NULL;
+  d->pClear = NULL;
+  d->pAttribute = NULL;
+  d->pOrder = NULL;
+
+  updateName_WOSD(lpszName);
+}
+
+XMLNode XMLNode::createXMLTopNode_WOSD(XMLSTR lpszName, char isDeclaration) {
+  return XMLNode(NULL, lpszName, isDeclaration);
+}
+XMLNode XMLNode::createXMLTopNode(XMLCSTR lpszName, char isDeclaration) {
+  return XMLNode(NULL, stringDup(lpszName), isDeclaration);
+}
+
+#define MEMORYINCREASE 50
+
+static inline void myFree(void *p) {
+  if (p)
+    free(p);
+}
+static inline void *myRealloc(void *p, int newsize, int memInc,
+                              int sizeofElem) {
+  if (p == NULL) {
+    if (memInc)
+      return malloc(memInc * sizeofElem);
+    return malloc(sizeofElem);
+  }
+  if ((memInc == 0) || ((newsize % memInc) == 0))
+    p = realloc(p, (newsize + memInc) * sizeofElem);
+  //    if (!p)
+  //    {
+  //        printf("XMLParser Error: Not enough memory! Aborting...\n");
+  //        exit(220);
+  //    }
+  return p;
+}
+
+// private:
+XMLElementPosition XMLNode::findPosition(XMLNodeData *d, int index,
+                                         XMLElementType xxtype) {
+  if (index < 0)
+    return -1;
+  int i = 0, j = (int)((index << 2) + xxtype), *o = d->pOrder;
+  while (o[i] != j)
+    i++;
+  return i;
+}
+
+// private:
+// update "order" information when deleting a content of a XMLNode
+int XMLNode::removeOrderElement(XMLNodeData *d, XMLElementType t, int index) {
+  int n = d->nChild + d->nText + d->nClear, *o = d->pOrder,
+      i = findPosition(d, index, t);
+  memmove(o + i, o + i + 1, (n - i) * sizeof(int));
+  for (; i < n; i++)
+    if ((o[i] & 3) == (int)t)
+      o[i] -= 4;
+  // We should normally do:
+  // d->pOrder=(int)realloc(d->pOrder,n*sizeof(int));
+  // but we skip reallocation because it's too time consuming.
+  // Anyway, at the end, it will be free'd completely at once.
+  return i;
+}
+
+void *XMLNode::addToOrder(int memoryIncrease, int *_pos, int nc, void *p,
+                          int size, XMLElementType xtype) {
+  //  in: *_pos is the position inside d->pOrder ("-1" means "EndOf")
+  // out: *_pos is the index inside p
+  p = myRealloc(p, (nc + 1), memoryIncrease, size);
+  int n = d->nChild + d->nText + d->nClear;
+  d->pOrder =
+      (int *)myRealloc(d->pOrder, n + 1, memoryIncrease * 3, sizeof(int));
+  int pos = *_pos, *o = d->pOrder;
+
+  if ((pos < 0) || (pos >= n)) {
+    *_pos = nc;
+    o[n] = (int)((nc << 2) + xtype);
+    return p;
+  }
+
+  int i = pos;
+  memmove(o + i + 1, o + i, (n - i) * sizeof(int));
+
+  while ((pos < n) && ((o[pos] & 3) != (int)xtype))
+    pos++;
+  if (pos == n) {
+    *_pos = nc;
+    o[n] = (int)((nc << 2) + xtype);
+    return p;
+  }
+
+  o[i] = o[pos];
+  for (i = pos + 1; i <= n; i++)
+    if ((o[i] & 3) == (int)xtype)
+      o[i] += 4;
+
+  *_pos = pos = o[pos] >> 2;
+  memmove(((char *)p) + (pos + 1) * size, ((char *)p) + pos * size,
+          (nc - pos) * size);
+
+  return p;
+}
+
+// Add a child node to the given element.
+XMLNode XMLNode::addChild_priv(int memoryIncrease, XMLSTR lpszName,
+                               char isDeclaration, int pos) {
+  if (!lpszName)
+    return emptyXMLNode;
+  d->pChild = (XMLNode *)addToOrder(memoryIncrease, &pos, d->nChild, d->pChild,
+                                    sizeof(XMLNode), eNodeChild);
+  d->pChild[pos].d = NULL;
+  d->pChild[pos] = XMLNode(d, lpszName, isDeclaration);
+  d->nChild++;
+  return d->pChild[pos];
+}
+
+// Add an attribute to an element.
+XMLAttribute *XMLNode::addAttribute_priv(int memoryIncrease, XMLSTR lpszName,
+                                         XMLSTR lpszValuev) {
+  if (!lpszName)
+    return &emptyXMLAttribute;
+  if (!d) {
+    myFree(lpszName);
+    myFree(lpszValuev);
+    return &emptyXMLAttribute;
+  }
+  int nc = d->nAttribute;
+  d->pAttribute = (XMLAttribute *)myRealloc(
+      d->pAttribute, (nc + 1), memoryIncrease, sizeof(XMLAttribute));
+  XMLAttribute *pAttr = d->pAttribute + nc;
+  pAttr->lpszName = lpszName;
+  pAttr->lpszValue = lpszValuev;
+  d->nAttribute++;
+  return pAttr;
+}
+
+// Add text to the element.
+XMLCSTR XMLNode::addText_priv(int memoryIncrease, XMLSTR lpszValue, int pos) {
+  if (!lpszValue)
+    return NULL;
+  if (!d) {
+    myFree(lpszValue);
+    return NULL;
+  }
+  d->pText = (XMLCSTR *)addToOrder(memoryIncrease, &pos, d->nText, d->pText,
+                                   sizeof(XMLSTR), eNodeText);
+  d->pText[pos] = lpszValue;
+  d->nText++;
+  return lpszValue;
+}
+
+// Add clear (unformatted) text to the element.
+XMLClear *XMLNode::addClear_priv(int memoryIncrease, XMLSTR lpszValue,
+                                 XMLCSTR lpszOpen, XMLCSTR lpszClose, int pos) {
+  if (!lpszValue)
+    return &emptyXMLClear;
+  if (!d) {
+    myFree(lpszValue);
+    return &emptyXMLClear;
+  }
+  d->pClear = (XMLClear *)addToOrder(memoryIncrease, &pos, d->nClear, d->pClear,
+                                     sizeof(XMLClear), eNodeClear);
+  XMLClear *pNewClear = d->pClear + pos;
+  pNewClear->lpszValue = lpszValue;
+  if (!lpszOpen)
+    lpszOpen = XMLClearTags->lpszOpen;
+  if (!lpszClose)
+    lpszClose = XMLClearTags->lpszClose;
+  pNewClear->lpszOpenTag = lpszOpen;
+  pNewClear->lpszCloseTag = lpszClose;
+  d->nClear++;
+  return pNewClear;
+}
+
+// private:
+// Parse a clear (unformatted) type node.
+char XMLNode::parseClearTag(void *px, void *_pClear) {
+  XML *pXML = (XML *)px;
+  ALLXMLClearTag pClear = *((ALLXMLClearTag *)_pClear);
+  int cbTemp = 0;
+  XMLCSTR lpszTemp = NULL;
+  XMLCSTR lpXML = &pXML->lpXML[pXML->nIndex];
+  static XMLCSTR docTypeEnd = _CXML("]>");
+
+  // Find the closing tag
+  // Seems the <!DOCTYPE need a better treatment so lets handle it
+  if (pClear.lpszOpen == XMLClearTags[1].lpszOpen) {
+    XMLCSTR pCh = lpXML;
+    while (*pCh) {
+      if (*pCh == _CXML('<')) {
+        pClear.lpszClose = docTypeEnd;
+        lpszTemp = xstrstr(lpXML, docTypeEnd);
+        break;
+      } else if (*pCh == _CXML('>')) {
+        lpszTemp = pCh;
+        break;
+      }
+#ifdef _XMLWIDECHAR
+      pCh++;
+#else
+      pCh += XML_ByteTable[(unsigned char)(*pCh)];
+#endif
+    }
+  } else
+    lpszTemp = xstrstr(lpXML, pClear.lpszClose);
+
+  if (lpszTemp) {
+    // Cache the size and increment the index
+    cbTemp = (int)(lpszTemp - lpXML);
+
+    pXML->nIndex += cbTemp + (int)xstrlen(pClear.lpszClose);
+
+    // Add the clear node to the current element
+    addClear_priv(MEMORYINCREASE, stringDup(lpXML, cbTemp), pClear.lpszOpen,
+                  pClear.lpszClose, -1);
+    return 0;
+  }
+
+  // If we failed to find the end tag
+  pXML->error = eXMLErrorUnmatchedEndClearTag;
+  return 1;
+}
+
+void XMLNode::exactMemory(XMLNodeData *d) {
+  if (d->pOrder)
+    d->pOrder = (int *)realloc(d->pOrder, (d->nChild + d->nText + d->nClear) *
+                                              sizeof(int));
+  if (d->pChild)
+    d->pChild = (XMLNode *)realloc(d->pChild, d->nChild * sizeof(XMLNode));
+  if (d->pAttribute)
+    d->pAttribute = (XMLAttribute *)realloc(
+        d->pAttribute, d->nAttribute * sizeof(XMLAttribute));
+  if (d->pText)
+    d->pText = (XMLCSTR *)realloc(d->pText, d->nText * sizeof(XMLSTR));
+  if (d->pClear)
+    d->pClear = (XMLClear *)realloc(d->pClear, d->nClear * sizeof(XMLClear));
+}
+
+char XMLNode::maybeAddTxT(void *pa, XMLCSTR tokenPStr) {
+  XML *pXML = (XML *)pa;
+  XMLCSTR lpszText = pXML->lpszText;
+  if (!lpszText)
+    return 0;
+  if (dropWhiteSpace)
+    while (XML_isSPACECHAR(*lpszText) && (lpszText != tokenPStr))
+      lpszText++;
+  int cbText = (int)(tokenPStr - lpszText);
+  if (!cbText) {
+    pXML->lpszText = NULL;
+    return 0;
+  }
+  if (dropWhiteSpace) {
+    cbText--;
+    while ((cbText) && XML_isSPACECHAR(lpszText[cbText]))
+      cbText--;
+    cbText++;
+  }
+  if (!cbText) {
+    pXML->lpszText = NULL;
+    return 0;
+  }
+  XMLSTR lpt = fromXMLString(lpszText, cbText, pXML);
+  if (!lpt)
+    return 1;
+  pXML->lpszText = NULL;
+  if (removeCommentsInMiddleOfText && d->nText && d->nClear) {
+    // if the previous insertion was a comment (<!-- -->) AND
+    // if the previous previous insertion was a text then, delete the comment
+    // and append the text
+    int n = d->nChild + d->nText + d->nClear - 1, *o = d->pOrder;
+    if (((o[n] & 3) == eNodeClear) && ((o[n - 1] & 3) == eNodeText)) {
+      int i = o[n] >> 2;
+      if (d->pClear[i].lpszOpenTag == XMLClearTags[2].lpszOpen) {
+        deleteClear(i);
+        i = o[n - 1] >> 2;
+        n = xstrlen(d->pText[i]);
+        int n2 = xstrlen(lpt) + 1;
+        d->pText[i] =
+            (XMLSTR)realloc((void *)d->pText[i], (n + n2) * sizeof(XMLCHAR));
+        if (!d->pText[i])
+          return 1;
+        memcpy((void *)(d->pText[i] + n), lpt, n2 * sizeof(XMLCHAR));
+        free(lpt);
+        return 0;
+      }
+    }
+  }
+  addText_priv(MEMORYINCREASE, lpt, -1);
+  return 0;
+}
+// private:
+// Recursively parse an XML element.
+int XMLNode::ParseXMLElement(void *pa) {
+  XML *pXML = (XML *)pa;
+  int cbToken;
+  enum XMLTokenTypeTag xtype;
+  NextToken token;
+  XMLCSTR lpszTemp = NULL;
+  int cbTemp = 0;
+  char nDeclaration;
+  XMLNode pNew;
+  enum Status status; // inside or outside a tag
+  enum Attrib attrib = eAttribName;
+
+  assert(pXML);
+
+  // If this is the first call to the function
+  if (pXML->nFirst) {
+    // Assume we are outside of a tag definition
+    pXML->nFirst = FALSE;
+    status = eOutsideTag;
+  } else {
+    // If this is not the first call then we should only be called when inside a
+    // tag.
+    status = eInsideTag;
+  }
+
+  // Iterate through the tokens in the document
+  for (;;) {
+    // Obtain the next token
+    token = GetNextToken(pXML, &cbToken, &xtype);
+
+    if (xtype != eTokenError) {
+      // Check the current status
+      switch (status) {
+
+      // If we are outside of a tag definition
+      case eOutsideTag:
+
+        // Check what type of token we obtained
+        switch (xtype) {
+        // If we have found text or quoted text
+        case eTokenText:
+        case eTokenCloseTag:       /* '>'         */
+        case eTokenShortHandClose: /* '/>'        */
+        case eTokenQuotedText:
+        case eTokenEquals:
+          break;
+
+        // If we found a start tag '<' and declarations '<?'
+        case eTokenTagStart:
+        case eTokenDeclaration:
+
+          // Cache whether this new element is a declaration or not
+          nDeclaration = (xtype == eTokenDeclaration);
+
+          // If we have node text then add this to the element
+          if (maybeAddTxT(pXML, token.pStr))
+            return FALSE;
+
+          // Find the name of the tag
+          token = GetNextToken(pXML, &cbToken, &xtype);
+
+          // Return an error if we couldn't obtain the next token or
+          // it wasnt text
+          if (xtype != eTokenText) {
+            pXML->error = eXMLErrorMissingTagName;
+            return FALSE;
+          }
+
+          // If we found a new element which is the same as this
+          // element then we need to pass this back to the caller..
+
+#ifdef APPROXIMATE_PARSING
+          if (d->lpszName && myTagCompare(d->lpszName, token.pStr) == 0) {
+            // Indicate to the caller that it needs to create a
+            // new element.
+            pXML->lpNewElement = token.pStr;
+            pXML->cbNewElement = cbToken;
+            return TRUE;
+          } else
+#endif
+          {
+            // If the name of the new element differs from the name of
+            // the current element we need to add the new element to
+            // the current one and recurse
+            pNew = addChild_priv(MEMORYINCREASE, stringDup(token.pStr, cbToken),
+                                 nDeclaration, -1);
+
+            while (!pNew.isEmpty()) {
+              // Callself to process the new node.  If we return
+              // FALSE this means we dont have any more
+              // processing to do...
+
+              if (!pNew.ParseXMLElement(pXML))
+                return FALSE;
+              else {
+                // If the call to recurse this function
+                // evented in a end tag specified in XML then
+                // we need to unwind the calls to this
+                // function until we find the appropriate node
+                // (the element name and end tag name must
+                // match)
+                if (pXML->cbEndTag) {
+                  // If we are back at the root node then we
+                  // have an unmatched end tag
+                  if (!d->lpszName) {
+                    pXML->error = eXMLErrorUnmatchedEndTag;
+                    return FALSE;
+                  }
+
+                  // If the end tag matches the name of this
+                  // element then we only need to unwind
+                  // once more...
+
+                  if (myTagCompare(d->lpszName, pXML->lpEndTag) == 0) {
+                    pXML->cbEndTag = 0;
+                  }
+
+                  return TRUE;
+                } else if (pXML->cbNewElement) {
+                  // If the call indicated a new element is to
+                  // be created on THIS element.
+
+                  // If the name of this element matches the
+                  // name of the element we need to create
+                  // then we need to return to the caller
+                  // and let it process the element.
+
+                  if (myTagCompare(d->lpszName, pXML->lpNewElement) == 0) {
+                    return TRUE;
+                  }
+
+                  // Add the new element and recurse
+                  pNew = addChild_priv(
+                      MEMORYINCREASE,
+                      stringDup(pXML->lpNewElement, pXML->cbNewElement), 0, -1);
+                  pXML->cbNewElement = 0;
+                } else {
+                  // If we didn't have a new element to create
+                  pNew = emptyXMLNode;
+                }
+              }
+            }
+          }
+          break;
+
+        // If we found an end tag
+        case eTokenTagEnd:
+
+          // If we have node text then add this to the element
+          if (maybeAddTxT(pXML, token.pStr))
+            return FALSE;
+
+          // Find the name of the end tag
+          token = GetNextToken(pXML, &cbTemp, &xtype);
+
+          // The end tag should be text
+          if (xtype != eTokenText) {
+            pXML->error = eXMLErrorMissingEndTagName;
+            return FALSE;
+          }
+          lpszTemp = token.pStr;
+
+          // After the end tag we should find a closing tag
+          token = GetNextToken(pXML, &cbToken, &xtype);
+          if (xtype != eTokenCloseTag) {
+            pXML->error = eXMLErrorMissingEndTagName;
+            return FALSE;
+          }
+          pXML->lpszText = pXML->lpXML + pXML->nIndex;
+
+          // We need to return to the previous caller.  If the name
+          // of the tag cannot be found we need to keep returning to
+          // caller until we find a match
+          if (myTagCompare(d->lpszName, lpszTemp) != 0)
+#ifdef STRICT_PARSING
+          {
+            pXML->error = eXMLErrorUnmatchedEndTag;
+            pXML->nIndexMissigEndTag = pXML->nIndex;
+            return FALSE;
+          }
+#else
+          {
+            pXML->error = eXMLErrorMissingEndTag;
+            pXML->nIndexMissigEndTag = pXML->nIndex;
+            pXML->lpEndTag = lpszTemp;
+            pXML->cbEndTag = cbTemp;
+          }
+#endif
+
+          // Return to the caller
+          exactMemory(d);
+          return TRUE;
+
+        // If we found a clear (unformatted) token
+        case eTokenClear:
+          // If we have node text then add this to the element
+          if (maybeAddTxT(pXML, token.pStr))
+            return FALSE;
+          if (parseClearTag(pXML, token.pClr))
+            return FALSE;
+          pXML->lpszText = pXML->lpXML + pXML->nIndex;
+          break;
+
+        default:
+          break;
+        }
+        break;
+
+      // If we are inside a tag definition we need to search for attributes
+      case eInsideTag:
+
+        // Check what part of the attribute (name, equals, value) we
+        // are looking for.
+        switch (attrib) {
+        // If we are looking for a new attribute
+        case eAttribName:
+
+          // Check what the current token type is
+          switch (xtype) {
+          // If the current type is text...
+          // Eg.  'attribute'
+          case eTokenText:
+            // Cache the token then indicate that we are next to
+            // look for the equals
+            lpszTemp = token.pStr;
+            cbTemp = cbToken;
+            attrib = eAttribEquals;
+            break;
+
+          // If we found a closing tag...
+          // Eg.  '>'
+          case eTokenCloseTag:
+            // We are now outside the tag
+            status = eOutsideTag;
+            pXML->lpszText = pXML->lpXML + pXML->nIndex;
+            break;
+
+          // If we found a short hand '/>' closing tag then we can
+          // return to the caller
+          case eTokenShortHandClose:
+            exactMemory(d);
+            pXML->lpszText = pXML->lpXML + pXML->nIndex;
+            return TRUE;
+
+          // Errors...
+          case eTokenQuotedText:  /* '"SomeText"'   */
+          case eTokenTagStart:    /* '<'            */
+          case eTokenTagEnd:      /* '</'           */
+          case eTokenEquals:      /* '='            */
+          case eTokenDeclaration: /* '<?'           */
+          case eTokenClear:
+            pXML->error = eXMLErrorUnexpectedToken;
+            return FALSE;
+          default:
+            break;
+          }
+          break;
+
+        // If we are looking for an equals
+        case eAttribEquals:
+          // Check what the current token type is
+          switch (xtype) {
+          // If the current type is text...
+          // Eg.  'Attribute AnotherAttribute'
+          case eTokenText:
+            // Add the unvalued attribute to the list
+            addAttribute_priv(MEMORYINCREASE, stringDup(lpszTemp, cbTemp),
+                              NULL);
+            // Cache the token then indicate.  We are next to
+            // look for the equals attribute
+            lpszTemp = token.pStr;
+            cbTemp = cbToken;
+            break;
+
+          // If we found a closing tag 'Attribute >' or a short hand
+          // closing tag 'Attribute />'
+          case eTokenShortHandClose:
+          case eTokenCloseTag:
+            // If we are a declaration element '<?' then we need
+            // to remove extra closing '?' if it exists
+            pXML->lpszText = pXML->lpXML + pXML->nIndex;
+
+            if (d->isDeclaration && (lpszTemp[cbTemp - 1]) == _CXML('?')) {
+              cbTemp--;
+              if (d->pParent && d->pParent->pParent)
+                xtype = eTokenShortHandClose;
+            }
+
+            if (cbTemp) {
+              // Add the unvalued attribute to the list
+              addAttribute_priv(MEMORYINCREASE, stringDup(lpszTemp, cbTemp),
+                                NULL);
+            }
+
+            // If this is the end of the tag then return to the caller
+            if (xtype == eTokenShortHandClose) {
+              exactMemory(d);
+              return TRUE;
+            }
+
+            // We are now outside the tag
+            status = eOutsideTag;
+            break;
+
+          // If we found the equals token...
+          // Eg.  'Attribute ='
+          case eTokenEquals:
+            // Indicate that we next need to search for the value
+            // for the attribute
+            attrib = eAttribValue;
+            break;
+
+          // Errors...
+          case eTokenQuotedText:  /* 'Attribute "InvalidAttr"'*/
+          case eTokenTagStart:    /* 'Attribute <'            */
+          case eTokenTagEnd:      /* 'Attribute </'           */
+          case eTokenDeclaration: /* 'Attribute <?'           */
+          case eTokenClear:
+            pXML->error = eXMLErrorUnexpectedToken;
+            return FALSE;
+          default:
+            break;
+          }
+          break;
+
+        // If we are looking for an attribute value
+        case eAttribValue:
+          // Check what the current token type is
+          switch (xtype) {
+          // If the current type is text or quoted text...
+          // Eg.  'Attribute = "Value"' or 'Attribute = Value' or
+          // 'Attribute = 'Value''.
+          case eTokenText:
+          case eTokenQuotedText:
+            // If we are a declaration element '<?' then we need
+            // to remove extra closing '?' if it exists
+            if (d->isDeclaration && (token.pStr[cbToken - 1]) == _CXML('?')) {
+              cbToken--;
+            }
+
+            if (cbTemp) {
+              // Add the valued attribute to the list
+              if (xtype == eTokenQuotedText) {
+                token.pStr++;
+                cbToken -= 2;
+              }
+              XMLSTR attrVal = (XMLSTR)token.pStr;
+              if (attrVal) {
+                attrVal = fromXMLString(attrVal, cbToken, pXML);
+                if (!attrVal)
+                  return FALSE;
+              }
+              addAttribute_priv(MEMORYINCREASE, stringDup(lpszTemp, cbTemp),
+                                attrVal);
+            }
+
+            // Indicate we are searching for a new attribute
+            attrib = eAttribName;
+            break;
+
+          // Errors...
+          case eTokenTagStart:       /* 'Attr = <'          */
+          case eTokenTagEnd:         /* 'Attr = </'         */
+          case eTokenCloseTag:       /* 'Attr = >'          */
+          case eTokenShortHandClose: /* "Attr = />"         */
+          case eTokenEquals:         /* 'Attr = ='          */
+          case eTokenDeclaration:    /* 'Attr = <?'         */
+          case eTokenClear:
+            pXML->error = eXMLErrorUnexpectedToken;
+            return FALSE;
+            break;
+          default:
+            break;
+          }
+        }
+      }
+    }
+    // If we failed to obtain the next token
+    else {
+      if ((!d->isDeclaration) && (d->pParent)) {
+#ifdef STRICT_PARSING
+        pXML->error = eXMLErrorUnmatchedEndTag;
+#else
+        pXML->error = eXMLErrorMissingEndTag;
+#endif
+        pXML->nIndexMissigEndTag = pXML->nIndex;
+      }
+      maybeAddTxT(pXML, pXML->lpXML + pXML->nIndex);
+      return FALSE;
+    }
+  }
+}
+
+// Count the number of lines and columns in an XML string.
+static void CountLinesAndColumns(XMLCSTR lpXML, int nUpto,
+                                 XMLResults *pResults) {
+  XMLCHAR ch;
+  assert(lpXML);
+  assert(pResults);
+
+  struct XML xml = {lpXML, lpXML, 0, 0, eXMLErrorNone, NULL, 0, NULL, 0, TRUE};
+
+  pResults->nLine = 1;
+  pResults->nColumn = 1;
+  while (xml.nIndex < nUpto) {
+    ch = getNextChar(&xml);
+    if (ch != _CXML('\n'))
+      pResults->nColumn++;
+    else {
+      pResults->nLine++;
+      pResults->nColumn = 1;
+    }
+  }
+}
+
+// Parse XML and return the root element.
+XMLNode XMLNode::parseString(XMLCSTR lpszXML, XMLCSTR tag,
+                             XMLResults *pResults) {
+  if (!lpszXML) {
+    if (pResults) {
+      pResults->error = eXMLErrorNoElements;
+      pResults->nLine = 0;
+      pResults->nColumn = 0;
+    }
+    return emptyXMLNode;
+  }
+
+  XMLNode xnode(NULL, NULL, FALSE);
+  struct XML xml = {lpszXML, lpszXML, 0,    0, eXMLErrorNone,
+                    NULL,    0,       NULL, 0, TRUE};
+
+  // Create header element
+  xnode.ParseXMLElement(&xml);
+  enum XMLError error = xml.error;
+  if (!xnode.nChildNode())
+    error = eXMLErrorNoXMLTagFound;
+  if ((xnode.nChildNode() == 1) && (xnode.nElement() == 1))
+    xnode = xnode.getChildNode(); // skip the empty node
+
+  // If no error occurred
+  if ((error == eXMLErrorNone) || (error == eXMLErrorMissingEndTag) ||
+      (error == eXMLErrorNoXMLTagFound)) {
+    XMLCSTR name = xnode.getName();
+    if (tag && (*tag) && ((!name) || (xstricmp(name, tag)))) {
+      xnode = xnode.getChildNode(tag);
+      if (xnode.isEmpty()) {
+        if (pResults) {
+          pResults->error = eXMLErrorFirstTagNotFound;
+          pResults->nLine = 0;
+          pResults->nColumn = 0;
+        }
+        return emptyXMLNode;
+      }
+    }
+  } else {
+    // Cleanup: this will destroy all the nodes
+    xnode = emptyXMLNode;
+  }
+
+  // If we have been given somewhere to place results
+  if (pResults) {
+    pResults->error = error;
+
+    // If we have an error
+    if (error != eXMLErrorNone) {
+      if (error == eXMLErrorMissingEndTag)
+        xml.nIndex = xml.nIndexMissigEndTag;
+      // Find which line and column it starts on.
+      CountLinesAndColumns(xml.lpXML, xml.nIndex, pResults);
+    }
+  }
+  return xnode;
+}
+
+XMLNode XMLNode::parseFile(XMLCSTR filename, XMLCSTR tag,
+                           XMLResults *pResults) {
+  if (pResults) {
+    pResults->nLine = 0;
+    pResults->nColumn = 0;
+  }
+  FILE *f = xfopen(filename, _CXML("rb"));
+  if (f == NULL) {
+    if (pResults)
+      pResults->error = eXMLErrorFileNotFound;
+    return emptyXMLNode;
+  }
+  fseek(f, 0, SEEK_END);
+  int l = ftell(f), headerSz = 0;
+  if (!l) {
+    if (pResults)
+      pResults->error = eXMLErrorEmpty;
+    fclose(f);
+    return emptyXMLNode;
+  }
+  fseek(f, 0, SEEK_SET);
+  unsigned char *buf = (unsigned char *)malloc(l + 4);
+  l = fread(buf, 1, l, f);
+  fclose(f);
+  buf[l] = 0;
+  buf[l + 1] = 0;
+  buf[l + 2] = 0;
+  buf[l + 3] = 0;
+#ifdef _XMLWIDECHAR
+  if (guessWideCharChars) {
+    if (!myIsTextWideChar(buf, l)) {
+      XMLNode::XMLCharEncoding ce = XMLNode::char_encoding_legacy;
+      if ((buf[0] == 0xef) && (buf[1] == 0xbb) && (buf[2] == 0xbf)) {
+        headerSz = 3;
+        ce = XMLNode::char_encoding_UTF8;
+      }
+      XMLSTR b2 = myMultiByteToWideChar((const char *)(buf + headerSz), ce);
+      free(buf);
+      buf = (unsigned char *)b2;
+      headerSz = 0;
+    } else {
+      if ((buf[0] == 0xef) && (buf[1] == 0xff))
+        headerSz = 2;
+      if ((buf[0] == 0xff) && (buf[1] == 0xfe))
+        headerSz = 2;
+    }
+  }
+#else
+  if (guessWideCharChars) {
+    if (myIsTextWideChar(buf, l)) {
+      if ((buf[0] == 0xef) && (buf[1] == 0xff))
+        headerSz = 2;
+      if ((buf[0] == 0xff) && (buf[1] == 0xfe))
+        headerSz = 2;
+      char *b2 = myWideCharToMultiByte((const wchar_t *)(buf + headerSz));
+      free(buf);
+      buf = (unsigned char *)b2;
+      headerSz = 0;
+    } else {
+      if ((buf[0] == 0xef) && (buf[1] == 0xbb) && (buf[2] == 0xbf))
+        headerSz = 3;
+    }
+  }
+#endif
+
+  if (!buf) {
+    if (pResults)
+      pResults->error = eXMLErrorCharConversionError;
+    return emptyXMLNode;
+  }
+  XMLNode x = parseString((XMLSTR)(buf + headerSz), tag, pResults);
+  free(buf);
+  return x;
+}
+
+static inline void charmemset(XMLSTR dest, XMLCHAR c, int l) {
+  while (l--)
+    *(dest++) = c;
+}
+// private:
+// Creates an user friendly XML string from a given element with
+// appropriate white space and carriage returns.
+//
+// This recurses through all subnodes then adds contents of the nodes to the
+// string.
+int XMLNode::CreateXMLStringR(XMLNodeData *pEntry, XMLSTR lpszMarker,
+                              int nFormat) {
+  int nResult = 0;
+  int cb = nFormat < 0 ? 0 : nFormat;
+  int cbElement;
+  int nChildFormat = -1;
+  int nElementI = pEntry->nChild + pEntry->nText + pEntry->nClear;
+  int i, j;
+  if ((nFormat >= 0) && (nElementI == 1) && (pEntry->nText == 1) &&
+      (!pEntry->isDeclaration))
+    nFormat = -2;
+
+  assert(pEntry);
+
+#define LENSTR(lpsz) (lpsz ? xstrlen(lpsz) : 0)
+
+  // If the element has no name then assume this is the head node.
+  cbElement = (int)LENSTR(pEntry->lpszName);
+
+  if (cbElement) {
+    // "<elementname "
+    if (lpszMarker) {
+      if (cb)
+        charmemset(lpszMarker, INDENTCHAR, cb);
+      nResult = cb;
+      lpszMarker[nResult++] = _CXML('<');
+      if (pEntry->isDeclaration)
+        lpszMarker[nResult++] = _CXML('?');
+      xstrcpy(&lpszMarker[nResult], pEntry->lpszName);
+      nResult += cbElement;
+      lpszMarker[nResult++] = _CXML(' ');
+
+    } else {
+      nResult += cbElement + 2 + cb;
+      if (pEntry->isDeclaration)
+        nResult++;
+    }
+
+    // Enumerate attributes and add them to the string
+    XMLAttribute *pAttr = pEntry->pAttribute;
+    for (i = 0; i < pEntry->nAttribute; i++) {
+      // "Attrib
+      cb = (int)LENSTR(pAttr->lpszName);
+      if (cb) {
+        if (lpszMarker)
+          xstrcpy(&lpszMarker[nResult], pAttr->lpszName);
+        nResult += cb;
+        // "Attrib=Value "
+        if (pAttr->lpszValue) {
+          cb = (int)ToXMLStringTool::lengthXMLString(pAttr->lpszValue);
+          if (lpszMarker) {
+            lpszMarker[nResult] = _CXML('=');
+            lpszMarker[nResult + 1] = _CXML('"');
+            if (cb)
+              ToXMLStringTool::toXMLUnSafe(&lpszMarker[nResult + 2],
+                                           pAttr->lpszValue);
+            lpszMarker[nResult + cb + 2] = _CXML('"');
+          }
+          nResult += cb + 3;
+        }
+        if (lpszMarker)
+          lpszMarker[nResult] = _CXML(' ');
+        nResult++;
+      }
+      pAttr++;
+    }
+
+    if (pEntry->isDeclaration) {
+      if (lpszMarker) {
+        lpszMarker[nResult - 1] = _CXML('?');
+        lpszMarker[nResult] = _CXML('>');
+      }
+      nResult++;
+      if (nFormat != -1) {
+        if (lpszMarker)
+          lpszMarker[nResult] = _CXML('\n');
+        nResult++;
+      }
+    } else
+        // If there are child nodes we need to terminate the start tag
+        if (nElementI) {
+      if (lpszMarker)
+        lpszMarker[nResult - 1] = _CXML('>');
+      if (nFormat >= 0) {
+        if (lpszMarker)
+          lpszMarker[nResult] = _CXML('\n');
+        nResult++;
+      }
+    } else
+      nResult--;
+  }
+
+  // Calculate the child format for when we recurse.  This is used to
+  // determine the number of spaces used for prefixes.
+  if (nFormat != -1) {
+    if (cbElement && (!pEntry->isDeclaration))
+      nChildFormat = nFormat + 1;
+    else
+      nChildFormat = nFormat;
+  }
+
+  // Enumerate through remaining children
+  for (i = 0; i < nElementI; i++) {
+    j = pEntry->pOrder[i];
+    switch ((XMLElementType)(j & 3)) {
+    // Text nodes
+    case eNodeText: {
+      // "Text"
+      XMLCSTR pChild = pEntry->pText[j >> 2];
+      cb = (int)ToXMLStringTool::lengthXMLString(pChild);
+      if (cb) {
+        if (nFormat >= 0) {
+          if (lpszMarker) {
+            charmemset(&lpszMarker[nResult], INDENTCHAR, nFormat + 1);
+            ToXMLStringTool::toXMLUnSafe(&lpszMarker[nResult + nFormat + 1],
+                                         pChild);
+            lpszMarker[nResult + nFormat + 1 + cb] = _CXML('\n');
+          }
+          nResult += cb + nFormat + 2;
+        } else {
+          if (lpszMarker)
+            ToXMLStringTool::toXMLUnSafe(&lpszMarker[nResult], pChild);
+          nResult += cb;
+        }
+      }
+      break;
+    }
+
+    // Clear type nodes
+    case eNodeClear: {
+      XMLClear *pChild = pEntry->pClear + (j >> 2);
+      // "OpenTag"
+      cb = (int)LENSTR(pChild->lpszOpenTag);
+      if (cb) {
+        if (nFormat != -1) {
+          if (lpszMarker) {
+            charmemset(&lpszMarker[nResult], INDENTCHAR, nFormat + 1);
+            xstrcpy(&lpszMarker[nResult + nFormat + 1], pChild->lpszOpenTag);
+          }
+          nResult += cb + nFormat + 1;
+        } else {
+          if (lpszMarker)
+            xstrcpy(&lpszMarker[nResult], pChild->lpszOpenTag);
+          nResult += cb;
+        }
+      }
+
+      // "OpenTag Value"
+      cb = (int)LENSTR(pChild->lpszValue);
+      if (cb) {
+        if (lpszMarker)
+          xstrcpy(&lpszMarker[nResult], pChild->lpszValue);
+        nResult += cb;
+      }
+
+      // "OpenTag Value CloseTag"
+      cb = (int)LENSTR(pChild->lpszCloseTag);
+      if (cb) {
+        if (lpszMarker)
+          xstrcpy(&lpszMarker[nResult], pChild->lpszCloseTag);
+        nResult += cb;
+      }
+
+      if (nFormat != -1) {
+        if (lpszMarker)
+          lpszMarker[nResult] = _CXML('\n');
+        nResult++;
+      }
+      break;
+    }
+
+    // Element nodes
+    case eNodeChild: {
+      // Recursively add child nodes
+      nResult +=
+          CreateXMLStringR(pEntry->pChild[j >> 2].d,
+                           lpszMarker ? lpszMarker + nResult : 0, nChildFormat);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  if ((cbElement) && (!pEntry->isDeclaration)) {
+    // If we have child entries we need to use long XML notation for
+    // closing the element - "<elementname>blah blah blah</elementname>"
+    if (nElementI) {
+      // "</elementname>\0"
+      if (lpszMarker) {
+        if (nFormat >= 0) {
+          charmemset(&lpszMarker[nResult], INDENTCHAR, nFormat);
+          nResult += nFormat;
+        }
+
+        lpszMarker[nResult] = _CXML('<');
+        lpszMarker[nResult + 1] = _CXML('/');
+        nResult += 2;
+        xstrcpy(&lpszMarker[nResult], pEntry->lpszName);
+        nResult += cbElement;
+
+        lpszMarker[nResult] = _CXML('>');
+        if (nFormat == -1)
+          nResult++;
+        else {
+          lpszMarker[nResult + 1] = _CXML('\n');
+          nResult += 2;
+        }
+      } else {
+        if (nFormat >= 0)
+          nResult += cbElement + 4 + nFormat;
+        else if (nFormat == -1)
+          nResult += cbElement + 3;
+        else
+          nResult += cbElement + 4;
+      }
+    } else {
+      // If there are no children we can use shorthand XML notation -
+      // "<elementname/>"
+      // "/>\0"
+      if (lpszMarker) {
+        lpszMarker[nResult] = _CXML('/');
+        lpszMarker[nResult + 1] = _CXML('>');
+        if (nFormat != -1)
+          lpszMarker[nResult + 2] = _CXML('\n');
+      }
+      nResult += nFormat == -1 ? 2 : 3;
+    }
+  }
+
+  return nResult;
+}
+
+#undef LENSTR
+
+// Create an XML string
+// @param       int nFormat             - 0 if no formatting is required
+//                                        otherwise nonzero for formatted text
+//                                        with carriage returns and indentation.
+// @param       int *pnSize             - [out] pointer to the size of the
+//                                        returned string not including the
+//                                        NULL terminator.
+// @return      XMLSTR                  - Allocated XML string, you must free
+//                                        this with free().
+XMLSTR XMLNode::createXMLString(int nFormat, int *pnSize) const {
+  if (!d) {
+    if (pnSize)
+      *pnSize = 0;
+    return NULL;
+  }
+
+  XMLSTR lpszResult = NULL;
+  int cbStr;
+
+  // Recursively Calculate the size of the XML string
+  if (!dropWhiteSpace)
+    nFormat = 0;
+  nFormat = nFormat ? 0 : -1;
+  cbStr = CreateXMLStringR(d, 0, nFormat);
+  // Alllocate memory for the XML string + the NULL terminator and
+  // create the recursively XML string.
+  lpszResult = (XMLSTR)malloc((cbStr + 1) * sizeof(XMLCHAR));
+  CreateXMLStringR(d, lpszResult, nFormat);
+  lpszResult[cbStr] = _CXML('\0');
+  if (pnSize)
+    *pnSize = cbStr;
+  return lpszResult;
+}
+
+int XMLNode::detachFromParent(XMLNodeData *d) {
+  XMLNode *pa = d->pParent->pChild;
+  int i = 0;
+  while (((void *)(pa[i].d)) != ((void *)d))
+    i++;
+  d->pParent->nChild--;
+  if (d->pParent->nChild)
+    memmove(pa + i, pa + i + 1, (d->pParent->nChild - i) * sizeof(XMLNode));
+  else {
+    free(pa);
+    d->pParent->pChild = NULL;
+  }
+  return removeOrderElement(d->pParent, eNodeChild, i);
+}
+
+XMLNode::~XMLNode() {
+  if (!d)
+    return;
+  d->ref_count--;
+  emptyTheNode(0);
+}
+void XMLNode::deleteNodeContent() {
+  if (!d)
+    return;
+  if (d->pParent) {
+    detachFromParent(d);
+    d->pParent = NULL;
+    d->ref_count--;
+  }
+  emptyTheNode(1);
+}
+void XMLNode::emptyTheNode(char force) {
+  XMLNodeData *dd = d; // warning: must stay this way!
+  if ((dd->ref_count == 0) || force) {
+    if (d->pParent)
+      detachFromParent(d);
+    int i;
+    XMLNode *pc;
+    for (i = 0; i < dd->nChild; i++) {
+      pc = dd->pChild + i;
+      pc->d->pParent = NULL;
+      pc->d->ref_count--;
+      pc->emptyTheNode(force);
+    }
+    myFree(dd->pChild);
+    for (i = 0; i < dd->nText; i++)
+      free((void *)dd->pText[i]);
+    myFree(dd->pText);
+    for (i = 0; i < dd->nClear; i++)
+      free((void *)dd->pClear[i].lpszValue);
+    myFree(dd->pClear);
+    for (i = 0; i < dd->nAttribute; i++) {
+      free((void *)dd->pAttribute[i].lpszName);
+      if (dd->pAttribute[i].lpszValue)
+        free((void *)dd->pAttribute[i].lpszValue);
+    }
+    myFree(dd->pAttribute);
+    myFree(dd->pOrder);
+    myFree((void *)dd->lpszName);
+    dd->nChild = 0;
+    dd->nText = 0;
+    dd->nClear = 0;
+    dd->nAttribute = 0;
+    dd->pChild = NULL;
+    dd->pText = NULL;
+    dd->pClear = NULL;
+    dd->pAttribute = NULL;
+    dd->pOrder = NULL;
+    dd->lpszName = NULL;
+    dd->pParent = NULL;
+  }
+  if (dd->ref_count == 0) {
+    free(dd);
+    d = NULL;
+  }
+}
+
+XMLNode &XMLNode::operator=(const XMLNode &A) {
+  // shallow copy
+  if (this != &A) {
+    if (d) {
+      d->ref_count--;
+      emptyTheNode(0);
+    }
+    d = A.d;
+    if (d)
+      (d->ref_count)++;
+  }
+  return *this;
+}
+
+XMLNode::XMLNode(const XMLNode &A) {
+  // shallow copy
+  d = A.d;
+  if (d)
+    (d->ref_count)++;
+}
+
+XMLNode XMLNode::deepCopy() const {
+  if (!d)
+    return XMLNode::emptyXMLNode;
+  XMLNode x(NULL, stringDup(d->lpszName), d->isDeclaration);
+  XMLNodeData *p = x.d;
+  int n = d->nAttribute;
+  if (n) {
+    p->nAttribute = n;
+    p->pAttribute = (XMLAttribute *)malloc(n * sizeof(XMLAttribute));
+    while (n--) {
+      p->pAttribute[n].lpszName = stringDup(d->pAttribute[n].lpszName);
+      p->pAttribute[n].lpszValue = stringDup(d->pAttribute[n].lpszValue);
+    }
+  }
+  if (d->pOrder) {
+    n = (d->nChild + d->nText + d->nClear) * sizeof(int);
+    p->pOrder = (int *)malloc(n);
+    memcpy(p->pOrder, d->pOrder, n);
+  }
+  n = d->nText;
+  if (n) {
+    p->nText = n;
+    p->pText = (XMLCSTR *)malloc(n * sizeof(XMLCSTR));
+    while (n--)
+      p->pText[n] = stringDup(d->pText[n]);
+  }
+  n = d->nClear;
+  if (n) {
+    p->nClear = n;
+    p->pClear = (XMLClear *)malloc(n * sizeof(XMLClear));
+    while (n--) {
+      p->pClear[n].lpszCloseTag = d->pClear[n].lpszCloseTag;
+      p->pClear[n].lpszOpenTag = d->pClear[n].lpszOpenTag;
+      p->pClear[n].lpszValue = stringDup(d->pClear[n].lpszValue);
+    }
+  }
+  n = d->nChild;
+  if (n) {
+    p->nChild = n;
+    p->pChild = (XMLNode *)malloc(n * sizeof(XMLNode));
+    while (n--) {
+      p->pChild[n].d = NULL;
+      p->pChild[n] = d->pChild[n].deepCopy();
+      p->pChild[n].d->pParent = p;
+    }
+  }
+  return x;
+}
+
+XMLNode XMLNode::addChild(XMLNode childNode, int pos) {
+  XMLNodeData *dc = childNode.d;
+  if ((!dc) || (!d))
+    return childNode;
+  if (!dc->lpszName) {
+    // this is a root node: todo: correct fix
+    int j = pos;
+    while (dc->nChild) {
+      addChild(dc->pChild[0], j);
+      if (pos >= 0)
+        j++;
+    }
+    return childNode;
+  }
+  if (dc->pParent) {
+    if ((detachFromParent(dc) <= pos) && (dc->pParent == d))
+      pos--;
+  } else
+    dc->ref_count++;
+  dc->pParent = d;
+  //     int nc=d->nChild;
+  //     d->pChild=(XMLNode*)myRealloc(d->pChild,(nc+1),memoryIncrease,sizeof(XMLNode));
+  d->pChild = (XMLNode *)addToOrder(0, &pos, d->nChild, d->pChild,
+                                    sizeof(XMLNode), eNodeChild);
+  d->pChild[pos].d = dc;
+  d->nChild++;
+  return childNode;
+}
+
+void XMLNode::deleteAttribute(int i) {
+  if ((!d) || (i < 0) || (i >= d->nAttribute))
+    return;
+  d->nAttribute--;
+  XMLAttribute *p = d->pAttribute + i;
+  free((void *)p->lpszName);
+  if (p->lpszValue)
+    free((void *)p->lpszValue);
+  if (d->nAttribute)
+    memmove(p, p + 1, (d->nAttribute - i) * sizeof(XMLAttribute));
+  else {
+    free(p);
+    d->pAttribute = NULL;
+  }
+}
+
+void XMLNode::deleteAttribute(XMLAttribute *a) {
+  if (a)
+    deleteAttribute(a->lpszName);
+}
+void XMLNode::deleteAttribute(XMLCSTR lpszName) {
+  int j = 0;
+  getAttribute(lpszName, &j);
+  if (j)
+    deleteAttribute(j - 1);
+}
+
+XMLAttribute *XMLNode::updateAttribute_WOSD(XMLSTR lpszNewValue,
+                                            XMLSTR lpszNewName, int i) {
+  if (!d) {
+    if (lpszNewValue)
+      free(lpszNewValue);
+    if (lpszNewName)
+      free(lpszNewName);
+    return NULL;
+  }
+  if (i >= d->nAttribute) {
+    if (lpszNewName)
+      return addAttribute_WOSD(lpszNewName, lpszNewValue);
+    return NULL;
+  }
+  XMLAttribute *p = d->pAttribute + i;
+  if (p->lpszValue && p->lpszValue != lpszNewValue)
+    free((void *)p->lpszValue);
+  p->lpszValue = lpszNewValue;
+  if (lpszNewName && p->lpszName != lpszNewName) {
+    free((void *)p->lpszName);
+    p->lpszName = lpszNewName;
+  };
+  return p;
+}
+
+XMLAttribute *XMLNode::updateAttribute_WOSD(XMLAttribute *newAttribute,
+                                            XMLAttribute *oldAttribute) {
+  if (oldAttribute)
+    return updateAttribute_WOSD((XMLSTR)newAttribute->lpszValue,
+                                (XMLSTR)newAttribute->lpszName,
+                                oldAttribute->lpszName);
+  return addAttribute_WOSD((XMLSTR)newAttribute->lpszName,
+                           (XMLSTR)newAttribute->lpszValue);
+}
+
+XMLAttribute *XMLNode::updateAttribute_WOSD(XMLSTR lpszNewValue,
+                                            XMLSTR lpszNewName,
+                                            XMLCSTR lpszOldName) {
+  int j = 0;
+  getAttribute(lpszOldName, &j);
+  if (j)
+    return updateAttribute_WOSD(lpszNewValue, lpszNewName, j - 1);
+  else {
+    if (lpszNewName)
+      return addAttribute_WOSD(lpszNewName, lpszNewValue);
+    else
+      return addAttribute_WOSD(stringDup(lpszOldName), lpszNewValue);
+  }
+}
+
+int XMLNode::indexText(XMLCSTR lpszValue) const {
+  if (!d)
+    return -1;
+  int i, l = d->nText;
+  if (!lpszValue) {
+    if (l)
+      return 0;
+    return -1;
+  }
+  XMLCSTR *p = d->pText;
+  for (i = 0; i < l; i++)
+    if (lpszValue == p[i])
+      return i;
+  return -1;
+}
+
+void XMLNode::deleteText(int i) {
+  if ((!d) || (i < 0) || (i >= d->nText))
+    return;
+  d->nText--;
+  XMLCSTR *p = d->pText + i;
+  free((void *)*p);
+  if (d->nText)
+    memmove(p, p + 1, (d->nText - i) * sizeof(XMLCSTR));
+  else {
+    free(p);
+    d->pText = NULL;
+  }
+  removeOrderElement(d, eNodeText, i);
+}
+
+void XMLNode::deleteText(XMLCSTR lpszValue) {
+  deleteText(indexText(lpszValue));
+}
+
+XMLCSTR XMLNode::updateText_WOSD(XMLSTR lpszNewValue, int i) {
+  if (!d) {
+    if (lpszNewValue)
+      free(lpszNewValue);
+    return NULL;
+  }
+  if (i >= d->nText)
+    return addText_WOSD(lpszNewValue);
+  XMLCSTR *p = d->pText + i;
+  if (*p != lpszNewValue) {
+    free((void *)*p);
+    *p = lpszNewValue;
+  }
+  return lpszNewValue;
+}
+
+XMLCSTR XMLNode::updateText_WOSD(XMLSTR lpszNewValue, XMLCSTR lpszOldValue) {
+  if (!d) {
+    if (lpszNewValue)
+      free(lpszNewValue);
+    return NULL;
+  }
+  int i = indexText(lpszOldValue);
+  if (i >= 0)
+    return updateText_WOSD(lpszNewValue, i);
+  return addText_WOSD(lpszNewValue);
+}
+
+void XMLNode::deleteClear(int i) {
+  if ((!d) || (i < 0) || (i >= d->nClear))
+    return;
+  d->nClear--;
+  XMLClear *p = d->pClear + i;
+  free((void *)p->lpszValue);
+  if (d->nClear)
+    memmove(p, p + 1, (d->nClear - i) * sizeof(XMLClear));
+  else {
+    free(p);
+    d->pClear = NULL;
+  }
+  removeOrderElement(d, eNodeClear, i);
+}
+
+int XMLNode::indexClear(XMLCSTR lpszValue) const {
+  if (!d)
+    return -1;
+  int i, l = d->nClear;
+  if (!lpszValue) {
+    if (l)
+      return 0;
+    return -1;
+  }
+  XMLClear *p = d->pClear;
+  for (i = 0; i < l; i++)
+    if (lpszValue == p[i].lpszValue)
+      return i;
+  return -1;
+}
+
+void XMLNode::deleteClear(XMLCSTR lpszValue) {
+  deleteClear(indexClear(lpszValue));
+}
+void XMLNode::deleteClear(XMLClear *a) {
+  if (a)
+    deleteClear(a->lpszValue);
+}
+
+XMLClear *XMLNode::updateClear_WOSD(XMLSTR lpszNewContent, int i) {
+  if (!d) {
+    if (lpszNewContent)
+      free(lpszNewContent);
+    return NULL;
+  }
+  if (i >= d->nClear)
+    return addClear_WOSD(lpszNewContent);
+  XMLClear *p = d->pClear + i;
+  if (lpszNewContent != p->lpszValue) {
+    free((void *)p->lpszValue);
+    p->lpszValue = lpszNewContent;
+  }
+  return p;
+}
+
+XMLClear *XMLNode::updateClear_WOSD(XMLSTR lpszNewContent,
+                                    XMLCSTR lpszOldValue) {
+  if (!d) {
+    if (lpszNewContent)
+      free(lpszNewContent);
+    return NULL;
+  }
+  int i = indexClear(lpszOldValue);
+  if (i >= 0)
+    return updateClear_WOSD(lpszNewContent, i);
+  return addClear_WOSD(lpszNewContent);
+}
+
+XMLClear *XMLNode::updateClear_WOSD(XMLClear *newP, XMLClear *oldP) {
+  if (oldP)
+    return updateClear_WOSD((XMLSTR)newP->lpszValue, (XMLSTR)oldP->lpszValue);
+  return NULL;
+}
+
+int XMLNode::nChildNode(XMLCSTR name) const {
+  if (!d)
+    return 0;
+  int i, j = 0, n = d->nChild;
+  XMLNode *pc = d->pChild;
+  for (i = 0; i < n; i++) {
+    if (xstricmp(pc->d->lpszName, name) == 0)
+      j++;
+    pc++;
+  }
+  return j;
+}
+
+XMLNode XMLNode::getChildNode(XMLCSTR name, int *j) const {
+  if (!d)
+    return emptyXMLNode;
+  int i = 0, n = d->nChild;
+  if (j)
+    i = *j;
+  XMLNode *pc = d->pChild + i;
+  for (; i < n; i++) {
+    if (!xstricmp(pc->d->lpszName, name)) {
+      if (j)
+        *j = i + 1;
+      return *pc;
+    }
+    pc++;
+  }
+  return emptyXMLNode;
+}
+
+XMLNode XMLNode::getChildNode(XMLCSTR name, int j) const {
+  if (!d)
+    return emptyXMLNode;
+  if (j >= 0) {
+    int i = 0;
+    while (j-- > 0)
+      getChildNode(name, &i);
+    return getChildNode(name, &i);
+  }
+  int i = d->nChild;
+  while (i--)
+    if (!xstricmp(name, d->pChild[i].d->lpszName))
+      break;
+  if (i < 0)
+    return emptyXMLNode;
+  return getChildNode(i);
+}
+
+XMLNode XMLNode::getChildNodeByPath(XMLCSTR _path, char createMissing,
+                                    XMLCHAR sep) {
+  XMLSTR path = stringDup(_path);
+  XMLNode x = getChildNodeByPathNonConst(path, createMissing, sep);
+  if (path)
+    free(path);
+  return x;
+}
+
+XMLNode XMLNode::getChildNodeByPathNonConst(XMLSTR path, char createIfMissing,
+                                            XMLCHAR sep) {
+  if ((!path) || (!(*path)))
+    return *this;
+  XMLNode xn, xbase = *this;
+  XMLCHAR *tend1, sepString[2];
+  sepString[0] = sep;
+  sepString[1] = 0;
+  tend1 = xstrstr(path, sepString);
+  while (tend1) {
+    *tend1 = 0;
+    xn = xbase.getChildNode(path);
+    if (xn.isEmpty()) {
+      if (createIfMissing)
+        xn = xbase.addChild(path);
+      else {
+        *tend1 = sep;
+        return XMLNode::emptyXMLNode;
+      }
+    }
+    *tend1 = sep;
+    xbase = xn;
+    path = tend1 + 1;
+    tend1 = xstrstr(path, sepString);
+  }
+  xn = xbase.getChildNode(path);
+  if (xn.isEmpty() && createIfMissing)
+    xn = xbase.addChild(path);
+  return xn;
+}
+
+XMLElementPosition XMLNode::positionOfText(int i) const {
+  if (i >= d->nText)
+    i = d->nText - 1;
+  return findPosition(d, i, eNodeText);
+}
+XMLElementPosition XMLNode::positionOfClear(int i) const {
+  if (i >= d->nClear)
+    i = d->nClear - 1;
+  return findPosition(d, i, eNodeClear);
+}
+XMLElementPosition XMLNode::positionOfChildNode(int i) const {
+  if (i >= d->nChild)
+    i = d->nChild - 1;
+  return findPosition(d, i, eNodeChild);
+}
+XMLElementPosition XMLNode::positionOfText(XMLCSTR lpszValue) const {
+  return positionOfText(indexText(lpszValue));
+}
+XMLElementPosition XMLNode::positionOfClear(XMLCSTR lpszValue) const {
+  return positionOfClear(indexClear(lpszValue));
+}
+XMLElementPosition XMLNode::positionOfClear(XMLClear *a) const {
+  if (a)
+    return positionOfClear(a->lpszValue);
+  return positionOfClear();
+}
+XMLElementPosition XMLNode::positionOfChildNode(XMLNode x) const {
+  if ((!d) || (!x.d))
+    return -1;
+  XMLNodeData *dd = x.d;
+  XMLNode *pc = d->pChild;
+  int i = d->nChild;
+  while (i--)
+    if (pc[i].d == dd)
+      return findPosition(d, i, eNodeChild);
+  return -1;
+}
+XMLElementPosition XMLNode::positionOfChildNode(XMLCSTR name, int count) const {
+  if (!name)
+    return positionOfChildNode(count);
+  int j = 0;
+  do {
+    getChildNode(name, &j);
+    if (j < 0)
+      return -1;
+  } while (count--);
+  return findPosition(d, j - 1, eNodeChild);
+}
+
+XMLNode XMLNode::getChildNodeWithAttribute(XMLCSTR name, XMLCSTR attributeName,
+                                           XMLCSTR attributeValue,
+                                           int *k) const {
+  int i = 0, j;
+  if (k)
+    i = *k;
+  XMLNode x;
+  XMLCSTR t;
+  do {
+    x = getChildNode(name, &i);
+    if (!x.isEmpty()) {
+      if (attributeValue) {
+        j = 0;
+        do {
+          t = x.getAttribute(attributeName, &j);
+          if (t && (xstricmp(attributeValue, t) == 0)) {
+            if (k)
+              *k = i;
+            return x;
+          }
+        } while (t);
+      } else {
+        if (x.isAttributeSet(attributeName)) {
+          if (k)
+            *k = i;
+          return x;
+        }
+      }
+    }
+  } while (!x.isEmpty());
+  return emptyXMLNode;
+}
+
+// Find an attribute on an node.
+XMLCSTR XMLNode::getAttribute(XMLCSTR lpszAttrib, int *j) const {
+  if (!d)
+    return NULL;
+  int i = 0, n = d->nAttribute;
+  if (j)
+    i = *j;
+  XMLAttribute *pAttr = d->pAttribute + i;
+  for (; i < n; i++) {
+    if (xstricmp(pAttr->lpszName, lpszAttrib) == 0) {
+      if (j)
+        *j = i + 1;
+      return pAttr->lpszValue;
+    }
+    pAttr++;
+  }
+  return NULL;
+}
+
+char XMLNode::isAttributeSet(XMLCSTR lpszAttrib) const {
+  if (!d)
+    return FALSE;
+  int i, n = d->nAttribute;
+  XMLAttribute *pAttr = d->pAttribute;
+  for (i = 0; i < n; i++) {
+    if (xstricmp(pAttr->lpszName, lpszAttrib) == 0) {
+      return TRUE;
+    }
+    pAttr++;
+  }
+  return FALSE;
+}
+
+XMLCSTR XMLNode::getAttribute(XMLCSTR name, int j) const {
+  if (!d)
+    return NULL;
+  int i = 0;
+  while (j-- > 0)
+    getAttribute(name, &i);
+  return getAttribute(name, &i);
+}
+
+XMLNodeContents XMLNode::enumContents(int i) const {
+  XMLNodeContents c;
+  if (!d) {
+    c.etype = eNodeNULL;
+    return c;
+  }
+  if (i < d->nAttribute) {
+    c.etype = eNodeAttribute;
+    c.attrib = d->pAttribute[i];
+    return c;
+  }
+  i -= d->nAttribute;
+  c.etype = (XMLElementType)(d->pOrder[i] & 3);
+  i = (d->pOrder[i]) >> 2;
+  switch (c.etype) {
+  case eNodeChild:
+    c.child = d->pChild[i];
+    break;
+  case eNodeText:
+    c.text = d->pText[i];
+    break;
+  case eNodeClear:
+    c.clear = d->pClear[i];
+    break;
+  default:
+    break;
+  }
+  return c;
+}
+
+XMLCSTR XMLNode::getName() const {
+  if (!d)
+    return NULL;
+  return d->lpszName;
+}
+int XMLNode::nText() const {
+  if (!d)
+    return 0;
+  return d->nText;
+}
+int XMLNode::nChildNode() const {
+  if (!d)
+    return 0;
+  return d->nChild;
+}
+int XMLNode::nAttribute() const {
+  if (!d)
+    return 0;
+  return d->nAttribute;
+}
+int XMLNode::nClear() const {
+  if (!d)
+    return 0;
+  return d->nClear;
+}
+int XMLNode::nElement() const {
+  if (!d)
+    return 0;
+  return d->nAttribute + d->nChild + d->nText + d->nClear;
+}
+XMLClear XMLNode::getClear(int i) const {
+  if ((!d) || (i >= d->nClear))
+    return emptyXMLClear;
+  return d->pClear[i];
+}
+XMLAttribute XMLNode::getAttribute(int i) const {
+  if ((!d) || (i >= d->nAttribute))
+    return emptyXMLAttribute;
+  return d->pAttribute[i];
+}
+XMLCSTR XMLNode::getAttributeName(int i) const {
+  if ((!d) || (i >= d->nAttribute))
+    return NULL;
+  return d->pAttribute[i].lpszName;
+}
+XMLCSTR XMLNode::getAttributeValue(int i) const {
+  if ((!d) || (i >= d->nAttribute))
+    return NULL;
+  return d->pAttribute[i].lpszValue;
+}
+XMLCSTR XMLNode::getText(int i) const {
+  if ((!d) || (i >= d->nText))
+    return NULL;
+  return d->pText[i];
+}
+XMLNode XMLNode::getChildNode(int i) const {
+  if ((!d) || (i >= d->nChild))
+    return emptyXMLNode;
+  return d->pChild[i];
+}
+XMLNode XMLNode::getParentNode() const {
+  if ((!d) || (!d->pParent))
+    return emptyXMLNode;
+  return XMLNode(d->pParent);
+}
+char XMLNode::isDeclaration() const {
+  if (!d)
+    return 0;
+  return d->isDeclaration;
+}
+char XMLNode::isEmpty() const { return (d == NULL); }
+XMLNode XMLNode::emptyNode() { return XMLNode::emptyXMLNode; }
+
+XMLNode XMLNode::addChild(XMLCSTR lpszName, char isDeclaration,
+                          XMLElementPosition pos) {
+  return addChild_priv(0, stringDup(lpszName), isDeclaration, pos);
+}
+XMLNode XMLNode::addChild_WOSD(XMLSTR lpszName, char isDeclaration,
+                               XMLElementPosition pos) {
+  return addChild_priv(0, lpszName, isDeclaration, pos);
+}
+XMLAttribute *XMLNode::addAttribute(XMLCSTR lpszName, XMLCSTR lpszValue) {
+  return addAttribute_priv(0, stringDup(lpszName), stringDup(lpszValue));
+}
+XMLAttribute *XMLNode::addAttribute_WOSD(XMLSTR lpszName, XMLSTR lpszValuev) {
+  return addAttribute_priv(0, lpszName, lpszValuev);
+}
+XMLCSTR XMLNode::addText(XMLCSTR lpszValue, XMLElementPosition pos) {
+  return addText_priv(0, stringDup(lpszValue), pos);
+}
+XMLCSTR XMLNode::addText_WOSD(XMLSTR lpszValue, XMLElementPosition pos) {
+  return addText_priv(0, lpszValue, pos);
+}
+XMLClear *XMLNode::addClear(XMLCSTR lpszValue, XMLCSTR lpszOpen,
+                            XMLCSTR lpszClose, XMLElementPosition pos) {
+  return addClear_priv(0, stringDup(lpszValue), lpszOpen, lpszClose, pos);
+}
+XMLClear *XMLNode::addClear_WOSD(XMLSTR lpszValue, XMLCSTR lpszOpen,
+                                 XMLCSTR lpszClose, XMLElementPosition pos) {
+  return addClear_priv(0, lpszValue, lpszOpen, lpszClose, pos);
+}
+XMLCSTR XMLNode::updateName(XMLCSTR lpszName) {
+  return updateName_WOSD(stringDup(lpszName));
+}
+XMLAttribute *XMLNode::updateAttribute(XMLAttribute *newAttribute,
+                                       XMLAttribute *oldAttribute) {
+  return updateAttribute_WOSD(stringDup(newAttribute->lpszValue),
+                              stringDup(newAttribute->lpszName),
+                              oldAttribute->lpszName);
+}
+XMLAttribute *XMLNode::updateAttribute(XMLCSTR lpszNewValue,
+                                       XMLCSTR lpszNewName, int i) {
+  return updateAttribute_WOSD(stringDup(lpszNewValue), stringDup(lpszNewName),
+                              i);
+}
+XMLAttribute *XMLNode::updateAttribute(XMLCSTR lpszNewValue,
+                                       XMLCSTR lpszNewName,
+                                       XMLCSTR lpszOldName) {
+  return updateAttribute_WOSD(stringDup(lpszNewValue), stringDup(lpszNewName),
+                              lpszOldName);
+}
+XMLCSTR XMLNode::updateText(XMLCSTR lpszNewValue, int i) {
+  return updateText_WOSD(stringDup(lpszNewValue), i);
+}
+XMLCSTR XMLNode::updateText(XMLCSTR lpszNewValue, XMLCSTR lpszOldValue) {
+  return updateText_WOSD(stringDup(lpszNewValue), lpszOldValue);
+}
+XMLClear *XMLNode::updateClear(XMLCSTR lpszNewContent, int i) {
+  return updateClear_WOSD(stringDup(lpszNewContent), i);
+}
+XMLClear *XMLNode::updateClear(XMLCSTR lpszNewValue, XMLCSTR lpszOldValue) {
+  return updateClear_WOSD(stringDup(lpszNewValue), lpszOldValue);
+}
+XMLClear *XMLNode::updateClear(XMLClear *newP, XMLClear *oldP) {
+  return updateClear_WOSD(stringDup(newP->lpszValue), oldP->lpszValue);
+}
+
+char XMLNode::setGlobalOptions(XMLCharEncoding _characterEncoding,
+                               char _guessWideCharChars, char _dropWhiteSpace,
+                               char _removeCommentsInMiddleOfText) {
+  guessWideCharChars = _guessWideCharChars;
+  dropWhiteSpace = _dropWhiteSpace;
+  removeCommentsInMiddleOfText = _removeCommentsInMiddleOfText;
+#ifdef _XMLWIDECHAR
+  if (_characterEncoding)
+    characterEncoding = _characterEncoding;
+#else
+  switch (_characterEncoding) {
+  case char_encoding_UTF8:
+    characterEncoding = _characterEncoding;
+    XML_ByteTable = XML_utf8ByteTable;
+    break;
+  case char_encoding_legacy:
+    characterEncoding = _characterEncoding;
+    XML_ByteTable = XML_legacyByteTable;
+    break;
+  case char_encoding_ShiftJIS:
+    characterEncoding = _characterEncoding;
+    XML_ByteTable = XML_sjisByteTable;
+    break;
+  case char_encoding_GB2312:
+    characterEncoding = _characterEncoding;
+    XML_ByteTable = XML_gb2312ByteTable;
+    break;
+  case char_encoding_Big5:
+  case char_encoding_GBK:
+    characterEncoding = _characterEncoding;
+    XML_ByteTable = XML_gbk_big5_ByteTable;
+    break;
+  default:
+    return 1;
+  }
+#endif
+  return 0;
+}
+
+XMLNode::XMLCharEncoding
+XMLNode::guessCharEncoding(void *buf, int l, char useXMLEncodingAttribute) {
+#ifdef _XMLWIDECHAR
+  return (XMLCharEncoding)0;
+#else
+  if (l < 25)
+    return (XMLCharEncoding)0;
+  if (guessWideCharChars && (myIsTextWideChar(buf, l)))
+    return (XMLCharEncoding)0;
+  unsigned char *b = (unsigned char *)buf;
+  if ((b[0] == 0xef) && (b[1] == 0xbb) && (b[2] == 0xbf))
+    return char_encoding_UTF8;
+
+  // Match utf-8 model ?
+  XMLCharEncoding bestGuess = char_encoding_UTF8;
+  int i = 0;
+  while (i < l)
+    switch (XML_utf8ByteTable[b[i]]) {
+    case 4:
+      i++;
+      if ((i < l) && (b[i] & 0xC0) != 0x80) {
+        bestGuess = char_encoding_legacy;
+        i = l;
+      } // 10bbbbbb ?
+    case 3:
+      i++;
+      if ((i < l) && (b[i] & 0xC0) != 0x80) {
+        bestGuess = char_encoding_legacy;
+        i = l;
+      } // 10bbbbbb ?
+    case 2:
+      i++;
+      if ((i < l) && (b[i] & 0xC0) != 0x80) {
+        bestGuess = char_encoding_legacy;
+        i = l;
+      } // 10bbbbbb ?
+    case 1:
+      i++;
+      break;
+    case 0:
+      i = l;
+    }
+  if (!useXMLEncodingAttribute)
+    return bestGuess;
+  // if encoding is specified and different from utf-8 than it's non-utf8
+  // otherwise it's utf-8
+  char bb[201];
+  l = mmin(l, 200);
+  memcpy(bb, buf, l); // copy buf into bb to be able to do "bb[l]=0"
+  bb[l] = 0;
+  b = (unsigned char *)strstr(bb, "encoding");
+  if (!b)
+    return bestGuess;
+  b += 8;
+  while
+    XML_isSPACECHAR(*b) b++;
+  if (*b != '=')
+    return bestGuess;
+  b++;
+  while
+    XML_isSPACECHAR(*b) b++;
+  if ((*b != '\'') && (*b != '"'))
+    return bestGuess;
+  b++;
+  while
+    XML_isSPACECHAR(*b) b++;
+
+  if ((xstrnicmp((char *)b, "utf-8", 5) == 0) ||
+      (xstrnicmp((char *)b, "utf8", 4) == 0)) {
+    if (bestGuess == char_encoding_legacy)
+      return char_encoding_error;
+    return char_encoding_UTF8;
+  }
+
+  if ((xstrnicmp((char *)b, "shiftjis", 8) == 0) ||
+      (xstrnicmp((char *)b, "shift-jis", 9) == 0) ||
+      (xstrnicmp((char *)b, "sjis", 4) == 0))
+    return char_encoding_ShiftJIS;
+
+  if (xstrnicmp((char *)b, "GB2312", 6) == 0)
+    return char_encoding_GB2312;
+  if (xstrnicmp((char *)b, "Big5", 4) == 0)
+    return char_encoding_Big5;
+  if (xstrnicmp((char *)b, "GBK", 3) == 0)
+    return char_encoding_GBK;
+
+  return char_encoding_legacy;
+#endif
+}
+#undef XML_isSPACECHAR
+
+//////////////////////////////////////////////////////////
+//      Here starts the base64 conversion functions.    //
+//////////////////////////////////////////////////////////
+
+static const char base64Fillchar =
+    _CXML('='); // used to mark partial words at the end
+
+// this lookup table defines the base64 encoding
+XMLCSTR base64EncodeTable =
+    _CXML("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
+
+// Decode Table gives the index of any valid base64 character in the Base64
+// table] 96: '='  -   97: space char   -   98: illegal char   -   99: end of
+// string
+const unsigned char base64DecodeTable[] = {
+    99, 98, 98, 98, 98, 98, 98, 98, 98, 97, 97, 98, 98, 97, 98, 98,
+    98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, // 00 -29
+    98, 98, 97, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 62, 98, 98,
+    98, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 98, 98, // 30 -59
+    98, 96, 98, 98, 98, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
+    11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, // 60 -89
+    25, 98, 98, 98, 98, 98, 98, 26, 27, 28, 29, 30, 31, 32, 33, 34,
+    35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, // 90 -119
+    49, 50, 51, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
+    98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, // 120 -149
+    98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
+    98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, // 150 -179
+    98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
+    98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, // 180 -209
+    98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,
+    98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98,        // 210 -239
+    98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98 // 240 -255
+};
+
+XMLParserBase64Tool::~XMLParserBase64Tool() { freeBuffer(); }
+
+void XMLParserBase64Tool::freeBuffer() {
+  if (buf)
+    free(buf);
+  buf = NULL;
+  buflen = 0;
+}
+
+int XMLParserBase64Tool::encodeLength(int inlen, char formatted) {
+  unsigned int i = ((inlen - 1) / 3 * 4 + 4 + 1);
+  if (formatted)
+    i += inlen / 54;
+  return i;
+}
+
+XMLSTR XMLParserBase64Tool::encode(unsigned char *inbuf, unsigned int inlen,
+                                   char formatted) {
+  int i = encodeLength(inlen, formatted), k = 17, eLen = inlen / 3, j;
+  alloc(i * sizeof(XMLCHAR));
+  XMLSTR curr = (XMLSTR)buf;
+  for (i = 0; i < eLen; i++) {
+    // Copy next three bytes into lower 24 bits of int, paying attention to
+    // sign.
+    j = (inbuf[0] << 16) | (inbuf[1] << 8) | inbuf[2];
+    inbuf += 3;
+    // Encode the int into four chars
+    *(curr++) = base64EncodeTable[j >> 18];
+    *(curr++) = base64EncodeTable[(j >> 12) & 0x3f];
+    *(curr++) = base64EncodeTable[(j >> 6) & 0x3f];
+    *(curr++) = base64EncodeTable[(j)&0x3f];
+    if (formatted) {
+      if (!k) {
+        *(curr++) = _CXML('\n');
+        k = 18;
+      }
+      k--;
+    }
+  }
+  eLen = inlen - eLen * 3; // 0 - 2.
+  if (eLen == 1) {
+    *(curr++) = base64EncodeTable[inbuf[0] >> 2];
+    *(curr++) = base64EncodeTable[(inbuf[0] << 4) & 0x3F];
+    *(curr++) = base64Fillchar;
+    *(curr++) = base64Fillchar;
+  } else if (eLen == 2) {
+    j = (inbuf[0] << 8) | inbuf[1];
+    *(curr++) = base64EncodeTable[j >> 10];
+    *(curr++) = base64EncodeTable[(j >> 4) & 0x3f];
+    *(curr++) = base64EncodeTable[(j << 2) & 0x3f];
+    *(curr++) = base64Fillchar;
+  }
+  *(curr++) = 0;
+  return (XMLSTR)buf;
+}
+
+unsigned int XMLParserBase64Tool::decodeSize(XMLCSTR data, XMLError *xe) {
+  if (xe)
+    *xe = eXMLErrorNone;
+  int size = 0;
+  unsigned char c;
+  // skip any extra characters (e.g. newlines or spaces)
+  while (*data) {
+#ifdef _XMLWIDECHAR
+    if (*data > 255) {
+      if (xe)
+        *xe = eXMLErrorBase64DecodeIllegalCharacter;
+      return 0;
+    }
+#endif
+    c = base64DecodeTable[(unsigned char)(*data)];
+    if (c < 97)
+      size++;
+    else if (c == 98) {
+      if (xe)
+        *xe = eXMLErrorBase64DecodeIllegalCharacter;
+      return 0;
+    }
+    data++;
+  }
+  if (xe && (size % 4 != 0))
+    *xe = eXMLErrorBase64DataSizeIsNotMultipleOf4;
+  if (size == 0)
+    return 0;
+  do {
+    data--;
+    size--;
+  } while (*data == base64Fillchar);
+  size++;
+  return (unsigned int)((size * 3) / 4);
+}
+
+unsigned char XMLParserBase64Tool::decode(XMLCSTR data, unsigned char *buf,
+                                          int len, XMLError *xe) {
+  if (xe)
+    *xe = eXMLErrorNone;
+  int i = 0, p = 0;
+  unsigned char d, c;
+  for (;;) {
+
+#ifdef _XMLWIDECHAR
+#define BASE64DECODE_READ_NEXT_CHAR(c)                                         \
+  do {                                                                         \
+    if (data[i] > 255) {                                                       \
+      c = 98;                                                                  \
+      break;                                                                   \
+    }                                                                          \
+    c = base64DecodeTable[(unsigned char)data[i++]];                           \
+  } while (c == 97);                                                           \
+  if (c == 98) {                                                               \
+    if (xe)                                                                    \
+      *xe = eXMLErrorBase64DecodeIllegalCharacter;                             \
+    return 0;                                                                  \
+  }
+#else
+#define BASE64DECODE_READ_NEXT_CHAR(c)                                         \
+  do {                                                                         \
+    c = base64DecodeTable[(unsigned char)data[i++]];                           \
+  } while (c == 97);                                                           \
+  if (c == 98) {                                                               \
+    if (xe)                                                                    \
+      *xe = eXMLErrorBase64DecodeIllegalCharacter;                             \
+    return 0;                                                                  \
+  }
+#endif
+
+    BASE64DECODE_READ_NEXT_CHAR(c)
+    if (c == 99) {
+      return 2;
+    }
+    if (c == 96) {
+      if (p == (int)len)
+        return 2;
+      if (xe)
+        *xe = eXMLErrorBase64DecodeTruncatedData;
+      return 1;
+    }
+
+    BASE64DECODE_READ_NEXT_CHAR(d)
+    if ((d == 99) || (d == 96)) {
+      if (xe)
+        *xe = eXMLErrorBase64DecodeTruncatedData;
+      return 1;
+    }
+    if (p == (int)len) {
+      if (xe)
+        *xe = eXMLErrorBase64DecodeBufferTooSmall;
+      return 0;
+    }
+    buf[p++] = (unsigned char)((c << 2) | ((d >> 4) & 0x3));
+
+    BASE64DECODE_READ_NEXT_CHAR(c)
+    if (c == 99) {
+      if (xe)
+        *xe = eXMLErrorBase64DecodeTruncatedData;
+      return 1;
+    }
+    if (p == (int)len) {
+      if (c == 96)
+        return 2;
+      if (xe)
+        *xe = eXMLErrorBase64DecodeBufferTooSmall;
+      return 0;
+    }
+    if (c == 96) {
+      if (xe)
+        *xe = eXMLErrorBase64DecodeTruncatedData;
+      return 1;
+    }
+    buf[p++] = (unsigned char)(((d << 4) & 0xf0) | ((c >> 2) & 0xf));
+
+    BASE64DECODE_READ_NEXT_CHAR(d)
+    if (d == 99) {
+      if (xe)
+        *xe = eXMLErrorBase64DecodeTruncatedData;
+      return 1;
+    }
+    if (p == (int)len) {
+      if (d == 96)
+        return 2;
+      if (xe)
+        *xe = eXMLErrorBase64DecodeBufferTooSmall;
+      return 0;
+    }
+    if (d == 96) {
+      if (xe)
+        *xe = eXMLErrorBase64DecodeTruncatedData;
+      return 1;
+    }
+    buf[p++] = (unsigned char)(((c << 6) & 0xc0) | d);
+  }
+}
+#undef BASE64DECODE_READ_NEXT_CHAR
+
+void XMLParserBase64Tool::alloc(int newsize) {
+  if ((!buf) && (newsize)) {
+    buf = malloc(newsize);
+    buflen = newsize;
+    return;
+  }
+  if (newsize > buflen) {
+    buf = realloc(buf, newsize);
+    buflen = newsize;
+  }
+}
+
+unsigned char *XMLParserBase64Tool::decode(XMLCSTR data, int *outlen,
+                                           XMLError *xe) {
+  if (xe)
+    *xe = eXMLErrorNone;
+  unsigned int len = decodeSize(data, xe);
+  if (outlen)
+    *outlen = len;
+  if (!len)
+    return NULL;
+  alloc(len + 1);
+  if (!decode(data, (unsigned char *)buf, len, xe)) {
+    return NULL;
+  }
+  return (unsigned char *)buf;
+}
diff --git a/src/gpuwattch/xmlParser.h b/src/gpuwattch/xmlParser.h
new file mode 100644
index 000000000..fe875facb
--- /dev/null
+++ b/src/gpuwattch/xmlParser.h
@@ -0,0 +1,971 @@
+/****************************************************************************/
+/*! \mainpage XMLParser library
+ * \section intro_sec Introduction
+ *
+ * This is a basic XML parser written in ANSI C++ for portability.
+ * It works by using recursion and a node tree for breaking
+ * down the elements of an XML document.
+ *
+ * @version     V2.41
+ * @author      Frank Vanden Berghen
+ *
+ * The following license terms for the "XMLParser library from Business-Insight"
+ *apply to projects that are in some way related to the "mcpat project",
+ *including applications using "mcpat project" and tools developed for enhancing
+ *"mcpat project". All other projects (not related to "mcpat project") have to
+ *use the "XMLParser library from Business-Insight" code under the Aladdin Free
+ *Public License (AFPL) See the file "AFPL-license.txt" for more informations
+ *about the AFPL license. (see http://www.artifex.com/downloads/doc/Public.htm
+ *for detailed AFPL terms)
+ *
+ * Redistribution and use of the "XMLParser library from Business-Insight" in
+ *source and binary forms, with or without modification, are permitted provided
+ *that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Frank Vanden Berghen nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Business-Insight ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Business-Insight BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Copyright (c) 2002, Business-Insight
+ * <a href="http://www.Business-Insight.com">Business-Insight</a>
+ * All rights reserved.
+ *
+ * \section tutorial First Tutorial
+ * You can follow a simple <a href="../../xmlParser.html">Tutorial</a> to know
+ *the basics...
+ *
+ * \section usage General usage: How to include the XMLParser library inside
+ *your project.
+ *
+ * The library is composed of two files: <a
+ *href="../../xmlParser.cpp">xmlParser.cpp</a> and <a
+ *href="../../xmlParser.h">xmlParser.h</a>. These are the ONLY 2 files that you
+ *need when using the library inside your own projects.
+ *
+ * All the functions of the library are documented inside the comments of the
+ *file <a href="../../xmlParser.h">xmlParser.h</a>. These comments can be
+ *transformed in full-fledged HTML documentation using the DOXYGEN software:
+ *simply type: "doxygen doxy.cfg"
+ *
+ * By default, the XMLParser library uses (char*) for string representation.To
+ *use the (wchar_t*) version of the library, you need to define the "_UNICODE"
+ *preprocessor definition variable (this is usually done inside your project
+ *definition file) (This is done automatically for you when using Visual
+ *Studio).
+ *
+ * \section example Advanced Tutorial and Many Examples of usage.
+ *
+ * Some very small introductory examples are described inside the Tutorial file
+ * <a href="../../xmlParser.html">xmlParser.html</a>
+ *
+ * Some additional small examples are also inside the file <a
+ *href="../../xmlTest.cpp">xmlTest.cpp</a> (for the "char*" version of the
+ *library) and inside the file <a
+ *href="../../xmlTestUnicode.cpp">xmlTestUnicode.cpp</a> (for the "wchar_t*"
+ * version of the library). If you have a question, please review these
+ *additionnal examples before sending an e-mail to the author.
+ *
+ * To build the examples:
+ * - linux/unix: type "make"
+ * - solaris: type "make -f makefile.solaris"
+ * - windows: Visual Studio: double-click on xmlParser.dsw
+ *   (under Visual Studio .NET, the .dsp and .dsw files will be automatically
+ *converted to .vcproj and .sln files)
+ *
+ * In order to build the examples you need some additional files:
+ * - linux/unix: makefile
+ * - solaris: makefile.solaris
+ * - windows: Visual Studio: *.dsp, xmlParser.dsw and also xmlParser.lib and
+ *xmlParser.dll
+ *
+ * \section debugging Debugging with the XMLParser library
+ *
+ * \subsection debugwin Debugging under WINDOWS
+ *
+ * 	Inside Visual C++, the "debug versions" of the memory allocation
+ *functions are very slow: Do not forget to compile in "release mode" to get
+ *maximum speed. When I had to debug a software that was using the XMLParser
+ *Library, it was usually a nightmare because the library was sooOOOoooo slow in
+ *debug mode (because of the slow memory allocations in Debug mode). To solve
+ *this problem, during all the debugging session, I am now using a very fast DLL
+ *version of the XMLParser Library (the DLL is compiled in release mode). Using
+ *the DLL version of the XMLParser Library allows me to have lightening XML
+ *parsing speed even in debug! Other than that, the DLL version is useless: In
+ *the release version of my tool, I always use the normal, ".cpp"-based,
+ *XMLParser Library (I simply include the <a
+ *href="../../xmlParser.cpp">xmlParser.cpp</a> and <a
+ *href="../../xmlParser.h">xmlParser.h</a> files into the project).
+ *
+ * 	The file <a href="../../XMLNodeAutoexp.txt">XMLNodeAutoexp.txt</a>
+ *contains some "tweaks" that improve substancially the display of the content
+ *of the XMLNode objects inside the Visual Studio Debugger. Believe me, once you
+ *have seen inside the debugger the "smooth" display of the XMLNode objects, you
+ *cannot live without it anymore!
+ *
+ * \subsection debuglinux Debugging under LINUX/UNIX
+ *
+ * 	The speed of the debug version of the XMLParser library is tolerable so
+ *no extra work.has been done.
+ *
+ ****************************************************************************/
+
+#ifndef __INCLUDE_XML_NODE__
+#define __INCLUDE_XML_NODE__
+
+#include <stdlib.h>
+
+#ifdef _UNICODE
+// If you comment the next "define" line then the library will never "switch to"
+// _UNICODE (wchar_t*) mode (16/32 bits per characters). This is useful when you
+// get error messages like:
+//    'XMLNode::openFileHelper' : cannot convert parameter 2 from 'const char
+//    [5]' to 'const wchar_t *'
+// The _XMLWIDECHAR preprocessor variable force the XMLParser library into
+// either utf16/32-mode (the proprocessor variable must be defined) or
+// utf8-mode(the pre-processor variable must be undefined).
+#define _XMLWIDECHAR
+#endif
+
+#if defined(WIN32) || defined(UNDER_CE) || defined(_WIN32) ||                  \
+    defined(WIN64) || defined(__BORLANDC__)
+// comment the next line if you are under windows and the compiler is not
+// Microsoft Visual Studio (6.0 or .NET) or Borland
+#define _XMLWINDOWS
+#endif
+
+#ifdef XMLDLLENTRY
+#undef XMLDLLENTRY
+#endif
+#ifdef _USE_XMLPARSER_DLL
+#ifdef _DLL_EXPORTS_
+#define XMLDLLENTRY __declspec(dllexport)
+#else
+#define XMLDLLENTRY __declspec(dllimport)
+#endif
+#else
+#define XMLDLLENTRY
+#endif
+
+// uncomment the next line if you want no support for wchar_t* (no need for the
+// <wchar.h> or <tchar.h> libraries anymore to compile)
+//#define XML_NO_WIDE_CHAR
+
+#ifdef XML_NO_WIDE_CHAR
+#undef _XMLWINDOWS
+#undef _XMLWIDECHAR
+#endif
+
+#ifdef _XMLWINDOWS
+#include <tchar.h>
+#else
+#define XMLDLLENTRY
+#ifndef XML_NO_WIDE_CHAR
+#include <wchar.h> // to have 'wcsrtombs' for ANSI version
+                   // to have 'mbsrtowcs' for WIDECHAR version
+#endif
+#endif
+
+// Some common types for char set portable code
+#ifdef _XMLWIDECHAR
+#define _CXML(c) L##c
+#define XMLCSTR const wchar_t *
+#define XMLSTR wchar_t *
+#define XMLCHAR wchar_t
+#else
+#define _CXML(c) c
+#define XMLCSTR const char *
+#define XMLSTR char *
+#define XMLCHAR char
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif /* FALSE */
+#ifndef TRUE
+#define TRUE 1
+#endif /* TRUE */
+
+/// Enumeration for XML parse errors.
+typedef enum XMLError {
+  eXMLErrorNone = 0,
+  eXMLErrorMissingEndTag,
+  eXMLErrorNoXMLTagFound,
+  eXMLErrorEmpty,
+  eXMLErrorMissingTagName,
+  eXMLErrorMissingEndTagName,
+  eXMLErrorUnmatchedEndTag,
+  eXMLErrorUnmatchedEndClearTag,
+  eXMLErrorUnexpectedToken,
+  eXMLErrorNoElements,
+  eXMLErrorFileNotFound,
+  eXMLErrorFirstTagNotFound,
+  eXMLErrorUnknownCharacterEntity,
+  eXMLErrorCharacterCodeAbove255,
+  eXMLErrorCharConversionError,
+  eXMLErrorCannotOpenWriteFile,
+  eXMLErrorCannotWriteFile,
+
+  eXMLErrorBase64DataSizeIsNotMultipleOf4,
+  eXMLErrorBase64DecodeIllegalCharacter,
+  eXMLErrorBase64DecodeTruncatedData,
+  eXMLErrorBase64DecodeBufferTooSmall
+} XMLError;
+
+/// Enumeration used to manage type of data. Use in conjunction with structure
+/// XMLNodeContents
+typedef enum XMLElementType {
+  eNodeChild = 0,
+  eNodeAttribute = 1,
+  eNodeText = 2,
+  eNodeClear = 3,
+  eNodeNULL = 4
+} XMLElementType;
+
+/// Structure used to obtain error details if the parse fails.
+typedef struct XMLResults {
+  enum XMLError error;
+  int nLine, nColumn;
+} XMLResults;
+
+/// Structure for XML clear (unformatted) node (usually comments)
+typedef struct XMLClear {
+  XMLCSTR lpszValue;
+  XMLCSTR lpszOpenTag;
+  XMLCSTR lpszCloseTag;
+} XMLClear;
+
+/// Structure for XML attribute.
+typedef struct XMLAttribute {
+  XMLCSTR lpszName;
+  XMLCSTR lpszValue;
+} XMLAttribute;
+
+/// XMLElementPosition are not interchangeable with simple indexes
+typedef int XMLElementPosition;
+
+struct XMLNodeContents;
+
+/** @defgroup XMLParserGeneral The XML parser */
+
+/// Main Class representing a XML node
+/**
+ * All operations are performed using this class.
+ * \note The constructors of the XMLNode class are protected, so use instead one
+ * of these four methods to get your first instance of XMLNode: <ul> <li>
+ * XMLNode::parseString </li> <li> XMLNode::parseFile </li> <li>
+ * XMLNode::openFileHelper </li> <li> XMLNode::createXMLTopNode (or
+ * XMLNode::createXMLTopNode_WOSD)</li>
+ * </ul> */
+typedef struct XMLDLLENTRY XMLNode {
+private:
+  struct XMLNodeDataTag;
+
+  /// Constructors are protected, so use instead one of: XMLNode::parseString,
+  /// XMLNode::parseFile, XMLNode::openFileHelper, XMLNode::createXMLTopNode
+  XMLNode(struct XMLNodeDataTag *pParent, XMLSTR lpszName, char isDeclaration);
+  /// Constructors are protected, so use instead one of: XMLNode::parseString,
+  /// XMLNode::parseFile, XMLNode::openFileHelper, XMLNode::createXMLTopNode
+  XMLNode(struct XMLNodeDataTag *p);
+
+public:
+  static XMLCSTR getVersion(); ///< Return the XMLParser library version number
+
+  /** @defgroup conversions Parsing XML files/strings to an XMLNode structure
+   * and Rendering XMLNode's to files/string.
+   * @ingroup XMLParserGeneral
+   * @{ */
+
+  /// Parse an XML string and return the root of a XMLNode tree representing the
+  /// string.
+  static XMLNode parseString(XMLCSTR lpXMLString, XMLCSTR tag = NULL,
+                             XMLResults *pResults = NULL);
+  /**< The "parseString" function parse an XML string and return the root of a
+   * XMLNode tree. The "opposite" of this function is the function
+   * "createXMLString" that re-creates an XML string from an XMLNode tree. If
+   * the XML document is corrupted, the "parseString" method will initialize the
+   * "pResults" variable with some information that can be used to trace the
+   * error. If you still want to parse the file, you can use the
+   * APPROXIMATE_PARSING option as explained inside the note at the beginning of
+   * the "xmlParser.cpp" file.
+   *
+   * @param lpXMLString the XML string to parse
+   * @param tag  the name of the first tag inside the XML file. If the tag
+   * parameter is omitted, this function returns a node that represents the head
+   * of the xml document including the declaration term (<? ... ?>).
+   * @param pResults a pointer to a XMLResults variable that will contain some
+   * information that can be used to trace the XML parsing error. You can have a
+   * user-friendly explanation of the parsing error with the "getError"
+   * function.
+   */
+
+  /// Parse an XML file and return the root of a XMLNode tree representing the
+  /// file.
+  static XMLNode parseFile(XMLCSTR filename, XMLCSTR tag = NULL,
+                           XMLResults *pResults = NULL);
+  /**< The "parseFile" function parse an XML file and return the root of a
+   * XMLNode tree. The "opposite" of this function is the function "writeToFile"
+   * that re-creates an XML file from an XMLNode tree. If the XML document is
+   * corrupted, the "parseFile" method will initialize the "pResults" variable
+   * with some information that can be used to trace the error. If you still
+   * want to parse the file, you can use the APPROXIMATE_PARSING option as
+   * explained inside the note at the beginning of the "xmlParser.cpp" file.
+   *
+   * @param filename the path to the XML file to parse
+   * @param tag the name of the first tag inside the XML file. If the tag
+   * parameter is omitted, this function returns a node that represents the head
+   * of the xml document including the declaration term (<? ... ?>).
+   * @param pResults a pointer to a XMLResults variable that will contain some
+   * information that can be used to trace the XML parsing error. You can have a
+   * user-friendly explanation of the parsing error with the "getError"
+   * function.
+   */
+
+  /// Parse an XML file and return the root of a XMLNode tree representing the
+  /// file. A very crude error checking is made. An attempt to guess the Char
+  /// Encoding used in the file is made.
+  static XMLNode openFileHelper(XMLCSTR filename, XMLCSTR tag = NULL);
+  /**< The "openFileHelper" function reports to the screen all the warnings and
+   * errors that occurred during parsing of the XML file. This function also
+   * tries to guess char Encoding (UTF-8, ASCII or SHIT-JIS) based on the first
+   * 200 bytes of the file. Since each application has its own way to report and
+   * deal with errors, you should rather use the "parseFile" function to parse
+   * XML files and program yourself thereafter an "error reporting" tailored for
+   * your needs (instead of using the very crude "error reporting" mechanism
+   * included inside the "openFileHelper" function).
+   *
+   * If the XML document is corrupted, the "openFileHelper" method will:
+   *         - display an error message on the console (or inside a messageBox
+   * for windows).
+   *         - stop execution (exit).
+   *
+   * I strongly suggest that you write your own "openFileHelper" method tailored
+   * to your needs. If you still want to parse the file, you can use the
+   * APPROXIMATE_PARSING option as explained inside the note at the beginning of
+   * the "xmlParser.cpp" file.
+   *
+   * @param filename the path of the XML file to parse.
+   * @param tag the name of the first tag inside the XML file. If the tag
+   * parameter is omitted, this function returns a node that represents the head
+   * of the xml document including the declaration term (<? ... ?>).
+   */
+
+  static XMLCSTR getError(XMLError error); ///< this gives you a user-friendly
+                                           ///< explanation of the parsing error
+
+  /// Create an XML string starting from the current XMLNode.
+  XMLSTR createXMLString(int nFormat = 1, int *pnSize = NULL) const;
+  /**< The returned string should be free'd using the "freeXMLString" function.
+   *
+   *   If nFormat==0, no formatting is required otherwise this returns an user
+   * friendly XML string from a given element
+   *   with appropriate white spaces and carriage returns. if pnSize is given it
+   * returns the size in character of the string. */
+
+  /// Save the content of an xmlNode inside a file
+  XMLError writeToFile(XMLCSTR filename, const char *encoding = NULL,
+                       char nFormat = 1) const;
+  /**< If nFormat==0, no formatting is required otherwise this returns an user
+   * friendly XML string from a given element with appropriate white spaces and
+   * carriage returns. If the global parameter
+   * "characterEncoding==encoding_UTF8", then the "encoding" parameter is
+   * ignored and always set to "utf-8". If the global parameter
+   * "characterEncoding==encoding_ShiftJIS", then the "encoding" parameter is
+   * ignored and always set to "SHIFT-JIS". If "_XMLWIDECHAR=1", then the
+   * "encoding" parameter is ignored and always set to "utf-16". If no
+   * "encoding" parameter is given the "ISO-8859-1" encoding is used. */
+  /** @} */
+
+  /** @defgroup navigate Navigate the XMLNode structure
+   * @ingroup XMLParserGeneral
+   * @{ */
+  XMLCSTR getName() const;               ///< name of the node
+  XMLCSTR getText(int i = 0) const;      ///< return ith text field
+  int nText() const;                     ///< nbr of text field
+  XMLNode getParentNode() const;         ///< return the parent node
+  XMLNode getChildNode(int i = 0) const; ///< return ith child node
+  XMLNode getChildNode(XMLCSTR name, int i)
+      const; ///< return ith child node with specific name (return an empty node
+             ///< if failing). If i==-1, this returns the last XMLNode with the
+             ///< given name.
+  XMLNode
+  getChildNode(XMLCSTR name,
+               int *i = NULL) const; ///< return next child node with specific
+                                     ///< name (return an empty node if failing)
+  XMLNode getChildNodeWithAttribute(
+      XMLCSTR tagName, XMLCSTR attributeName, XMLCSTR attributeValue = NULL,
+      int *i = NULL) const; ///< return child node with specific name/attribute
+                            ///< (return an empty node if failing)
+  XMLNode getChildNodeByPath(XMLCSTR path, char createNodeIfMissing = 0,
+                             XMLCHAR sep = '/');
+  ///< return the first child node with specific path
+  XMLNode getChildNodeByPathNonConst(XMLSTR path, char createNodeIfMissing = 0,
+                                     XMLCHAR sep = '/');
+  ///< return the first child node with specific path.
+
+  int nChildNode(XMLCSTR name)
+      const; ///< return the number of child node with specific name
+  int nChildNode() const;                     ///< nbr of child node
+  XMLAttribute getAttribute(int i = 0) const; ///< return ith attribute
+  XMLCSTR getAttributeName(int i = 0) const;  ///< return ith attribute name
+  XMLCSTR getAttributeValue(int i = 0) const; ///< return ith attribute value
+  char isAttributeSet(XMLCSTR name)
+      const; ///< test if an attribute with a specific name is given
+  XMLCSTR
+  getAttribute(XMLCSTR name,
+               int i) const; ///< return ith attribute content with specific
+                             ///< name (return a NULL if failing)
+  XMLCSTR getAttribute(XMLCSTR name, int *i = NULL)
+      const; ///< return next attribute content with specific name (return a
+             ///< NULL if failing)
+  int nAttribute() const;             ///< nbr of attribute
+  XMLClear getClear(int i = 0) const; ///< return ith clear field (comments)
+  int nClear() const;                 ///< nbr of clear field
+  XMLNodeContents enumContents(XMLElementPosition i)
+      const; ///< enumerate all the different contents (attribute,child,text,
+             ///< clear) of the current XMLNode. The order is reflecting the
+             ///< order of the original file/string. NOTE: 0 <= i < nElement();
+  int nElement() const;       ///< nbr of different contents for current node
+  char isEmpty() const;       ///< is this node Empty?
+  char isDeclaration() const; ///< is this node a declaration <? .... ?>
+  XMLNode deepCopy() const;   ///< deep copy (duplicate/clone) a XMLNode
+  static XMLNode emptyNode(); ///< return XMLNode::emptyXMLNode;
+  /** @} */
+
+  ~XMLNode();
+  XMLNode(const XMLNode &A);            ///< to allow shallow/fast copy:
+  XMLNode &operator=(const XMLNode &A); ///< to allow shallow/fast copy:
+
+  XMLNode() : d(NULL){};
+  static XMLNode emptyXMLNode;
+  static XMLClear emptyXMLClear;
+  static XMLAttribute emptyXMLAttribute;
+
+  /** @defgroup xmlModify Create or Update the XMLNode structure
+   * @ingroup XMLParserGeneral
+   *  The functions in this group allows you to create from scratch (or update)
+   * a XMLNode structure. Start by creating your top node with the
+   * "createXMLTopNode" function and then add new nodes with the "addChild"
+   * function. The parameter 'pos' gives the position where the childNode, the
+   * text or the XMLClearTag will be inserted. The default value (pos=-1)
+   * inserts at the end. The value (pos=0) insert at the beginning (Insertion at
+   * the beginning is slower than at the end). <br>
+   *
+   *  REMARK: 0 <= pos < nChild()+nText()+nClear() <br>
+   */
+
+  /** @defgroup creation Creating from scratch a XMLNode structure
+   * @ingroup xmlModify
+   * @{ */
+  static XMLNode
+  createXMLTopNode(XMLCSTR lpszName,
+                   char isDeclaration =
+                       FALSE); ///< Create the top node of an XMLNode structure
+  XMLNode addChild(XMLCSTR lpszName, char isDeclaration = FALSE,
+                   XMLElementPosition pos = -1); ///< Add a new child node
+  XMLNode addChild(XMLNode nodeToAdd,
+                   XMLElementPosition pos =
+                       -1); ///< If the "nodeToAdd" has some parents, it will be
+                            ///< detached from it's parents before being
+                            ///< attached to the current XMLNode
+  XMLAttribute *addAttribute(XMLCSTR lpszName,
+                             XMLCSTR lpszValuev); ///< Add a new attribute
+  XMLCSTR addText(XMLCSTR lpszValue,
+                  XMLElementPosition pos = -1); ///< Add a new text content
+  XMLClear *addClear(XMLCSTR lpszValue, XMLCSTR lpszOpen = NULL,
+                     XMLCSTR lpszClose = NULL, XMLElementPosition pos = -1);
+  /**< Add a new clear tag
+   * @param lpszOpen default value "<![CDATA["
+   * @param lpszClose default value "]]>"
+   */
+  /** @} */
+
+  /** @defgroup xmlUpdate Updating Nodes
+   * @ingroup xmlModify
+   * Some update functions:
+   * @{
+   */
+  XMLCSTR updateName(XMLCSTR lpszName); ///< change node's name
+  XMLAttribute *updateAttribute(
+      XMLAttribute *newAttribute,
+      XMLAttribute *oldAttribute); ///< if the attribute to update is missing, a
+                                   ///< new one will be added
+  XMLAttribute *
+  updateAttribute(XMLCSTR lpszNewValue, XMLCSTR lpszNewName = NULL,
+                  int i = 0); ///< if the attribute to update is missing, a new
+                              ///< one will be added
+  XMLAttribute *updateAttribute(
+      XMLCSTR lpszNewValue, XMLCSTR lpszNewName,
+      XMLCSTR
+          lpszOldName); ///< set lpszNewName=NULL if you don't want to change
+                        ///< the name of the attribute if the attribute to
+                        ///< update is missing, a new one will be added
+  XMLCSTR updateText(
+      XMLCSTR lpszNewValue,
+      int i = 0); ///< if the text to update is missing, a new one will be added
+  XMLCSTR
+  updateText(XMLCSTR lpszNewValue,
+             XMLCSTR lpszOldValue); ///< if the text to update is missing, a new
+                                    ///< one will be added
+  XMLClear *updateClear(XMLCSTR lpszNewContent,
+                        int i = 0); ///< if the clearTag to update is missing, a
+                                    ///< new one will be added
+  XMLClear *updateClear(XMLClear *newP,
+                        XMLClear *oldP); ///< if the clearTag to update is
+                                         ///< missing, a new one will be added
+  XMLClear *
+  updateClear(XMLCSTR lpszNewValue,
+              XMLCSTR lpszOldValue); ///< if the clearTag to update is missing,
+                                     ///< a new one will be added
+  /** @} */
+
+  /** @defgroup xmlDelete Deleting Nodes or Attributes
+   * @ingroup xmlModify
+   * Some deletion functions:
+   * @{
+   */
+  /// The "deleteNodeContent" function forces the deletion of the content of
+  /// this XMLNode and the subtree.
+  void deleteNodeContent();
+  /**< \note The XMLNode instances that are referring to the part of the subtree
+   * that has been deleted CANNOT be used anymore!!. Unexpected results will
+   * occur if you continue using them. */
+  void deleteAttribute(
+      int i = 0); ///< Delete the ith attribute of the current XMLNode
+  void
+  deleteAttribute(XMLCSTR lpszName); ///< Delete the attribute with the given
+                                     ///< name (the "strcmp" function is used to
+                                     ///< find the right attribute)
+  void deleteAttribute(
+      XMLAttribute
+          *anAttribute); ///< Delete the attribute with the name
+                         ///< "anAttribute->lpszName" (the "strcmp" function is
+                         ///< used to find the right attribute)
+  void
+  deleteText(int i = 0); ///< Delete the Ith text content of the current XMLNode
+  void deleteText(
+      XMLCSTR lpszValue); ///< Delete the text content "lpszValue" inside the
+                          ///< current XMLNode (direct "pointer-to-pointer"
+                          ///< comparison is used to find the right text)
+  void deleteClear(
+      int i = 0); ///< Delete the Ith clear tag inside the current XMLNode
+  void deleteClear(
+      XMLCSTR lpszValue); ///< Delete the clear tag "lpszValue" inside the
+                          ///< current XMLNode (direct "pointer-to-pointer"
+                          ///< comparison is used to find the clear tag)
+  void deleteClear(
+      XMLClear *p); ///< Delete the clear tag "p" inside the current XMLNode
+                    ///< (direct "pointer-to-pointer" comparison on the lpszName
+                    ///< of the clear tag is used to find the clear tag)
+  /** @} */
+
+  /** @defgroup xmlWOSD ???_WOSD functions.
+   * @ingroup xmlModify
+   *  The strings given as parameters for the "add" and "update" methods that
+   * have a name with the postfix "_WOSD" (that means "WithOut String
+   * Duplication")(for example "addText_WOSD") will be free'd by the XMLNode
+   * class. For example, it means that this is incorrect: \code
+   *     xNode.addText_WOSD("foo");
+   *     xNode.updateAttribute_WOSD("#newcolor" ,NULL,"color");
+   *  \endcode
+   *  In opposition, this is correct:
+   *  \code
+   *     xNode.addText("foo");
+   *     xNode.addText_WOSD(stringDup("foo"));
+   *     xNode.updateAttribute("#newcolor" ,NULL,"color");
+   *     xNode.updateAttribute_WOSD(stringDup("#newcolor"),NULL,"color");
+   *  \endcode
+   *  Typically, you will never do:
+   *  \code
+   *     char *b=(char*)malloc(...);
+   *     xNode.addText(b);
+   *     free(b);
+   *  \endcode
+   *  ... but rather:
+   *  \code
+   *     char *b=(char*)malloc(...);
+   *     xNode.addText_WOSD(b);
+   *  \endcode
+   *  ('free(b)' is performed by the XMLNode class)
+   * @{ */
+  static XMLNode createXMLTopNode_WOSD(
+      XMLSTR lpszName,
+      char isDeclaration =
+          FALSE); ///< Create the top node of an XMLNode structure
+  XMLNode addChild_WOSD(XMLSTR lpszName, char isDeclaration = FALSE,
+                        XMLElementPosition pos = -1); ///< Add a new child node
+  XMLAttribute *addAttribute_WOSD(XMLSTR lpszName,
+                                  XMLSTR lpszValue); ///< Add a new attribute
+  XMLCSTR addText_WOSD(XMLSTR lpszValue,
+                       XMLElementPosition pos = -1); ///< Add a new text content
+  XMLClear *addClear_WOSD(XMLSTR lpszValue, XMLCSTR lpszOpen = NULL,
+                          XMLCSTR lpszClose = NULL,
+                          XMLElementPosition pos = -1); ///< Add a new clear Tag
+
+  XMLCSTR updateName_WOSD(XMLSTR lpszName); ///< change node's name
+  XMLAttribute *updateAttribute_WOSD(
+      XMLAttribute *newAttribute,
+      XMLAttribute *oldAttribute); ///< if the attribute to update is missing, a
+                                   ///< new one will be added
+  XMLAttribute *
+  updateAttribute_WOSD(XMLSTR lpszNewValue, XMLSTR lpszNewName = NULL,
+                       int i = 0); ///< if the attribute to update is missing, a
+                                   ///< new one will be added
+  XMLAttribute *updateAttribute_WOSD(
+      XMLSTR lpszNewValue, XMLSTR lpszNewName,
+      XMLCSTR
+          lpszOldName); ///< set lpszNewName=NULL if you don't want to change
+                        ///< the name of the attribute if the attribute to
+                        ///< update is missing, a new one will be added
+  XMLCSTR updateText_WOSD(
+      XMLSTR lpszNewValue,
+      int i = 0); ///< if the text to update is missing, a new one will be added
+  XMLCSTR
+  updateText_WOSD(XMLSTR lpszNewValue,
+                  XMLCSTR lpszOldValue); ///< if the text to update is missing,
+                                         ///< a new one will be added
+  XMLClear *updateClear_WOSD(XMLSTR lpszNewContent,
+                             int i = 0); ///< if the clearTag to update is
+                                         ///< missing, a new one will be added
+  XMLClear *
+  updateClear_WOSD(XMLClear *newP,
+                   XMLClear *oldP); ///< if the clearTag to update is missing, a
+                                    ///< new one will be added
+  XMLClear *
+  updateClear_WOSD(XMLSTR lpszNewValue,
+                   XMLCSTR lpszOldValue); ///< if the clearTag to update is
+                                          ///< missing, a new one will be added
+  /** @} */
+
+  /** @defgroup xmlPosition Position helper functions (use in conjunction with
+   * the update&add functions
+   * @ingroup xmlModify
+   * These are some useful functions when you want to insert a childNode, a text
+   * or a XMLClearTag in the middle (at a specified position) of a XMLNode tree
+   * already constructed. The value returned by these methods is to be used as
+   * last parameter (parameter 'pos') of addChild, addText or addClear.
+   * @{ */
+  XMLElementPosition positionOfText(int i = 0) const;
+  XMLElementPosition positionOfText(XMLCSTR lpszValue) const;
+  XMLElementPosition positionOfClear(int i = 0) const;
+  XMLElementPosition positionOfClear(XMLCSTR lpszValue) const;
+  XMLElementPosition positionOfClear(XMLClear *a) const;
+  XMLElementPosition positionOfChildNode(int i = 0) const;
+  XMLElementPosition positionOfChildNode(XMLNode x) const;
+  XMLElementPosition positionOfChildNode(XMLCSTR name, int i = 0)
+      const; ///< return the position of the ith childNode with the specified
+             ///< name if (name==NULL) return the position of the ith childNode
+  /** @} */
+
+  /// Enumeration for XML character encoding.
+  typedef enum XMLCharEncoding {
+    char_encoding_error = 0,
+    char_encoding_UTF8 = 1,
+    char_encoding_legacy = 2,
+    char_encoding_ShiftJIS = 3,
+    char_encoding_GB2312 = 4,
+    char_encoding_Big5 = 5,
+    char_encoding_GBK = 6 // this is actually the same as Big5
+  } XMLCharEncoding;
+
+  /** \addtogroup conversions
+   * @{ */
+
+  /// Sets the global options for the conversions
+  static char setGlobalOptions(
+      XMLCharEncoding characterEncoding = XMLNode::char_encoding_UTF8,
+      char guessWideCharChars = 1, char dropWhiteSpace = 1,
+      char removeCommentsInMiddleOfText = 1);
+  /**< The "setGlobalOptions" function allows you to change four global
+   * parameters that affect string & file parsing. First of all, you
+   * most-probably will never have to change these 3 global parameters.
+   *
+   * @param guessWideCharChars If "guessWideCharChars"=1 and if this library is
+   * compiled in WideChar mode, then the XMLNode::parseFile and
+   * XMLNode::openFileHelper functions will test if the file contains ASCII
+   *     characters. If this is the case, then the file will be loaded and
+   * converted in memory to WideChar before being parsed. If 0, no conversion
+   * will be performed.
+   *
+   * @param guessWideCharChars If "guessWideCharChars"=1 and if this library is
+   * compiled in ASCII/UTF8/char* mode, then the XMLNode::parseFile and
+   * XMLNode::openFileHelper functions will test if the file contains WideChar
+   *     characters. If this is the case, then the file will be loaded and
+   * converted in memory to ASCII/UTF8/char* before being parsed. If 0, no
+   * conversion will be performed.
+   *
+   * @param characterEncoding This parameter is only meaningful when compiling
+   * in char* mode (multibyte character mode). In wchar_t* (wide char mode),
+   * this parameter is ignored. This parameter should be one of the three
+   * currently recognized encodings: XMLNode::encoding_UTF8,
+   * XMLNode::encoding_ascii, XMLNode::encoding_ShiftJIS.
+   *
+   * @param dropWhiteSpace In most situations, text fields containing only white
+   * spaces (and carriage returns) are useless. Even more, these "empty" text
+   * fields are annoying because they increase the complexity of the user's code
+   * for parsing. So, 99% of the time, it's better to drop the "empty" text
+   * fields. However The XML specification indicates that no white spaces should
+   * be lost when parsing the file. So to be perfectly XML-compliant, you should
+   * set dropWhiteSpace=0. A note of caution: if you set "dropWhiteSpace=0", the
+   * parser will be slower and your code will be more complex.
+   *
+   * @param removeCommentsInMiddleOfText To explain this parameter, let's
+   * consider this code: \code XMLNode x=XMLNode::parseString("<a>foo<!-- hello
+   * -->bar<!DOCTYPE world >chu</a>","a"); \endcode If
+   * removeCommentsInMiddleOfText=0, then we will have: \code x.getText(0) ->
+   * "foo" x.getText(1) -> "bar" x.getText(2) -> "chu" x.getClear(0) --> "<!--
+   * hello -->" x.getClear(1) --> "<!DOCTYPE world >" \endcode If
+   * removeCommentsInMiddleOfText=1, then we will have: \code x.getText(0) ->
+   * "foobar" x.getText(1) -> "chu" x.getClear(0) --> "<!DOCTYPE world >"
+   * \endcode
+   *
+   * \return "0" when there are no errors. If you try to set an unrecognized
+   * encoding then the return value will be "1" to signal an error.
+   *
+   * \note Sometime, it's useful to set "guessWideCharChars=0" to disable any
+   * conversion because the test to detect the file-type (ASCII/UTF8/char* or
+   * WideChar) may fail (rarely). */
+
+  /// Guess the character encoding of the string (ascii, utf8 or shift-JIS)
+  static XMLCharEncoding guessCharEncoding(void *buffer, int bufLen,
+                                           char useXMLEncodingAttribute = 1);
+  /**< The "guessCharEncoding" function try to guess the character encoding. You
+   * most-probably will never have to use this function. It then returns the
+   * appropriate value of the global parameter "characterEncoding" described in
+   * the XMLNode::setGlobalOptions. The guess is based on the content of a
+   * buffer of length "bufLen" bytes that contains the first bytes (minimum 25
+   * bytes; 200 bytes is a good value) of the file to be parsed. The
+   * XMLNode::openFileHelper function is using this function to automatically
+   * compute the value of the "characterEncoding" global parameter. There are
+   * several heuristics used to do the guess. One of the heuristic is based on
+   * the "encoding" attribute. The original XML specifications forbids to use
+   * this attribute to do the guess but you can still use it if you set
+   * "useXMLEncodingAttribute" to 1 (this is the default behavior and the
+   * behavior of most parsers).
+   * If an inconsistency in the encoding is detected, then the return value is
+   * "0". */
+  /** @} */
+
+private:
+  // these are functions and structures used internally by the XMLNode class
+  // (don't bother about them):
+
+  typedef struct XMLNodeDataTag // to allow shallow copy and "intelligent/smart"
+                                // pointers (automatic delete):
+  {
+    XMLCSTR lpszName;   // Element name (=NULL if root)
+    int nChild,         // Number of child nodes
+        nText,          // Number of text fields
+        nClear,         // Number of Clear fields (comments)
+        nAttribute;     // Number of attributes
+    char isDeclaration; // Whether node is an XML declaration - '<?xml ?>'
+    struct XMLNodeDataTag *pParent; // Pointer to parent element (=NULL if root)
+    XMLNode *pChild;                // Array of child nodes
+    XMLCSTR *pText;                 // Array of text fields
+    XMLClear *pClear;               // Array of clear fields
+    XMLAttribute *pAttribute;       // Array of attributes
+    int *pOrder;   // order of the child_nodes,text_fields,clear_fields
+    int ref_count; // for garbage collection (smart pointers)
+  } XMLNodeData;
+  XMLNodeData *d;
+
+  char parseClearTag(void *px, void *pa);
+  char maybeAddTxT(void *pa, XMLCSTR tokenPStr);
+  int ParseXMLElement(void *pXML);
+  void *addToOrder(int memInc, int *_pos, int nc, void *p, int size,
+                   XMLElementType xtype);
+  int indexText(XMLCSTR lpszValue) const;
+  int indexClear(XMLCSTR lpszValue) const;
+  XMLNode addChild_priv(int, XMLSTR, char, int);
+  XMLAttribute *addAttribute_priv(int, XMLSTR, XMLSTR);
+  XMLCSTR addText_priv(int, XMLSTR, int);
+  XMLClear *addClear_priv(int, XMLSTR, XMLCSTR, XMLCSTR, int);
+  void emptyTheNode(char force);
+  static inline XMLElementPosition findPosition(XMLNodeData *d, int index,
+                                                XMLElementType xtype);
+  static int CreateXMLStringR(XMLNodeData *pEntry, XMLSTR lpszMarker,
+                              int nFormat);
+  static int removeOrderElement(XMLNodeData *d, XMLElementType t, int index);
+  static void exactMemory(XMLNodeData *d);
+  static int detachFromParent(XMLNodeData *d);
+} XMLNode;
+
+/// This structure is given by the function XMLNode::enumContents.
+typedef struct XMLNodeContents {
+  /// This dictates what's the content of the XMLNodeContent
+  enum XMLElementType etype;
+  /**< should be an union to access the appropriate data. Compiler does not
+   * allow union of object with constructor... too bad. */
+  XMLNode child;
+  XMLAttribute attrib;
+  XMLCSTR text;
+  XMLClear clear;
+
+} XMLNodeContents;
+
+/** @defgroup StringAlloc String Allocation/Free functions
+ * @ingroup xmlModify
+ * @{ */
+/// Duplicate (copy in a new allocated buffer) the source string.
+XMLDLLENTRY XMLSTR stringDup(XMLCSTR source, int cbData = -1);
+/**< This is
+ * a very handy function when used with all the "XMLNode::*_WOSD" functions
+ * (\link xmlWOSD \endlink).
+ * @param cbData If !=0 then cbData is the number of chars to duplicate. New
+ * strings allocated with this function should be free'd using the
+ * "freeXMLString" function. */
+
+/// to free the string allocated inside the "stringDup" function or the
+/// "createXMLString" function.
+XMLDLLENTRY void freeXMLString(XMLSTR t); // {free(t);}
+/** @} */
+
+/** @defgroup atoX ato? like functions
+ * @ingroup XMLParserGeneral
+ * The "xmlto?" functions are equivalents to the atoi, atol, atof functions.
+ * The only difference is: If the variable "xmlString" is NULL, than the return
+ * value is "defautValue". These 6 functions are only here as "convenience"
+ * functions for the user (they are not used inside the XMLparser). If you don't
+ * need them, you can delete them without any trouble.
+ *
+ * @{ */
+XMLDLLENTRY char xmltob(XMLCSTR xmlString, char defautValue = 0);
+XMLDLLENTRY int xmltoi(XMLCSTR xmlString, int defautValue = 0);
+XMLDLLENTRY long xmltol(XMLCSTR xmlString, long defautValue = 0);
+XMLDLLENTRY double xmltof(XMLCSTR xmlString, double defautValue = .0);
+XMLDLLENTRY XMLCSTR xmltoa(XMLCSTR xmlString, XMLCSTR defautValue = _CXML(""));
+XMLDLLENTRY XMLCHAR xmltoc(XMLCSTR xmlString,
+                           XMLCHAR defautValue = _CXML('\0'));
+/** @} */
+
+/** @defgroup ToXMLStringTool Helper class to create XML files using "printf",
+ * "fprintf", "cout",... functions.
+ * @ingroup XMLParserGeneral
+ * @{ */
+/// Helper class to create XML files using "printf", "fprintf", "cout",...
+/// functions.
+/** The ToXMLStringTool class helps you creating XML files using "printf",
+ * "fprintf", "cout",... functions. The "ToXMLStringTool" class is processing
+ * strings so that all the characters
+ * &,",',<,> are replaced by their XML equivalent:
+ * \verbatim &amp;, &quot;, &apos;, &lt;, &gt; \endverbatim
+ * Using the "ToXMLStringTool class" and the "fprintf function" is THE most
+ * efficient way to produce VERY large XML documents VERY fast. \note If you are
+ * creating from scratch an XML file using the provided XMLNode class you must
+ * not use the "ToXMLStringTool" class (because the "XMLNode" class does the
+ * processing job for you during rendering).*/
+typedef struct XMLDLLENTRY ToXMLStringTool {
+public:
+  ToXMLStringTool() : buf(NULL), buflen(0) {}
+  ~ToXMLStringTool();
+  void freeBuffer(); ///< call this function when you have finished using this
+                     ///< object to release memory used by the internal buffer.
+
+  XMLSTR toXML(XMLCSTR source); ///< returns a pointer to an internal buffer
+                                ///< that contains a XML-encoded string based on
+                                ///< the "source" parameter.
+
+  /** The "toXMLUnSafe" function is deprecated because there is a possibility of
+   * "destination-buffer-overflow". It converts the string
+   * "source" to the string "dest". */
+  static XMLSTR
+  toXMLUnSafe(XMLSTR dest, XMLCSTR source); ///< deprecated: use "toXML" instead
+  static int
+  lengthXMLString(XMLCSTR source); ///< deprecated: use "toXML" instead
+
+private:
+  XMLSTR buf;
+  int buflen;
+} ToXMLStringTool;
+/** @} */
+
+/** @defgroup XMLParserBase64Tool Helper class to include binary data inside XML
+ * strings using "Base64 encoding".
+ * @ingroup XMLParserGeneral
+ * @{ */
+/// Helper class to include binary data inside XML strings using "Base64
+/// encoding".
+/** The "XMLParserBase64Tool" class allows you to include any binary data
+ * (images, sounds,...) into an XML document using "Base64 encoding". This class
+ * is completely separated from the rest of the xmlParser library and can be
+ * removed without any problem. To include some binary data into an XML file,
+ * you must convert the binary data into standard text (using "encode"). To
+ * retrieve the original binary data from the b64-encoded text included inside
+ * the XML file, use "decode". Alternatively, these functions can also be used
+ * to "encrypt/decrypt" some critical data contained inside
+ * the XML (it's not a strong encryption at all, but sometimes it can be
+ * useful). */
+typedef struct XMLDLLENTRY XMLParserBase64Tool {
+public:
+  XMLParserBase64Tool() : buf(NULL), buflen(0) {}
+  ~XMLParserBase64Tool();
+  void freeBuffer(); ///< Call this function when you have finished using this
+                     ///< object to release memory used by the internal buffer.
+
+  /**
+   * @param formatted If "formatted"=true, some space will be reserved for a
+   * carriage-return every 72 chars. */
+  static int encodeLength(
+      int inBufLen,
+      char formatted = 0); ///< return the length of the base64 string that
+                           ///< encodes a data buffer of size inBufLen bytes.
+
+  /**
+   * The "base64Encode" function returns a string containing the base64 encoding
+   * of "inByteLen" bytes from "inByteBuf". If "formatted" parameter is true,
+   * then there will be a carriage-return every 72 chars. The string will be
+   * free'd when the XMLParserBase64Tool object is deleted.
+   * All returned strings are sharing the same memory space. */
+  XMLSTR encode(
+      unsigned char *inByteBuf, unsigned int inByteLen,
+      char formatted =
+          0); ///< returns a pointer to an internal buffer containing the base64
+              ///< string containing the binary data encoded from "inByteBuf"
+
+  /// returns the number of bytes which will be decoded from "inString".
+  static unsigned int decodeSize(XMLCSTR inString, XMLError *xe = NULL);
+
+  /**
+   * The "decode" function returns a pointer to a buffer containing the binary
+   * data decoded from "inString" The output buffer will be free'd when the
+   * XMLParserBase64Tool object is deleted. All output buffer are sharing the
+   * same memory space.
+   * @param inString If "instring" is malformed, NULL will be returned */
+  unsigned char *
+  decode(XMLCSTR inString, int *outByteLen = NULL,
+         XMLError *xe =
+             NULL); ///< returns a pointer to an internal buffer containing the
+                    ///< binary data decoded from "inString"
+
+  /**
+   * decodes data from "inString" to "outByteBuf". You need to provide the size
+   * (in byte) of "outByteBuf" in "inMaxByteOutBuflen". If "outByteBuf" is not
+   * large enough or if data is malformed, then "FALSE" will be returned;
+   * otherwise "TRUE". */
+  static unsigned char decode(XMLCSTR inString, unsigned char *outByteBuf,
+                              int inMaxByteOutBuflen,
+                              XMLError *xe = NULL); ///< deprecated.
+
+private:
+  void *buf;
+  int buflen;
+  void alloc(int newsize);
+} XMLParserBase64Tool;
+/** @} */
+
+#undef XMLDLLENTRY
+
+#endif
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index d43964a8f..9b0562482 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -130,16 +130,45 @@ void CUstream_st::print(FILE *fp) {
   pthread_mutex_unlock(&m_lock);
 }
 
+void stream_manager::register_prefetch(size_t m_device_addr,
+                                       size_t m_device_allocation_ptr,
+                                       size_t m_cnt,
+                                       struct CUstream_st *m_stream) {
+  m_gpu->getGmmu()->register_prefetch(m_device_addr, m_device_allocation_ptr,
+                                      m_cnt, m_stream);
+}
+
 bool stream_operation::do_operation(gpgpu_sim *gpu) {
   if (is_noop()) return true;
 
   assert(!m_done && m_stream);
+
+  unsigned long long cur_cycle = gpu->gpu_sim_cycle + gpu->gpu_tot_sim_cycle;
+
   if (g_debug_execution >= 3)
     printf("GPGPU-Sim API: stream %u performing ", m_stream->get_uid());
   switch (m_type) {
+    case stream_prefetch_host_to_device:
+      gpu->getGmmu()->activate_prefetch(m_device_address_dst, m_cnt, m_stream);
+      if (sim_prof_enable) {
+        event_stats *cp_pref =
+            new memory_stats(prefetch, cur_cycle, m_device_address_dst, m_cnt,
+                            m_stream->get_uid());
+        sim_prof[cur_cycle].push_back(cp_pref);
+      }
+      break;
     case stream_memcpy_host_to_device:
       if (g_debug_execution >= 3) printf("memcpy host-to-device\n");
       gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt);
+      if (sim_prof_enable) {
+        unsigned long long transfer_time =
+            gpu->getGmmu()->calculate_transfer_time(m_cnt);
+        gpu->gpu_tot_sim_cycle += transfer_time;
+        event_stats *cp_h2d =
+            new memory_stats(memcpy_h2d, cur_cycle, cur_cycle + transfer_time,
+                            m_device_address_dst, m_cnt, m_stream->get_uid());
+        sim_prof[cur_cycle].push_back(cp_h2d);
+      }
       m_stream->record_next_done();
       if (gpu->is_SST_mode()) {
         SST_callback_memcpy_H2D_done((uint64_t) m_device_address_dst, (uint64_t) m_host_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream);
@@ -148,12 +177,27 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
     case stream_memcpy_device_to_host:
       if (g_debug_execution >= 3) printf("memcpy device-to-host\n");
       gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt);
+      if (sim_prof_enable) {
+        unsigned long long transfer_time =
+            gpu->getGmmu()->calculate_transfer_time(m_cnt);
+        gpu->gpu_tot_sim_cycle += transfer_time;
+        event_stats *cp_d2h =
+            new memory_stats(memcpy_d2h, cur_cycle, cur_cycle + transfer_time,
+                            m_device_address_src, m_cnt, m_stream->get_uid());
+        sim_prof[cur_cycle].push_back(cp_d2h);
+      }
       m_stream->record_next_done();
       if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done((uint64_t) m_host_address_dst, (uint64_t) m_device_address_src, m_cnt, m_stream->is_stream_zero_stream() ? 0 : m_stream);
       break;
     case stream_memcpy_device_to_device:
       if (g_debug_execution >= 3) printf("memcpy device-to-device\n");
       gpu->memcpy_gpu_to_gpu(m_device_address_dst, m_device_address_src, m_cnt);
+      if (sim_prof_enable) {
+        event_stats *cp_d2d =
+            new memory_stats(memcpy_d2d, cur_cycle, m_device_address_dst, m_cnt,
+                            m_stream->get_uid());
+        sim_prof[cur_cycle].push_back(cp_d2d);
+      }
       m_stream->record_next_done();
       break;
     case stream_memcpy_to_symbol:
@@ -188,6 +232,15 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
           }
           gpu->set_cache_config(m_kernel->name());
           gpu->launch(m_kernel);
+
+          gpu->getGmmu()->log_kernel_info(
+              m_kernel->get_uid(), gpu->gpu_sim_cycle + gpu->gpu_tot_sim_cycle, false);
+
+          if (sim_prof_enable) {
+            kernel_stats *k_s = new kernel_stats(cur_cycle, m_stream->get_uid(),
+                                                m_kernel->get_uid());
+            sim_prof[cur_cycle].push_back(k_s);
+          }
         } else {
           if (m_kernel->m_launch_latency) m_kernel->m_launch_latency--;
           if (g_debug_execution >= 3)
@@ -323,6 +376,13 @@ bool stream_manager::register_finished_kernel(unsigned grid_uid) {
       //            printf("kernel %d finishes, retires from stream %d\n",
       //            grid_uid, stream->get_uid()); kernel_stat.flush();
       //            kernel_stat.close();
+      if (sim_prof_enable) {
+        update_sim_prof_kernel(kernel->get_uid(),
+                               m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+        m_gpu->getGmmu()->log_kernel_info(
+            kernel->get_uid(), m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, true);
+      }
+
       stream->record_next_done();
       // Callback to notify a kernel is done for SST's stream
       // manager to support with nonblocking + blocking kernel launch
@@ -511,7 +571,7 @@ void stream_manager::push(stream_operation op) {
   }
   if (g_debug_execution >= 3) print_impl(stdout);
   pthread_mutex_unlock(&m_lock);
-  if (!m_gpu->is_SST_mode() && (m_cuda_launch_blocking || stream == NULL)) {
+  if (!m_gpu->is_SST_mode() && (m_cuda_launch_blocking || stream == NULL || op.is_kernel())) {
     unsigned int wait_amount = 100;
     unsigned int wait_cap = 100000;  // 100ms
     while (!empty()) {
diff --git a/src/stream_manager.h b/src/stream_manager.h
index 55cfb8d28..bd7d27641 100644
--- a/src/stream_manager.h
+++ b/src/stream_manager.h
@@ -96,6 +96,7 @@ enum stream_operation_type {
   stream_memcpy_device_to_device,
   stream_memcpy_to_symbol,
   stream_memcpy_from_symbol,
+  stream_prefetch_host_to_device,
   stream_kernel_launch,
   stream_event,
   stream_wait_event
@@ -195,6 +196,20 @@ class stream_operation {
     m_done = false;
   }
 
+  stream_operation(size_t device_address_dst, size_t cnt,
+                   struct CUstream_st *stream) {
+    m_kernel = NULL;
+    m_type = stream_prefetch_host_to_device;
+    m_device_address_src = 0;
+    m_device_address_dst = device_address_dst;
+    m_host_address_src = NULL;
+    m_host_address_dst = NULL;
+    m_cnt = cnt;
+    m_stream = stream;
+    m_sim_mode = false;
+    m_done = false;
+  }
+
   bool is_kernel() const { return m_type == stream_kernel_launch; }
   bool is_mem() const {
     return m_type == stream_memcpy_host_to_device ||
@@ -283,7 +298,10 @@ class stream_manager {
   void stop_all_running_kernels();
   unsigned size() { return m_streams.size(); };
   bool is_blocking() { return m_cuda_launch_blocking; };
-  CUstream_st *get_stream_zero() { return &m_stream_zero; };
+
+  void register_prefetch(size_t m_device_addr, size_t m_device_allocation_ptr,
+                         size_t m_cnt, struct CUstream_st *m_stream);
+  CUstream_st *get_stream_zero() { return &m_stream_zero; }
   std::list<CUstream_st *>& get_concurrent_streams() { return m_streams; };
 
  private: