SESC配置文件解析及测试

Posted by yuchen on April 16, 2016
SESC配置文件解析及测试

SESC配置文件及测试


1.SESC配置文件解析

  1. # 注释
  2. procsPerNode = 4 # 4 核
  3. cacheLineSize = 64 # cache块大小
  4. issue = 2 # 每个周期最多发射2条指令
  5. ##################################################################
  6. # cpucore 指定机器拥有四个相同的核,编号范围从0到3(上述procsPerNode定义为4),
  7. # 每个核的配置被描述在 [issueX]段.这些核可以通过板载mesh网络相互通信。
  8. ##################################################################
  9. cpucore[0:$(procsPerNode)-1] = 'issueX'
  10. ##############################
  11. # 处理器的配置 #
  12. ##############################
  13. [issueX]
  14. frequency = 1e9 #1GHz
  15. ##################################################################
  16. # inorder设为false说明这是一个 out-of-order核,并且fetches, issues, and
  17. # retires up to 2 instructions per cycle (参数“issue” 在前面已经设为2).
  18. ##################################################################
  19. inorder = false
  20. fetchWidth = $(issue)
  21. issueWidth = $(issue)
  22. retireWidth = $(issue)
  23. bb4Cycle = 1 # 一个周期内可以取得基本块的数量,即一个周期内基本上可以获取多少个分支
  24. maxIRequests = 4 # Max number of outstanding instruction requests
  25. interClusterLat = 2
  26. intraClusterLat = 1
  27. cluster[0] = 'FXClusterIssueX'//配置整数运算指令
  28. cluster[1] = 'FPClusterIssueX'//配置浮点数运算指令
  29. robSize = 64
  30. intRegs = 64
  31. fpRegs = 64
  32. bpred = 'BPredIssueX'//配置分支预测器
  33. enableICache = true
  34. dtlb = 'FXDTLB'
  35. itlb = 'FXITLB'
  36. dataSource = "DMemory DL1"
  37. instrSource = "IMemory IL1"
  38. OSType = 'dummy' # Memory OS, either Dummy or std
  39. # 整数功能单元
  40. [FXClusterIssueX]
  41. winSize = 12*$(Issue)+32 # number of entries in window
  42. recycleAt = 'Execute'
  43. schedNumPorts = 4
  44. schedPortOccp = 1
  45. wakeUpNumPorts= 4
  46. wakeUpPortOccp= 1
  47. wakeupDelay = 2
  48. schedDelay = 1 # Minimum latency like a intraClusterLat
  49. iStoreLat = 1
  50. iStoreUnit = 'LDSTIssueX'
  51. iLoadLat = 1
  52. iLoadUnit = 'LDSTIssueX'
  53. iALULat = 1
  54. iALUUnit = 'ALUIssueX'
  55. iBJLat = 1
  56. iBJUnit = 'ALUIssueX'
  57. iDivLat = 12
  58. iDivUnit = 'ALUIssueX'
  59. iMultLat = 4
  60. iMultUnit = 'ALUIssueX'
  61. [LDSTIssueX]
  62. Num = $(issue)/3+1
  63. Occ = 1
  64. [ALUIssueX]
  65. Num = $(issue)/3+1
  66. Occ = 1
  67. # 浮点数功能单元
  68. [FPClusterIssueX]
  69. winSize = 8*$(issue)
  70. recycleAt = 'Execute'
  71. schedNumPorts = 4
  72. schedPortOccp = 1
  73. wakeUpNumPorts= 4
  74. wakeUpPortOccp= 1
  75. wakeupDelay = 2
  76. schedDelay = 1 # Minimum latency like a intraClusterLat
  77. fpALULat = 1
  78. fpALUUnit = 'FPIssueX'
  79. fpMultLat = 8
  80. fpMultUnit = 'FPIssueX'
  81. fpDivLat = 20
  82. fpDivUnit = 'FPIssueX'
  83. [FPIssueX]
  84. Num = $(issue)/2+1
  85. Occ = 1
  86. # 分支预测器配置
  87. [BPredIssueX]
  88. ##################################################################
  89. #"hybrid" 实际上是一个联合的预测器, 配有一个元预测器(有2048个entries),
  90. # 每一个entry是一个饱和计数器。
  91. ##################################################################
  92. type = "oracle" # hybrid-->taken-->oracle
  93. BTACDelay = 0 #Branch Taken ACcess Delay, 0表示执行时无障碍,非零表示一个分支常数延迟
  94. #下面的数据为不同的分支预测器所用
  95. l1size = 1
  96. l2size = 2*1024
  97. l2Bits = 1
  98. historySize = 8
  99. Metasize = 2*1024
  100. MetaBits = 2
  101. localSize = 2*1024
  102. localBits = 2
  103. btbSize = 256
  104. btbBsize = 1
  105. btbAssoc = 2
  106. btbReplPolicy = 'LRU'
  107. btbHistory = 0
  108. rasSize = 32
  109. # memory translation mechanism
  110. [FXDTLB]
  111. size = 64*8
  112. assoc = 4
  113. bsize = 8
  114. numPorts = 2
  115. replPolicy = 'LRU'
  116. deviceType = 'cache' # libcore/GMemorySystem, options: dummy, cache, icache, smpcache
  117. [FXITLB]
  118. size = 64*8
  119. assoc = 4
  120. bsize = 8
  121. numPorts = 2
  122. replPolicy = 'LRU'
  123. deviceType = 'cache'
  124. ##############################
  125. # MEMORY SUBSYSTEM #
  126. ##############################
  127. # instruction source--IL1
  128. [IMemory]
  129. deviceType = 'icache'
  130. size = 32*1024
  131. assoc = 4
  132. bsize = $(cacheLineSize)
  133. writePolicy = 'WB'
  134. replPolicy = 'LRU'
  135. protocol = 'DMESI'
  136. numPorts = 1
  137. portOccp = 1
  138. hitDelay = 1
  139. missDelay = 1 # this number is added to the hitDelay
  140. #displNotify = false
  141. MSHR = "iMSHR"
  142. lowerLevel = "Router RTR sharedBy 1" # Format: [Type] [UserDefinedName] [ShareOption]
  143. sideLowerLevel = "" # Another lower level
  144. [iMSHR]
  145. type = 'single'
  146. size = 32
  147. bsize = $(cacheLineSize)
  148. # data source -- DL1
  149. [DMemory]
  150. deviceType = 'smpcache' # SMP类型的cache
  151. size = 16*1024 # 可以存储32KBytes 的数据
  152. assoc = 4 # 4路组相联
  153. # 64­byte block/line size (cacheLineSize在前面被定义为 64)
  154. bsize = $(cacheLineSize)
  155. writePolicy = 'WB' # a write­back cache (写策略)
  156. replPolicy = 'LRU' # 使用 LRU 置换策略
  157. protocol = 'DMESI'
  158. numPorts = 2 # 有两个端口,故一个周期可以处理2次访问
  159. portOccp = 1 # Number of occupancy per port. 0: UnlimitedPort, 1:FullyPipelinedPort, other value: PortPipe
  160. hitDelay = 1 #命中时间需要一个周期
  161. missDelay = 1 # 未命中检测需要1个周期
  162. # 如果出现一个miss,处理器会使用DMSHR (data miss handling registers)结构跟踪这个miss,
  163. # DMSHR被描述在[DMSHR]段,其拥有一个64-entry结构,且每个entry可以跟踪整个64字节块的一个miss。
  164. # 在一个miss上,L1 cache或从一个本地核L2cache的slice上请求数据或通过on­chip路由与其连接的其他核上
  165. # 的L2 slice上请求数据。
  166. MSHR = "DMSHR"
  167. lowerLevel = "Router RTR sharedBy 1"
  168. sideLowerLevel = "L2Slice L2S" # Another lower level
  169. [DMSHR]
  170. type = 'single' # Options: none, nodeps, full, single, banked Check libsuc/MSHR
  171. size = 64
  172. bsize = $(cacheLineSize)
  173. [Router]
  174. deviceType = 'router'
  175. delay = 1
  176. numPorts = 2 # read / write
  177. portOccp = 1
  178. dimX = $(NOCdim) # This needs to be the same with NOC dimension
  179. dimY = $(NOCdim) # This needs to be the same with NOC dimension
  180. lowerLevel ="NOC NOC shared"
  181. [NOC]
  182. deviceType = 'booksim'
  183. booksim_config = 'mesh22.booksim'
  184. booksim_output = 'booksim.log'
  185. booksim_sample = 1000000
  186. lowerLevel = "MemoryCtrl MemCtrl shared"
  187. [L2Slice] # L2 Cache
  188. deviceType = 'slicecache'
  189. inclusive = false
  190. size = 1*1024*1024 # 一个slice大小为 1 MB (所以在此, L2 cache 总大小是4MB),
  191. assoc = 16 # 16路组相联
  192. bsize = $(cacheLineSize) # 64字节块大小
  193. writePolicy = 'WB' # 回写策略
  194. replPolicy = 'LRU' # LRU置换策略
  195. numPorts = 2 # 2 ports,one for L1, one for snooping
  196. portOccp = 1 # 一个cache的吞吐量
  197. hitDelay = 12 # 命中时间需要12­cycle
  198. missDelay = 12 # 检测一个miss需要12 cycles
  199. numPortsDir = 1
  200. portOccpDir = 1
  201. hitDelayDir = 1
  202. MSHR = 'L2MSHR' # 使用一个 64­entry MSHR 去跟踪misses
  203. # 当出现一个miss时,该miss将被本地on­chip路由器处理,本地on­chip路由器使用板载网络 (NOC)
  204. # 传递消息给内存控制器。接着,内存控制器使用 off­chip 处理器内存总线访问主存。主存被配置在
  205. # [Memory]段,其被模型化一个拥有200 cycle延迟的极大的cache。
  206. lowerLevel = "Router RTR sharedBy 1"
  207. [L2MSHR]
  208. size = 64
  209. type = 'single'
  210. bsize = $(cacheLineSize)
  211. [MemoryCtrl]
  212. deviceType = 'memoryController'
  213. numPorts = 8 # 8 channel
  214. portOccp = 1
  215. delay = 1
  216. lowerLevel = "MemoryBus MemoryBus"
  217. [MemoryBus]
  218. deviceType = 'bus'
  219. numPorts = 8
  220. portOccp = $(cacheLineSize) / 8 # assuming 8*8 Gbyte/s
  221. delay = 5
  222. lowerLevel = "Memory Memory"
  223. [Memory]
  224. deviceType = 'niceCache'
  225. size = 64
  226. assoc = 1
  227. bsize = 64
  228. writePolicy = 'WB'
  229. replPolicy = 'LRU'
  230. numPorts = 1
  231. portOccp = 1
  232. hitDelay = 200
  233. missDelay = 10000
  234. MSHR = NoMSHR
  235. lowerLevel = 'voidDevice'
  236. [NoMSHR]
  237. type = 'none'
  238. size = 128
  239. bsize = 64
  240. [voidDevice]
  241. deviceType = 'void'
  242. ############################
  243. # BEGIN MIPSEMUL #
  244. ############################
  245. [FileSys]
  246. mount="/bin=/mipsroot/tools/bin:/lib=/mipsroot/tools/lib:/tools=/mipsroot/tools"

2.分支预测器的类型测试(hybrid–>taken–>oracle)

  1. # Bench : sesc.opt -c /home/ud233user/sesc/confs/cmp4-noc.conf -olu.out -elr.err lu.mipseb -n32 -p 1
  2. # File : sesc_lu.mipseb.b4QrwJ : Sat Apr 16 03:33:27 2016
  3. Exe Speed Exe MHz Exe Time Sim Time (1000MHz)
  4. 389.963 KIPS 0.5564 MHz 0.830 secs 0.462 msec
  5. Proc Avg.Time BPType Total RAS BPred BTB BTAC
  6. 0 55.215 hybrid 91.05% ( 99.89% of 11.34%) 89.92% ( 95.81% of 57.74%) 0.00%
  7. nInst BJ Load Store INT FP : LD Forward , Replay : Worst Unit (clk)
  8. 0 323669 9.49% 18.01% 9.25% 55.68% 7.57% : 12.66% 302 inst/repl : ALUIssueX 2.80
  9. Proc IPC Cycles Busy LDQ STQ IWin ROB Regs Ports TLB maxBr MisBr Br4Clk Other
  10. 0 0.70 461822 35.0 0.0 0.0 0.1 18.0 0.0 0.0 0.0 0.0 45.2 0.1 1.6
  11. ################################################################################
  12. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  13. 0 DL1 0.0 0.42% ( 0.1%, 0.3%) 92.46% 0.05GB/s : MemoryBus 0 MB/s :
  14. ################################################################################
  15. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  16. 0 IL1 0.0 0.45% ( 0.5%, 0.0%) 192.05% 0.11GB/s : MemoryBus 0 MB/s :
  17. # Bench : sesc.opt -c /home/ud233user/sesc/confs/cmp4-noc.conf -olu.out -elr.err lu.mipseb -n32 -p 1
  18. # File : sesc_lu.mipseb.ZOV04r : Sat Apr 16 03:40:03 2016
  19. Exe Speed Exe MHz Exe Time Sim Time (1000MHz)
  20. 363.673 KIPS 0.5692 MHz 0.890 secs 0.507 msec
  21. Proc Avg.Time BPType Total RAS BPred BTB BTAC
  22. 0 26.849 taken 67.56% ( 99.89% of 11.34%) 63.42% ( 94.16% of 59.71%) 0.00%
  23. nInst BJ Load Store INT FP : LD Forward , Replay : Worst Unit (clk)
  24. 0 323669 9.49% 18.01% 9.25% 55.68% 7.57% : 5.48% 306 inst/repl : ALUIssueX 2.18
  25. Proc IPC Cycles Busy LDQ STQ IWin ROB Regs Ports TLB maxBr MisBr Br4Clk Other
  26. 0 0.64 506617 31.9 0.0 0.0 0.1 4.1 0.0 0.0 0.0 0.0 62.8 0.0 1.1
  27. ################################################################################
  28. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  29. 0 DL1 0.0 0.39% ( 0.1%, 0.3%) 98.40% 0.04GB/s : MemoryBus 0 MB/s :
  30. ################################################################################
  31. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  32. 0 IL1 0.0 0.45% ( 0.4%, 0.0%) 193.55% 0.10GB/s : MemoryBus 0 MB/s :
  33. # Bench : sesc.opt -c /home/ud233user/sesc/confs/cmp4-noc.conf -olu.out -elr.err lu.mipseb -n32 -p 1
  34. # File : sesc_lu.mipseb.9o5lQb : Sat Apr 16 03:42:42 2016
  35. Exe Speed Exe MHz Exe Time Sim Time (1000MHz)
  36. 404.586 KIPS 0.5474 MHz 0.800 secs 0.438 msec
  37. Proc Avg.Time BPType Total RAS BPred BTB BTAC
  38. 0 98.266 oracle 96.50% ( 99.89% of 11.34%) 96.07% ( 94.16% of 59.71%) 0.00%
  39. nInst BJ Load Store INT FP : LD Forward , Replay : Worst Unit (clk)
  40. 0 323669 9.49% 18.01% 9.25% 55.68% 7.57% : 12.80% 300 inst/repl : ALUIssueX 3.15
  41. Proc IPC Cycles Busy LDQ STQ IWin ROB Regs Ports TLB maxBr MisBr Br4Clk Other
  42. 0 0.74 437914 37.0 0.0 0.0 0.1 21.2 0.0 0.0 0.0 0.0 39.6 0.1 2.1
  43. ################################################################################
  44. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  45. 0 DL1 0.0 0.42% ( 0.1%, 0.3%) 92.41% 0.05GB/s : MemoryBus 0 MB/s :
  46. ################################################################################
  47. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  48. 0 IL1 0.0 0.45% ( 0.5%, 0.0%) 192.02% 0.11GB/s : MemoryBus 0 MB/s :

3.指令发射宽度测试(issue:2–>4)

  1. # Bench : sesc.opt -c /home/ud233user/sesc/confs/cmp4-noc.conf -olu.out -elr.err lu.mipseb -n32 -p 1
  2. # File : sesc_lu.mipseb.W213wq : Sat Apr 16 04:01:07 2016
  3. Exe Speed Exe MHz Exe Time Sim Time (1000MHz)
  4. 449.540 KIPS 0.5527 MHz 0.720 secs 0.398 msec
  5. Proc Avg.Time BPType Total RAS BPred BTB BTAC
  6. 0 67.584 hybrid 91.05% ( 99.89% of 11.34%) 89.92% ( 95.81% of 57.74%) 0.00%
  7. nInst BJ Load Store INT FP : LD Forward , Replay : Worst Unit (clk)
  8. 0 323669 9.49% 18.01% 9.25% 55.68% 7.57% : 12.68% 302 inst/repl : ALUIssueX 0.25
  9. Proc IPC Cycles Busy LDQ STQ IWin ROB Regs Ports TLB maxBr MisBr Br4Clk Other
  10. 0 0.81 397915 20.3 0.0 0.0 0.0 29.1 0.0 0.0 0.0 0.0 48.9 0.1 1.6
  11. ################################################################################
  12. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  13. 0 DL1 0.0 0.42% ( 0.1%, 0.3%) 92.59% 0.05GB/s : MemoryBus 0 MB/s :
  14. ################################################################################
  15. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  16. 0 IL1 0.0 0.85% ( 0.9%, 0.0%) 100.64% 0.12GB/s : MemoryBus 0 MB/s :
  17. # Bench : sesc.opt -c /home/ud233user/sesc/confs/cmp4-noc.conf -olu.out -elr.err lu.mipseb -n32 -p 1
  18. # File : sesc_lu.mipseb.hwtxXj : Sat Apr 16 04:01:27 2016
  19. Exe Speed Exe MHz Exe Time Sim Time (1000MHz)
  20. 431.559 KIPS 0.5780 MHz 0.750 secs 0.433 msec
  21. Proc Avg.Time BPType Total RAS BPred BTB BTAC
  22. 0 30.247 taken 67.56% ( 99.89% of 11.34%) 63.42% ( 94.16% of 59.71%) 0.00%
  23. nInst BJ Load Store INT FP : LD Forward , Replay : Worst Unit (clk)
  24. 0 323669 9.49% 18.01% 9.25% 55.68% 7.57% : 5.50% 305 inst/repl : ALUIssueX 0.22
  25. Proc IPC Cycles Busy LDQ STQ IWin ROB Regs Ports TLB maxBr MisBr Br4Clk Other
  26. 0 0.75 433486 18.7 0.0 0.0 0.0 11.4 0.0 0.0 0.0 0.0 68.6 0.0 1.3
  27. ################################################################################
  28. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  29. 0 DL1 0.0 0.40% ( 0.1%, 0.3%) 97.37% 0.05GB/s : MemoryBus 0 MB/s :
  30. ################################################################################
  31. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  32. 0 IL1 0.0 0.83% ( 0.8%, 0.0%) 102.55% 0.11GB/s : MemoryBus 0 MB/s :
  33. # Bench : sesc.opt -c /home/ud233user/sesc/confs/cmp4-noc.conf -olu.out -elr.err lu.mipseb -n32 -p 1
  34. # File : sesc_lu.mipseb.Y8YKpM : Sat Apr 16 04:01:40 2016
  35. Exe Speed Exe MHz Exe Time Sim Time (1000MHz)
  36. 469.086 KIPS 0.5336 MHz 0.690 secs 0.368 msec
  37. Proc Avg.Time BPType Total RAS BPred BTB BTAC
  38. 0 126.080 oracle 96.50% ( 99.89% of 11.34%) 96.07% ( 94.16% of 59.71%) 0.00%
  39. nInst BJ Load Store INT FP : LD Forward , Replay : Worst Unit (clk)
  40. 0 323669 9.49% 18.01% 9.25% 55.68% 7.57% : 12.93% 301 inst/repl : ALUIssueX 0.25
  41. Proc IPC Cycles Busy LDQ STQ IWin ROB Regs Ports TLB maxBr MisBr Br4Clk Other
  42. 0 0.88 368151 22.0 0.0 0.0 0.0 33.4 0.0 0.0 0.0 0.0 42.6 0.1 1.9
  43. ################################################################################
  44. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  45. 0 DL1 0.0 0.42% ( 0.1%, 0.3%) 92.49% 0.06GB/s : MemoryBus 0 MB/s :
  46. ################################################################################
  47. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  48. 0 IL1 0.0 0.85% ( 0.9%, 0.0%) 100.45% 0.13GB/s : MemoryBus 0 MB/s :

4. Cache测试(size:32KB–>16KB,numPorts:2–>4)

  1. # Bench : sesc.opt -c /home/ud233user/sesc/confs/cmp4-noc.conf -oocean.out -eocean.err ocean.mipseb -n 258 -p 1
  2. # File : sesc_ocean.mipseb.KYbtOE : Sat Apr 16 05:31:48 2016
  3. Exe Speed Exe MHz Exe Time Sim Time (1000MHz)
  4. 250.623 KIPS 0.4201 MHz 1956.200 secs 821.856 msec
  5. Proc Avg.Time BPType Total RAS BPred BTB BTAC
  6. 0 98.401 oracle 99.94% ( 99.99% of 0.32%) 99.94% ( 99.93% of 88.33%) 0.00%
  7. nInst BJ Load Store INT FP : LD Forward , Replay : Worst Unit (clk)
  8. 0 490268438 4.61% 22.88% 6.18% 31.74% 34.59% : 0.80% 582959 inst/repl : LDSTIssueX 0.23
  9. Proc IPC Cycles Busy LDQ STQ IWin ROB Regs Ports TLB maxBr MisBr Br4Clk Other
  10. 0 0.60 821855801 14.9 0.0 0.0 0.3 84.5 0.0 0.0 0.0 0.0 0.0 0.2 0.0
  11. ################################################################################
  12. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  13. 0 DL1 0.2 5.76% ( 4.7%, 1.1%) 124.25% 0.79GB/s : MemoryBus 0 MB/s :
  14. ################################################################################
  15. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  16. 0 IL1 0.0 0.00% ( 0.0%, 0.0%) 92.02% 0.00GB/s : MemoryBus 0 MB/s :
  17. # Bench : sesc.opt -c /home/ud233user/sesc/confs/cmp4-noc.conf -oocean.out -eocean.err ocean.mipseb -n 258 -p 1
  18. # File : sesc_ocean.mipseb.KAxWgJ : Sat Apr 16 07:19:24 2016
  19. Exe Speed Exe MHz Exe Time Sim Time (1000MHz)
  20. 228.784 KIPS 0.4045 MHz 2142.930 secs 866.814 msec
  21. Proc Avg.Time BPType Total RAS BPred BTB BTAC
  22. 0 98.629 oracle 99.94% ( 99.99% of 0.32%) 99.94% ( 99.93% of 88.33%) 0.00%
  23. nInst BJ Load Store INT FP : LD Forward , Replay : Worst Unit (clk)
  24. 0 490268438 4.61% 22.88% 6.18% 31.74% 34.59% : 0.80% 576786 inst/repl : LDSTIssueX 0.23
  25. Proc IPC Cycles Busy LDQ STQ IWin ROB Regs Ports TLB maxBr MisBr Br4Clk Other
  26. 0 0.57 866813880 14.1 0.0 0.0 0.3 85.4 0.0 0.0 0.0 0.0 0.0 0.2 0.0
  27. ################################################################################
  28. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  29. 0 DL1 0.0 7.03% ( 5.8%, 1.2%) 125.16% 0.93GB/s : MemoryBus 0 MB/s :
  30. ################################################################################
  31. Proc Cache Occ MissRate (RD, WR) %DMemAcc MB/s : ...
  32. 0 IL1 0.0 0.00% ( 0.0%, 0.0%) 92.02% 0.00GB/s : MemoryBus 0 MB/s :

5.利用异构多核运行程序

SESC本来大多用来研究SMP、CMP问题,很少用来研究AMP问题。但是徐友军师兄,对其配置文件及源码进行了一些更改,使得其可以用于研究AMP问题。
我们运行模拟器时需要使用配置文件(如前解析),使用功耗功能时需要执行
make sesc.confmake power.conf命令:

  • make sesc.conf命令将源码中的模板配置文件sesc.conf和shared.conf复制到当前文件夹(sesc.conf会将shared.conf包含进来);
  • make power.conf命令将生成wattchify和 cactify文件。
    • wattchify利用sesc.conf生成tmp.conf;
    • cactify 利用tmp.conf生成power.conf。
    • 最后删除tmp.conf。

正如前面所述,’issueX’段对核进行了配置,我们只需要先成一种核的power.conf文件(文件1),然后在另外一个文件夹中将issueX更改为其他标志(如issueY),然后修改里面相应段的配置,利用修改后的文件再生成另外一种核的power.conf文件(文件2),合并文件1和文件2,修改相同下标处(使得下标连贯)即可生成AMP的配置文件。

  • 这里使用的配置文件如下(8核,四大核四小核)
  1. #BEGIN Configuration used. Extracted from "S.conf":
  2. procsPerNode=1
  3. thermal ='SescTherm'
  4. gNetwork ='m3tnetwork'
  5. L2ll ="AdvMem MemBus shared"
  6. depth_S =2
  7. pageSize =4096
  8. memSizing_S=1
  9. STUnits =1
  10. technology='techParam'
  11. wattchDataCacheEnergy=1.065153e+00
  12. cpucore[0:4]='issueX_S'
  13. cpucore[5:8]='issueX_L'
  14. LDUnits =1
  15. nCPUs =9
  16. NoMigration=false
  17. UseTLS =0
  18. traceMode ='qemusparc'
  19. AdvMemMap ='M3TMemMap'
  20. thermSpot ='SescSpot'
  21. issue_S =2
  22. floorplan ='layoutDescr'
  23. [LDIssueX_S]
  24. Num =1
  25. Occ =1
  26. [techParam]
  27. numberOfFanouts=1
  28. padCapacitance=1
  29. tech =70
  30. microstripLength=10
  31. clockTreeStyle='htree'
  32. skewBudget=20
  33. optimalNumberOfBuffer=3
  34. loadInClockNode=20
  35. randomLogicStyle=1
  36. loadCapacitance=1
  37. areaOfChip=200
  38. numberOfClusters=1
  39. numberOfioBufferStage=5
  40. numberOfFunctions=4
  41. numberofGates=30000
  42. numberOfFanins=4
  43. frequency =2.400000e+09
  44. [DataL1_S]
  45. portOccp =1
  46. blockName ='Dcache'
  47. missDelay =1
  48. RdHitEnergy=1.065153e+00
  49. WrHitEnergy=1.065153e+00
  50. size =32768
  51. assoc =4
  52. WrMissEnergy=2.130306e+00
  53. writePolicy='WB'
  54. hitDelay =2
  55. bsize =64
  56. lowerLevel="CommonBus Bus shared"
  57. MSHR ='DL1MSHR_S'
  58. replPolicy='RANDOM'
  59. skew =false
  60. numPorts =1
  61. RdMissEnergy=2.130306e+00
  62. deviceType='cache'
  63. [issueX_S]
  64. windowCheckEnergy=4.349804e-02
  65. instQueueSize=12
  66. maxStores =38
  67. btbEnergy =2.156033e-01
  68. intRegs =64
  69. iALUEnergy=3.417957e-01
  70. LSQBanks =1
  71. ldqCheckEnergy=1.044431e-01
  72. renameEnergy=1.732633e-01
  73. clockEnergy=3.142553e+00
  74. windowRdWrEnergy=2.224685e-01
  75. windowSelEnergy=5.741436e-03
  76. inorder =true
  77. resultBusEnergy=4.949979e-02
  78. wrRegEnergy=1.833822e-01
  79. dataSource="DataL1_S DL1_S"
  80. areaFactor=2.562500e-01
  81. bb4Cycle =1
  82. robSize =100
  83. maxBranches=16
  84. totEnergy =1.306076e+01
  85. interClusterLat=2
  86. cluster ='FXClusterIssueX_S'
  87. cluster[1:1]='FPClusterIssueX_S'
  88. bpredEnergy=8.987931e-02
  89. forwardBusEnergy=4.949979e-02
  90. dtlb ='FXDTLB_S'
  91. issueWrongPath=true
  92. archBits =32
  93. decodeDelay=3
  94. rasEnergy =0.000000e+00
  95. fpRegs =64
  96. minTLBMissDelay=16
  97. itlb ='FXITLB_S'
  98. stqRdWrEnergy=1.462314e+00
  99. fpALUEnergy=1.047439e+00
  100. instrSource="InstL1_S IL1_S"
  101. bpred ='BPredIssueX_S'
  102. fetchWidth=6
  103. maxLoads =42
  104. issueWidth=2
  105. rdRegEnergy=1.833822e-01
  106. robEnergy =6.256428e-02
  107. stqCheckEnergy=9.549810e-02
  108. retireWidth=2
  109. renameDelay=3
  110. stForwardDelay=1
  111. regFileDelay=3
  112. OSType ='std'
  113. maxIRequests=3
  114. ldqRdWrEnergy=1.429250e+00
  115. enableICache=true
  116. bpredDelay=1
  117. [L2Cache]
  118. bsize =64
  119. writePolicy='WB'
  120. blockName ='L2'
  121. hitDelay =10
  122. RdMissEnergy=7.798407e+00
  123. missDelay =4
  124. WrHitEnergy=3.899203e+00
  125. numPorts =1
  126. portOccp =1
  127. size =524288
  128. replPolicy='LRU'
  129. assoc =8
  130. WrMissEnergy=7.798407e+00
  131. RdHitEnergy=3.899203e+00
  132. MSHR ='MSHRL2_S'
  133. lowerLevel="AdvMem MemBus shared"
  134. deviceType='cache'
  135. [BestBPred]
  136. BTACDelay =0
  137. btbReplPolicy='LRU'
  138. btbAssoc =2
  139. tbits =5
  140. rasSize =0
  141. btbSize =2048
  142. type ='ogehl'
  143. tcbits =7
  144. tsize =2048
  145. mtables =6
  146. btbBsize =1
  147. [PBuff]
  148. hitDelay =3
  149. missDelay =2
  150. buffCache ='PBuffBuff_S'
  151. learnHitDelay=4
  152. streamCache='PBuffStream_S'
  153. maxStride =512
  154. deviceType='prefbuff'
  155. depth =1
  156. learnMissDelay=6
  157. lowerLevel="AdvMem MemBus shared"
  158. missWindow=16
  159. [SimParams]
  160. hotspotLogFile='scooreX.out'
  161. setBinaryLog=1
  162. sampleRate=10
  163. dtmUsed =0
  164. floorPlanFile='scooreX.flp'
  165. omitLateralR=0
  166. [TaskScalar]
  167. SyncOnRestart=3
  168. VersionSize=32
  169. bsize =64
  170. MFThreshold=4
  171. IDP ='IntPred1_S'
  172. MLThreshold=32
  173. [InstL1_S]
  174. hitDelay =2
  175. WrHitEnergy=1.135176e+00
  176. size =32768
  177. portOccp =1
  178. deviceType='icache'
  179. WrMissEnergy=2.270351e+00
  180. numPorts =2
  181. MSHR ='InstL1MSHR_S'
  182. RdHitEnergy=1.135176e+00
  183. lowerLevel="L2Cache L2 shared"
  184. assoc =2
  185. writePolicy='WB'
  186. RdMissEnergy=2.270351e+00
  187. replPolicy='LRU'
  188. bsize =64
  189. blockName ='Icache'
  190. missDelay =0
  191. [STIssueX_S]
  192. Occ =1
  193. Num =1
  194. [IntPred1_S]
  195. RdMissEnergy=1.303039e+00
  196. WrMissEnergy=1.303039e+00
  197. RdHitEnergy=6.515196e-01
  198. ReplPolicy='LRU'
  199. Assoc =32
  200. deviceType='cache'
  201. IDPnChildMax=4
  202. WrHitEnergy=6.515196e-01
  203. portOccp =1
  204. bSize =8
  205. size =256
  206. numPorts =1
  207. [BPredIssueX_S]
  208. btbAssoc =2
  209. btbBsize =1
  210. historySize=11
  211. btbReplPolicy='LRU'
  212. localSize =16384
  213. Metasize =16384
  214. BTACDelay =0
  215. localBits =2
  216. l2Bits =1
  217. btbSize =2048
  218. type ='hybrid'
  219. MetaBits =2
  220. rasSize =0
  221. bpred4Cycle=1
  222. l2size =16384
  223. l1size =1
  224. [FXDTLB_S]
  225. deviceType='tlb'
  226. numPorts =1
  227. replPolicy='LRU'
  228. RdMissEnergy=2.762933e+00
  229. WrHitEnergy=1.381467e+00
  230. RdHitEnergy=1.381467e+00
  231. bsize =8
  232. assoc =64
  233. size =512
  234. WrMissEnergy=2.762933e+00
  235. [PBuffStream_S]
  236. portOccp =3
  237. RdHitEnergy=1.052776e+00
  238. WrHitEnergy=1.052776e+00
  239. ReplPolicy='LRU'
  240. WrMissEnergy=2.105553e+00
  241. RdMissEnergy=2.105553e+00
  242. Assoc =16
  243. numPorts =2
  244. Size =128
  245. BSize =8
  246. deviceType='cache'
  247. [SescSpot]
  248. InterfaceMaterialThickness=7.500000e-05
  249. SpreaderThickness=1.000000e-03
  250. ConvectionCapacitance=1.404000e+02
  251. SpreaderLength=3.000000e-02
  252. DTMUsed =false
  253. ChipThickness=5.000000e-04
  254. HeatsinkThinkness=6.900000e-03
  255. ConvectionResistance=1.000000e-01
  256. HeatsinkLength=6.000000e-02
  257. DTMTempThreshhold=1.118000e+02
  258. [AdvMem]
  259. numPorts =1
  260. busWidth =64
  261. delay =1
  262. deviceType='bus'
  263. lowerLevel='BigMem_S'
  264. iopins =true
  265. portOccp =32
  266. [BigMem_S]
  267. WrMissEnergy=0.000000e+00
  268. hitDelay =490
  269. MSHR ='BigMemMSHR_S'
  270. numPorts =1
  271. size =1024
  272. assoc =1
  273. deviceType='niceCache'
  274. bsize =64
  275. writePolicy='WB'
  276. RdMissEnergy=0.000000e+00
  277. WrHitEnergy=0.000000e+00
  278. RdHitEnergy=0.000000e+00
  279. lowerLevel='voidDevice'
  280. replPolicy='LRU'
  281. missDelay =1600000
  282. portOccp =1
  283. [layer1]
  284. thickness =1.000000e-03
  285. material ='Silicon'
  286. [AmbientTemperature]
  287. initialTemp=60
  288. offsetConstant=2.731500e+02
  289. ambientTemp=40
  290. [Virtual]
  291. specHeat =0
  292. conductance=0
  293. density =0
  294. alpha =0
  295. [FXClusterIssueX_S]
  296. iDivUnit ='ALUIssueX_S'
  297. schedNumPorts=4
  298. schedPortOccp=1
  299. recycleAt ='Execute'
  300. iLoadUnit ='LDIssueX_S'
  301. iMultUnit ='ALUIssueX_S'
  302. winSize =56
  303. iDivLat =207
  304. iMultLat =10
  305. iStoreUnit='STIssueX_S'
  306. schedDelay=1
  307. iLoadLat =1
  308. blockName ='IntWin'
  309. windowRdWrEnergy=5.560074e-01
  310. iALULat =1
  311. wakeUpNumPorts=4
  312. iBJUnit ='ALUIssueX_S'
  313. iALUUnit ='ALUIssueX_S'
  314. iBJLat =1
  315. wakeupDelay=3
  316. wakeUpPortOccp=1
  317. iStoreLat =1
  318. [FXITLB_S]
  319. size =256
  320. RdHitEnergy=6.696175e-01
  321. RdMissEnergy=1.339235e+00
  322. assoc =32
  323. deviceType='tlb'
  324. bsize =8
  325. WrHitEnergy=6.696175e-01
  326. numPorts =1
  327. replPolicy='LRU'
  328. WrMissEnergy=1.339235e+00
  329. [CommonBus]
  330. busWidth =32
  331. buffWCReqs=1
  332. deviceType='bus'
  333. lowerLevel="L2Cache L2 shared"
  334. delay =3
  335. portOccp =1
  336. numPorts =2
  337. busLength =7500
  338. [layer5]
  339. heat_sink_width=1.000000e-01
  340. heat_sink_resistance=1.300000e-01
  341. heat_sink_fins=5
  342. thickness =1.000000e-01
  343. heat_sink_height=1.000000e-01
  344. heat_sink =true
  345. material ='Copper'
  346. [ucoolConf]
  347. current =4.000000e-01
  348. seebeck =2.400000e-04
  349. coupledDevices=12
  350. Resistivity=2.857000e-06
  351. Height =1.000000e-04
  352. crossSection=5.000000e-02
  353. Width =1.000000e-04
  354. conductivity=3.000000e+00
  355. [PBuffBuff_S]
  356. ReplPolicy='LRU'
  357. RdMissEnergy=3.573363e+00
  358. deviceType='cache'
  359. numPorts =2
  360. Size =16384
  361. RdHitEnergy=1.786681e+00
  362. Assoc =4
  363. WrMissEnergy=3.573363e+00
  364. BSize =32
  365. WrHitEnergy=1.786681e+00
  366. portOccp =3
  367. [FPClusterIssueX_S]
  368. schedPortOccp=1
  369. blockName ='FPWin'
  370. schedNumPorts=4
  371. schedDelay=1
  372. fpDivUnit ='FP0IssueX_S'
  373. wakeUpPortOccp=1
  374. recycleAt ='Execute'
  375. fpMultUnit='FP0IssueX_S'
  376. fpDivLat =60
  377. windowRdWrEnergy=4.776964e-01
  378. fpMultLat =4
  379. wakeupDelay=3
  380. winSize =24
  381. fpALUUnit ='FP0IssueX_S'
  382. fpALULat =5
  383. wakeUpNumPorts=4
  384. [SescTherm]
  385. CyclesPerSample=20000
  386. TimeIncrement=2.500000e-01
  387. ucool ='ucoolConf'
  388. MeshResolutionSpreader=1.000000e-01
  389. MeshResolutionChip=3.000000e-03
  390. initialTemp=2.500000e+01
  391. MeshResolutionSink=5.000000e-01
  392. FanVelocity=2.000000e+00
  393. material[2:2]='Virtual'
  394. material[1:1]='Copper'
  395. material ='Silicon'
  396. ambientTemp=40
  397. layer[5:5]='layer5'
  398. layer[4:4]='layer4'
  399. layer[3:3]='layer3'
  400. layer[2:2]='layer2'
  401. layer[1:1]='layer1'
  402. [HeatSink]
  403. convecR =1.000000e-01
  404. convecC =1.404000e+02
  405. heatsinkThick=6.900000e-02
  406. heatsinkSide=6.000000e-02
  407. [MSHRL2_S]
  408. type ='full'
  409. bsize =64
  410. size =32
  411. [layer3]
  412. thickness =2.000000e-03
  413. material ='Copper'
  414. [HeatSpreader]
  415. spreaderThick=1.000000e-03
  416. spreaderSide=3.000000e-02
  417. [Copper]
  418. alpha =1.170000e-04
  419. density =8933
  420. conductance=401
  421. specHeat =385
  422. [BPredTaken]
  423. btbSize =1
  424. btbBsize =1
  425. rasSize =1
  426. btbReplPolicy='LRU'
  427. btbAssoc =1
  428. type ='Static'
  429. [Silicon]
  430. alpha =8.920000e-05
  431. density =2330
  432. specHeat =712
  433. conductance=148
  434. [FileSys]
  435. mount =''
  436. [ALUIssueX_S]
  437. Occ =1
  438. Num =2
  439. [BigMemMSHR_S]
  440. bsize =64
  441. size =32
  442. type ='none'
  443. [InterfaceMaterial]
  444. interfaceThick=7.500000e-05
  445. [FP0IssueX_S]
  446. Occ =1
  447. Num =1
  448. [InstL1MSHR_S]
  449. type ='full'
  450. size =4
  451. bsize =64
  452. [miscEnergy]
  453. combWriteEnergy=3.952196e-02
  454. [DL1MSHR_S]
  455. type ='full'
  456. size =32
  457. bsize =64
  458. [ChipSpecs]
  459. tempThreshold=1.118000e+02
  460. chipThickness=5.000000e-04
  461. [voidDevice]
  462. deviceType='void'
  463. #END Configuration used. Extracted from "S.conf":
  464. #BEGIN Configuration used. Extracted from "L.conf":
  465. procsPerNode=1
  466. thermal ='SescTherm'
  467. gNetwork ='m3tnetwork'
  468. L2ll ="AdvMem MemBus shared"
  469. pageSize =4096
  470. STUnits =3
  471. memSizing_L=1
  472. wattchDataCacheEnergy=1.065153e+00
  473. LDUnits =3
  474. nCPUs =4
  475. NoMigration=false
  476. UseTLS =0
  477. traceMode ='qemusparc'
  478. AdvMemMap ='M3TMemMap'
  479. issue_L =3
  480. depth_L =3
  481. thermSpot ='SescSpot'
  482. floorplan ='layoutDescr'
  483. [FXDTLB_L]
  484. WrHitEnergy=1.246505e+00
  485. bsize =8
  486. assoc =64
  487. replPolicy='LRU'
  488. WrMissEnergy=2.493009e+00
  489. deviceType='tlb'
  490. numPorts =1
  491. RdHitEnergy=1.246505e+00
  492. RdMissEnergy=2.493009e+00
  493. size =512
  494. [layoutDescr]
  495. blockDescr="FPWin 0.0025 0.001 0.000 0.000"
  496. blockDescr[1:1]="FPReg 0.0025 0.001 0.0025 0.000"
  497. blockDescr[3:3]="IntWin 0.003 0.0015 0.0035 0.001"
  498. blockDescr[4:4]="IntReg 0.002 0.0015 0.0065 0.001"
  499. blockDescr[6:6]="FPRAT 0.0035 0.0005 0.000 0.0010"
  500. blockDescr[7:7]="IntRAT 0.0035 0.0010 0.000 0.0015"
  501. blockDescr[10:10]="ROB 0.0025 0.0015 0.0035 0.0025"
  502. blockDescr[11:11]="Icache 0.0045 0.0015 0.000 0.004"
  503. blockDescr[12:12]="Dcache 0.005 0.0015 0.005 0.004"
  504. blockDescr[13:13]="MSHR 0.0005 0.0015 0.0045 0.004"
  505. blockDescr[14:14]="L2 0.0100 0.0055 0.000 0.0055"
  506. blockDescr[2:2]="FPUnitMult 0.0015 0.001 0.0085 0.000"
  507. blockDescr[18:18]="FPUnitAlu 0.0020 0.001 0.0065 0.000"
  508. blockDescr[19:19]="FPUnitDiv 0.0015 0.001 0.005 0.000"
  509. blockDescr[5:5]="IntUnitMult 0.0015 0.0005 0.0085 0.0020"
  510. blockDescr[20:20]="IntUnitAlu 0.0015 0.0005 0.0085 0.0015"
  511. blockDescr[21:21]="IntUnitDiv 0.0015 0.0005 0.0085 0.001"
  512. blockDescr[9:9]="LDQ 0.002 0.0015 0.006 0.0025"
  513. blockDescr[17:17]="STQ 0.002 0.0015 0.008 0.0025"
  514. blockDescr[8:8]="RAS 0.0005 0.00075 0.003 0.00325"
  515. blockDescr[16:16]="Bpred 0.0030 0.00075 0.000 0.00325"
  516. blockDescr[15:15]="BTB 0.0035 0.00075 0.000 0.0025"
  517. blockMatch='Proc(0)_FPClusterIssueX'
  518. blockMatch[1:1]="Proc(0):rdFPRegEnergy Proc(0):wrFPRegEnergy"
  519. blockMatch[3:3]='Proc(0)_FXClusterIssueX*'
  520. blockMatch[4:4]="Proc(0):rdIRegEnergy Proc(0):wrIRegEnergy"
  521. blockMatch[6:6]='Proc(0):renameEnergy'
  522. blockMatch[7:7]='Proc(0):renameEnergy'
  523. blockMatch[10:10]='Proc(0):robEnergy'
  524. blockMatch[11:11]="P(0)_IL1* P(0)_ITLB*"
  525. blockMatch[12:12]='P(0)_DTLB*'
  526. blockMatch[13:13]="P(0)_DL1_MSHR P(0)_PBuff"
  527. blockMatch[14:14]="L2 niceCache"
  528. blockMatch[2:2]='Cluster(0):fpMult'
  529. blockMatch[18:18]='Cluster(0):fpALU'
  530. blockMatch[19:19]='Cluster(0):fpDiv'
  531. blockMatch[5:5]='Cluster(0):iMult'
  532. blockMatch[20:20]='Cluster(0):iALU'
  533. blockMatch[21:21]='Cluster(0):iDiv'
  534. blockMatch[9:9]="FULoad(0) FUMemory(0)"
  535. blockMatch[17:17]="FUStore(0) FUMemory(0)"
  536. blockMatch[8:8]='BPred(0)_RAS'
  537. blockMatch[16:16]='BPred(0)_hybrid'
  538. blockMatch[15:15]='BPred(0)_BTB'
  539. [layer4]
  540. heat_spreader_height=2.000000e-02
  541. material ='Copper'
  542. thickness =2.000000e-02
  543. heat_spreader_width=2.000000e-02
  544. heat_spreader=true
  545. [issueX_L]
  546. retireWidth=3
  547. renameDelay=3
  548. OSType ='std'
  549. dataSource="DataL1_L DL1_L"
  550. resultBusEnergy=1.528119e-01
  551. windowSelEnergy=1.107277e-02
  552. bpredDelay=1
  553. issueWrongPath=true
  554. renameEnergy=2.000427e-01
  555. minTLBMissDelay=16
  556. interClusterLat=2
  557. stForwardDelay=1
  558. fpRegs =80
  559. forwardBusEnergy=1.528119e-01
  560. bb4Cycle =1
  561. robEnergy =7.539995e-01
  562. wrRegEnergy=3.438024e-01
  563. maxIRequests=3
  564. areaFactor=5.687500e-01
  565. robSize =128
  566. maxLoads =48
  567. decodeDelay=3
  568. itlb ='FXITLB_L'
  569. inorder =false
  570. intRegs =96
  571. instrSource="InstL1_L IL1_L"
  572. regFileDelay=3
  573. stqCheckEnergy=1.044431e-01
  574. cluster ='FXClusterIssueX_L'
  575. cluster[1:1]='FPClusterIssueX_L'
  576. dtlb ='FXDTLB_L'
  577. windowCheckEnergy=8.454319e-02
  578. ldqCheckEnergy=1.178606e-01
  579. enableICache=true
  580. bpred ='BPredIssueX_L'
  581. rasEnergy =0.000000e+00
  582. stqRdWrEnergy=1.507999e+00
  583. ldqRdWrEnergy=1.474934e+00
  584. totEnergy =1.464537e+01
  585. instQueueSize=12
  586. rdRegEnergy=3.438024e-01
  587. fpALUEnergy=1.047439e+00
  588. maxStores =42
  589. bpredEnergy=8.987931e-02
  590. btbEnergy =2.156033e-01
  591. issueWidth=3
  592. iALUEnergy=3.417957e-01
  593. fetchWidth=6
  594. clockEnergy=3.823849e+00
  595. maxBranches=22
  596. windowRdWrEnergy=3.958168e-01
  597. archBits =32
  598. LSQBanks =1
  599. [IntPred1_L]
  600. numPorts =1
  601. WrHitEnergy=5.346558e-01
  602. WrMissEnergy=1.069312e+00
  603. Assoc =32
  604. size =256
  605. RdHitEnergy=5.346558e-01
  606. IDPnChildMax=4
  607. bSize =8
  608. ReplPolicy='LRU'
  609. portOccp =1
  610. RdMissEnergy=1.069312e+00
  611. deviceType='cache'
  612. [FXClusterIssueX_L]
  613. iMultUnit ='ALUIssueX_L'
  614. iMultLat =8
  615. wakeUpPortOccp=1
  616. recycleAt ='Execute'
  617. wakeupDelay=3
  618. schedNumPorts=4
  619. iLoadLat =1
  620. schedPortOccp=1
  621. iStoreLat =1
  622. iBJUnit ='ALUIssueX_L'
  623. iLoadUnit ='LDIssueX_L'
  624. iDivLat =23
  625. iDivUnit ='ALUIssueX_L'
  626. iALUUnit ='ALUIssueX_L'
  627. blockName ='IntWin'
  628. winSize =68
  629. iALULat =1
  630. schedDelay=1
  631. iBJLat =1
  632. iStoreUnit='STIssueX_L'
  633. windowRdWrEnergy=6.876807e-01
  634. wakeUpNumPorts=4
  635. [FPClusterIssueX_L]
  636. winSize =32
  637. recycleAt ='Execute'
  638. fpDivLat =21
  639. schedPortOccp=1
  640. wakeUpNumPorts=4
  641. fpMultUnit='FP0IssueX_L'
  642. fpALULat =3
  643. schedNumPorts=4
  644. blockName ='FPWin'
  645. fpALUUnit ='FP0IssueX_L'
  646. fpDivUnit ='FP0IssueX_L'
  647. windowRdWrEnergy=5.641608e-01
  648. fpMultLat =5
  649. schedDelay=1
  650. wakeUpPortOccp=1
  651. wakeupDelay=3
  652. [DataL1_L]
  653. bsize =64
  654. replPolicy='RANDOM'
  655. portOccp =1
  656. MSHR ='DL1MSHR_L'
  657. numPorts =1
  658. size =32768
  659. RdMissEnergy=2.130306e+00
  660. WrHitEnergy=1.065153e+00
  661. assoc =4
  662. hitDelay =2
  663. missDelay =1
  664. WrMissEnergy=2.130306e+00
  665. lowerLevel="CommonBus Bus shared"
  666. blockName ='Dcache'
  667. RdHitEnergy=1.065153e+00
  668. deviceType='cache'
  669. skew =false
  670. writePolicy='WB'
  671. [FXITLB_L]
  672. WrHitEnergy=6.515196e-01
  673. bsize =8
  674. RdMissEnergy=1.303039e+00
  675. numPorts =1
  676. deviceType='tlb'
  677. RdHitEnergy=6.515196e-01
  678. assoc =32
  679. size =256
  680. replPolicy='LRU'
  681. WrMissEnergy=1.303039e+00
  682. [BPredIssueX_L]
  683. l2Bits =1
  684. btbReplPolicy='LRU'
  685. localBits =2
  686. BTACDelay =0
  687. l1size =1
  688. localSize =16384
  689. btbSize =2048
  690. rasSize =0
  691. btbBsize =1
  692. MetaBits =2
  693. bpred4Cycle=1
  694. Metasize =16384
  695. historySize=11
  696. l2size =16384
  697. type ='hybrid'
  698. btbAssoc =2
  699. [DL1MSHR_L]
  700. size =32
  701. type ='full'
  702. bsize =64
  703. [BigMemMSHR_L]
  704. type ='none'
  705. size =32
  706. bsize =64
  707. [InstL1_L]
  708. blockName ='Icache'
  709. missDelay =0
  710. lowerLevel="L2Cache L2 shared"
  711. RdHitEnergy=1.135176e+00
  712. portOccp =1
  713. MSHR ='InstL1MSHR_L'
  714. hitDelay =2
  715. assoc =2
  716. RdMissEnergy=2.270351e+00
  717. deviceType='icache'
  718. size =32768
  719. bsize =64
  720. WrMissEnergy=2.270351e+00
  721. writePolicy='WB'
  722. WrHitEnergy=1.135176e+00
  723. replPolicy='LRU'
  724. numPorts =2
  725. [BigMem_L]
  726. RdMissEnergy=0.000000e+00
  727. assoc =1
  728. WrMissEnergy=0.000000e+00
  729. bsize =64
  730. portOccp =1
  731. RdHitEnergy=0.000000e+00
  732. hitDelay =490
  733. numPorts =1
  734. MSHR ='BigMemMSHR_L'
  735. replPolicy='LRU'
  736. lowerLevel='voidDevice'
  737. missDelay =1600000
  738. WrHitEnergy=0.000000e+00
  739. deviceType='niceCache'
  740. size =1024
  741. writePolicy='WB'
  742. [PBuffStream_L]
  743. BSize =8
  744. RdMissEnergy=2.105553e+00
  745. numPorts =2
  746. ReplPolicy='LRU'
  747. deviceType='cache'
  748. WrHitEnergy=1.052776e+00
  749. Assoc =16
  750. WrMissEnergy=2.105553e+00
  751. RdHitEnergy=1.052776e+00
  752. portOccp =3
  753. Size =128
  754. [PBuffBuff_L]
  755. Assoc =4
  756. WrMissEnergy=3.573363e+00
  757. WrHitEnergy=1.786681e+00
  758. RdMissEnergy=3.573363e+00
  759. ReplPolicy='LRU'
  760. deviceType='cache'
  761. BSize =32
  762. portOccp =3
  763. Size =16384
  764. numPorts =2
  765. RdHitEnergy=1.786681e+00
  766. [MSHRL2_L]
  767. bsize =64
  768. type ='full'
  769. size =32
  770. [STIssueX_L]
  771. Occ = 1
  772. Num = 3
  773. [LDIssueX_L]
  774. Occ =1
  775. Num =3
  776. [InstL1MSHR_L]
  777. size =4
  778. type ='full'
  779. bsize =64
  780. [layer2]
  781. material ='Virtual'
  782. thickness =0
  783. [ALUIssueX_L]
  784. Num = 8
  785. Occ = 1
  786. [FP0IssueX_L]
  787. Num =1
  788. Occ =1
  • 这里使用的基于SESC API的多进程程序如下:
  1. #include <unistd.h>
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <string.h>
  5. #include"sescapi.h"
  6. struct thread_data
  7. {
  8. int argc;
  9. char **argv;
  10. int done;
  11. };
  12. int basicmath_main(void *);
  13. int bitcount_main(void *);
  14. int cjpeg_main(void *);
  15. int djpeg_main(void *);
  16. int dijkstra_large_main(void *);
  17. int patricia_main(void *);
  18. int stringsearch_large_main(void *);
  19. int sha_main(void *);
  20. int rawdaudio_main(void *);
  21. int rawcaudio_main(void *);
  22. int CRC32_main(void *);
  23. int fft_main(void *);
  24. int fft_main(void *);
  25. #define numP 9
  26. int main(int argc, char *argv[]) {
  27. int i,j,k;
  28. struct thread_data ** thread_args = (struct thread_data**)malloc(numP * sizeof(struct thrad_data*));
  29. for(j = 0; j < numP; j++){
  30. thread_args[j] = (struct thread_data *) malloc(sizeof(struct thread_data));
  31. }
  32. i = 0 ;
  33. thread_args[i]->argc = 8;
  34. thread_args[i]->argv = (char **)malloc(8*sizeof(char *));
  35. thread_args[i]->argv[0] = (char *)malloc(20 *sizeof(char));
  36. memcpy(thread_args[i]->argv[0],"cjpeg_mian",20) ;
  37. thread_args[i]->argv[1] = (char *)malloc(20 *sizeof(char));
  38. memcpy(thread_args[i]->argv[1],"-dct",20);
  39. thread_args[i]->argv[2] = (char *)malloc(20 *sizeof(char));
  40. memcpy(thread_args[i]->argv[2],"int",20);
  41. thread_args[i]->argv[3] = (char *)malloc(20 *sizeof(char));
  42. memcpy(thread_args[i]->argv[3],"-progressive",20);
  43. thread_args[i]->argv[4] = (char *)malloc(20 *sizeof(char));
  44. memcpy(thread_args[i]->argv[4],"-opt",20);
  45. thread_args[i]->argv[5] = (char *)malloc(20 *sizeof(char));
  46. memcpy(thread_args[i]->argv[5],"-outfile",20);
  47. thread_args[i]->argv[6] = (char *)malloc(30 *sizeof(char));
  48. memcpy(thread_args[i]->argv[6],"output_large_encode.jpeg",30);
  49. thread_args[i]->argv[7] = (char *)malloc(20 *sizeof(char));
  50. memcpy(thread_args[i]->argv[7],"input_large.ppm",20);
  51. i = 1 ;
  52. thread_args[i]->argc = 2;
  53. thread_args[i]->argv = (char **)malloc(2*sizeof(char *));
  54. thread_args[i]->argv[0] = (char *)malloc(30 *sizeof(char));
  55. memcpy(thread_args[i]->argv[0],"dijkstra_large_mian",30);
  56. thread_args[i]->argv[1] = (char *)malloc(20 *sizeof(char));
  57. memcpy(thread_args[i]->argv[1],"input.dat",20);
  58. i = 2 ;
  59. thread_args[i]->argc = 2;
  60. thread_args[i]->argv = (char **)malloc(2*sizeof(char *));
  61. thread_args[i]->argv[0] = (char *)malloc(20 *sizeof(char));
  62. memcpy(thread_args[i]->argv[0],"patricia_mian",20);
  63. thread_args[i]->argv[1] = (char *)malloc(20 *sizeof(char));
  64. memcpy(thread_args[i]->argv[1],"large.udp",20);
  65. i = 3 ;
  66. thread_args[i]->argc = 1;
  67. thread_args[i]->argv = (char **)malloc(sizeof(char *));
  68. thread_args[i]->argv[0] = (char *)malloc(20 *sizeof(char));
  69. memcpy(thread_args[i]->argv[0],"search_large_mian",20);
  70. i = 4 ;
  71. thread_args[i]->argc = 2;
  72. thread_args[i]->argv = (char **)malloc(2*sizeof(char *));
  73. thread_args[i]->argv[0] = (char *)malloc(20 *sizeof(char));
  74. memcpy(thread_args[i]->argv[0],"sha_mian",20);
  75. thread_args[i]->argv[1] = (char *)malloc(20 *sizeof(char));
  76. memcpy(thread_args[i]->argv[1],"input_large.asc",20);
  77. i = 5 ;
  78. thread_args[i]->argc = 1;
  79. thread_args[i]->argv = (char **)malloc(2*sizeof(char *));
  80. thread_args[i]->argv[0] = (char *)malloc(20 *sizeof(char));
  81. memcpy(thread_args[i]->argv[0],"rawcaudio_mian",20);
  82. i = 6 ;
  83. thread_args[i]->argc = 3;
  84. thread_args[i]->argv = (char **)malloc(3*sizeof(char *));
  85. thread_args[i]->argv[0] = (char *)malloc(20 *sizeof(char));
  86. memcpy(thread_args[i]->argv[0],"fft_mian",20);
  87. thread_args[i]->argv[1] = (char *)malloc(20 *sizeof(char));
  88. memcpy(thread_args[i]->argv[1],"8",20);
  89. thread_args[i]->argv[2] = (char *)malloc(20 *sizeof(char));
  90. memcpy(thread_args[i]->argv[2],"32768",20);
  91. i = 7 ;
  92. thread_args[i]->argc = 4;
  93. thread_args[i]->argv = (char **)malloc(4*sizeof(char *));
  94. thread_args[i]->argv[0] = (char *)malloc(20 *sizeof(char));
  95. memcpy(thread_args[i]->argv[0],"fft_mian",20);
  96. thread_args[i]->argv[1] = (char *)malloc(20 *sizeof(char));
  97. memcpy(thread_args[i]->argv[1],"8",20);
  98. thread_args[i]->argv[2] = (char *)malloc(20 *sizeof(char));
  99. memcpy(thread_args[i]->argv[2],"32768",20);
  100. thread_args[i]->argv[3] = (char *)malloc(20 *sizeof(char));
  101. memcpy(thread_args[i]->argv[3],"-i",20);
  102. i = 0 ;
  103. sesc_spawn((void*)cjpeg_main,(void *)thread_args[i],SESC_FLAG_MAP| i + 1);
  104. i = 1 ;
  105. sesc_spawn((void*)dijkstra_large_main,(void *)thread_args[i],SESC_FLAG_MAP| i + 1);
  106. i = 2 ;
  107. sesc_spawn((void*)patricia_main,(void *)thread_args[i],SESC_FLAG_MAP| i + 1);
  108. i = 3 ;
  109. sesc_spawn((void*)stringsearch_large_main,(void *)thread_args[i],SESC_FLAG_MAP| i + 1);
  110. i = 4 ;
  111. sesc_spawn((void*)sha_main,(void *)thread_args[i],SESC_FLAG_MAP| i + 1);
  112. i = 5 ;
  113. sesc_spawn((void*)rawcaudio_main,(void *)thread_args[i],SESC_FLAG_MAP| i + 1);
  114. i = 6 ;
  115. sesc_spawn((void*)fft_main,(void *)thread_args[i],SESC_FLAG_MAP| i + 1);
  116. i = 7 ;
  117. sesc_spawn((void*)fft_main,(void *)thread_args[i],SESC_FLAG_MAP| i + 1);
  118. }
  • 上述程序运行结果如下:
  1. # Bench : ./sesc.mem -ccombina.conf -w100000 combina_1212
  2. # File : sesc_combina_1212.v6kaqg : Sun Apr 17 10:27:59 2016
  3. Exe Speed Exe MHz Exe Time Sim Time (2400MHz)
  4. 1369.351 KIPS 0.7969 MHz 549.180 secs 182.350 msec (rabbit)
  5. Proc Avg.Time BPType Total RAS BPred BTB BTAC
  6. 1 156.985 hybrid 59.80% (100.00% of 9.04%) 55.80% ( 59.87% of 58.62%) 0.00%
  7. 2 230.504 hybrid 99.18% (100.00% of 0.77%) 99.18% ( 98.60% of 36.16%) 0.00%
  8. 3 236.339 hybrid 47.88% (100.00% of 8.02%) 43.33% ( 41.61% of 32.31%) 0.00%
  9. 4 123.601 hybrid 92.16% (100.00% of 3.23%) 91.89% ( 88.71% of 42.08%) 0.00%
  10. 5 209.454 hybrid 54.61% (100.00% of 7.59%) 50.88% ( 45.81% of 32.37%) 0.00%
  11. 6 976.000 hybrid 25.00% ( 0.00% of 0.00%) 25.00% ( 0.00% of 12.50%) 0.00%
  12. 7 35.086 hybrid 86.45% (100.00% of 8.55%) 85.18% ( 78.66% of 51.11%) 0.00%
  13. 8 35.870 hybrid 87.05% (100.00% of 8.92%) 85.78% ( 79.32% of 47.09%) 0.00%
  14. nInst BJ Load Store INT FP : LD Forward , Replay : Worst Unit (clk)
  15. 1 11096 16.05% 18.56% 15.97% 49.41% 0.01% : 0.24% ???? inst/repl : LDIssueX_S 0.45
  16. 2 67271313 10.80% 20.26% 4.67% 64.28% 0.00% : 0.10% ???? inst/repl : LDIssueX_S 0.02
  17. 3 2063 20.55% 19.24% 15.08% 45.08% 0.05% : 2.27% ???? inst/repl : LDIssueX_S 0.39
  18. 4 9735226 15.35% 22.02% 11.69% 50.93% 0.00% : 0.68% ???? inst/repl : LDIssueX_S 0.14
  19. 5 2645 20.91% 20.79% 14.44% 43.82% 0.04% : 13.82% 1322 inst/repl : STIssueX_L 0.04
  20. 6 56 14.29% 5.36% 25.00% 53.57% 1.79% : 66.67% ???? inst/repl : STIssueX_L 0.04
  21. 7 408228696 15.31% 24.60% 12.35% 41.18% 6.56% : 33.62% 88 inst/repl : FP0IssueX_L 0.25
  22. 8 266768970 14.17% 24.68% 11.59% 39.45% 10.11% : 29.07% 150 inst/repl : FP0IssueX_L 0.25
  23. Proc IPC Active Cycles Busy LDQ STQ IWin ROB Regs Ports TLB maxBr MisBr Br4Clk Other
  24. 1 0.08 0.03 133191 4.2 0.0 0.0 56.1 0.0 0.0 0.0 4.5 0.0 35.1 0.0 0.1
  25. 2 0.16 93.22 407946995 8.2 0.0 0.0 91.4 0.0 0.0 0.0 0.0 0.0 0.3 0.0 0.0
  26. 3 0.03 0.01 59650 1.7 0.0 0.0 29.3 0.0 0.0 0.0 8.0 0.0 60.9 0.0 0.0
  27. 4 0.18 12.43 54394504 8.9 0.0 0.0 87.5 0.0 0.0 0.0 0.0 0.0 3.5 0.0 0.0
  28. 5 0.05 0.01 58207 1.5 0.0 0.0 0.1 0.0 0.0 0.0 8.1 0.0 90.2 0.0 0.1
  29. 6 0.01 0.00 6935 0.3 0.0 0.0 0.0 0.0 0.0 0.0 52.3 0.0 47.4 0.0 0.0
  30. 7 0.93 100.00 437640780 31.1 2.9 0.0 1.9 6.5 0.3 0.0 0.2 0.0 54.3 0.1 2.6
  31. 8 0.97 62.79 274810653 32.4 4.8 0.0 3.1 3.8 0.0 0.0 0.4 0.0 53.0 0.1 2.4
  32. ################################################################################
  33. Proc CacheName LVID revLVID Energy : ...
  34. ################################################################################
  35. Proc Fetch Issue Mem Exec Clock Total (watts)
  36. 0 0.001 0.000 0.087 0.000 1.582 1.670
  37. 1 0.001 0.000 0.001 0.000 1.571 1.573
  38. 2 0.086 0.393 0.273 0.524 1.725 3.000
  39. 3 0.001 0.000 0.001 0.000 1.571 1.573
  40. 4 0.014 0.057 0.051 0.094 1.597 1.813
  41. 5 0.001 0.000 0.001 0.000 1.912 1.914
  42. 6 0.001 0.000 0.001 0.000 1.912 1.914
  43. 7 0.616 7.048 1.799 5.174 3.823 18.458
  44. 8 0.390 4.609 1.186 3.377 3.160 12.722
  45. Total 1.109 12.106 3.399 9.169 18.854 44.636

6.下周任务

  • 对SESC的使用进一步学习;
  • 对SESC进行AMP试验进一步了解;
  • 向师兄请教SESC支持AMP需要修改哪些文件,初步涉猎源码;
  • 师兄所给资料信息量巨大,需要经自己整理慢慢消化。

本文总阅读量