@@ -65,7 +65,7 @@ def main():
6565 st .markdown ("---" )
6666
6767 results_dir = st .text_input (
68- "测试结果目录" , value = ".. /output" , help = "包含 JSON/CSV 测试结果的目录"
68+ "测试结果目录" , value = "./output" , help = "包含 JSON/CSV 测试结果的目录"
6969 )
7070
7171 if not use_mongodb and results_dir != str (
@@ -122,20 +122,23 @@ def render_dashboard(run_id_filter: str):
122122 <div style="
123123 margin-top: 0.5em;
124124 margin-bottom: 1.5em;
125- max-width: 1100px ;
126- font-size: 1.05em ;
125+ max-width: 80% ;
126+ font-size: 1.3em ;
127127 line-height: 1.6;
128128 ">
129129 <strong>InfiniMetrics Dashboard</strong> 用于统一展示
130130 <strong>通信(NCCL / 集合通信)</strong>、
131+ <strong>训练(Training / 分布式训练)</strong>、
131132 <strong>推理(直接推理 / 服务性能)</strong>、
132133 <strong>算子(核心算子性能)</strong>、
133134 <strong>硬件(内存带宽 / 缓存性能)</strong>
134135 等 AI 加速卡性能测试结果。
135136 <br/>
136137 测试框架输出 <code>JSON</code>(环境 / 配置 / 标量指标) +
137138 <code>CSV</code>(曲线 / 时序数据),
138- Dashboard 自动加载并支持多次运行的对比分析与可视化。
139+ Dashboard 自动加载并支持多次运行的
140+ <strong>性能对比</strong>、<strong>趋势分析</strong> 与
141+ <strong>可视化展示</strong>。
139142 </div>
140143 """ ,
141144 unsafe_allow_html = True ,
@@ -177,6 +180,7 @@ def _parse_time(t):
177180 # ========== Categorize runs ==========
178181 comm_runs = [r for r in runs if r .get ("testcase" , "" ).startswith ("comm" )]
179182 infer_runs = [r for r in runs if r .get ("testcase" , "" ).startswith ("infer" )]
183+ train_runs = [r for r in runs if r .get ("testcase" , "" ).startswith ("train" )]
180184
181185 ops_runs , hw_runs = [], []
182186 for r in runs :
@@ -188,13 +192,14 @@ def _parse_time(t):
188192 hw_runs .append (r )
189193
190194 # ========== KPI ==========
191- c1 , c2 , c3 , c4 , c5 , c6 = st .columns (6 )
195+ c1 , c2 , c3 , c4 , c5 , c6 , c7 = st .columns (7 )
192196 c1 .metric ("总测试数" , total )
193197 c2 .metric ("成功率" , f"{ (success / total * 100 ):.1f} %" )
194198 c3 .metric ("通信测试" , len (comm_runs ))
195199 c4 .metric ("推理测试" , len (infer_runs ))
196- c5 .metric ("算子测试" , len (ops_runs ))
197- c6 .metric ("硬件检测" , len (hw_runs ))
200+ c5 .metric ("训练测试" , len (train_runs ))
201+ c6 .metric ("算子测试" , len (ops_runs ))
202+ c7 .metric ("硬件检测" , len (hw_runs ))
198203
199204 st .caption (f"失败测试数:{ fail } " )
200205 st .caption (f"当前筛选:加速卡={ ',' .join (selected_accs ) or '全部' } " )
@@ -208,8 +213,9 @@ def _latest(lst):
208213 latest_comm = _latest (comm_runs )
209214 latest_infer = _latest (infer_runs )
210215 latest_ops = _latest (ops_runs )
216+ latest_train = _latest (train_runs )
211217
212- colA , colB , colC = st .columns (3 )
218+ colA , colB , colC , colD = st .columns (4 )
213219
214220 with colA :
215221 st .markdown ("#### 🔗 通信(最新)" )
@@ -238,6 +244,17 @@ def _latest(lst):
238244 st .write (f"- time: { latest_ops .get ('time' ,'' )} " )
239245 st .write (f"- status: { '✅' if latest_ops .get ('success' ) else '❌' } " )
240246
247+ with colD :
248+ st .markdown ("#### 🏋️ 训练(最新)" )
249+ if not latest_train :
250+ st .info ("暂无训练结果" )
251+ else :
252+ framework = latest_train .get ("config" , {}).get ("framework" , "unknown" )
253+ model = latest_train .get ("config" , {}).get ("model" , "unknown" )
254+ st .write (f"- 框架/模型: `{ framework } /{ model } `" )
255+ st .write (f"- time: { latest_train .get ('time' ,'' )} " )
256+ st .write (f"- status: { '✅' if latest_train .get ('success' ) else '❌' } " )
257+
241258 st .divider ()
242259
243260 # ========== Recent runs table ==========
@@ -294,13 +311,15 @@ def _latest(lst):
294311 st .markdown ("---" )
295312 st .markdown ("### 🚀 快速导航" )
296313
297- col1 , col2 , col3 = st .columns (3 )
314+ col1 , col2 , col3 , col4 = st .columns (4 )
298315 if col1 .button ("🔗 通信测试分析" , use_container_width = True ):
299316 st .switch_page ("pages/communication.py" )
300317 if col2 .button ("⚡ 算子测试分析" , use_container_width = True ):
301318 st .switch_page ("pages/operator.py" )
302- if col3 .button ("🤖 推理测试分析" , use_container_width = True ):
319+ if col3 .button ("🚀 推理测试分析" , use_container_width = True ):
303320 st .switch_page ("pages/inference.py" )
321+ if col4 .button ("🏋️ 训练测试分析" , use_container_width = True ):
322+ st .switch_page ("pages/training.py" )
304323
305324 except Exception as e :
306325 st .error (f"Dashboard 加载失败: { e } " )
0 commit comments