2424)
2525
2626# Initialize session state
27- if "data_loader " not in st .session_state :
27+ if "ni " not in st .session_state :
2828 st .session_state .data_loader = InfiniMetricsDataLoader ()
2929if "selected_accelerators" not in st .session_state :
3030 st .session_state .selected_accelerators = []
@@ -96,19 +96,22 @@ def render_dashboard(run_id_filter: str):
9696 <div style="
9797 margin-top: 0.5em;
9898 margin-bottom: 1.5em;
99- max-width: 1100px ;
100- font-size: 1.05em ;
99+ max-width: 80% ;
100+ font-size: 1.3em ;
101101 line-height: 1.6;
102102 ">
103103 <strong>InfiniMetrics Dashboard</strong> 用于统一展示
104104 <strong>通信(NCCL / 集合通信)</strong>、
105- <strong>推理(Direct / Service)</strong>、
105+ <strong>训练(Training / 分布式训练)</strong>、
106+ <strong>推理(Direct / Service 推理)</strong>、
106107 <strong>算子(核心算子性能)</strong>
107108 等 AI 加速卡性能测试结果。
108109 <br/>
109110 测试框架输出 <code>JSON</code>(环境 / 配置 / 标量指标) +
110111 <code>CSV</code>(曲线 / 时序数据),
111- Dashboard 自动加载并支持多次运行的对比分析与可视化。
112+ Dashboard 自动加载并支持多次运行的
113+ <strong>性能对比</strong>、<strong>趋势分析</strong> 与
114+ <strong>可视化展示</strong>。
112115 </div>
113116 """ ,
114117 unsafe_allow_html = True ,
@@ -150,6 +153,7 @@ def _parse_time(t):
150153 # ========== Categorize runs ==========
151154 comm_runs = [r for r in runs if r .get ("testcase" , "" ).startswith ("comm" )]
152155 infer_runs = [r for r in runs if r .get ("testcase" , "" ).startswith ("infer" )]
156+ train_runs = [r for r in runs if r .get ("testcase" , "" ).startswith ("train" )]
153157
154158 ops_runs , hw_runs = [], []
155159 for r in runs :
@@ -161,13 +165,14 @@ def _parse_time(t):
161165 hw_runs .append (r )
162166
163167 # ========== KPI ==========
164- c1 , c2 , c3 , c4 , c5 , c6 = st .columns (6 )
168+ c1 , c2 , c3 , c4 , c5 , c6 , c7 = st .columns (7 )
165169 c1 .metric ("总测试数" , total )
166170 c2 .metric ("成功率" , f"{ (success / total * 100 ):.1f} %" )
167171 c3 .metric ("通信测试" , len (comm_runs ))
168172 c4 .metric ("推理测试" , len (infer_runs ))
169- c5 .metric ("算子测试" , len (ops_runs ))
170- c6 .metric ("硬件检测" , len (hw_runs ))
173+ c5 .metric ("训练测试" , len (train_runs ))
174+ c6 .metric ("算子测试" , len (ops_runs ))
175+ c7 .metric ("硬件检测" , len (hw_runs ))
171176
172177 st .caption (f"失败测试数:{ fail } " )
173178 st .caption (f"当前筛选:加速卡={ ',' .join (selected_accs ) or '全部' } " )
@@ -181,8 +186,9 @@ def _latest(lst):
181186 latest_comm = _latest (comm_runs )
182187 latest_infer = _latest (infer_runs )
183188 latest_ops = _latest (ops_runs )
189+ latest_train = _latest (train_runs )
184190
185- colA , colB , colC = st .columns (3 )
191+ colA , colB , colC , colD = st .columns (4 )
186192
187193 with colA :
188194 st .markdown ("#### 🔗 通信(最新)" )
@@ -211,6 +217,17 @@ def _latest(lst):
211217 st .write (f"- time: { latest_ops .get ('time' ,'' )} " )
212218 st .write (f"- status: { '✅' if latest_ops .get ('success' ) else '❌' } " )
213219
220+ with colD :
221+ st .markdown ("#### 🏋️ 训练(最新)" )
222+ if not latest_train :
223+ st .info ("暂无训练结果" )
224+ else :
225+ framework = latest_train .get ("config" , {}).get ("framework" , "unknown" )
226+ model = latest_train .get ("config" , {}).get ("model" , "unknown" )
227+ st .write (f"- 框架/模型: `{ framework } /{ model } `" )
228+ st .write (f"- time: { latest_train .get ('time' ,'' )} " )
229+ st .write (f"- status: { '✅' if latest_train .get ('success' ) else '❌' } " )
230+
214231 st .divider ()
215232
216233 # ========== Recent runs table ==========
@@ -267,13 +284,15 @@ def _latest(lst):
267284 st .markdown ("---" )
268285 st .markdown ("### 🚀 快速导航" )
269286
270- col1 , col2 , col3 = st .columns (3 )
287+ col1 , col2 , col3 , col4 = st .columns (4 )
271288 if col1 .button ("🔗 通信测试分析" , use_container_width = True ):
272289 st .switch_page ("pages/communication.py" )
273290 if col2 .button ("⚡ 算子测试分析" , use_container_width = True ):
274291 st .switch_page ("pages/operator.py" )
275- if col3 .button ("🤖 推理测试分析" , use_container_width = True ):
292+ if col3 .button ("🚀 推理测试分析" , use_container_width = True ):
276293 st .switch_page ("pages/inference.py" )
294+ if col4 .button ("🏋️ 训练测试分析" , use_container_width = True ):
295+ st .switch_page ("pages/training.py" )
277296
278297 except Exception as e :
279298 st .error (f"Dashboard 加载失败: { e } " )
0 commit comments