Skip to content

Commit 54e5bbf

Browse files
committed
feat(dashboard): add training analysis page and improve communication/inference tables
1 parent 69fee5f commit 54e5bbf

File tree

5 files changed

+622
-58
lines changed

5 files changed

+622
-58
lines changed

dashboard/app.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
)
2525

2626
# Initialize session state
27-
if "data_loader" not in st.session_state:
27+
if "ni" not in st.session_state:
2828
st.session_state.data_loader = InfiniMetricsDataLoader()
2929
if "selected_accelerators" not in st.session_state:
3030
st.session_state.selected_accelerators = []
@@ -96,19 +96,22 @@ def render_dashboard(run_id_filter: str):
9696
<div style="
9797
margin-top: 0.5em;
9898
margin-bottom: 1.5em;
99-
max-width: 1100px;
100-
font-size: 1.05em;
99+
max-width: 80%;
100+
font-size: 1.3em;
101101
line-height: 1.6;
102102
">
103103
<strong>InfiniMetrics Dashboard</strong> 用于统一展示
104104
<strong>通信(NCCL / 集合通信)</strong>、
105-
<strong>推理(Direct / Service)</strong>、
105+
<strong>训练(Training / 分布式训练)</strong>、
106+
<strong>推理(Direct / Service 推理)</strong>、
106107
<strong>算子(核心算子性能)</strong>
107108
等 AI 加速卡性能测试结果。
108109
<br/>
109110
测试框架输出 <code>JSON</code>(环境 / 配置 / 标量指标) +
110111
<code>CSV</code>(曲线 / 时序数据),
111-
Dashboard 自动加载并支持多次运行的对比分析与可视化。
112+
Dashboard 自动加载并支持多次运行的
113+
<strong>性能对比</strong>、<strong>趋势分析</strong> 与
114+
<strong>可视化展示</strong>。
112115
</div>
113116
""",
114117
unsafe_allow_html=True,
@@ -150,6 +153,7 @@ def _parse_time(t):
150153
# ========== Categorize runs ==========
151154
comm_runs = [r for r in runs if r.get("testcase", "").startswith("comm")]
152155
infer_runs = [r for r in runs if r.get("testcase", "").startswith("infer")]
156+
train_runs = [r for r in runs if r.get("testcase", "").startswith("train")]
153157

154158
ops_runs, hw_runs = [], []
155159
for r in runs:
@@ -161,13 +165,14 @@ def _parse_time(t):
161165
hw_runs.append(r)
162166

163167
# ========== KPI ==========
164-
c1, c2, c3, c4, c5, c6 = st.columns(6)
168+
c1, c2, c3, c4, c5, c6, c7 = st.columns(7)
165169
c1.metric("总测试数", total)
166170
c2.metric("成功率", f"{(success/total*100):.1f}%")
167171
c3.metric("通信测试", len(comm_runs))
168172
c4.metric("推理测试", len(infer_runs))
169-
c5.metric("算子测试", len(ops_runs))
170-
c6.metric("硬件检测", len(hw_runs))
173+
c5.metric("训练测试", len(train_runs))
174+
c6.metric("算子测试", len(ops_runs))
175+
c7.metric("硬件检测", len(hw_runs))
171176

172177
st.caption(f"失败测试数:{fail}")
173178
st.caption(f"当前筛选:加速卡={','.join(selected_accs) or '全部'}")
@@ -181,8 +186,9 @@ def _latest(lst):
181186
latest_comm = _latest(comm_runs)
182187
latest_infer = _latest(infer_runs)
183188
latest_ops = _latest(ops_runs)
189+
latest_train = _latest(train_runs)
184190

185-
colA, colB, colC = st.columns(3)
191+
colA, colB, colC, colD = st.columns(4)
186192

187193
with colA:
188194
st.markdown("#### 🔗 通信(最新)")
@@ -211,6 +217,17 @@ def _latest(lst):
211217
st.write(f"- time: {latest_ops.get('time','')}")
212218
st.write(f"- status: {'✅' if latest_ops.get('success') else '❌'}")
213219

220+
with colD:
221+
st.markdown("#### 🏋️ 训练(最新)")
222+
if not latest_train:
223+
st.info("暂无训练结果")
224+
else:
225+
framework = latest_train.get("config", {}).get("framework", "unknown")
226+
model = latest_train.get("config", {}).get("model", "unknown")
227+
st.write(f"- 框架/模型: `{framework}/{model}`")
228+
st.write(f"- time: {latest_train.get('time','')}")
229+
st.write(f"- status: {'✅' if latest_train.get('success') else '❌'}")
230+
214231
st.divider()
215232

216233
# ========== Recent runs table ==========
@@ -267,13 +284,15 @@ def _latest(lst):
267284
st.markdown("---")
268285
st.markdown("### 🚀 快速导航")
269286

270-
col1, col2, col3 = st.columns(3)
287+
col1, col2, col3, col4 = st.columns(4)
271288
if col1.button("🔗 通信测试分析", use_container_width=True):
272289
st.switch_page("pages/communication.py")
273290
if col2.button("⚡ 算子测试分析", use_container_width=True):
274291
st.switch_page("pages/operator.py")
275-
if col3.button("🤖 推理测试分析", use_container_width=True):
292+
if col3.button("🚀 推理测试分析", use_container_width=True):
276293
st.switch_page("pages/inference.py")
294+
if col4.button("🏋️ 训练测试分析", use_container_width=True):
295+
st.switch_page("pages/training.py")
277296

278297
except Exception as e:
279298
st.error(f"Dashboard 加载失败: {e}")

dashboard/pages/communication.py

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
create_summary_table_infer,
1818
)
1919

20-
init_page("推理测试分析 | InfiniMetrics", "🔗")
20+
init_page("通信测试分析 | InfiniMetrics", "🔗")
2121

2222

2323
def main():
@@ -58,7 +58,7 @@ def main():
5858
# Status filter
5959
show_success = st.checkbox("仅显示成功测试", value=True)
6060

61-
# Apply filters
61+
# Apply filter
6262
filtered_runs = [
6363
r
6464
for r in comm_runs
@@ -119,6 +119,7 @@ def main():
119119
run_info = filtered_runs[idx]
120120
result = st.session_state.data_loader.load_test_result(run_info["path"])
121121
run_info["data"] = result
122+
122123
selected_runs.append(run_info)
123124

124125
# Tabs for different views
@@ -181,26 +182,53 @@ def main():
181182
if len(selected_runs) == 1:
182183
st.markdown("#### 📌 核心指标(最新)")
183184
run = selected_runs[0]
184-
core = extract_core_metrics(run)
185185

186+
max_bw = None
187+
avg_lat = None
188+
duration = None
189+
190+
for metric in run.get("data", {}).get("metrics", []):
191+
metric_name = metric.get("name", "")
192+
193+
# bandwidth
194+
if (
195+
metric_name == "comm.bandwidth"
196+
and metric.get("data") is not None
197+
):
198+
df = metric["data"]
199+
if "bandwidth_gbs" in df.columns:
200+
max_bw = df["bandwidth_gbs"].max()
201+
202+
# latency
203+
elif (
204+
metric_name == "comm.latency" and metric.get("data") is not None
205+
):
206+
df = metric["data"]
207+
if "latency_us" in df.columns:
208+
avg_lat = df["latency_us"].mean()
209+
210+
# duration
211+
elif metric_name == "comm.duration":
212+
duration = metric.get("value")
186213
c1, c2, c3 = st.columns(3)
187214

188-
c1.metric(
189-
"峰值带宽",
190-
(
191-
f"{core['bandwidth_gbps']:.2f} GB/s"
192-
if core["bandwidth_gbps"]
193-
else "-"
194-
),
195-
)
196-
c2.metric(
197-
"平均延迟",
198-
f"{core['latency_us']:.2f} μs" if core["latency_us"] else "-",
199-
)
200-
c3.metric(
201-
"测试耗时",
202-
f"{core['duration_ms']:.2f} ms" if core["duration_ms"] else "-",
203-
)
215+
with c1:
216+
if max_bw is not None and max_bw > 0:
217+
st.metric("峰值带宽", f"{max_bw:.2f} GB/s")
218+
else:
219+
st.metric("峰值带宽", "-")
220+
221+
with c2:
222+
if avg_lat is not None and avg_lat > 0:
223+
st.metric("平均延迟", f"{avg_lat:.2f} μs")
224+
else:
225+
st.metric("平均延迟", "-")
226+
227+
with c3:
228+
if duration is not None and duration > 0:
229+
st.metric("测试耗时", f"{duration:.2f} ms")
230+
else:
231+
st.metric("测试耗时", "-")
204232
# Gauge charts for key metrics
205233
if len(selected_runs) == 1:
206234
st.markdown("#### 关键指标")
@@ -221,7 +249,7 @@ def main():
221249
max_bw = df["bandwidth_gbs"].max()
222250
fig = create_gauge_chart(
223251
max_bw,
224-
300, # Theoretical max for A100 NVLink
252+
300,
225253
"峰值带宽",
226254
"blue",
227255
"GB/s",
@@ -242,7 +270,7 @@ def main():
242270
avg_lat = df["latency_us"].mean()
243271
fig = create_gauge_chart(
244272
avg_lat,
245-
1000, # Reference: 1000 µs
273+
1000,
246274
"平均延迟",
247275
"red",
248276
"µs",
@@ -261,7 +289,7 @@ def main():
261289
if duration > 0:
262290
fig = create_gauge_chart(
263291
duration,
264-
duration * 2, # Scale to show progress
292+
duration * 2,
265293
"测试耗时",
266294
"green",
267295
"ms",

dashboard/pages/inference.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
create_summary_table_infer,
1313
)
1414

15-
init_page("推理测试分析 | InfiniMetrics", "🤖")
15+
init_page("推理测试分析 | InfiniMetrics", "🚀")
1616

1717

1818
def main():
@@ -178,7 +178,7 @@ def _plot_metric(metric_name_contains: str, container):
178178

179179
_plot_metric("infer.compute_latency", c1)
180180
_plot_metric("infer.ttft", c2)
181-
_plot_metric("infer.direct_throughput", c3)
181+
_plot_metric("infer.direct_throughput_tps", c3)
182182

183183
# ---------- Tables ----------
184184
with tab2:

0 commit comments

Comments
 (0)