Merge pull request #31 from InfiniTensor/feat/dashboard-streamlit

Chamberlain0w0 · web-flow · commit c509fd5a3841 · 2026-03-17T14:46:12.000+08:00
feat(dashboard): add training analysis page and improve communication…
diff --git a/dashboard/app.py b/dashboard/app.py
@@ -65,7 +65,7 @@ def main():
         st.markdown("---")
 
         results_dir = st.text_input(
-            "测试结果目录", value="../output", help="包含 JSON/CSV 测试结果的目录"
+            "测试结果目录", value="./output", help="包含 JSON/CSV 测试结果的目录"
         )
 
         if not use_mongodb and results_dir != str(
@@ -122,20 +122,23 @@ def render_dashboard(run_id_filter: str):
         <div style="
             margin-top: 0.5em;
             margin-bottom: 1.5em;
-            max-width: 1100px;
-            font-size: 1.05em;
+            max-width: 80%;
+            font-size: 1.3em;
             line-height: 1.6;
         ">
             <strong>InfiniMetrics Dashboard</strong> 用于统一展示
             <strong>通信（NCCL / 集合通信）</strong>、
+            <strong>训练（Training / 分布式训练）</strong>、
             <strong>推理（直接推理 / 服务性能）</strong>、
             <strong>算子（核心算子性能）</strong>、
             <strong>硬件（内存带宽 / 缓存性能）</strong>
             等 AI 加速卡性能测试结果。
             <br/>
             测试框架输出 <code>JSON</code>（环境 / 配置 / 标量指标） +
             <code>CSV</code>（曲线 / 时序数据），
-            Dashboard 自动加载并支持多次运行的对比分析与可视化。
+            Dashboard 自动加载并支持多次运行的
+            <strong>性能对比</strong>、<strong>趋势分析</strong> 与
+            <strong>可视化展示</strong>。
         </div>
         """,
         unsafe_allow_html=True,
@@ -177,6 +180,7 @@ def _parse_time(t):
         # ========== Categorize runs ==========
         comm_runs = [r for r in runs if r.get("testcase", "").startswith("comm")]
         infer_runs = [r for r in runs if r.get("testcase", "").startswith("infer")]
+        train_runs = [r for r in runs if r.get("testcase", "").startswith("train")]
 
         ops_runs, hw_runs = [], []
         for r in runs:
@@ -188,13 +192,14 @@ def _parse_time(t):
                 hw_runs.append(r)
 
         # ========== KPI ==========
-        c1, c2, c3, c4, c5, c6 = st.columns(6)
+        c1, c2, c3, c4, c5, c6, c7 = st.columns(7)
         c1.metric("总测试数", total)
         c2.metric("成功率", f"{(success/total*100):.1f}%")
         c3.metric("通信测试", len(comm_runs))
         c4.metric("推理测试", len(infer_runs))
-        c5.metric("算子测试", len(ops_runs))
-        c6.metric("硬件检测", len(hw_runs))
+        c5.metric("训练测试", len(train_runs))
+        c6.metric("算子测试", len(ops_runs))
+        c7.metric("硬件检测", len(hw_runs))
 
         st.caption(f"失败测试数：{fail}")
         st.caption(f"当前筛选：加速卡={','.join(selected_accs) or '全部'}")
@@ -208,8 +213,9 @@ def _latest(lst):
         latest_comm = _latest(comm_runs)
         latest_infer = _latest(infer_runs)
         latest_ops = _latest(ops_runs)
+        latest_train = _latest(train_runs)
 
-        colA, colB, colC = st.columns(3)
+        colA, colB, colC, colD = st.columns(4)
 
         with colA:
             st.markdown("#### 🔗 通信（最新）")
@@ -238,6 +244,17 @@ def _latest(lst):
                 st.write(f"- time: {latest_ops.get('time','')}")
                 st.write(f"- status: {'✅' if latest_ops.get('success') else '❌'}")
 
+        with colD:
+            st.markdown("#### 🏋️ 训练（最新）")
+            if not latest_train:
+                st.info("暂无训练结果")
+            else:
+                framework = latest_train.get("config", {}).get("framework", "unknown")
+                model = latest_train.get("config", {}).get("model", "unknown")
+                st.write(f"- 框架/模型: `{framework}/{model}`")
+                st.write(f"- time: {latest_train.get('time','')}")
+                st.write(f"- status: {'✅' if latest_train.get('success') else '❌'}")
+
         st.divider()
 
         # ========== Recent runs table ==========
@@ -294,13 +311,15 @@ def _latest(lst):
         st.markdown("---")
         st.markdown("### 🚀 快速导航")
 
-        col1, col2, col3 = st.columns(3)
+        col1, col2, col3, col4 = st.columns(4)
         if col1.button("🔗 通信测试分析", use_container_width=True):
             st.switch_page("pages/communication.py")
         if col2.button("⚡ 算子测试分析", use_container_width=True):
             st.switch_page("pages/operator.py")
-        if col3.button("🤖 推理测试分析", use_container_width=True):
+        if col3.button("🚀 推理测试分析", use_container_width=True):
             st.switch_page("pages/inference.py")
+        if col4.button("🏋️ 训练测试分析", use_container_width=True):
+            st.switch_page("pages/training.py")
 
     except Exception as e:
         st.error(f"Dashboard 加载失败: {e}")
diff --git a/dashboard/pages/communication.py b/dashboard/pages/communication.py
@@ -60,7 +60,7 @@ def main():
             # Status filter
             show_success = st.checkbox("仅显示成功测试", value=True)
 
-            # Apply filters
+            # Apply filter
             filtered_runs = [
                 r
                 for r in comm_runs
@@ -123,6 +123,7 @@ def main():
             identifier = run_info.get("path") or run_info.get("run_id")
             result = st.session_state.data_loader.load_test_result(identifier)
             run_info["data"] = result
+
             selected_runs.append(run_info)
 
         # Tabs for different views
@@ -183,36 +184,30 @@ def main():
                     st.plotly_chart(fig, use_container_width=True)
 
             if len(selected_runs) == 1:
-                st.markdown("#### 📌 核心指标（最新）")
+                st.markdown("#### 关键指标")
                 run = selected_runs[0]
                 core = extract_core_metrics(run)
 
-                c1, c2, c3 = st.columns(3)
-
-                c1.metric(
+                # First Line: numerical indicators
+                cols = st.columns(3)
+                cols[0].metric(
                     "峰值带宽",
-                    (
-                        f"{core['bandwidth_gbps']:.2f} GB/s"
-                        if core["bandwidth_gbps"]
-                        else "-"
-                    ),
+                    f"{core['bandwidth_gbps']:.2f} GB/s"
+                    if core["bandwidth_gbps"]
+                    else "-",
                 )
-                c2.metric(
+                cols[1].metric(
                     "平均延迟",
                     f"{core['latency_us']:.2f} μs" if core["latency_us"] else "-",
                 )
-                c3.metric(
+                cols[2].metric(
                     "测试耗时",
                     f"{core['duration_ms']:.2f} ms" if core["duration_ms"] else "-",
                 )
-            # Gauge charts for key metrics
-            if len(selected_runs) == 1:
-                st.markdown("#### 关键指标")
-                run = selected_runs[0]
 
-                col1, col2, col3 = st.columns(3)
+                cols = st.columns(3)
 
-                with col1:
+                with cols[0]:
                     # Find max bandwidth
                     max_bw = 0
                     for metric in run.get("data", {}).get("metrics", []):
@@ -233,7 +228,7 @@ def main():
                                 st.plotly_chart(fig, use_container_width=True)
                                 break
 
-                with col2:
+                with cols[1]:
                     # Find average latency
                     avg_lat = 0
                     for metric in run.get("data", {}).get("metrics", []):
@@ -254,7 +249,7 @@ def main():
                                 st.plotly_chart(fig, use_container_width=True)
                                 break
 
-                with col3:
+                with cols[2]:
                     # Extract duration
                     duration = 0
                     for metric in run.get("data", {}).get("metrics", []):
diff --git a/dashboard/pages/inference.py b/dashboard/pages/inference.py
@@ -11,7 +11,7 @@
     create_summary_table_infer,
 )
 
-init_page("推理测试分析 | InfiniMetrics", "🤖")
+init_page("推理测试分析 | InfiniMetrics", "🚀")
 
 
 def main():
@@ -180,7 +180,7 @@ def _plot_metric(metric_name_contains: str, container):
 
         _plot_metric("infer.compute_latency", c1)
         _plot_metric("infer.ttft", c2)
-        _plot_metric("infer.direct_throughput", c3)
+        _plot_metric("infer.direct_throughput_tps", c3)
 
     # ---------- Tables ----------
     with tab2:
diff --git a/dashboard/pages/training.py b/dashboard/pages/training.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""Training tests analysis page."""
+
+import streamlit as st
+
+from common import init_page
+from components.header import render_header
+from utils.training_utils import (
+    load_training_runs,
+    filter_runs,
+    create_run_options,
+    load_selected_runs,
+    create_training_summary,
+)
+from utils.training_plots import (
+    render_performance_curves,
+    render_throughput_comparison,
+    render_data_tables,
+    render_config_details,
+)
+
+init_page("训练测试分析 | InfiniMetrics", "🏋️")
+
+
+def main():
+    render_header()
+    st.markdown("## 🏋️ 训练性能测试分析")
+
+    dl = st.session_state.data_loader
+    runs = load_training_runs(dl)
+
+    if not runs:
+        st.info("未找到训练测试结果\n请将训练测试结果放在 output/train/ 或 output/training/ 目录下")
+        return
+
+    # Sidebar Filters
+    with st.sidebar:
+        st.markdown("### 🔍 筛选条件")
+
+        frameworks = sorted(
+            {r.get("config", {}).get("framework", "unknown") for r in runs}
+        )
+        models = sorted({r.get("config", {}).get("model", "unknown") for r in runs})
+        device_counts = sorted({r.get("device_used", 1) for r in runs})
+
+        selected_fw = st.multiselect("框架", frameworks, default=frameworks)
+        selected_models = st.multiselect("模型", models, default=models)
+        selected_dev = st.multiselect("设备数", device_counts, default=device_counts)
+        only_success = st.checkbox("仅显示成功测试", value=True)
+
+        st.markdown("---")
+        st.markdown("### 📈 图表选项")
+        y_log = st.checkbox("Y轴对数刻度", value=False)
+        smoothing = st.slider("平滑窗口", 1, 50, 5, help="对曲线进行移动平均平滑")
+
+    # Apply filters
+    filtered = filter_runs(
+        runs, selected_fw, selected_models, selected_dev, only_success
+    )
+    st.caption(f"找到 {len(filtered)} 个训练测试")
+
+    if not filtered:
+        st.warning("没有符合条件的测试结果")
+        return
+
+    # Run Selection
+    options = create_run_options(filtered)
+    selected = st.multiselect(
+        "选择要分析的测试运行（可多选对比）",
+        list(options.keys()),
+        default=list(options.keys())[: min(3, len(options))],
+    )
+
+    if not selected:
+        return
+
+    # Load selected runs
+    selected_runs = load_selected_runs(dl, filtered, options, selected)
+
+    # Tabs
+    tab1, tab2, tab3, tab4 = st.tabs(["📈 性能曲线", "📊 吞吐量对比", "📋 数据表格", "🔍 详细配置"])
+
+    with tab1:
+        render_performance_curves(selected_runs, smoothing, y_log)
+    with tab2:
+        render_throughput_comparison(selected_runs)
+    with tab3:
+        render_data_tables(selected_runs)
+    with tab4:
+        render_config_details(selected_runs, create_training_summary)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dashboard/utils/data_loader.py b/dashboard/utils/data_loader.py
@@ -28,7 +28,7 @@ class InfiniMetricsDataLoader:
 
     def __init__(
         self,
-        results_dir: str = "../output",
+        results_dir: str = "./output",
         use_mongodb: bool = False,
         mongo_config=None,
         fallback_to_files: bool = True,
diff --git a/dashboard/utils/data_sources.py b/dashboard/utils/data_sources.py
@@ -51,7 +51,7 @@ def source_type(self) -> str:
 class FileDataSource(DataSource):
     """File-based data source (reads from JSON/CSV files)."""
 
-    def __init__(self, results_dir: str = "../output"):
+    def __init__(self, results_dir: str = "./output"):
         self.results_dir = Path(results_dir)
 
     @property
diff --git a/dashboard/utils/training_plots.py b/dashboard/utils/training_plots.py
diff --git a/dashboard/utils/training_utils.py b/dashboard/utils/training_utils.py
diff --git a/dashboard/utils/visualizations.py b/dashboard/utils/visualizations.py