FreedomIntelligence
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example‎
Lines changed: 60 additions & 0 deletions b/‎configs/runner_configs/runner_moorethreads_vllm_musa_f2f6f965.yaml.example‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json‎
Lines changed: 8 additions & 0 deletions b/‎results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/accuracy/accuracy.json‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json‎
Lines changed: 48 additions & 0 deletions b/‎results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/env_info.json‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json‎
Lines changed: 164 additions & 0 deletions b/‎results/community/mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0/offline/result.json‎
Lines changed: 164 additions & 0 deletions
@@ -93,6 +93,7 @@ Reference runners live under `runners/` (see each folder’s `meta.json`). The t
 | Huawei Ascend NPU | `ascend_vllm_ascend_d4aa9fda` | vllm-ascend | ✓ | ✓ | ✓ | ✓ | ✓ | — | — |
 | Apple Silicon | `apple_mlx_lm_9546b8b5` | mlx-lm | ⋯ | — | — | ⋯ | — | ⋯ | — |
 | Google TPU | `google_vllm_tpu_68cc9ffa` | vllm-tpu | ✓ | — | — | ✓ | — | ✓ | — |
+| Moore Threads GPU | `moorethreads_vllm_musa_f2f6f965` | vllm-musa | ✓ | ⋯ | ⋯ | ⋯ | ⋯ | ✓ | — |
 
 _Legend: ✓ validated · ⋯ author-declared (not smoke-tested in this repo yet) · — unsupported._
 <!-- platforms-matrix:end -->
 
@@ -0,0 +1,60 @@
+# AccelMark runner config — moorethreads_vllm_musa_f2f6f965 (vllm-musa on Moore Threads)
+#
+# Copy this file to runner_moorethreads_vllm_musa_f2f6f965.yaml (remove
+# .example suffix) and edit as needed for your hardware. The actual .yaml
+# is gitignored.
+#
+# These settings adapt the runner to your hardware environment. They are
+# recorded in result.json task.extra_config for transparency but are NOT
+# part of the benchmark identity (not hashed into run_id).
+#
+# Merge priority: CLI flags > suite-specific > global defaults > runner defaults
+
+# ── Global defaults (apply to all suites) ─────────────────────────────────────
+
+# Tensor parallel size — number of Moore Threads GPUs to use (default: 1).
+# For multi-card runs make sure to export VLLM_WORKER_MULTIPROC_METHOD=spawn.
+tensor_parallel_size: 1
+
+# Disable Triton CUDA-graph / compilation. Set true if you hit Triton kernel
+# errors on first request (most common on S3000 / S80 paths).
+enforce_eager: false
+
+# Maximum number of sequences in a batch (default: 256).
+# Reduce on lower-memory cards: 128 on 24 GB cards, 64 on 16 GB cards.
+max_num_seqs: 256
+
+# Fraction of MUSA HBM reserved for the KV cache (default: 0.85). Reduce if
+# you hit OOM; the vLLM flag is named gpu_memory_utilization but applies to
+# MUSA HBM via torchada.
+gpu_memory_utilization: 0.85
+
+# Pass-through kwargs forwarded directly to vLLM LLM() / AsyncEngineArgs().
+# Unknown keys are dropped automatically with a warning, so this is safe to
+# use across vLLM 0.10.x / 0.13.x.
+# engine_kwargs:
+#   swap_space: 8
+#   max_seq_len_to_capture: 4096
+
+# ── Suite-specific overrides ───────────────────────────────────────────────────
+
+suites:
+  suite_D:
+    # Long-context — reduce batch size and reserve more memory.
+    max_num_seqs: 32
+    gpu_memory_utilization: 0.80
+
+  suite_F:
+    max_num_seqs: 128
+
+# ── Speculative decoding (suite_A / suite_D extra scenario) ─────────────────
+# Uncomment to enable. vllm-musa accepts the same speculative_config dict as
+# upstream vLLM; the runner translates flat keys (speculative_model,
+# num_speculative_tokens, ...) into speculative_config automatically.
+#
+# suites:
+#   suite_A:
+#     engine_kwargs:
+#       speculative_model: "meta-llama/Llama-3.2-1B-Instruct"
+#       num_speculative_tokens: 4
+#       speculative_draft_tensor_parallel_size: 1
@@ -0,0 +1,8 @@
+{
+  "subset_score": 0.07,
+  "baseline_delta": -0.53,
+  "valid": false,
+  "framework": "vllm-musa",
+  "precision": "BF16",
+  "notes": "Integrated accuracy check \u2014 used same vllm-musa instance as benchmark."
+}
@@ -0,0 +1,48 @@
+{
+  "collected_at": "2026-05-18T09:21:31.092840+00:00",
+  "accelerators": [
+    {
+      "index": 0,
+      "name": "MTT S4000",
+      "vendor": "Moore Threads",
+      "memory_gb": 48.0,
+      "driver_version": "2.7.0",
+      "firmware_version": null,
+      "supports_bf16": true
+    }
+  ],
+  "accelerator_platform": "moorethreads",
+  "accelerator_topology": null,
+  "intra_node_interconnect": null,
+  "cpu": {
+    "model": "Intel(R) Xeon(R) Gold 6430",
+    "physical_cores": 64,
+    "logical_cores": 128,
+    "numa_nodes": 2
+  },
+  "system_memory_gb": 1007.5,
+  "pcie_generation": "PCIe 16x/16x",
+  "cpu_accelerator_bandwidth_gbs": null,
+  "network_interfaces": [
+    {
+      "name": "mlx5_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_1",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    },
+    {
+      "name": "mlx5_bond_0",
+      "type": "InfiniBand/RoCE",
+      "bandwidth_gbps": null
+    }
+  ],
+  "os": "Ubuntu Jammy Jellyfish (development branch)",
+  "python_version": "3.10.8",
+  "kernel_version": "5.15.0-105-generic",
+  "runtime_version": "Moore Threads Driver 2.7.0",
+  "pytorch_version": "2.2.0"
+}
@@ -0,0 +1,164 @@
+{
+  "schema_version": "1.0",
+  "suite_id": "suite_A",
+  "implementation_id": "moorethreads_vllm_musa_f2f6f965",
+  "chip": {
+    "name": "MTT S4000",
+    "vendor": "Moore Threads",
+    "count": 1,
+    "memory_gb": 48.0,
+    "interconnect_intra_node": null,
+    "interconnect_inter_node": null
+  },
+  "environment": {
+    "collected_at": "2026-05-18T09:21:31.092840+00:00",
+    "accelerators": [
+      {
+        "index": 0,
+        "name": "MTT S4000",
+        "vendor": "Moore Threads",
+        "memory_gb": 48.0,
+        "driver_version": "2.7.0",
+        "firmware_version": null,
+        "supports_bf16": true
+      }
+    ],
+    "accelerator_platform": "moorethreads",
+    "accelerator_topology": null,
+    "intra_node_interconnect": null,
+    "cpu": {
+      "model": "Intel(R) Xeon(R) Gold 6430",
+      "physical_cores": 64,
+      "logical_cores": 128,
+      "numa_nodes": 2
+    },
+    "system_memory_gb": 1007.5,
+    "pcie_generation": "PCIe 16x/16x",
+    "cpu_accelerator_bandwidth_gbs": null,
+    "network_interfaces": [
+      {
+        "name": "mlx5_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_1",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      },
+      {
+        "name": "mlx5_bond_0",
+        "type": "InfiniBand/RoCE",
+        "bandwidth_gbps": null
+      }
+    ],
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8",
+    "kernel_version": "5.15.0-105-generic",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "pytorch_version": "2.2.0"
+  },
+  "software": {
+    "framework": "vllm-musa",
+    "framework_version": "0.4.2",
+    "driver_version": "2.7.0",
+    "runtime_version": "Moore Threads Driver 2.7.0",
+    "os": "Ubuntu Jammy Jellyfish (development branch)",
+    "python_version": "3.10.8"
+  },
+  "model": {
+    "model_id": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "model_revision": "8afb486c1db24fe5011ec46dfbe5b5dccdb575c2",
+    "model_name": null,
+    "model_note": null,
+    "model_source": "local",
+    "architecture": "dense",
+    "parameter_count_b": 8.0,
+    "precision": "BF16",
+    "effective_dtype": "float16",
+    "quantization_method": null,
+    "model_format": "HuggingFace original"
+  },
+  "task": {
+    "scenario": "offline",
+    "num_runs": 3,
+    "warmup_runs": 1,
+    "parallelism": {
+      "tensor_parallel_size": 1,
+      "pipeline_parallel_size": 1,
+      "expert_parallel_size": 1,
+      "data_parallel_size": 1
+    },
+    "extra_config": null,
+    "runtime_metrics": null
+  },
+  "metrics": {
+    "offline": {
+      "results_by_concurrency": [
+        {
+          "client_concurrency": 8,
+          "throughput_tokens_per_sec": 332.62,
+          "throughput_tokens_per_sec_per_chip": 332.62,
+          "throughput_tokens_per_sec_total": 922.83,
+          "elapsed_seconds_median": 43.4,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 32,
+          "throughput_tokens_per_sec": 331.64,
+          "throughput_tokens_per_sec_per_chip": 331.64,
+          "throughput_tokens_per_sec_total": 920.1,
+          "elapsed_seconds_median": 43.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        },
+        {
+          "client_concurrency": 128,
+          "throughput_tokens_per_sec": 331.76,
+          "throughput_tokens_per_sec_per_chip": 331.76,
+          "throughput_tokens_per_sec_total": 920.46,
+          "elapsed_seconds_median": 43.6,
+          "peak_memory_gb": null,
+          "power_watts_avg": null,
+          "power_watts_peak": null,
+          "oom": false,
+          "_throughput_note": "output_only",
+          "_concurrency_note": "client_concurrency is the number of requests sent simultaneously. The inference engine batches internally; this does not directly set engine parameters like max_num_seqs."
+        }
+      ]
+    }
+  },
+  "accuracy": {
+    "subset_score": null,
+    "baseline_delta": null,
+    "valid": false,
+    "notes": "Run --scenario accuracy to check model accuracy."
+  },
+  "meta": {
+    "submitted_by": "JuhaoLiang1997",
+    "submission_type": "individual",
+    "date": "2026-05-18",
+    "time": "17:34:52",
+    "run_id": "cabb7bd0",
+    "run_name": "mtt_s4000x1_suite_A_moorethreads_vllm_musa_f2f6f965_cabb7bd0",
+    "flagged": null,
+    "reproduce_script": "runners/moorethreads_vllm_musa_f2f6f965/runner.py",
+    "env_info_file": "../env_info.json",
+    "log_file": "run.log",
+    "samples_file": "samples.jsonl",
+    "notes": null,
+    "benchmark_start_time": "2026-05-18T09:26:10.676960+00:00",
+    "benchmark_end_time": "2026-05-18T09:34:52.667112+00:00",
+    "benchmark_elapsed_minutes": 8.7,
+    "model_load_seconds": 116.8
+  }
+}