adding more workflows

cquil11 · cquil11 · commit 13803ac43474 · 2025-10-30T09:19:33.000-05:00
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
@@ -4,6 +4,11 @@ concurrency:
   group: benchmark-lock-1k8k
   cancel-in-progress: false
 
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 23 * * *'
+
 on:
     # pull_request:
     workflow_dispatch:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,147 @@
+name: Test - Full Sweep
+
+concurrency:
+    group: benchmark-lock
+    cancel-in-progress: false
+
+on:
+    pull_request:
+    workflow_dispatch:
+        inputs:
+            name:
+                description: "Name of benchmark from master configs"
+                required: true
+                type: string
+                default: 70b-fp4-mi355x-vllm
+
+            run_1k1k:
+                description: "Run ISL/OSL 1k/1k"
+                type: boolean
+                required: true
+            run_1k8k:
+                description: "Run ISL/OSL 1k/8k"
+                type: boolean
+                required: true
+            run_8k1k:
+                description: "Run ISL/OSL 8k/1k"
+                type: boolean
+                required: true
+
+            runner:
+                description: "Specific runner node to run on"
+                required: false
+                type: choice
+                options:
+                    - "h100-cr_0"
+                    - "h100-cr_1"
+                    - "h100-cw_0"
+                    - "h100-cw_1"
+                    - "h200-cw_0"
+                    - "h200-cw_1"
+                    - "h200-nb_0"
+                    - "h200-nb_1"
+                    - "h200-nb_2"
+                    - "h200-nb_3"
+                    - "h200-nv_0"
+                    - "h200-nv_1"
+                    - "h200-nv_2"
+                    - "h200-nv_3"
+                    - "b200-nv_0"
+                    - "b200-nv_1"
+                    - "b200-nb_0"
+                    - "b200-nb_1"
+                    - "b200-nvd_0"
+                    - "b200-nvd_1"
+                    - "b200-nvd_2"
+                    - "b200-nvd_3"
+                    - "b200-tg_0"
+                    - "mi300x-amd_0"
+                    - "mi300x-amd_1"
+                    - "mi300x-amd_2"
+                    - "mi300x-amd_3"
+                    - "mi300x-amd_4"
+                    - "mi300x-cr_0"
+                    - "mi300x-oci_0"
+                    - "mi325x-amd_0"
+                    - "mi325x-tw_0"
+                    - "mi325x-tw_1"
+                    - "mi325x-tw_2"
+                    - "mi325x-tw_3"
+                    - "mi355x-amd_0"
+                    - "mi355x-amd_1"
+                    - "mi355x-amd_2"
+                    - "mi355x-amd_3"
+
+jobs:
+    get-jobs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-jobs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
+                    --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
+                    --key ${{ inputs.name }} \
+                    ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    test-sweep:
+        needs: get-jobs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: test sweep - ${{ inputs.name }}
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    calc-success-rate:
+        needs: test-sweep
+        if: ${{ always() }}
+        runs-on: ubuntu-latest
+
+        env:
+            RESULTS_DIR: "results/"
+            STATS_FILENAME: "run_stats"
+            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
+
+        steps:
+            - uses: actions/checkout@v3
+              with:
+                  token: ${{ secrets.REPO_PAT }}
+                  fetch-depth: 0
+
+            - name: Download results artifacts
+              uses: actions/download-artifact@v4
+              with:
+                  path: ${{ env.RESULTS_DIR }}
+                  pattern: results_*
+
+            - name: Install python dependencies
+              run: pip install PyGithub
+
+            - name: Calculate success rate
+              run: python3 utils/calc_success_rate.py $STATS_FILENAME
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: "run-stats"
+                  path: ${{ env.STATS_FILENAME }}.json
diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py
@@ -0,0 +1,151 @@
+import json
+import yaml
+import sys
+import argparse
+
+seq_len_stoi = {
+    "1k1k": (1024, 1024),
+    "1k8k": (1024, 8192),
+    "8k1k": (8192, 1024)
+}
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate benchmark matrix from a specific configuration key'
+    )
+    parser.add_argument(
+        '--config-files',
+        nargs='+',
+        required=True,
+        help='One or more configuration files (YAML format)'
+    )
+    parser.add_argument(
+        '--key',
+        required=True,
+        help='Configuration key to use'
+    )
+    parser.add_argument(
+        '--seq-lens',
+        nargs='+',
+        choices=list(seq_len_stoi.keys()),
+        required=False,
+        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
+    )
+    parser.add_argument(
+        '--step-size',
+        type=int,
+        default=2,
+        help='Step size for concurrency values (default: 2)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Convert seq-lens to set of (isl, osl) tuples for filtering
+    seq_lens_filter = None
+    if args.seq_lens:
+        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
+    
+    # Load and merge all config files
+    all_config_data = {}
+    for config_file in args.config_files:
+        try:
+            with open(config_file, 'r') as f:
+                config_data = yaml.safe_load(f)
+                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
+                
+                # Check for duplicate keys
+                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
+                if duplicate_keys:
+                    raise ValueError(
+                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
+                    )
+                
+                all_config_data.update(config_data)
+        except FileNotFoundError:
+            raise ValueError(f"Input file '{config_file}' does not exist.")
+    
+    # Check if the key exists
+    if args.key not in all_config_data:
+        available_keys = ', '.join(sorted(all_config_data.keys()))
+        raise ValueError(
+            f"Key '{args.key}' not found in configuration files. "
+            f"Available keys: {available_keys}"
+        )
+    
+    val = all_config_data[args.key]
+    
+    # Validate required fields
+    seq_len_configs = val.get('seq-len-configs')
+    assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'"
+    
+    image = val.get('image')
+    model = val.get('model')
+    precision = val.get('precision')
+    framework = val.get('framework')
+    runner = val.get('runner')
+    
+    assert None not in (image, model, precision, framework, runner), \
+        f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
+    
+    matrix_values = []
+    
+    # Process each sequence length configuration
+    for seq_config in seq_len_configs:
+        isl = seq_config.get('isl')
+        osl = seq_config.get('osl')
+        
+        assert None not in (isl, osl), \
+            f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'"
+        
+        # Filter by sequence lengths if specified
+        if seq_lens_filter and (isl, osl) not in seq_lens_filter:
+            continue
+        
+        bmk_space = seq_config.get('bmk-space')
+        assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'"
+        
+        for bmk in bmk_space:
+            tp = bmk.get('tp')
+            conc_start = bmk.get('conc-start')
+            conc_end = bmk.get('conc-end')
+            ep = bmk.get('ep')
+            dp_attn = bmk.get('dp-attn')
+            
+            assert None not in (tp, conc_start, conc_end), \
+                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
+            
+            # Generate entries for each concurrency value in the range
+            conc = conc_start
+            while conc <= conc_end:
+                entry = {
+                    'image': image,
+                    'model': model,
+                    'precision': precision,
+                    'framework': framework,
+                    'runner': runner,
+                    'isl': isl,
+                    'osl': osl,
+                    'tp': tp,
+                    'conc': conc,
+                    'max-model-len': isl + osl,
+                }
+                
+                # Add optional fields if they exist
+                if ep is not None:
+                    entry['ep'] = ep
+                if dp_attn is not None:
+                    entry['dp-attn'] = dp_attn
+                
+                matrix_values.append(entry)
+                
+                if conc == conc_end:
+                    break
+                conc *= args.step_size
+                if conc > conc_end:
+                    conc = conc_end
+    
+    print(json.dumps(matrix_values))
+    return matrix_values
+
+if __name__ == "__main__":
+    main()