QuantLLM/.github/workflows/quantize-model.yml at 40bb0c40e81b7e6fffeff25a8fb0e2b600109904 · codewithdark-git/QuantLLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
name: Quantize Model

on:
  workflow_dispatch:
    inputs:
      model_name:
        description: 'Model name or path'
        required: true
        type: string
      quantization_method:
        description: 'Quantization method'
        required: true
        type: choice
        options:
          - auto
          - gguf
          - gptq
          - awq
        default: 'auto'
      bits:
        description: 'Quantization bits'
        required: true
        type: choice
        options:
          - '2'
          - '3'
          - '4'
          - '8'
        default: '4'
      output_format:
        description: 'Output format'
        required: true
        type: choice
        options:
          - auto
          - gguf
          - safetensors
          - pytorch
        default: 'auto'
      upload_to_hub:
        description: 'Upload to HuggingFace Hub'
        required: false
        type: boolean
        default: false
      hub_repo_id:
        description: 'HuggingFace Hub repository ID'
        required: false
        type: string

  push:
    paths:
      - 'models/**'
      - '.github/workflows/quantize-model.yml'

  pull_request:
    paths:
      - 'models/**'
      - '.github/workflows/quantize-model.yml'

env:
  PYTHON_VERSION: '3.9'
  CUDA_VERSION: '11.8'

jobs:
  quantize:
    runs-on: ubuntu-latest

    strategy:
      matrix:
        include:
          - gpu: false
            runner: ubuntu-latest
          - gpu: true
            runner: [self-hosted, gpu]

    steps:
    - name: Checkout repository
      uses: actions/checkout@v4

    - name: Set up Python
      uses: actions/setup-python@v4
      with:
        python-version: ${{ env.PYTHON_VERSION }}

    - name: Setup CUDA (GPU runners only)
      if: matrix.gpu
      uses: Jimver/cuda-toolkit@v0.2.11
      with:
        cuda: ${{ env.CUDA_VERSION }}

    - name: Cache pip dependencies
      uses: actions/cache@v3
      with:
        path: ~/.cache/pip
        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
        restore-keys: |
          ${{ runner.os }}-pip-

    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -e .
        pip install -r requirements.txt

    - name: Install GPU dependencies
      if: matrix.gpu
      run: |
        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

    - name: Set up model configuration
      run: |
        mkdir -p ./quantized_models
        echo "MODEL_NAME=${{ github.event.inputs.model_name || 'gpt2' }}" >> $GITHUB_ENV
        echo "METHOD=${{ github.event.inputs.quantization_method || 'auto' }}" >> $GITHUB_ENV
        echo "BITS=${{ github.event.inputs.bits || '4' }}" >> $GITHUB_ENV
        echo "OUTPUT_FORMAT=${{ github.event.inputs.output_format || 'auto' }}" >> $GITHUB_ENV

    - name: Run quantization
      run: |
        quantllm quantize \
          --model "${{ env.MODEL_NAME }}" \
          --method "${{ env.METHOD }}" \
          --bits "${{ env.BITS }}" \
          --output-format "${{ env.OUTPUT_FORMAT }}" \
          --output-dir "./quantized_models/${{ env.MODEL_NAME }}-${{ env.METHOD }}-${{ env.BITS }}bit" \
          --validate \
          --benchmark \
          --progress json \
          --log-file "./logs/quantization-${{ github.run_id }}.log" \
          --verbose

    - name: Upload quantization logs
      if: always()
      uses: actions/upload-artifact@v3
      with:
        name: quantization-logs-${{ github.run_id }}
        path: ./logs/
        retention-days: 30

    - name: Upload quantized model
      uses: actions/upload-artifact@v3
      with:
        name: quantized-model-${{ env.MODEL_NAME }}-${{ env.METHOD }}-${{ env.BITS }}bit
        path: ./quantized_models/
        retention-days: 7

    - name: Generate model card
      run: |
        python -c "
        import json
        import yaml
        from pathlib import Path

        model_info = {
            'model_name': '${{ env.MODEL_NAME }}',
            'quantization_method': '${{ env.METHOD }}',
            'bits': int('${{ env.BITS }}'),
            'output_format': '${{ env.OUTPUT_FORMAT }}',
            'github_run_id': '${{ github.run_id }}',
            'github_sha': '${{ github.sha }}',
            'quantized_at': '$(date -u +%Y-%m-%dT%H:%M:%SZ)'
        }

        with open('./quantized_models/model_info.json', 'w') as f:
            json.dump(model_info, f, indent=2)
        "

    - name: Upload to HuggingFace Hub
      if: github.event.inputs.upload_to_hub == 'true' && github.event.inputs.hub_repo_id != ''
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        pip install huggingface_hub
        python -c "
        from huggingface_hub import HfApi
        import os

        if os.getenv('HF_TOKEN'):
            api = HfApi()
            api.upload_folder(
                folder_path='./quantized_models/${{ env.MODEL_NAME }}-${{ env.METHOD }}-${{ env.BITS }}bit',
                repo_id='${{ github.event.inputs.hub_repo_id }}',
                token=os.getenv('HF_TOKEN')
            )
            print('Model uploaded to HuggingFace Hub successfully!')
        else:
            print('HF_TOKEN not found, skipping upload')
        "

    - name: Create release (on tag)
      if: startsWith(github.ref, 'refs/tags/')
      uses: softprops/action-gh-release@v1
      with:
        files: |
          ./quantized_models/**/*
        generate_release_notes: true
      env:
        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

  notify:
    needs: quantize
    runs-on: ubuntu-latest
    if: always()

    steps:
    - name: Notify on success
      if: needs.quantize.result == 'success'
      run: |
        echo "✅ Quantization completed successfully!"
        echo "Model: ${{ github.event.inputs.model_name || 'gpt2' }}"
        echo "Method: ${{ github.event.inputs.quantization_method || 'auto' }}"
        echo "Bits: ${{ github.event.inputs.bits || '4' }}"

    - name: Notify on failure
      if: needs.quantize.result == 'failure'
      run: |
        echo "❌ Quantization failed!"
        echo "Check the logs for more details."
        exit 1