Update Whisper CUDA models to int4/int8 quantized ones (#5086)

hanbitmyths · web-flow · commit 187a157e4e00 · 2026-05-26T10:04:08.000-07:00
diff --git a/assets/models/foundrylocal/openai-whisper-base-cuda-gpu/model.yaml b/assets/models/foundrylocal/openai-whisper-base-cuda-gpu/model.yaml
@@ -1,6 +1,6 @@
 path:
   container_name: models
-  container_path: foundrylocal/models/openai-whisper-base/onnx/cuda/v2
+  container_path: foundrylocal/models/openai-whisper-base/onnx/cuda/v3
   storage_name: foundrylocalassetdata
   type: azureblob
 publish:
diff --git a/assets/models/foundrylocal/openai-whisper-base-cuda-gpu/spec.yaml b/assets/models/foundrylocal/openai-whisper-base-cuda-gpu/spec.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
 name: openai-whisper-base-cuda-gpu
-version: 2
+version: 3
 isArchived: true
 path: ./
 tags:
@@ -13,7 +13,7 @@ tags:
   task: automatic-speech-recognition
   maxOutputTokens: 2048
   alias: whisper-base
-  directoryPath: v2
+  directoryPath: v3
   promptTemplate: "{\"prompt\": \"<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>\"}"
   contextLength: 448
   capabilities: ""
@@ -31,5 +31,5 @@ variantInfo:
     quantization: ['RTN']
     device: 'gpu'
     executionProvider: 'CUDAExecutionProvider'
-    fileSizeBytes: 204272297
-    vRamFootprintBytes: 204272297
+    fileSizeBytes: 130003264
+    vRamFootprintBytes: 130003264
diff --git a/assets/models/foundrylocal/openai-whisper-large-v3-turbo-cuda-gpu/model.yaml b/assets/models/foundrylocal/openai-whisper-large-v3-turbo-cuda-gpu/model.yaml
@@ -1,6 +1,6 @@
 path:
-  container_name: whisper-models
-  container_path: whisper-large-v3-turbo/cuda-fp16/v2
+  container_name: models
+  container_path: foundrylocal/models/openai-whisper-large-v3-turbo/onnx/cuda/v3
   storage_name: foundrylocalassetdata
   type: azureblob
 publish:
diff --git a/assets/models/foundrylocal/openai-whisper-large-v3-turbo-cuda-gpu/spec.yaml b/assets/models/foundrylocal/openai-whisper-large-v3-turbo-cuda-gpu/spec.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
 name: openai-whisper-large-v3-turbo-cuda-gpu
-version: 2
+version: 3
 isArchived: true
 path: ./
 tags:
@@ -13,7 +13,7 @@ tags:
   task: automatic-speech-recognition
   maxOutputTokens: 2048
   alias: whisper-large-v3-turbo
-  directoryPath: v2
+  directoryPath: v3
   promptTemplate: "{\"prompt\": \"<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>\"}"
   contextLength: 448
   capabilities: ""
@@ -31,5 +31,5 @@ variantInfo:
     quantization: ['RTN']
     device: 'gpu'
     executionProvider: 'CUDAExecutionProvider'
-    fileSizeBytes: 9438190632
-    vRamFootprintBytes: 9438528266
+    fileSizeBytes: 842333753
+    vRamFootprintBytes: 842333753
diff --git a/assets/models/foundrylocal/openai-whisper-medium-cuda-gpu/model.yaml b/assets/models/foundrylocal/openai-whisper-medium-cuda-gpu/model.yaml
@@ -1,6 +1,6 @@
 path:
   container_name: models
-  container_path: foundrylocal/models/openai-whisper-medium/onnx/cuda/v2
+  container_path: foundrylocal/models/openai-whisper-medium/onnx/cuda/v3
   storage_name: foundrylocalassetdata
   type: azureblob
 publish:
diff --git a/assets/models/foundrylocal/openai-whisper-medium-cuda-gpu/spec.yaml b/assets/models/foundrylocal/openai-whisper-medium-cuda-gpu/spec.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
 name: openai-whisper-medium-cuda-gpu
-version: 2
+version: 3
 isArchived: true
 path: ./
 tags:
@@ -13,7 +13,7 @@ tags:
   task: automatic-speech-recognition
   maxOutputTokens: 2048
   alias: whisper-medium
-  directoryPath: v2
+  directoryPath: v3
   promptTemplate: "{\"prompt\": \"<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>\"}"
   contextLength: 448
   capabilities: ""
@@ -31,5 +31,5 @@ variantInfo:
     quantization: ['RTN']
     device: 'gpu'
     executionProvider: 'CUDAExecutionProvider'
-    fileSizeBytes: 1640305824
-    vRamFootprintBytes: 1640305824
+    fileSizeBytes: 670344241
+    vRamFootprintBytes: 670344241
diff --git a/assets/models/foundrylocal/openai-whisper-small-cuda-gpu/model.yaml b/assets/models/foundrylocal/openai-whisper-small-cuda-gpu/model.yaml
@@ -1,6 +1,6 @@
 path:
   container_name: models
-  container_path: foundrylocal/models/openai-whisper-small/onnx/cuda/v2
+  container_path: foundrylocal/models/openai-whisper-small/onnx/cuda/v3
   storage_name: foundrylocalassetdata
   type: azureblob
 publish:
diff --git a/assets/models/foundrylocal/openai-whisper-small-cuda-gpu/spec.yaml b/assets/models/foundrylocal/openai-whisper-small-cuda-gpu/spec.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
 name: openai-whisper-small-cuda-gpu
-version: 2
+version: 3
 isArchived: true
 path: ./
 tags:
@@ -13,7 +13,7 @@ tags:
   task: automatic-speech-recognition
   maxOutputTokens: 2048
   alias: whisper-small
-  directoryPath: v2
+  directoryPath: v3
   promptTemplate: "{\"prompt\": \"<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>\"}"
   contextLength: 448
   capabilities: ""
@@ -31,5 +31,5 @@ variantInfo:
     quantization: ['RTN']
     device: 'gpu'
     executionProvider: 'CUDAExecutionProvider'
-    fileSizeBytes: 569232292
-    vRamFootprintBytes: 572890964
+    fileSizeBytes: 316267734
+    vRamFootprintBytes: 316267734
diff --git a/assets/models/foundrylocal/openai-whisper-tiny-cuda-gpu/model.yaml b/assets/models/foundrylocal/openai-whisper-tiny-cuda-gpu/model.yaml
@@ -1,6 +1,6 @@
 path:
   container_name: models
-  container_path: foundrylocal/models/openai-whisper-tiny/onnx/cuda/v2
+  container_path: foundrylocal/models/openai-whisper-tiny/onnx/cuda/v3
   storage_name: foundrylocalassetdata
   type: azureblob
 publish:
diff --git a/assets/models/foundrylocal/openai-whisper-tiny-cuda-gpu/spec.yaml b/assets/models/foundrylocal/openai-whisper-tiny-cuda-gpu/spec.yaml
@@ -1,6 +1,6 @@
 $schema: https://azuremlschemas.azureedge.net/latest/model.schema.json
 name: openai-whisper-tiny-cuda-gpu
-version: 2
+version: 3
 isArchived: true
 path: ./
 tags:
@@ -13,7 +13,7 @@ tags:
   task: automatic-speech-recognition
   maxOutputTokens: 2048
   alias: whisper-tiny
-  directoryPath: v2
+  directoryPath: v3
   promptTemplate: "{\"prompt\": \"<|startoftranscript|> <|en|> <|transcribe|> <|notimestamps|>\"}"
   contextLength: 448
   capabilities: ""
@@ -31,5 +31,5 @@ variantInfo:
     quantization: ['RTN']
     device: 'gpu'
     executionProvider: 'CUDAExecutionProvider'
-    fileSizeBytes: 121285804
-    vRamFootprintBytes: 121285804
+    fileSizeBytes: 89899070
+    vRamFootprintBytes: 89899070