AsrModule demo (#194)

kirklandsign · web-flow · commit feed004756d5 · 2026-01-29T10:59:13.000-08:00
diff --git a/whisper/android/WhisperApp/README.md b/whisper/android/WhisperApp/README.md
@@ -1,26 +1,22 @@
 # Whisper Demo App
 
-This app runs the Whisper model in ExecuTorch.
+This app demonstrates running the Whisper speech recognition model on Android using ExecuTorch.
 
-## Build the ExecuTorch Android library
+> **Note:** The ExecuTorch `AsrModule` API is not yet released. We will give a snapshot AAR soon™
 
-Build the [ExecuTorch Android library with QNN backend](https://github.com/pytorch/executorch/blob/main/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md).
+## Export Model Files
 
-## Export the audio processing and model .pte files
+Export the audio preprocessor and model `.pte` files following the instructions at:
+https://github.com/pytorch/executorch/tree/main/examples/models/whisper
 
-There are two steps, audio processing and the Whisper model (encoder+decoder), which are both done via ExecuTorch.
+This app requires both a model `.pte` and a preprocessor `.pte` file.
 
-1. Run `extension/audio/mel_spectrogram.py` to export `whisper_preprocess.pte`
-2. Run `examples/qualcomm/oss_scripts/whisper/whisper.py` to export `whisper_qnn_16a8w.pte`
-
-Move these two `.pte` files along with `tokenizer.json` to `/data/local/tmp/whisper` on device.
-
-## Run the app
+## Run the App
 
 1. Open WhisperApp in Android Studio
-2. Copy the Android library `executorch.aar` (with audio JNI bindings) into `app/libs`
+2. Copy the `executorch.aar` library (with audio JNI bindings) into `app/libs`
 3. Build and run on device
 
 ## Demo
 
-https://github.com/user-attachments/assets/ff8c71c5-b734-4ed4-8382-70a429830665
+https://github.com/user-attachments/assets/eb4c4ae6-b89f-4eb4-a291-549a42c95f54
diff --git a/whisper/android/WhisperApp/app/src/main/java/com/example/whisperapp/MainActivity.kt b/whisper/android/WhisperApp/app/src/main/java/com/example/whisperapp/MainActivity.kt
@@ -44,18 +44,12 @@ import androidx.compose.ui.unit.dp
 import androidx.core.content.ContextCompat
 import androidx.lifecycle.ViewModelProvider
 import com.example.whisperapp.ui.theme.WhisperAppTheme
-import org.pytorch.executorch.EValue
-import org.pytorch.executorch.Module
-import org.pytorch.executorch.Tensor
 import org.pytorch.executorch.extension.asr.AsrCallback
 import org.pytorch.executorch.extension.asr.AsrModule
 import java.io.File
-import java.io.FileInputStream
 import java.io.FileOutputStream
 import java.io.IOException
 import java.io.OutputStream
-import java.nio.ByteBuffer
-import java.nio.ByteOrder
 
 class MainActivity : ComponentActivity(), AsrCallback {
 
@@ -97,48 +91,6 @@ class MainActivity : ComponentActivity(), AsrCallback {
         SETTINGS
     }
 
-    @Throws(IOException::class)
-    fun readWavPcmBytes(filePath: String): ByteArray {
-        val wavHeaderSize = 44 // Standard header size for PCM WAV
-        val file = File(filePath)
-        val fis = FileInputStream(file)
-        try {
-            val totalSize = file.length()
-            assert(totalSize > wavHeaderSize)
-            val pcmSize = (totalSize - wavHeaderSize).toInt()
-            val pcmBytes = ByteArray(pcmSize)
-            // Skip the header
-            val skipped = fis.skip(wavHeaderSize.toLong())
-            if (skipped != wavHeaderSize.toLong()) throw IOException("Failed to skip WAV header")
-            // Read PCM data
-            val read = fis.read(pcmBytes)
-            if (read != pcmSize) throw IOException("Failed to read all PCM data")
-            return pcmBytes
-        } finally {
-            fis.close()
-        }
-    }
-
-    private fun convertPcm16ToFloat(audioBytes: ByteArray): FloatArray {
-        val totalSamples = audioBytes.size / 2  // 2 bytes per 16-bit sample
-        val floatSamples = FloatArray(totalSamples)
-
-        // Create ByteBuffer with little-endian byte order (standard for WAV)
-        val byteBuffer = ByteBuffer.wrap(audioBytes).order(ByteOrder.LITTLE_ENDIAN)
-
-        for (i in 0 until totalSamples) {
-            val sample = byteBuffer.short.toInt()
-            // Normalize 16-bit PCM to [-1.0, 1.0]
-            floatSamples[i] = if (sample < 0) {
-                sample / 32768.0f
-            } else {
-                sample / 32767.0f
-            }
-        }
-
-        return floatSamples
-    }
-
     override fun onCreate(savedInstanceState: Bundle?) {
         super.onCreate(savedInstanceState)
 
@@ -252,62 +204,21 @@ class MainActivity : ComponentActivity(), AsrCallback {
 
         runOnUiThread {
             transcriptionOutput = ""
-        }
-
-        val audioData: FloatArray
-        val batchSize: Int
-        val featureDim: Int
-        val timeSteps: Int
-
-        if (settings.hasPreprocessor()) {
-            // Use preprocessor to convert WAV to mel-spectrogram
-            Log.v(TAG, "Using preprocessor: ${settings.preprocessorPath}")
-            runOnUiThread {
-                statusText = "Processing audio with mel-spectrogram..."
-            }
-
-            val pcmBytes = readWavPcmBytes(wavFilePath)
-            val inputFloatArray = convertPcm16ToFloat(pcmBytes)
-
-            val tensor1 = Tensor.fromBlob(
-                inputFloatArray,
-                longArrayOf(inputFloatArray.size.toLong())
-            )
-            val module = Module.load(settings.preprocessorPath)
-            val eValue1 = EValue.from(tensor1)
-            audioData = module.forward(eValue1)[0].toTensor().dataAsFloatArray
-
-            // result shape is [batchSize, timeSteps, featureDim]
-            batchSize = 1
-            featureDim = 128  // Whisper uses 128 mel bins
-            timeSteps = audioData.size / (batchSize * featureDim)
-        } else {
-            // No preprocessor: use raw WAV audio directly
-            Log.v(TAG, "No preprocessor, using raw WAV audio")
-            runOnUiThread {
-                statusText = "Processing raw audio..."
-            }
-
-            val pcmBytes = readWavPcmBytes(wavFilePath)
-            audioData = convertPcm16ToFloat(pcmBytes)
-
-            // For raw audio: batchSize=1, timeSteps=numSamples, featureDim=1
-            batchSize = 1
-            featureDim = 1  // Raw audio has 1 feature dimension
-            timeSteps = audioData.size
+            statusText = "Loading model..."
         }
 
         val whisperModule = AsrModule(
-            settings.modelPath,
-            settings.tokenizerPath,
-            settings.dataPath
+            modelPath = settings.modelPath,
+            tokenizerPath = settings.tokenizerPath,
+            dataPath = settings.dataPath.ifBlank { null },
+            preprocessorPath = settings.preprocessorPath.ifBlank { null }
         )
 
-        Log.v(TAG, "Starting transcribe with batchSize=$batchSize, timeSteps=$timeSteps, featureDim=$featureDim")
+        Log.v(TAG, "Starting transcribe for: $wavFilePath")
         runOnUiThread {
             statusText = "Transcribing..."
         }
-        whisperModule.transcribe(audioData, batchSize, timeSteps, featureDim, this@MainActivity)
+        whisperModule.transcribe(wavFilePath, this@MainActivity)
         Log.v(TAG, "Finished transcribe")
 
         // Display result in Text view instead of Toast