Add model response output to GitHub job summary and support multiple model presets (#150)

kirklandsign · web-flow · commit 5b9154f9c589 · 2026-01-15T13:21:58.000-08:00
Add model response extraction from instrumentation tests to GitHub job summary
Support configurable model presets (stories, llama, qwen3, custom) with proper filename handling
Pass model filenames to instrumentation tests via Gradle arguments for preset-aware testing
diff --git a/.github/workflows/llm-android.yml b/.github/workflows/llm-android.yml
@@ -83,7 +83,7 @@ jobs:
           path: |
             ~/.android/avd/*
             ~/.android/adb*
-          key: avd-${{ env.API_LEVEL }}-${{ env.ARCH }}-ram${{ env.RAM_SIZE }}
+          key: avd-${{ env.API_LEVEL }}-${{ env.ARCH }}-ram${{ env.RAM_SIZE }}-disk16G
 
       - name: Create AVD and generate snapshot for caching
         if: steps.avd-cache.outputs.cache-hit != 'true'
@@ -97,37 +97,88 @@ jobs:
           working-directory: llm/android/LlamaDemo
           script: echo "Generated AVD snapshot for caching."
 
-      - name: Configure AVD RAM
+      - name: Configure AVD RAM and disk
         run: |
           AVD_DIR="$HOME/.android/avd"
           for config in "$AVD_DIR"/*.avd/config.ini; do
             if [ -f "$config" ]; then
-              echo "Updating RAM in $config"
+              echo "Updating config in $config"
+              # Update RAM
               sed -i 's/hw.ramSize=.*/hw.ramSize=${{ env.RAM_SIZE }}/' "$config" || true
               grep -q "hw.ramSize" "$config" || echo "hw.ramSize=${{ env.RAM_SIZE }}" >> "$config"
+              # Update disk size to 16GB for large models
+              sed -i 's/disk.dataPartition.size=.*/disk.dataPartition.size=16G/' "$config" || true
+              grep -q "disk.dataPartition.size" "$config" || echo "disk.dataPartition.size=16G" >> "$config"
             fi
           done
 
-      - name: Run instrumentation tests
-        uses: reactivecircus/android-emulator-runner@v2
+      - name: Download model files
         env:
           MODEL_PRESET: ${{ inputs.model_preset || 'stories' }}
           CUSTOM_PTE_URL: ${{ inputs.custom_pte_url }}
           CUSTOM_TOKENIZER_URL: ${{ inputs.custom_tokenizer_url }}
+        run: |
+          mkdir -p /tmp/llama_models
+
+          # Determine URLs based on preset
+          case "$MODEL_PRESET" in
+            llama)
+              PTE_URL="https://huggingface.co/executorch-community/Llama-3.2-1B-ET/resolve/main/llama3_2-1B.pte"
+              TOKENIZER_URL="https://huggingface.co/executorch-community/Llama-3.2-1B-ET/resolve/main/tokenizer.model"
+              ;;
+            qwen3)
+              PTE_URL="https://huggingface.co/pytorch/Qwen3-4B-INT8-INT4/resolve/main/model.pte"
+              TOKENIZER_URL="https://huggingface.co/pytorch/Qwen3-4B-INT8-INT4/resolve/main/tokenizer.json"
+              ;;
+            custom)
+              PTE_URL="$CUSTOM_PTE_URL"
+              TOKENIZER_URL="$CUSTOM_TOKENIZER_URL"
+              ;;
+            *)
+              PTE_URL="https://ossci-android.s3.amazonaws.com/executorch/stories/snapshot-20260114/stories110M.pte"
+              TOKENIZER_URL="https://ossci-android.s3.amazonaws.com/executorch/stories/snapshot-20260114/tokenizer.model"
+              ;;
+          esac
+
+          PTE_FILE=$(basename "$PTE_URL")
+          TOKENIZER_FILE=$(basename "$TOKENIZER_URL")
+
+          echo "Downloading model: $PTE_URL"
+          curl -fL --progress-bar -o "/tmp/llama_models/$PTE_FILE" "$PTE_URL"
+
+          echo "Downloading tokenizer: $TOKENIZER_URL"
+          curl -fL --progress-bar -o "/tmp/llama_models/$TOKENIZER_FILE" "$TOKENIZER_URL"
+
+          echo "Downloaded files:"
+          ls -lh /tmp/llama_models/
+
+          # Export filenames for later steps
+          echo "MODEL_FILE=$PTE_FILE" >> $GITHUB_ENV
+          echo "TOKENIZER_FILE=$TOKENIZER_FILE" >> $GITHUB_ENV
+
+      - name: Run instrumentation tests
+        uses: reactivecircus/android-emulator-runner@v2
+        env:
+          MODEL_PRESET: ${{ inputs.model_preset || 'stories' }}
         with:
           api-level: ${{ env.API_LEVEL }}
           arch: ${{ env.ARCH }}
           force-avd-creation: false
           emulator-options: -no-snapshot-save ${{ env.EMULATOR_OPTIONS }}
           disable-animations: true
           working-directory: llm/android/LlamaDemo
-          script: |
-            adb shell rm -rf /data/local/tmp/llama
-            adb shell mkdir -p /data/local/tmp/llama
-            adb logcat -c && adb logcat > /tmp/logcat.txt &
-            LOGCAT_PID=$!
-            if [ "$MODEL_PRESET" = "custom" ]; then GRADLE_ARGS="-PmodelPreset=$MODEL_PRESET -PcustomPteUrl=$CUSTOM_PTE_URL -PcustomTokenizerUrl=$CUSTOM_TOKENIZER_URL"; else GRADLE_ARGS="-PmodelPreset=$MODEL_PRESET"; fi
-            ./gradlew connectedCheck $GRADLE_ARGS; TEST_EXIT_CODE=$?; kill $LOGCAT_PID || true; exit $TEST_EXIT_CODE
+          script: bash ./scripts/run-ci-tests.sh "$MODEL_PRESET" "$MODEL_FILE" "$TOKENIZER_FILE"
+
+      - name: Add model response to summary
+        if: always()
+        run: |
+          if [ -f /tmp/response.txt ]; then
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "## Model Response" >> $GITHUB_STEP_SUMMARY
+            echo '```' >> $GITHUB_STEP_SUMMARY
+            cat /tmp/response.txt >> $GITHUB_STEP_SUMMARY
+            echo '```' >> $GITHUB_STEP_SUMMARY
+          fi
 
       - name: Upload logcat
         if: always()
diff --git a/llm/android/LlamaDemo/README.md b/llm/android/LlamaDemo/README.md
@@ -228,15 +228,39 @@ adb push tokenizer.model /data/local/tmp/llama
 
 ### Running Tests
 
-Run all instrumentation tests:
+The easiest way to run instrumentation tests is using model presets, which automatically download the model and tokenizer files:
+
 ```sh
-./gradlew connectedAndroidTest
+# Run with stories model (default, smallest and fastest)
+./gradlew connectedCheck -PmodelPreset=stories
+
+# Run with Llama 3.2 1B model
+./gradlew connectedCheck -PmodelPreset=llama
+
+# Run with Qwen3 4B model
+./gradlew connectedCheck -PmodelPreset=qwen3
+
+# Run with custom model URLs
+./gradlew connectedCheck -PmodelPreset=custom \
+  -PcustomPteUrl=https://example.com/model.pte \
+  -PcustomTokenizerUrl=https://example.com/tokenizer.model
+
+# Skip model download (use existing files on device)
+./gradlew connectedCheck -PmodelPreset=stories -PskipModelDownload=true
 ```
 
+Available presets:
+| Preset | Model | Description |
+|--------|-------|-------------|
+| `stories` | stories110M | Tiny model for quick testing |
+| `llama` | Llama 3.2 1B | Production-quality Llama model |
+| `qwen3` | Qwen3 4B | Qwen3 model with INT8/INT4 quantization |
+| `custom` | User-provided | Specify custom URLs for model and tokenizer |
+
 Run a specific test class:
 ```sh
-./gradlew connectedAndroidTest -Pandroid.testInstrumentationRunnerArguments.class=com.example.executorchllamademo.SanityCheck
-./gradlew connectedAndroidTest -Pandroid.testInstrumentationRunnerArguments.class=com.example.executorchllamademo.UIWorkflowTest
+./gradlew connectedCheck -PmodelPreset=stories -Pandroid.testInstrumentationRunnerArguments.class=com.example.executorchllamademo.SanityCheck
+./gradlew connectedCheck -PmodelPreset=stories -Pandroid.testInstrumentationRunnerArguments.class=com.example.executorchllamademo.UIWorkflowTest
 ```
 
 ## Reporting Issues
diff --git a/llm/android/LlamaDemo/app/build.gradle.kts b/llm/android/LlamaDemo/app/build.gradle.kts
@@ -12,7 +12,7 @@ plugins {
 }
 
 // Model files configuration for instrumentation tests
-// Supported presets: stories, llama, custom
+// Supported presets: stories, llama, qwen3, custom
 val modelPreset: String = (project.findProperty("modelPreset") as? String) ?: "stories"
 
 // Preset configurations
@@ -62,6 +62,14 @@ fun execCmdWithExitCode(vararg args: String): Pair<Int, String> {
   return Pair(exitCode, output)
 }
 
+// Streaming version that shows output in real-time (for long-running commands)
+fun execCmdStreaming(vararg args: String): Int {
+  val process = ProcessBuilder(*args)
+    .inheritIO()
+    .start()
+  return process.waitFor()
+}
+
 tasks.register("pushModelFiles") {
   description = "Download model files and push to connected Android device if not present"
   group = "verification"
@@ -84,17 +92,17 @@ tasks.register("pushModelFiles") {
       tokenizerUrl = customTokenizerUrl ?: throw GradleException("customTokenizerUrl is required when modelPreset is 'custom'")
       verifyChecksum = false
     } else {
-      val preset = modelPresets[modelPreset] ?: throw GradleException("Unknown model preset: $modelPreset. Valid options: stories, llama, custom")
+      val preset = modelPresets[modelPreset] ?: throw GradleException("Unknown model preset: $modelPreset. Valid options: ${modelPresets.keys.joinToString(", ")}, custom")
       val baseUrl = preset["baseUrl"] as String
       pteUrl = "$baseUrl/${preset["pteFile"]}"
       tokenizerUrl = "$baseUrl/${preset["tokenizerFile"]}"
       verifyChecksum = preset["verifyChecksum"] as Boolean
     }
 
-    // Files to download: source URL -> target name on device
+    // Files to download: source URL -> target name on device (keep original filenames)
     val filesToDownload = mapOf(
-      pteUrl to "model.pte",
-      tokenizerUrl to "tokenizer.model"
+      pteUrl to pteUrl.substringAfterLast("/"),
+      tokenizerUrl to tokenizerUrl.substringAfterLast("/")
     )
 
     // Check if adb is available
@@ -130,13 +138,11 @@ tasks.register("pushModelFiles") {
         val localPath = "$tempDir/$targetName"
         val devicePath = "$deviceModelDir/$targetName"
 
-        // Download file
+        // Download file with progress indicator
         logger.lifecycle("Downloading from $sourceUrl...")
-        val (dlCode, dlOutput) = execCmdWithExitCode(
-          "curl", "-fL", "-o", localPath, sourceUrl
-        )
+        val dlCode = execCmdStreaming("curl", "-fL", "--progress-bar", "-o", localPath, sourceUrl)
         if (dlCode != 0) {
-          throw GradleException("Failed to download from $sourceUrl: $dlOutput")
+          throw GradleException("Failed to download from $sourceUrl")
         }
 
         // Verify checksum if enabled and available (only for stories preset)
@@ -173,11 +179,11 @@ tasks.register("pushModelFiles") {
           }
         }
 
-        // Push to device
+        // Push to device with progress
         logger.lifecycle("Pushing $targetName to device...")
-        val (pushCode, pushOutput) = execCmdWithExitCode(adbPath, "push", localPath, devicePath)
+        val pushCode = execCmdStreaming(adbPath, "push", localPath, devicePath)
         if (pushCode != 0) {
-          throw GradleException("Failed to push $targetName to device: $pushOutput")
+          throw GradleException("Failed to push $targetName to device")
         }
         logger.lifecycle("Successfully pushed $targetName")
       }
diff --git a/llm/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/SanityCheck.java b/llm/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/SanityCheck.java
@@ -11,28 +11,52 @@
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 
+import android.os.Bundle;
 import androidx.test.ext.junit.runners.AndroidJUnit4;
+import androidx.test.platform.app.InstrumentationRegistry;
 import java.io.File;
 import java.util.ArrayList;
 import java.util.List;
+import org.junit.Before;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.pytorch.executorch.extension.llm.LlmCallback;
 import org.pytorch.executorch.extension.llm.LlmModule;
 
+/**
+ * Sanity check test for model loading and generation.
+ *
+ * Model filenames can be configured via instrumentation arguments:
+ * - modelFile: name of the .pte file (default: stories110M.pte)
+ * - tokenizerFile: name of the tokenizer file (default: tokenizer.model)
+ */
 @RunWith(AndroidJUnit4.class)
 public class SanityCheck implements LlmCallback {
 
   private static final String RESOURCE_PATH = "/data/local/tmp/llama/";
-  private static final String TOKENIZER_PATH = "tokenizer.model";
-  private static final String MODEL_PATH = "model.pte";
+
+  // Default filenames (stories preset)
+  private static final String DEFAULT_MODEL_FILE = "stories110M.pte";
+  private static final String DEFAULT_TOKENIZER_FILE = "tokenizer.model";
+
+  private String modelFile;
+  private String tokenizerFile;
 
   private final List<String> results = new ArrayList<>();
 
+  @Before
+  public void setUp() {
+    // Read model filenames from instrumentation arguments
+    Bundle args = InstrumentationRegistry.getArguments();
+    modelFile = args.getString("modelFile", DEFAULT_MODEL_FILE);
+    tokenizerFile = args.getString("tokenizerFile", DEFAULT_TOKENIZER_FILE);
+    android.util.Log.i("SanityCheck", "Using model: " + modelFile + ", tokenizer: " + tokenizerFile);
+  }
+
   @Test
   public void testLoadAndGenerate() {
-    String tokenizerPath = RESOURCE_PATH + TOKENIZER_PATH;
-    File model = new File(RESOURCE_PATH + MODEL_PATH);
+    String tokenizerPath = RESOURCE_PATH + tokenizerFile;
+    File model = new File(RESOURCE_PATH + modelFile);
     LlmModule mModule = new LlmModule(model.getPath(), tokenizerPath, 0.8f);
 
     int loadResult = mModule.load();
diff --git a/llm/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/UIWorkflowTest.java b/llm/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/UIWorkflowTest.java
diff --git a/llm/android/LlamaDemo/scripts/run-ci-tests.sh b/llm/android/LlamaDemo/scripts/run-ci-tests.sh