android-qa-agent/android-qa-replay at main · tobrun/android-qa-agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
#!/usr/bin/env python3
"""Replay a finalized Android QA recording at configurable speed."""

import argparse
import json
import os
import re
import subprocess
import sys
import time
import warnings
from datetime import datetime


RECORDINGS_DIR = "recordings"

# Commands that are observation-only and should be skipped during replay.
OBSERVATION_PREFIXES = [
    ["shell", "screencap"],
    ["shell", "uiautomator", "dump"],
    ["pull"],
]

# Minimum delay (seconds) between consecutive "shell input" commands (tap, text,
# keyevent, swipe) to prevent character dropping and missed taps at high replay
# speeds. Applied as a floor regardless of the --speed multiplier.
MIN_DELAY_BETWEEN_INPUTS = 1.0


def is_input_command(args):
    """Return True if the command is a user-interaction input command."""
    return len(args) >= 3 and args[0] == "shell" and args[1] == "input"


def find_adb():
    """Locate the real adb binary: $ANDROID_HOME/platform-tools/adb, then PATH."""
    android_home = os.environ.get("ANDROID_HOME")
    if android_home:
        adb_path = os.path.join(android_home, "platform-tools", "adb")
        if os.path.isfile(adb_path) and os.access(adb_path, os.X_OK):
            return adb_path

    for directory in os.environ.get("PATH", "").split(os.pathsep):
        adb_path = os.path.join(directory, "adb")
        if os.path.isfile(adb_path) and os.access(adb_path, os.X_OK):
            return adb_path

    return None


def utcnow():
    """Get current UTC time (suppresses deprecation warning on Python 3.12+)."""
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        return datetime.utcnow()


def format_timestamp(dt):
    """Format a datetime as ISO 8601 with millisecond precision."""
    return dt.strftime("%Y-%m-%dT%H:%M:%S.") + dt.strftime("%f")[:3] + "Z"


def parse_timestamp(ts):
    """Parse ISO 8601 timestamp (Python 3.8 compatible)."""
    # Strip trailing Z and parse
    ts = ts.rstrip("Z")
    return datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S.%f")


def is_observation_command(args):
    """Return True if the command is observation-only (screencap, dump, pull)."""
    for prefix in OBSERVATION_PREFIXES:
        if args[: len(prefix)] == prefix:
            return True
    return False


def filter_commands(commands):
    """Filter out observation commands, keeping only interactions."""
    return [cmd for cmd in commands if not is_observation_command(cmd["args"])]


def adb_cmd(adb, device_serial=None):
    """Return the base adb command list, with -s <serial> if specified."""
    if device_serial:
        return [adb, "-s", device_serial]
    return [adb]


def take_screenshot(adb, dest_path, device_serial=None):
    """Take a screenshot from the device and pull it to dest_path."""
    os.makedirs(os.path.dirname(dest_path), exist_ok=True)
    base = adb_cmd(adb, device_serial)
    result = subprocess.run(
        base + ["shell", "screencap", "-p", "/sdcard/screen.png"],
        capture_output=True,
    )
    if result.returncode != 0:
        print("Error: failed to take screenshot on device.", file=sys.stderr)
        return False
    result = subprocess.run(
        base + ["pull", "/sdcard/screen.png", dest_path],
        capture_output=True,
    )
    if result.returncode != 0:
        print("Error: failed to pull screenshot from device.", file=sys.stderr)
        return False
    return True


def verify(recording, json_path, adb, device_serial=None):
    """Run verification: compare golden screenshot to current device state."""
    metadata = recording["metadata"]
    session_name = metadata["name"]
    golden_filename = metadata.get("golden_screenshot")

    if not golden_filename:
        print(
            "Error: no golden screenshot in recording. "
            "Cannot verify without a screenshot taken during recording.",
            file=sys.stderr,
        )
        sys.exit(1)

    session_dir = os.path.dirname(json_path) or os.path.join(RECORDINGS_DIR, session_name)
    golden_path = os.path.join(session_dir, golden_filename)

    if not os.path.exists(golden_path):
        print("Error: golden screenshot not found: %s" % golden_path, file=sys.stderr)
        sys.exit(1)

    # Wait for the UI to settle after the last replayed command
    print("\nVerification: waiting 2s for UI to settle...")
    time.sleep(2)

    # Take a fresh screenshot
    actual_filename = "actual.png"
    actual_path = os.path.join(session_dir, actual_filename)
    print("Verification: capturing device screenshot...")
    if not take_screenshot(adb, actual_path, device_serial):
        sys.exit(1)

    # Invoke claude -p for comparison
    abs_golden = os.path.abspath(golden_path)
    abs_actual = os.path.abspath(actual_path)
    print("Verification: comparing screenshots (golden: %s, actual: %s)..." % (golden_filename, actual_filename))
    print("Verification: invoking Claude for visual comparison (this may take a moment)...")
    prompt = (
        "You are verifying an Android UI test. "
        "Read these two screenshot files using the Read tool:\n"
        "  Expected (golden): %s\n"
        "  Actual (replay):   %s\n"
        "Do these show the same app state? Ignore transient differences like "
        "the status bar clock, battery level, or notification indicators. "
        'Respond with JSON only: {"pass": true, "reasoning": "brief explanation"}.'
    ) % (abs_golden, abs_actual)

    result = subprocess.run(
        ["claude", "-p", prompt, "--allowedTools", "Read"],
        capture_output=True,
        text=True,
    )

    if result.returncode != 0:
        print("Error: claude invocation failed.", file=sys.stderr)
        if result.stderr:
            print(result.stderr, file=sys.stderr)
        sys.exit(1)

    # Parse Claude's response — extract JSON from the output
    response_text = result.stdout.strip()
    try:
        verdict = json.loads(response_text)
    except json.JSONDecodeError:
        # Try to find JSON in the response
        match = re.search(r"\{[^}]+\}", response_text)
        if match:
            verdict = json.loads(match.group())
        else:
            print("Error: could not parse Claude response as JSON:", file=sys.stderr)
            print(response_text, file=sys.stderr)
            sys.exit(1)

    passed = verdict.get("pass", False)
    reasoning = verdict.get("reasoning", "")

    # Print human-readable summary
    status = "PASS" if passed else "FAIL"
    print("\nVerification: %s" % status)
    print("Reasoning: %s" % reasoning)

    # Save verification report
    report = {
        "pass": passed,
        "reasoning": reasoning,
        "golden_screenshot": golden_filename,
        "actual_screenshot": actual_filename,
        "verified_at": format_timestamp(utcnow()),
    }

    report_path = os.path.join(session_dir, "verify.json")
    with open(report_path, "w") as f:
        json.dump(report, f, indent=2)
        f.write("\n")

    print("Report saved to: %s" % report_path)

    sys.exit(0 if passed else 1)


def verify_multi(pairs, recording, json_path, adb, device_serial=None):
    """Run multi-step verification: compare golden screenshots at each checkpoint."""
    metadata = recording["metadata"]
    session_name = metadata["name"]
    session_dir = os.path.dirname(json_path) or os.path.join(RECORDINGS_DIR, session_name)
    total = len(pairs)

    print("\nMulti-step verification: comparing %d checkpoint%s..." % (total, "s" if total != 1 else ""))

    # Build prompt for Claude
    prompt_lines = [
        "You are verifying an Android UI test with %d checkpoint%s." % (total, "s" if total != 1 else ""),
        "Read these screenshot pairs using the Read tool and compare each pair:",
        "",
    ]
    for i, pair in enumerate(pairs, 1):
        abs_golden = os.path.abspath(os.path.join(session_dir, pair["golden"]))
        abs_actual = os.path.abspath(os.path.join(session_dir, pair["actual"]))
        prompt_lines.append("Step %d:" % i)
        prompt_lines.append("  Expected (golden): %s" % abs_golden)
        prompt_lines.append("  Actual (replay):   %s" % abs_actual)
        prompt_lines.append("")

    prompt_lines.extend([
        "For each step, determine if the golden and actual screenshots show the same app state.",
        "Ignore transient differences like the status bar clock, battery level, or notification indicators.",
        "",
        'Respond with JSON only: {"steps": [{"step": 1, "pass": true, "reasoning": "brief explanation"}, ...]}',
    ])

    prompt = "\n".join(prompt_lines)

    print("Verification: invoking Claude for visual comparison (this may take a moment)...")
    result = subprocess.run(
        ["claude", "-p", prompt, "--allowedTools", "Read"],
        capture_output=True,
        text=True,
    )

    if result.returncode != 0:
        print("Error: claude invocation failed.", file=sys.stderr)
        if result.stderr:
            print(result.stderr, file=sys.stderr)
        sys.exit(1)

    # Parse Claude's response — extract JSON from the output
    response_text = result.stdout.strip()

    # Strip markdown code fences if present
    fenced = re.search(r"```(?:json)?\s*\n(.*?)```", response_text, re.DOTALL)
    if fenced:
        response_text = fenced.group(1).strip()

    try:
        verdict = json.loads(response_text)
    except json.JSONDecodeError:
        # Try to find JSON object in the response (greedy match for nested braces)
        match = re.search(r"\{[\s\S]*\}", response_text)
        if match:
            try:
                verdict = json.loads(match.group())
            except json.JSONDecodeError:
                print("Error: could not parse Claude response as JSON:", file=sys.stderr)
                print(result.stdout.strip(), file=sys.stderr)
                sys.exit(1)
        else:
            print("Error: could not parse Claude response as JSON:", file=sys.stderr)
            print(result.stdout.strip(), file=sys.stderr)
            sys.exit(1)

    steps = verdict.get("steps", [])

    # Print per-step results
    passed_count = 0
    for s in steps:
        step_pass = s.get("pass", False)
        if step_pass:
            passed_count += 1
        status = "PASS" if step_pass else "FAIL"
        print("  Step %s: %s — %s" % (s.get("step", "?"), status, s.get("reasoning", "")))

    all_passed = passed_count == total
    status = "PASS" if all_passed else "FAIL"
    print("\nVerification: %s (%d/%d steps passed)" % (status, passed_count, total))

    # Build and save verification report
    report = {
        "pass": all_passed,
        "summary": "%d/%d steps passed" % (passed_count, total),
        "steps": [],
        "verified_at": format_timestamp(utcnow()),
    }
    for i, pair in enumerate(pairs):
        step_data = steps[i] if i < len(steps) else {"pass": False, "reasoning": "no response from verifier"}
        report["steps"].append({
            "step": i + 1,
            "pass": step_data.get("pass", False),
            "reasoning": step_data.get("reasoning", ""),
            "golden_screenshot": pair["golden"],
            "actual_screenshot": pair["actual"],
        })

    report_path = os.path.join(session_dir, "verify.json")
    with open(report_path, "w") as f:
        json.dump(report, f, indent=2)
        f.write("\n")

    print("Report saved to: %s" % report_path)
    sys.exit(0 if all_passed else 1)


def main():
    parser = argparse.ArgumentParser(description="Replay a finalized Android QA recording.")
    parser.add_argument("recording", help="Path to a .json recording, or a session name (looked up in recordings/)")
    parser.add_argument("--speed", type=float, default=1.0, help="Speed multiplier (default: 1.0)")
    parser.add_argument("--verify", action="store_true", help="After replay, verify the result against the golden screenshot")
    parser.add_argument("--device", help="ADB device serial to replay on (from adb devices)")
    args = parser.parse_args()

    if args.speed <= 0:
        print("Error: speed must be greater than 0.", file=sys.stderr)
        sys.exit(1)

    # Find adb
    adb = find_adb()
    if adb is None:
        print("Error: adb not found. Set ANDROID_HOME or add adb to PATH.", file=sys.stderr)
        sys.exit(1)

    # Resolve recording path
    recording_arg = args.recording
    if os.sep in recording_arg or recording_arg.endswith(".json"):
        json_path = recording_arg
    else:
        json_path = os.path.join(RECORDINGS_DIR, recording_arg, "recording.json")

    if not os.path.exists(json_path):
        print("Error: recording not found: %s" % json_path, file=sys.stderr)
        sys.exit(1)

    with open(json_path, "r") as f:
        recording = json.load(f)

    # Filter out observation commands
    commands = filter_commands(recording["commands"])

    # Build base adb command with optional device targeting
    base = adb_cmd(adb, args.device)

    # Build checkpoint map for multi-step verification
    checkpoints = {}
    if args.verify:
        for gs in recording["metadata"].get("golden_screenshots", []):
            checkpoints[gs["after_interaction"]] = gs["file"]

    session_dir = os.path.dirname(json_path) or os.path.join(
        RECORDINGS_DIR, recording["metadata"]["name"]
    )
    actual_screenshots = []

    def capture_checkpoint(interaction_idx):
        """Capture a checkpoint screenshot if this interaction index has one."""
        if interaction_idx not in checkpoints:
            return
        step_num = len(actual_screenshots) + 1
        total = len(checkpoints)
        print("Verification: checkpoint %d/%d — capturing screenshot..." % (step_num, total))
        time.sleep(1.5)
        actual_file = "actual-%03d.png" % step_num
        actual_path = os.path.join(session_dir, actual_file)
        if take_screenshot(adb, actual_path, args.device):
            actual_screenshots.append({
                "golden": checkpoints[interaction_idx],
                "actual": actual_file,
            })
        else:
            print("Warning: failed to capture checkpoint %d screenshot." % step_num, file=sys.stderr)

    if commands:
        # Execute first command immediately
        result = subprocess.run(base + commands[0]["args"])
        if result.returncode != 0:
            sys.exit(result.returncode)
        capture_checkpoint(0)

        # Execute remaining commands with timing
        for i in range(1, len(commands)):
            prev_ts = parse_timestamp(commands[i - 1]["issued_at"])
            curr_ts = parse_timestamp(commands[i]["issued_at"])
            gap_seconds = (curr_ts - prev_ts).total_seconds()
            delay = gap_seconds / args.speed

            # Enforce minimum settle time between consecutive input commands
            if (is_input_command(commands[i]["args"])
                    and is_input_command(commands[i - 1]["args"])):
                delay = max(delay, MIN_DELAY_BETWEEN_INPUTS)

            if delay > 0:
                time.sleep(delay)

            result = subprocess.run(base + commands[i]["args"])
            if result.returncode != 0:
                sys.exit(result.returncode)
            capture_checkpoint(i)

    if args.verify:
        if actual_screenshots:
            verify_multi(actual_screenshots, recording, json_path, adb, args.device)
        else:
            # Fall back to single-screenshot verification (old recordings)
            verify(recording, json_path, adb, args.device)

    sys.exit(0)


if __name__ == "__main__":
    main()