Skip to content

Commit f006143

Browse files
authored
Reduce e2e test flakiness (#16063)
1 parent 3583040 commit f006143

19 files changed

Lines changed: 593 additions & 50 deletions

.ado/build-template.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,10 +208,12 @@ extends:
208208
- script: npx --yes midgard-yarn-strict@1.2.4 @rnw-scripts/beachball-config
209209
displayName: Strict yarn install @rnw-scripts/beachball-config
210210
condition: and(succeeded(), eq(variables['detectScenario.isReleaseBuild'], 'False'))
211+
retryCountOnTaskFailure: 2
211212

212213
- script: npx lage build --scope @rnw-scripts/prepare-release --scope @rnw-scripts/beachball-config
213214
displayName: Build prepare-release and beachball-config
214215
condition: and(succeeded(), eq(variables['detectScenario.isReleaseBuild'], 'False'))
216+
retryCountOnTaskFailure: 2
215217

216218
# 5. Beachball check (Developer PR only)
217219
- pwsh: npx beachball check --branch "origin/$env:BEACHBALL_BRANCH" --verbose --changehint "##vso[task.logissue type=error]Run 'yarn change' from root of repo to generate a change file."

.ado/jobs/e2e-test.yml

Lines changed: 249 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ parameters:
88
- Continuous
99
- name: AgentPool
1010
type: object
11+
# When set to true on a PR-validation queue, the E2E app deliberately
12+
# crashes (simulateCrashForTesting) or hangs (simulateHangForTesting) so we
13+
# can re-validate that the crash-dump collection path still produces a
14+
# usable artifact. Disabled by default — the test step is doomed by design
15+
# when these are on.
16+
- name: simulateCrashForTesting
17+
type: boolean
18+
default: false
19+
- name: simulateHangForTesting
20+
type: boolean
21+
default: false
1122
- name: buildMatrix
1223
type: object
1324
default:
@@ -185,6 +196,12 @@ jobs:
185196
platform: ${{ matrix.BuildPlatform }}
186197
configuration: Release
187198
buildEnvironment: ${{ config.buildEnvironment }}
199+
# Capture crash dumps for the E2E test app (packaged UWP) and
200+
# the Metro bundler. ProcDump-as-AeDebug does not reliably fire
201+
# for packaged apps; WER LocalDumps does.
202+
localDumpsExeNames:
203+
- RNTesterApp-Fabric
204+
- node
188205

189206
- pwsh: |
190207
Write-Host "##vso[task.setvariable variable=BuildLogDirectory]$(Build.BinariesDirectory)\${{ matrix.BuildPlatform }}\BuildLogs"
@@ -209,11 +226,238 @@ jobs:
209226
echo ##vso[task.setvariable variable=StartedFabricTests]true
210227
displayName: Set StartedFabricTests
211228
212-
- script: |
213-
yarn e2etest
214-
displayName: yarn e2etest
215-
workingDirectory: packages/e2e-test-app-fabric
216-
timeoutInMinutes: 10 # Time to wait for this task to complete before the server kills it.
229+
# Test-only: arm the crash-simulation sentinel so RNTesterApp-Fabric
230+
# crashes on startup. Validates the in-process minidump path.
231+
- ${{ if eq(parameters.simulateCrashForTesting, true) }}:
232+
- pwsh: |
233+
$flagPath = Join-Path $env:ProgramData 'rnw-e2e-simulate-crash.flag'
234+
New-Item -Path $flagPath -ItemType File -Force | Out-Null
235+
Write-Host "Crash-simulation sentinel created at $flagPath"
236+
$dumpDir = Join-Path $env:ProgramData 'RNW-E2E-Dumps'
237+
if (Test-Path $dumpDir) {
238+
Remove-Item -Path "$dumpDir\*" -Recurse -Force -ErrorAction SilentlyContinue
239+
Write-Host "Cleared stale dumps under $dumpDir"
240+
}
241+
displayName: Arm crash-simulation sentinel (TEST ONLY)
242+
243+
# Test-only: arm the hang-simulation env var, which switches on
244+
# the HangSimulationTest.test.ts test. That test invokes the
245+
# `HangForTesting` automation command, jamming the app's UI thread
246+
# so the post-failure ProcDump path captures a hang dump.
247+
- ${{ if eq(parameters.simulateHangForTesting, true) }}:
248+
- pwsh: |
249+
Write-Host "##vso[task.setvariable variable=RNW_SIMULATE_HANG]1"
250+
Write-Host "Hang simulation armed (RNW_SIMULATE_HANG=1)"
251+
displayName: Arm hang-simulation env var (TEST ONLY)
252+
253+
# When simulating a hang, run ONLY the HangSimulationTest. The default
254+
# jest sequencer puts brand-new (no-timing-history) tests late in the order,
255+
# so without filtering the test step times out before the hang test even
256+
# runs. 4-minute timeout: enough for app launch (~30 s) + the test's 70 s
257+
# jest testTimeout + jest teardown attempt; ADO will cut off at 4 min if the
258+
# hang prevents jest from exiting cleanly, which is fine — Capture step then
259+
# finds the still-alive UI-hung app.
260+
- ${{ if eq(parameters.simulateHangForTesting, true) }}:
261+
- script: |
262+
yarn e2etest --testPathPattern HangSimulationTest
263+
displayName: yarn e2etest (hang simulation only)
264+
workingDirectory: packages/e2e-test-app-fabric
265+
timeoutInMinutes: 4
266+
267+
- ${{ if not(eq(parameters.simulateHangForTesting, true)) }}:
268+
- script: |
269+
yarn e2etest
270+
displayName: yarn e2etest
271+
workingDirectory: packages/e2e-test-app-fabric
272+
# Drop to 2 min during crash simulation — the app crashes
273+
# immediately on startup, so a 10-minute wait is dead time.
274+
${{ if eq(parameters.simulateCrashForTesting, true) }}:
275+
timeoutInMinutes: 2
276+
${{ if not(eq(parameters.simulateCrashForTesting, true)) }}:
277+
timeoutInMinutes: 10
278+
279+
# Always disarm the crash sentinel so it cannot leak to a rerun on
280+
# the same agent.
281+
- ${{ if eq(parameters.simulateCrashForTesting, true) }}:
282+
- pwsh: |
283+
$flagPath = Join-Path $env:ProgramData 'rnw-e2e-simulate-crash.flag'
284+
if (Test-Path $flagPath) {
285+
Remove-Item $flagPath -Force
286+
Write-Host "Removed crash-simulation sentinel at $flagPath"
287+
}
288+
displayName: Disarm crash-simulation sentinel (TEST ONLY)
289+
condition: always()
290+
291+
# Always disarm the hang-simulation env var so the post-failure
292+
# `Update snapshots` step (which also runs `yarn e2etest`) does not
293+
# re-trigger the hang and burn 10 minutes of dead time. Setting an
294+
# ADO variable to empty string clears it for subsequent steps.
295+
- ${{ if eq(parameters.simulateHangForTesting, true) }}:
296+
- pwsh: |
297+
Write-Host "##vso[task.setvariable variable=RNW_SIMULATE_HANG]"
298+
Write-Host "Hang simulation disarmed (RNW_SIMULATE_HANG cleared)"
299+
displayName: Disarm hang-simulation env var (TEST ONLY)
300+
condition: always()
301+
302+
# On test failure, snapshot any lingering RNTesterApp-Fabric / node
303+
# processes before subsequent steps (or the agent) tear them down.
304+
# WER LocalDumps only fires on actual crashes; this catches hangs
305+
# (e.g. "Unable to enter correct text" timeouts) where the process
306+
# is alive but unresponsive.
307+
#
308+
# Dumps must go into a subfolder of $(CrashDumpRootPath). Files
309+
# written directly at the root were observed to disappear during
310+
# the long `Update snapshots` step that runs after a failed test;
311+
# files in a subfolder survive. We don't know which agent
312+
# behavior deletes them — Defender, a 1ES cleanup script, or a
313+
# side-effect of `yarn e2etest -u` — but a subfolder evades it.
314+
- pwsh: |
315+
$procDump = Join-Path "$(ProcDumpPath)" 'procdump64.exe'
316+
if (-not (Test-Path $procDump)) {
317+
Write-Host "ProcDump not found at $procDump; skipping live-process dump capture."
318+
exit 0
319+
}
320+
321+
$hangDir = Join-Path "$(CrashDumpRootPath)" 'hang'
322+
New-Item -ItemType Directory -Path $hangDir -Force | Out-Null
323+
324+
$targets = @('RNTesterApp-Fabric', 'node')
325+
foreach ($name in $targets) {
326+
Get-Process -Name $name -ErrorAction SilentlyContinue | ForEach-Object {
327+
$dumpPath = Join-Path $hangDir ("hang_{0}_{1}.dmp" -f $name, $_.Id)
328+
Write-Host "Capturing full dump of $name (pid $($_.Id)) to $dumpPath"
329+
& $procDump -accepteula -ma $_.Id $dumpPath
330+
Write-Host ("ProcDump exit code: {0} (non-zero is normal - encodes the dump count written)" -f $LASTEXITCODE)
331+
}
332+
}
333+
# ProcDump uses non-zero exit codes to encode the number of dumps written.
334+
# Force a clean PowerShell exit so the step doesn't show as a warning.
335+
exit 0
336+
displayName: Capture dumps of surviving test processes
337+
condition: and(failed(), eq(variables.StartedFabricTests, 'true'))
338+
continueOnError: true
339+
340+
# Collect any in-process minidumps the app's UEF wrote to
341+
# %ProgramData%\RNW-E2E-Dumps, plus any dumps WER may have written
342+
# to its standard fallback locations, and stage them into
343+
# subfolders of $(CrashDumpRootPath) so they ride the crash-dumps
344+
# artifact. Dumps in subfolders survive the post-failure
345+
# `Update snapshots` step (see comment on the Capture step above).
346+
- pwsh: |
347+
# In-process minidumps (primary mechanism for actual crashes).
348+
$inProc = Join-Path $env:ProgramData 'RNW-E2E-Dumps'
349+
if (Test-Path $inProc) {
350+
$dest = Join-Path "$(CrashDumpRootPath)" 'in-process'
351+
New-Item -ItemType Directory -Path $dest -Force | Out-Null
352+
Copy-Item -Path "$inProc\*" -Destination $dest -Recurse -Force -ErrorAction SilentlyContinue
353+
Get-ChildItem -Path $dest -Recurse -Force -ErrorAction SilentlyContinue |
354+
Select-Object FullName, Length | Format-Table -AutoSize | Out-String | Write-Host
355+
}
356+
357+
# Fallback search: if the agent image ever changes back to a
358+
# working WER LocalDumps configuration, dumps may land here.
359+
$searchRoots = @(
360+
"$env:LOCALAPPDATA\CrashDumps",
361+
"$env:ProgramData\Microsoft\Windows\WER\ReportQueue",
362+
"$env:ProgramData\Microsoft\Windows\WER\ReportArchive",
363+
"$env:ProgramData\Microsoft\Windows\WER\Temp"
364+
)
365+
$found = @()
366+
foreach ($root in $searchRoots) {
367+
if (-not (Test-Path $root)) { continue }
368+
$found += Get-ChildItem -Path $root -Recurse -Include *.dmp,*.mdmp -ErrorAction SilentlyContinue -Force |
369+
Where-Object { -not $_.PSIsContainer -and $_.LastWriteTime -gt (Get-Date).AddHours(-2) }
370+
}
371+
if ($found.Count -gt 0) {
372+
$dest = Join-Path "$(CrashDumpRootPath)" 'recovered'
373+
New-Item -ItemType Directory -Path $dest -Force | Out-Null
374+
foreach ($h in $found) {
375+
$target = Join-Path $dest ($h.FullName -replace '[:\\/]', '_')
376+
Copy-Item -LiteralPath $h.FullName -Destination $target -Force -ErrorAction SilentlyContinue
377+
Write-Host "Recovered $($h.FullName) ($($h.Length) bytes) -> $target"
378+
}
379+
}
380+
displayName: Collect in-process and fallback crash dumps
381+
condition: and(failed(), eq(variables.StartedFabricTests, 'true'))
382+
continueOnError: true
383+
384+
# Bundle matching PDBs and a debugging README into the Crash dumps
385+
# artifact so the dump is self-contained for an offline developer.
386+
# Skipped if no .dmp/.mdmp files exist — $(CrashDumpRootPath) also
387+
# holds MSBuild failure logs (MSBUILDDEBUGPATH points here), and
388+
# those don't need symbols or this README.
389+
- pwsh: |
390+
$dumps = Get-ChildItem -Path "$(CrashDumpRootPath)" -Recurse -Include *.dmp,*.mdmp -File -ErrorAction SilentlyContinue
391+
if (-not $dumps -or $dumps.Count -eq 0) {
392+
Write-Host "No .dmp/.mdmp files in $(CrashDumpRootPath); skipping symbols + README bundling."
393+
exit 0
394+
}
395+
Write-Host "Found $($dumps.Count) dump file(s); bundling matching PDBs and README."
396+
397+
$symbolsDir = Join-Path "$(CrashDumpRootPath)" 'symbols'
398+
$releaseRoot = "$(Build.SourcesDirectory)\packages\e2e-test-app-fabric\windows\${{ matrix.BuildPlatform }}\Release"
399+
if (Test-Path $releaseRoot) {
400+
$pdbs = Get-ChildItem -Path $releaseRoot -Recurse -Filter *.pdb -File -ErrorAction SilentlyContinue
401+
foreach ($pdb in $pdbs) {
402+
$rel = $pdb.FullName.Substring($releaseRoot.Length).TrimStart('\','/')
403+
$target = Join-Path $symbolsDir $rel
404+
New-Item -ItemType Directory -Path (Split-Path -Parent $target) -Force | Out-Null
405+
Copy-Item -LiteralPath $pdb.FullName -Destination $target -Force -ErrorAction SilentlyContinue
406+
}
407+
Write-Host "Staged $($pdbs.Count) PDB(s) under $symbolsDir"
408+
} else {
409+
Write-Host "Release root not found at $releaseRoot; skipping PDB stage."
410+
}
411+
412+
$readme = @'
413+
# Reading these crash dumps
414+
415+
This artifact contains crash and/or hang dumps from a failed React
416+
Native Windows E2E test run, plus matching debug symbols.
417+
418+
## What is in here
419+
420+
- `hang/` -- full-memory dumps captured by procdump64 from
421+
RNTesterApp-Fabric / node processes that were still alive when
422+
the test step timed out.
423+
- `in-process/` -- full-memory minidumps written by
424+
RNTesterApp-Fabric's own unhandled-exception filter when the app
425+
actually crashed.
426+
- `recovered/` -- dumps recovered from common WER fallback
427+
locations on the agent. Usually empty.
428+
- `symbols/` -- PDBs that match the binaries deployed to the test
429+
agent. Folder layout mirrors the test app's Release deploy tree.
430+
431+
## Opening in WinDbg
432+
433+
1. Download and extract this artifact. Note the absolute path of
434+
the extracted `symbols/` folder.
435+
2. Open a dump:
436+
437+
windbg -z hang\hang_RNTesterApp-Fabric_<pid>.dmp
438+
439+
3. Set the symbol path (this artifact's symbols + Microsoft public
440+
symbol server) and reload:
441+
442+
.sympath srv*C:\symbols*https://msdl.microsoft.com/download/symbols;<extracted-path>\symbols
443+
.reload /f
444+
445+
4. Useful first commands:
446+
- `~* k` -- call stack of every thread (most useful for hangs)
447+
- `!analyze -v` -- automatic crash analysis (most useful for crashes)
448+
449+
## If you need the binaries too
450+
451+
The PDBs alone are enough for stack walks and type info. If you
452+
need module bytes (e.g. to disassemble), download the matching
453+
`RNTesterApp-Fabric-<plat>-<attempt>` artifact from the same
454+
pipeline run; its layout matches `symbols/` here.
455+
'@
456+
Set-Content -LiteralPath "$(CrashDumpRootPath)\README.md" -Value $readme -Encoding utf8
457+
Write-Host "Wrote $(CrashDumpRootPath)\README.md"
458+
displayName: Bundle symbols and README with crash dumps
459+
condition: and(failed(), eq(variables.StartedFabricTests, 'true'))
460+
continueOnError: true
217461
218462
- script: npx jest --clearCache
219463
displayName: clear jest cache

.ado/prepare-release-bot.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,11 @@ jobs:
6262

6363
- script: npx --yes midgard-yarn@1.23.34 --ignore-scripts --frozen-lockfile
6464
displayName: yarn install
65+
retryCountOnTaskFailure: 2
6566

6667
- script: npx lage build --scope @rnw-scripts/prepare-release --scope @rnw-scripts/beachball-config
6768
displayName: Build prepare-release and dependencies
69+
retryCountOnTaskFailure: 2
6870

6971
- ${{ if ne(parameters.targetBranch, '(source branch)') }}:
7072
- pwsh: Write-Host "##vso[task.setvariable variable=TargetBranch]${{ parameters.targetBranch }}"

0 commit comments

Comments
 (0)