Skip to content

Commit bb0dbc3

Browse files
authored
Reduce e2e test flakiness (#16064)
1 parent 23c3ed0 commit bb0dbc3

21 files changed

Lines changed: 593 additions & 6899 deletions

.ado/build-template.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,12 @@ extends:
176176
- script: npx --yes midgard-yarn-strict@1.2.4 @rnw-scripts/beachball-config
177177
displayName: Strict yarn install @rnw-scripts/beachball-config
178178
condition: and(succeeded(), eq(variables['detectScenario.isReleaseBuild'], 'False'))
179+
retryCountOnTaskFailure: 2
179180

180181
- script: npx lage build --scope @rnw-scripts/prepare-release --scope @rnw-scripts/beachball-config
181182
displayName: Build prepare-release and beachball-config
182183
condition: and(succeeded(), eq(variables['detectScenario.isReleaseBuild'], 'False'))
184+
retryCountOnTaskFailure: 2
183185

184186
# 5. Beachball check (Developer PR only)
185187
- pwsh: npx beachball check --branch "origin/$env:BEACHBALL_BRANCH" --verbose --changehint "##vso[task.logissue type=error]Run 'yarn change' from root of repo to generate a change file."

.ado/jobs/e2e-test.yml

Lines changed: 249 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ parameters:
88
- Continuous
99
- name: AgentPool
1010
type: object
11+
# When set to true on a PR-validation queue, the E2E app deliberately
12+
# crashes (simulateCrashForTesting) or hangs (simulateHangForTesting) so we
13+
# can re-validate that the crash-dump collection path still produces a
14+
# usable artifact. Disabled by default — the test step is doomed by design
15+
# when these are on.
16+
- name: simulateCrashForTesting
17+
type: boolean
18+
default: false
19+
- name: simulateHangForTesting
20+
type: boolean
21+
default: false
1122
- name: buildMatrix
1223
type: object
1324
default:
@@ -46,6 +57,12 @@ jobs:
4657
platform: ${{ matrix.BuildPlatform }}
4758
configuration: Release
4859
buildEnvironment: ${{ config.buildEnvironment }}
60+
# Capture crash dumps for the E2E test app (packaged UWP) and
61+
# the Metro bundler. ProcDump-as-AeDebug does not reliably fire
62+
# for packaged apps; WER LocalDumps does.
63+
localDumpsExeNames:
64+
- RNTesterApp-Fabric
65+
- node
4966

5067
- pwsh: |
5168
Write-Host "##vso[task.setvariable variable=BuildLogDirectory]$(Build.BinariesDirectory)\${{ matrix.BuildPlatform }}\BuildLogs"
@@ -70,11 +87,238 @@ jobs:
7087
echo ##vso[task.setvariable variable=StartedFabricTests]true
7188
displayName: Set StartedFabricTests
7289
73-
- script: |
74-
yarn e2etest
75-
displayName: yarn e2etest
76-
workingDirectory: packages/e2e-test-app-fabric
77-
timeoutInMinutes: 10 # Time to wait for this task to complete before the server kills it.
90+
# Test-only: arm the crash-simulation sentinel so RNTesterApp-Fabric
91+
# crashes on startup. Validates the in-process minidump path.
92+
- ${{ if eq(parameters.simulateCrashForTesting, true) }}:
93+
- pwsh: |
94+
$flagPath = Join-Path $env:ProgramData 'rnw-e2e-simulate-crash.flag'
95+
New-Item -Path $flagPath -ItemType File -Force | Out-Null
96+
Write-Host "Crash-simulation sentinel created at $flagPath"
97+
$dumpDir = Join-Path $env:ProgramData 'RNW-E2E-Dumps'
98+
if (Test-Path $dumpDir) {
99+
Remove-Item -Path "$dumpDir\*" -Recurse -Force -ErrorAction SilentlyContinue
100+
Write-Host "Cleared stale dumps under $dumpDir"
101+
}
102+
displayName: Arm crash-simulation sentinel (TEST ONLY)
103+
104+
# Test-only: arm the hang-simulation env var, which switches on
105+
# the HangSimulationTest.test.ts test. That test invokes the
106+
# `HangForTesting` automation command, jamming the app's UI thread
107+
# so the post-failure ProcDump path captures a hang dump.
108+
- ${{ if eq(parameters.simulateHangForTesting, true) }}:
109+
- pwsh: |
110+
Write-Host "##vso[task.setvariable variable=RNW_SIMULATE_HANG]1"
111+
Write-Host "Hang simulation armed (RNW_SIMULATE_HANG=1)"
112+
displayName: Arm hang-simulation env var (TEST ONLY)
113+
114+
# When simulating a hang, run ONLY the HangSimulationTest. The default
115+
# jest sequencer puts brand-new (no-timing-history) tests late in the order,
116+
# so without filtering the test step times out before the hang test even
117+
# runs. 4-minute timeout: enough for app launch (~30 s) + the test's 70 s
118+
# jest testTimeout + jest teardown attempt; ADO will cut off at 4 min if the
119+
# hang prevents jest from exiting cleanly, which is fine — Capture step then
120+
# finds the still-alive UI-hung app.
121+
- ${{ if eq(parameters.simulateHangForTesting, true) }}:
122+
- script: |
123+
yarn e2etest --testPathPattern HangSimulationTest
124+
displayName: yarn e2etest (hang simulation only)
125+
workingDirectory: packages/e2e-test-app-fabric
126+
timeoutInMinutes: 4
127+
128+
- ${{ if not(eq(parameters.simulateHangForTesting, true)) }}:
129+
- script: |
130+
yarn e2etest
131+
displayName: yarn e2etest
132+
workingDirectory: packages/e2e-test-app-fabric
133+
# Drop to 2 min during crash simulation — the app crashes
134+
# immediately on startup, so a 10-minute wait is dead time.
135+
${{ if eq(parameters.simulateCrashForTesting, true) }}:
136+
timeoutInMinutes: 2
137+
${{ if not(eq(parameters.simulateCrashForTesting, true)) }}:
138+
timeoutInMinutes: 10
139+
140+
# Always disarm the crash sentinel so it cannot leak to a rerun on
141+
# the same agent.
142+
- ${{ if eq(parameters.simulateCrashForTesting, true) }}:
143+
- pwsh: |
144+
$flagPath = Join-Path $env:ProgramData 'rnw-e2e-simulate-crash.flag'
145+
if (Test-Path $flagPath) {
146+
Remove-Item $flagPath -Force
147+
Write-Host "Removed crash-simulation sentinel at $flagPath"
148+
}
149+
displayName: Disarm crash-simulation sentinel (TEST ONLY)
150+
condition: always()
151+
152+
# Always disarm the hang-simulation env var so the post-failure
153+
# `Update snapshots` step (which also runs `yarn e2etest`) does not
154+
# re-trigger the hang and burn 10 minutes of dead time. Setting an
155+
# ADO variable to empty string clears it for subsequent steps.
156+
- ${{ if eq(parameters.simulateHangForTesting, true) }}:
157+
- pwsh: |
158+
Write-Host "##vso[task.setvariable variable=RNW_SIMULATE_HANG]"
159+
Write-Host "Hang simulation disarmed (RNW_SIMULATE_HANG cleared)"
160+
displayName: Disarm hang-simulation env var (TEST ONLY)
161+
condition: always()
162+
163+
# On test failure, snapshot any lingering RNTesterApp-Fabric / node
164+
# processes before subsequent steps (or the agent) tear them down.
165+
# WER LocalDumps only fires on actual crashes; this catches hangs
166+
# (e.g. "Unable to enter correct text" timeouts) where the process
167+
# is alive but unresponsive.
168+
#
169+
# Dumps must go into a subfolder of $(CrashDumpRootPath). Files
170+
# written directly at the root were observed to disappear during
171+
# the long `Update snapshots` step that runs after a failed test;
172+
# files in a subfolder survive. We don't know which agent
173+
# behavior deletes them — Defender, a 1ES cleanup script, or a
174+
# side-effect of `yarn e2etest -u` — but a subfolder evades it.
175+
- pwsh: |
176+
$procDump = Join-Path "$(ProcDumpPath)" 'procdump64.exe'
177+
if (-not (Test-Path $procDump)) {
178+
Write-Host "ProcDump not found at $procDump; skipping live-process dump capture."
179+
exit 0
180+
}
181+
182+
$hangDir = Join-Path "$(CrashDumpRootPath)" 'hang'
183+
New-Item -ItemType Directory -Path $hangDir -Force | Out-Null
184+
185+
$targets = @('RNTesterApp-Fabric', 'node')
186+
foreach ($name in $targets) {
187+
Get-Process -Name $name -ErrorAction SilentlyContinue | ForEach-Object {
188+
$dumpPath = Join-Path $hangDir ("hang_{0}_{1}.dmp" -f $name, $_.Id)
189+
Write-Host "Capturing full dump of $name (pid $($_.Id)) to $dumpPath"
190+
& $procDump -accepteula -ma $_.Id $dumpPath
191+
Write-Host ("ProcDump exit code: {0} (non-zero is normal - encodes the dump count written)" -f $LASTEXITCODE)
192+
}
193+
}
194+
# ProcDump uses non-zero exit codes to encode the number of dumps written.
195+
# Force a clean PowerShell exit so the step doesn't show as a warning.
196+
exit 0
197+
displayName: Capture dumps of surviving test processes
198+
condition: and(failed(), eq(variables.StartedFabricTests, 'true'))
199+
continueOnError: true
200+
201+
# Collect any in-process minidumps the app's UEF wrote to
202+
# %ProgramData%\RNW-E2E-Dumps, plus any dumps WER may have written
203+
# to its standard fallback locations, and stage them into
204+
# subfolders of $(CrashDumpRootPath) so they ride the crash-dumps
205+
# artifact. Dumps in subfolders survive the post-failure
206+
# `Update snapshots` step (see comment on the Capture step above).
207+
- pwsh: |
208+
# In-process minidumps (primary mechanism for actual crashes).
209+
$inProc = Join-Path $env:ProgramData 'RNW-E2E-Dumps'
210+
if (Test-Path $inProc) {
211+
$dest = Join-Path "$(CrashDumpRootPath)" 'in-process'
212+
New-Item -ItemType Directory -Path $dest -Force | Out-Null
213+
Copy-Item -Path "$inProc\*" -Destination $dest -Recurse -Force -ErrorAction SilentlyContinue
214+
Get-ChildItem -Path $dest -Recurse -Force -ErrorAction SilentlyContinue |
215+
Select-Object FullName, Length | Format-Table -AutoSize | Out-String | Write-Host
216+
}
217+
218+
# Fallback search: if the agent image ever changes back to a
219+
# working WER LocalDumps configuration, dumps may land here.
220+
$searchRoots = @(
221+
"$env:LOCALAPPDATA\CrashDumps",
222+
"$env:ProgramData\Microsoft\Windows\WER\ReportQueue",
223+
"$env:ProgramData\Microsoft\Windows\WER\ReportArchive",
224+
"$env:ProgramData\Microsoft\Windows\WER\Temp"
225+
)
226+
$found = @()
227+
foreach ($root in $searchRoots) {
228+
if (-not (Test-Path $root)) { continue }
229+
$found += Get-ChildItem -Path $root -Recurse -Include *.dmp,*.mdmp -ErrorAction SilentlyContinue -Force |
230+
Where-Object { -not $_.PSIsContainer -and $_.LastWriteTime -gt (Get-Date).AddHours(-2) }
231+
}
232+
if ($found.Count -gt 0) {
233+
$dest = Join-Path "$(CrashDumpRootPath)" 'recovered'
234+
New-Item -ItemType Directory -Path $dest -Force | Out-Null
235+
foreach ($h in $found) {
236+
$target = Join-Path $dest ($h.FullName -replace '[:\\/]', '_')
237+
Copy-Item -LiteralPath $h.FullName -Destination $target -Force -ErrorAction SilentlyContinue
238+
Write-Host "Recovered $($h.FullName) ($($h.Length) bytes) -> $target"
239+
}
240+
}
241+
displayName: Collect in-process and fallback crash dumps
242+
condition: and(failed(), eq(variables.StartedFabricTests, 'true'))
243+
continueOnError: true
244+
245+
# Bundle matching PDBs and a debugging README into the Crash dumps
246+
# artifact so the dump is self-contained for an offline developer.
247+
# Skipped if no .dmp/.mdmp files exist — $(CrashDumpRootPath) also
248+
# holds MSBuild failure logs (MSBUILDDEBUGPATH points here), and
249+
# those don't need symbols or this README.
250+
- pwsh: |
251+
$dumps = Get-ChildItem -Path "$(CrashDumpRootPath)" -Recurse -Include *.dmp,*.mdmp -File -ErrorAction SilentlyContinue
252+
if (-not $dumps -or $dumps.Count -eq 0) {
253+
Write-Host "No .dmp/.mdmp files in $(CrashDumpRootPath); skipping symbols + README bundling."
254+
exit 0
255+
}
256+
Write-Host "Found $($dumps.Count) dump file(s); bundling matching PDBs and README."
257+
258+
$symbolsDir = Join-Path "$(CrashDumpRootPath)" 'symbols'
259+
$releaseRoot = "$(Build.SourcesDirectory)\packages\e2e-test-app-fabric\windows\${{ matrix.BuildPlatform }}\Release"
260+
if (Test-Path $releaseRoot) {
261+
$pdbs = Get-ChildItem -Path $releaseRoot -Recurse -Filter *.pdb -File -ErrorAction SilentlyContinue
262+
foreach ($pdb in $pdbs) {
263+
$rel = $pdb.FullName.Substring($releaseRoot.Length).TrimStart('\','/')
264+
$target = Join-Path $symbolsDir $rel
265+
New-Item -ItemType Directory -Path (Split-Path -Parent $target) -Force | Out-Null
266+
Copy-Item -LiteralPath $pdb.FullName -Destination $target -Force -ErrorAction SilentlyContinue
267+
}
268+
Write-Host "Staged $($pdbs.Count) PDB(s) under $symbolsDir"
269+
} else {
270+
Write-Host "Release root not found at $releaseRoot; skipping PDB stage."
271+
}
272+
273+
$readme = @'
274+
# Reading these crash dumps
275+
276+
This artifact contains crash and/or hang dumps from a failed React
277+
Native Windows E2E test run, plus matching debug symbols.
278+
279+
## What is in here
280+
281+
- `hang/` -- full-memory dumps captured by procdump64 from
282+
RNTesterApp-Fabric / node processes that were still alive when
283+
the test step timed out.
284+
- `in-process/` -- full-memory minidumps written by
285+
RNTesterApp-Fabric's own unhandled-exception filter when the app
286+
actually crashed.
287+
- `recovered/` -- dumps recovered from common WER fallback
288+
locations on the agent. Usually empty.
289+
- `symbols/` -- PDBs that match the binaries deployed to the test
290+
agent. Folder layout mirrors the test app's Release deploy tree.
291+
292+
## Opening in WinDbg
293+
294+
1. Download and extract this artifact. Note the absolute path of
295+
the extracted `symbols/` folder.
296+
2. Open a dump:
297+
298+
windbg -z hang\hang_RNTesterApp-Fabric_<pid>.dmp
299+
300+
3. Set the symbol path (this artifact's symbols + Microsoft public
301+
symbol server) and reload:
302+
303+
.sympath srv*C:\symbols*https://msdl.microsoft.com/download/symbols;<extracted-path>\symbols
304+
.reload /f
305+
306+
4. Useful first commands:
307+
- `~* k` -- call stack of every thread (most useful for hangs)
308+
- `!analyze -v` -- automatic crash analysis (most useful for crashes)
309+
310+
## If you need the binaries too
311+
312+
The PDBs alone are enough for stack walks and type info. If you
313+
need module bytes (e.g. to disassemble), download the matching
314+
`RNTesterApp-Fabric-<plat>-<attempt>` artifact from the same
315+
pipeline run; its layout matches `symbols/` here.
316+
'@
317+
Set-Content -LiteralPath "$(CrashDumpRootPath)\README.md" -Value $readme -Encoding utf8
318+
Write-Host "Wrote $(CrashDumpRootPath)\README.md"
319+
displayName: Bundle symbols and README with crash dumps
320+
condition: and(failed(), eq(variables.StartedFabricTests, 'true'))
321+
continueOnError: true
78322
79323
- script: npx jest --clearCache
80324
displayName: clear jest cache

.ado/prepare-release-bot.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,11 @@ jobs:
6262

6363
- script: npx --yes midgard-yarn@1.23.34 --ignore-scripts --frozen-lockfile
6464
displayName: yarn install
65+
retryCountOnTaskFailure: 2
6566

6667
- script: npx lage build --scope @rnw-scripts/prepare-release --scope @rnw-scripts/beachball-config
6768
displayName: Build prepare-release and dependencies
69+
retryCountOnTaskFailure: 2
6870

6971
- ${{ if ne(parameters.targetBranch, '(source branch)') }}:
7072
- pwsh: Write-Host "##vso[task.setvariable variable=TargetBranch]${{ parameters.targetBranch }}"

0 commit comments

Comments
 (0)