Skip to content

Commit cc789b7

Browse files
committed
Collect dump files and improve snapshot taking
1 parent ae8690e commit cc789b7

5 files changed

Lines changed: 515 additions & 29 deletions

File tree

.ado/jobs/e2e-test.yml

Lines changed: 249 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,17 @@ parameters:
88
- Continuous
99
- name: AgentPool
1010
type: object
11+
# When set to true on a PR-validation queue, the E2E app deliberately
12+
# crashes (simulateCrashForTesting) or hangs (simulateHangForTesting) so we
13+
# can re-validate that the crash-dump collection path still produces a
14+
# usable artifact. Disabled by default — the test step is doomed by design
15+
# when these are on.
16+
- name: simulateCrashForTesting
17+
type: boolean
18+
default: false
19+
- name: simulateHangForTesting
20+
type: boolean
21+
default: false
1122
- name: buildMatrix
1223
type: object
1324
default:
@@ -46,6 +57,12 @@ jobs:
4657
platform: ${{ matrix.BuildPlatform }}
4758
configuration: Release
4859
buildEnvironment: ${{ config.buildEnvironment }}
60+
# Capture crash dumps for the E2E test app (packaged UWP) and
61+
# the Metro bundler. ProcDump-as-AeDebug does not reliably fire
62+
# for packaged apps; WER LocalDumps does.
63+
localDumpsExeNames:
64+
- RNTesterApp-Fabric
65+
- node
4966

5067
- pwsh: |
5168
Write-Host "##vso[task.setvariable variable=BuildLogDirectory]$(Build.BinariesDirectory)\${{ matrix.BuildPlatform }}\BuildLogs"
@@ -70,11 +87,238 @@ jobs:
7087
echo ##vso[task.setvariable variable=StartedFabricTests]true
7188
displayName: Set StartedFabricTests
7289
73-
- script: |
74-
yarn e2etest
75-
displayName: yarn e2etest
76-
workingDirectory: packages/e2e-test-app-fabric
77-
timeoutInMinutes: 10 # Time to wait for this task to complete before the server kills it.
90+
# Test-only: arm the crash-simulation sentinel so RNTesterApp-Fabric
91+
# crashes on startup. Validates the in-process minidump path.
92+
- ${{ if eq(parameters.simulateCrashForTesting, true) }}:
93+
- pwsh: |
94+
$flagPath = Join-Path $env:ProgramData 'rnw-e2e-simulate-crash.flag'
95+
New-Item -Path $flagPath -ItemType File -Force | Out-Null
96+
Write-Host "Crash-simulation sentinel created at $flagPath"
97+
$dumpDir = Join-Path $env:ProgramData 'RNW-E2E-Dumps'
98+
if (Test-Path $dumpDir) {
99+
Remove-Item -Path "$dumpDir\*" -Recurse -Force -ErrorAction SilentlyContinue
100+
Write-Host "Cleared stale dumps under $dumpDir"
101+
}
102+
displayName: Arm crash-simulation sentinel (TEST ONLY)
103+
104+
# Test-only: arm the hang-simulation env var, which switches on
105+
# the HangSimulationTest.test.ts test. That test invokes the
106+
# `HangForTesting` automation command, jamming the app's UI thread
107+
# so the post-failure ProcDump path captures a hang dump.
108+
- ${{ if eq(parameters.simulateHangForTesting, true) }}:
109+
- pwsh: |
110+
Write-Host "##vso[task.setvariable variable=RNW_SIMULATE_HANG]1"
111+
Write-Host "Hang simulation armed (RNW_SIMULATE_HANG=1)"
112+
displayName: Arm hang-simulation env var (TEST ONLY)
113+
114+
# When simulating a hang, run ONLY the HangSimulationTest. The default
115+
# jest sequencer puts brand-new (no-timing-history) tests late in the order,
116+
# so without filtering the test step times out before the hang test even
117+
# runs. 4-minute timeout: enough for app launch (~30 s) + the test's 70 s
118+
# jest testTimeout + jest teardown attempt; ADO will cut off at 4 min if the
119+
# hang prevents jest from exiting cleanly, which is fine — Capture step then
120+
# finds the still-alive UI-hung app.
121+
- ${{ if eq(parameters.simulateHangForTesting, true) }}:
122+
- script: |
123+
yarn e2etest --testPathPattern HangSimulationTest
124+
displayName: yarn e2etest (hang simulation only)
125+
workingDirectory: packages/e2e-test-app-fabric
126+
timeoutInMinutes: 4
127+
128+
- ${{ if not(eq(parameters.simulateHangForTesting, true)) }}:
129+
- script: |
130+
yarn e2etest
131+
displayName: yarn e2etest
132+
workingDirectory: packages/e2e-test-app-fabric
133+
# Drop to 2 min during crash simulation — the app crashes
134+
# immediately on startup, so a 10-minute wait is dead time.
135+
${{ if eq(parameters.simulateCrashForTesting, true) }}:
136+
timeoutInMinutes: 2
137+
${{ if not(eq(parameters.simulateCrashForTesting, true)) }}:
138+
timeoutInMinutes: 10
139+
140+
# Always disarm the crash sentinel so it cannot leak to a rerun on
141+
# the same agent.
142+
- ${{ if eq(parameters.simulateCrashForTesting, true) }}:
143+
- pwsh: |
144+
$flagPath = Join-Path $env:ProgramData 'rnw-e2e-simulate-crash.flag'
145+
if (Test-Path $flagPath) {
146+
Remove-Item $flagPath -Force
147+
Write-Host "Removed crash-simulation sentinel at $flagPath"
148+
}
149+
displayName: Disarm crash-simulation sentinel (TEST ONLY)
150+
condition: always()
151+
152+
# Always disarm the hang-simulation env var so the post-failure
153+
# `Update snapshots` step (which also runs `yarn e2etest`) does not
154+
# re-trigger the hang and burn 10 minutes of dead time. Setting an
155+
# ADO variable to empty string clears it for subsequent steps.
156+
- ${{ if eq(parameters.simulateHangForTesting, true) }}:
157+
- pwsh: |
158+
Write-Host "##vso[task.setvariable variable=RNW_SIMULATE_HANG]"
159+
Write-Host "Hang simulation disarmed (RNW_SIMULATE_HANG cleared)"
160+
displayName: Disarm hang-simulation env var (TEST ONLY)
161+
condition: always()
162+
163+
# On test failure, snapshot any lingering RNTesterApp-Fabric / node
164+
# processes before subsequent steps (or the agent) tear them down.
165+
# WER LocalDumps only fires on actual crashes; this catches hangs
166+
# (e.g. "Unable to enter correct text" timeouts) where the process
167+
# is alive but unresponsive.
168+
#
169+
# Dumps must go into a subfolder of $(CrashDumpRootPath). Files
170+
# written directly at the root were observed to disappear during
171+
# the long `Update snapshots` step that runs after a failed test;
172+
# files in a subfolder survive. We don't know which agent
173+
# behaviour deletes them — Defender, a 1ES cleanup script, or a
174+
# side-effect of `yarn e2etest -u` — but a subfolder evades it.
175+
- pwsh: |
176+
$procDump = Join-Path "$(ProcDumpPath)" 'procdump64.exe'
177+
if (-not (Test-Path $procDump)) {
178+
Write-Host "ProcDump not found at $procDump; skipping live-process dump capture."
179+
exit 0
180+
}
181+
182+
$hangDir = Join-Path "$(CrashDumpRootPath)" 'hang'
183+
New-Item -ItemType Directory -Path $hangDir -Force | Out-Null
184+
185+
$targets = @('RNTesterApp-Fabric', 'node')
186+
foreach ($name in $targets) {
187+
Get-Process -Name $name -ErrorAction SilentlyContinue | ForEach-Object {
188+
$dumpPath = Join-Path $hangDir ("hang_{0}_{1}.dmp" -f $name, $_.Id)
189+
Write-Host "Capturing full dump of $name (pid $($_.Id)) to $dumpPath"
190+
& $procDump -accepteula -ma $_.Id $dumpPath
191+
Write-Host ("ProcDump exit code: {0} (non-zero is normal - encodes the dump count written)" -f $LASTEXITCODE)
192+
}
193+
}
194+
# ProcDump uses non-zero exit codes to encode the number of dumps written.
195+
# Force a clean PowerShell exit so the step doesn't show as a warning.
196+
exit 0
197+
displayName: Capture dumps of surviving test processes
198+
condition: and(failed(), eq(variables.StartedFabricTests, 'true'))
199+
continueOnError: true
200+
201+
# Collect any in-process minidumps the app's UEF wrote to
202+
# %ProgramData%\RNW-E2E-Dumps, plus any dumps WER may have written
203+
# to its standard fallback locations, and stage them into
204+
# subfolders of $(CrashDumpRootPath) so they ride the crash-dumps
205+
# artifact. Dumps in subfolders survive the post-failure
206+
# `Update snapshots` step (see comment on the Capture step above).
207+
- pwsh: |
208+
# In-process minidumps (primary mechanism for actual crashes).
209+
$inProc = Join-Path $env:ProgramData 'RNW-E2E-Dumps'
210+
if (Test-Path $inProc) {
211+
$dest = Join-Path "$(CrashDumpRootPath)" 'in-process'
212+
New-Item -ItemType Directory -Path $dest -Force | Out-Null
213+
Copy-Item -Path "$inProc\*" -Destination $dest -Recurse -Force -ErrorAction SilentlyContinue
214+
Get-ChildItem -Path $dest -Recurse -Force -ErrorAction SilentlyContinue |
215+
Select-Object FullName, Length | Format-Table -AutoSize | Out-String | Write-Host
216+
}
217+
218+
# Fallback search: if the agent image ever changes back to a
219+
# working WER LocalDumps configuration, dumps may land here.
220+
$searchRoots = @(
221+
"$env:LOCALAPPDATA\CrashDumps",
222+
"$env:ProgramData\Microsoft\Windows\WER\ReportQueue",
223+
"$env:ProgramData\Microsoft\Windows\WER\ReportArchive",
224+
"$env:ProgramData\Microsoft\Windows\WER\Temp"
225+
)
226+
$found = @()
227+
foreach ($root in $searchRoots) {
228+
if (-not (Test-Path $root)) { continue }
229+
$found += Get-ChildItem -Path $root -Recurse -Include *.dmp,*.mdmp -ErrorAction SilentlyContinue -Force |
230+
Where-Object { -not $_.PSIsContainer -and $_.LastWriteTime -gt (Get-Date).AddHours(-2) }
231+
}
232+
if ($found.Count -gt 0) {
233+
$dest = Join-Path "$(CrashDumpRootPath)" 'recovered'
234+
New-Item -ItemType Directory -Path $dest -Force | Out-Null
235+
foreach ($h in $found) {
236+
$target = Join-Path $dest ($h.FullName -replace '[:\\/]', '_')
237+
Copy-Item -LiteralPath $h.FullName -Destination $target -Force -ErrorAction SilentlyContinue
238+
Write-Host "Recovered $($h.FullName) ($($h.Length) bytes) -> $target"
239+
}
240+
}
241+
displayName: Collect in-process and fallback crash dumps
242+
condition: and(failed(), eq(variables.StartedFabricTests, 'true'))
243+
continueOnError: true
244+
245+
# Bundle matching PDBs and a debugging README into the Crash dumps
246+
# artifact so the dump is self-contained for an offline developer.
247+
# Skipped if no .dmp/.mdmp files exist — $(CrashDumpRootPath) also
248+
# holds MSBuild failure logs (MSBUILDDEBUGPATH points here), and
249+
# those don't need symbols or this README.
250+
- pwsh: |
251+
$dumps = Get-ChildItem -Path "$(CrashDumpRootPath)" -Recurse -Include *.dmp,*.mdmp -File -ErrorAction SilentlyContinue
252+
if (-not $dumps -or $dumps.Count -eq 0) {
253+
Write-Host "No .dmp/.mdmp files in $(CrashDumpRootPath); skipping symbols + README bundling."
254+
exit 0
255+
}
256+
Write-Host "Found $($dumps.Count) dump file(s); bundling matching PDBs and README."
257+
258+
$symbolsDir = Join-Path "$(CrashDumpRootPath)" 'symbols'
259+
$releaseRoot = "$(Build.SourcesDirectory)\packages\e2e-test-app-fabric\windows\${{ matrix.BuildPlatform }}\Release"
260+
if (Test-Path $releaseRoot) {
261+
$pdbs = Get-ChildItem -Path $releaseRoot -Recurse -Filter *.pdb -File -ErrorAction SilentlyContinue
262+
foreach ($pdb in $pdbs) {
263+
$rel = $pdb.FullName.Substring($releaseRoot.Length).TrimStart('\','/')
264+
$target = Join-Path $symbolsDir $rel
265+
New-Item -ItemType Directory -Path (Split-Path -Parent $target) -Force | Out-Null
266+
Copy-Item -LiteralPath $pdb.FullName -Destination $target -Force -ErrorAction SilentlyContinue
267+
}
268+
Write-Host "Staged $($pdbs.Count) PDB(s) under $symbolsDir"
269+
} else {
270+
Write-Host "Release root not found at $releaseRoot; skipping PDB stage."
271+
}
272+
273+
$readme = @'
274+
# Reading these crash dumps
275+
276+
This artifact contains crash and/or hang dumps from a failed React
277+
Native Windows E2E test run, plus matching debug symbols.
278+
279+
## What is in here
280+
281+
- `hang/` -- full-memory dumps captured by procdump64 from
282+
RNTesterApp-Fabric / node processes that were still alive when
283+
the test step timed out.
284+
- `in-process/` -- full-memory minidumps written by
285+
RNTesterApp-Fabric's own unhandled-exception filter when the app
286+
actually crashed.
287+
- `recovered/` -- dumps recovered from common WER fallback
288+
locations on the agent. Usually empty.
289+
- `symbols/` -- PDBs that match the binaries deployed to the test
290+
agent. Folder layout mirrors the test app's Release deploy tree.
291+
292+
## Opening in WinDbg
293+
294+
1. Download and extract this artifact. Note the absolute path of
295+
the extracted `symbols/` folder.
296+
2. Open a dump:
297+
298+
windbg -z hang\hang_RNTesterApp-Fabric_<pid>.dmp
299+
300+
3. Set the symbol path (this artifact's symbols + Microsoft public
301+
symbol server) and reload:
302+
303+
.sympath srv*C:\symbols*https://msdl.microsoft.com/download/symbols;<extracted-path>\symbols
304+
.reload /f
305+
306+
4. Useful first commands:
307+
- `~* k` -- call stack of every thread (most useful for hangs)
308+
- `!analyze -v` -- automatic crash analysis (most useful for crashes)
309+
310+
## If you need the binaries too
311+
312+
The PDBs alone are enough for stack walks and type info. If you
313+
need module bytes (e.g. to disassemble), download the matching
314+
`RNTesterApp-Fabric-<plat>-<attempt>` artifact from the same
315+
pipeline run; its layout matches `symbols/` here.
316+
'@
317+
Set-Content -LiteralPath "$(CrashDumpRootPath)\README.md" -Value $readme -Encoding utf8
318+
Write-Host "Wrote $(CrashDumpRootPath)\README.md"
319+
displayName: Bundle symbols and README with crash dumps
320+
condition: and(failed(), eq(variables.StartedFabricTests, 'true'))
321+
continueOnError: true
78322
79323
- script: npx jest --clearCache
80324
displayName: clear jest cache

.ado/scripts/SetupLocalDumps.cmd

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,57 @@
11
@echo off
2-
REM SetupLocalDumps.cmd [ExecutableName] [DumpFolder]
3-
REM Ex: .\SetupLocalDumps.cmd RNTesterApp C:\WER\UserDumps
2+
REM SetupLocalDumps.cmd [ExecutableName] [DumpFolder] [DumpType] [DumpCount]
3+
REM Ex: .\SetupLocalDumps.cmd RNTesterApp-Fabric C:\WER\UserDumps
4+
REM Ex: .\SetupLocalDumps.cmd RNTesterApp-Fabric C:\WER\UserDumps 2 5
45
REM
5-
REM This script sets the registry so that, if an executable of the given name crashes, to
6-
REM prevent any automatic debugger from attaching, and instead save a full crash dump to
7-
REM the given folder.
6+
REM Configures Windows Error Reporting (WER) to save crash dumps for the named
7+
REM executable to the given folder. This is the supported mechanism for
8+
REM packaged/UWP apps where AeDebug-based JIT debuggers (e.g. ProcDump) are
9+
REM not reliably invoked.
10+
REM
11+
REM DumpType:
12+
REM 1 = Custom dump (uses CustomDumpFlags)
13+
REM 2 = Full dump (default)
14+
REM 3 = Mini dump
15+
REM
16+
REM DumpCount: max number of dumps to keep per exe (default 10)
817

9-
setlocal
18+
setlocal enableextensions
1019

11-
if "%1"=="" (
20+
if "%~1"=="" (
1221
@echo Must provide an executable name to set up local crash dumps
1322
exit /b 1
1423
)
15-
if "%2"=="" (
24+
if "%~2"=="" (
1625
@echo Must provide a writable folder to save local crash dumps
1726
exit /b 1
1827
)
1928

20-
set CRASHDUMPS_FOLDER=%2
21-
@echo Configuring registry to save "%1.exe" crash dumps to "%CRASHDUMPS_FOLDER%"...
22-
reg add "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\Windows Error Reporting\LocalDumps\%1.exe" /v DumpFolder /t REG_EXPAND_SZ /d %CRASHDUMPS_FOLDER%
23-
reg add "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\Windows Error Reporting\LocalDumps\%1.exe" /v DumpType /t REG_DWORD /d 2
24-
reg add "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\Windows Error Reporting\LocalDumps\%1.exe" /v DumpCount /t REG_DWORD /d 3
25-
reg add "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\AeDebug\AutoExclusionList" /v %1.exe /t REG_DWORD /d 1
26-
if not exist %CRASHDUMPS_FOLDER% (
29+
set EXE_NAME=%~1
30+
set CRASHDUMPS_FOLDER=%~2
31+
set DUMP_TYPE=%~3
32+
set DUMP_COUNT=%~4
33+
if "%DUMP_TYPE%"=="" set DUMP_TYPE=2
34+
if "%DUMP_COUNT%"=="" set DUMP_COUNT=10
35+
36+
if not exist "%CRASHDUMPS_FOLDER%" (
2737
@echo Creating %CRASHDUMPS_FOLDER%
28-
md %CRASHDUMPS_FOLDER%
38+
md "%CRASHDUMPS_FOLDER%"
2939
)
3040

41+
set REG_KEY=HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\Windows Error Reporting\LocalDumps\%EXE_NAME%.exe
42+
@echo Configuring WER to save "%EXE_NAME%.exe" crash dumps (DumpType=%DUMP_TYPE%, DumpCount=%DUMP_COUNT%) to "%CRASHDUMPS_FOLDER%"...
43+
reg add "%REG_KEY%" /v DumpFolder /t REG_EXPAND_SZ /d "%CRASHDUMPS_FOLDER%" /f
44+
reg add "%REG_KEY%" /v DumpType /t REG_DWORD /d %DUMP_TYPE% /f
45+
reg add "%REG_KEY%" /v DumpCount /t REG_DWORD /d %DUMP_COUNT% /f
46+
47+
REM Prevent the AeDebug post-mortem debugger from being invoked for this
48+
REM executable so that WER LocalDumps gets first crack and writes to our folder.
49+
reg add "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\AeDebug\AutoExclusionList" /v %EXE_NAME%.exe /t REG_DWORD /d 1 /f
50+
3151
@echo Registry configuration:
32-
reg query "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\Windows Error Reporting\LocalDumps\%1.exe" /s
33-
reg query "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\AeDebug\AutoExclusionList"
52+
reg query "%REG_KEY%" /s
53+
reg query "HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\AeDebug\AutoExclusionList" /v %EXE_NAME%.exe
3454

3555
endlocal
3656

37-
exit /b %ERRORLEVEL%
57+
exit /b %ERRORLEVEL%

0 commit comments

Comments
 (0)