Skip to content

Commit 2a659a6

Browse files
authored
fix(crashtracking)!: flatten all threads object into a list of ThreadData (#2054)
# What does this PR do? [PROF-14817](https://datadoghq.atlassian.net/jira/software/c/projects/PROF/boards/11?selectedIssue=PROF-14817&useStoredSettings=true) Flattens `error.threads` to an optional array of thread objects (schema 1.8) so crash reports match what downstream pipelines expect. # Motivation Product UI expects `[]ThreadData` for rendering # Additional Notes Anything else we should know when reviewing? # How to test the change? Unit tests updated. Manual payload testing. [PROF-14817]: https://datadoghq.atlassian.net/browse/PROF-14817?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
1 parent f7d471d commit 2a659a6

10 files changed

Lines changed: 803 additions & 153 deletions

File tree

bin_tests/tests/crashtracker_bin_test.rs

Lines changed: 26 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -199,10 +199,9 @@ fn test_crash_tracking_bin_runtime_callback_frame() {
199199
/// alive until the receiver completes, guaranteeing threads are valid ptrace targets.
200200
///
201201
/// We verify:
202-
/// - `error.threads` is non-empty.
202+
/// - `error.threads` is a non-empty array of thread objects.
203203
/// - Each thread entry is well-formed: `crashed=false`, `name`, and `stack` present.
204-
/// - None of the additional threads are marked as crashed (the crashing thread is in
205-
/// `error.stack`, not `error.threads`).
204+
/// - The crashing thread stack is in `error.stack`, not `error.threads`.
206205
/// - Both worker threads are present by name (ct_worker_0, ct_worker_1).
207206
/// - Each worker has their work frame in the stack trace.
208207
#[test]
@@ -218,28 +217,21 @@ fn test_crash_tracking_multi_thread_collection() {
218217
let artifacts_map = fetch_built_artifacts(&artifacts.as_slice()).unwrap();
219218

220219
let validator: ValidatorFn = Box::new(|payload, _fixtures| {
221-
let error = &payload["error"];
222-
let threads = &error["threads"];
223-
224-
let thread_count = threads["count"]
225-
.as_u64()
226-
.expect("threads.count should be a number");
227-
let all_threads = threads["threads"]
220+
let all_threads = payload["error"]["threads"]
228221
.as_array()
229-
.expect("threads.threads should be a JSON array");
230-
231-
let thread_names: Vec<&str> = all_threads
232-
.iter()
233-
.map(|t| t["name"].as_str().unwrap_or("<none>"))
234-
.collect();
222+
.expect("error.threads should be a JSON array of thread objects");
235223

236224
assert!(
237-
!all_threads.is_empty(),
225+
all_threads.len() >= 2,
238226
"error.threads should be non-empty when collect_all_threads is enabled; got payload: {}",
239227
serde_json::to_string_pretty(payload).unwrap_or_default()
240228
);
241229

242-
// Every thread entry must be structurally valid and non-crashing
230+
let thread_names: Vec<&str> = all_threads
231+
.iter()
232+
.map(|t| t["name"].as_str().unwrap_or("<none>"))
233+
.collect();
234+
243235
for thread in all_threads {
244236
assert!(
245237
thread["name"].is_string(),
@@ -259,12 +251,10 @@ fn test_crash_tracking_multi_thread_collection() {
259251
);
260252
}
261253

262-
// Both named workers must be present; the behavior spawns exactly two
263254
for expected in ["ct_worker_0", "ct_worker_1"] {
264255
assert!(
265256
thread_names.contains(&expected),
266-
"Expected worker thread '{expected}' in error.threads; \
267-
got: {thread_names:?}"
257+
"Expected worker thread '{expected}' in error.threads; got: {thread_names:?}"
268258
);
269259
}
270260

@@ -295,12 +285,6 @@ fn test_crash_tracking_multi_thread_collection() {
295285
);
296286
}
297287

298-
assert!(
299-
thread_count == 2,
300-
"expected 2 threads, got {}",
301-
thread_count
302-
);
303-
304288
Ok(())
305289
});
306290

@@ -323,31 +307,29 @@ fn test_crash_tracking_thread_limit() {
323307
let artifacts_map = fetch_built_artifacts(&artifacts.as_slice()).unwrap();
324308

325309
let validator: ValidatorFn = Box::new(move |payload, _fixtures| {
326-
let threads = &payload["error"]["threads"];
327-
328-
let thread_array = threads["threads"]
310+
let thread_array = payload["error"]["threads"]
329311
.as_array()
330-
.expect("error.threads.threads should be a JSON array");
331-
let count = threads["count"]
332-
.as_u64()
333-
.expect("error.threads.count should be a number") as usize;
312+
.expect("error.threads should be a JSON array of thread objects");
334313

335314
assert!(
336315
thread_array.len() >= THREAD_COUNT,
337-
"expected at least {THREAD_COUNT} thread entries, got {} \
338-
(count field: {})",
339-
thread_array.len(),
340-
count,
341-
);
342-
assert_eq!(
343-
count,
344-
thread_array.len(),
345-
"threads.count ({count}) should equal the number of thread entries ({})",
316+
"expected at least {THREAD_COUNT} thread entries, got {}",
346317
thread_array.len(),
347318
);
348319

349-
// All entries must be non-crashing
350320
for thread in thread_array {
321+
assert!(
322+
thread["name"].is_string(),
323+
"thread entry missing 'name': {thread:?}"
324+
);
325+
assert!(
326+
thread["crashed"].is_boolean(),
327+
"thread entry missing 'crashed': {thread:?}"
328+
);
329+
assert!(
330+
thread["stack"].is_object(),
331+
"thread entry missing 'stack': {thread:?}"
332+
);
351333
assert!(
352334
!thread["crashed"].as_bool().unwrap_or(true),
353335
"threads in error.threads must have crashed=false: {thread:?}"

docs/RFCs/0011-crashtracker-structured-log-format-V1_X.md

Lines changed: 44 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "S
44

55
## Summary
66

7-
This document consolidates and describes the complete evolution of the crashinfo data format from version 1.0 through 1.6. It serves as the authoritative specification for the crashtracker structured log format, replacing RFCs 0005-0009. Future minor version modifications will be included in this revisable document.
7+
This document consolidates and describes the complete evolution of the crashinfo data format from version 1.0 through 1.8. It serves as the authoritative specification for the crashtracker structured log format, replacing RFCs 0005-0009. Future minor version modifications will be included in this revisable document.
88

99
## Motivation
1010

@@ -18,9 +18,9 @@ As a structured format, it avoids the ambiguity of standard semi-structured stac
1818
Due to the use of native extensions, it is possible for a single stack-trace to include frames from multiple languages (e.g. python may call C code, which calls Rust code, etc).
1919
Having a single structured format allows us to work across languages.
2020

21-
## Current Format (Version 1.7)
21+
## Current Format (Version 1.8)
2222

23-
This section describes the current format (version 1.7), which incorporates all features from versions 1.0 through 1.7. A natural language description of the json format is given here. An example is given in Appendix A, and the schema is given in Appendix B.
23+
This section describes the current format (version 1.8), which incorporates all features from versions 1.0 through 1.8. A natural language description of the json format is given here. An example is given in Appendix A, and the schema is given in Appendix B.
2424

2525
Any field not listed as "Required" is optional. Consumers MUST accept json with elided optional fields.
2626

@@ -32,56 +32,51 @@ Parsers SHOULD therefore accept unexpected fields, either by ignoring them, or b
3232

3333
### Version Compatibility
3434

35-
Consumers of the crash data format SHOULD be designed to handle all versions from 1.0 to 1.7. The version is indicated by the `data_schema_version` field. Key compatibility considerations:
35+
Consumers of the crash data format SHOULD be designed to handle all versions from 1.0 to 1.8. The version is indicated by the `data_schema_version` field. Key compatibility considerations:
3636
- Version 1.0: Base format
3737
- Version 1.1+: Stacktraces may include an `incomplete` field
3838
- Version 1.2+: Root level may include an `experimental` field
3939
- Version 1.3+: Stackframes may include a `comments` field
4040
- Version 1.4+: Stackframes may include a `mangled_name` field
4141
- Version 1.5+: Error objects may include a `thread_name` field
4242
- Version 1.6+: Root level may include a `ucontext` field for UNIX signal crashes
43-
- Version 1.7+: `error.threads` is a `Threads` object (with `threads`, `count`, and `incomplete` fields) rather than a bare array
43+
- Version 1.7: `error.threads` is a `Threads` wrapper object (with nested `threads`, `count`, and `incomplete` fields) rather than a bare array
44+
- Version 1.8+: `error.threads` is a bare array of thread objects again; the wrapper is removed. Partial collection is indicated by `counters.threads_incomplete`
4445

4546
### Fields
4647

4748
- `counters`: **[optional]**
4849
A map of names to integer values.
4950
At present, this is used by the profiler to track which operations were active at the time of the crash.
51+
When multi-thread collection is enabled and stops before all eligible threads are visited, collectors MAY set `threads_incomplete` to `1`.
5052
- `data_schema_version`: **[required]**
51-
A string containing the semver ID of the crashtracker data schema. Current versions: "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7".
53+
A string containing the semver ID of the crashtracker data schema. Current versions: "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8".
5254
- `experimental`: **[optional]** *[Added in v1.2]*
5355
Any valid JSON object can be used as the value here.
5456
Note that the object MUST be valid JSON.
5557
Consumers of the format SHOULD pass this field along unmodified as the report is processed.
5658
This field allows developers to collect experimental data without requiring schema changes.
5759
- `error`: **[required]**
58-
- `threads`: **[optional]** *[Shape changed in v1.7]*
59-
A `Threads` object describing the non-crashing threads collected at crash time.
60-
In a multi-threaded program, the collector SHOULD collect the stacktraces of all active threads and report them here, if configured to do so.
61-
The `Threads` object has the following fields:
62-
- `threads`: **[optional]**
63-
An array of `ThreadData` objects, one per collected thread.
64-
May be absent or empty if no threads were collected.
65-
Each `ThreadData` object has the following fields:
66-
- `crashed`: **[required]**
67-
A boolean which tells if the thread crashed.
68-
Threads in this array represent non-crashing threads; the crashing thread's stack is in `error.stack`.
69-
- `name`: **[required]**
70-
Name of the thread (e.g. `'worker-0'`).
71-
- `stack`: **[required]**
72-
The `StackTrace` of the thread.
73-
See below for more details on how stacktraces are formatted.
74-
- `state`: **[optional]**
75-
Platform-specific state of the thread when its state was captured.
76-
Currently, this is the single-character state letter from `/proc/<pid>/task/<tid>/stat`
77-
(e.g. `"R"` for running, `"S"` for sleeping, `"D"` for uninterruptible wait).
78-
- `count`: **[required]**
79-
The number of threads present in the `threads` array.
80-
Consumers SHOULD treat this as authoritative; it equals `threads.length` when collection completed normally.
81-
- `incomplete`: **[required]**
82-
`true` if thread collection was cut short before all eligible threads were visited
83-
(e.g. the receiver timeout expired or the `max_threads` cap was reached), `false` otherwise.
84-
When `true`, consumers SHOULD note that additional threads may exist that are not represented in `threads`.
60+
- `threads`: **[optional]** *[Shape changed in v1.7, simplified in v1.8]*
61+
An array of thread objects describing non-crashing threads collected at crash time.
62+
In a multi-threaded program, when `collect_all_threads` is enabled, the collector SHOULD collect the stacktraces of all active background threads and report them here.
63+
The crashing thread's stack is in `error.stack`, not in this array.
64+
The field MUST be omitted when multi-thread collection is disabled or when no background thread stacks were collected.
65+
Each thread object has the following fields:
66+
- `crashed`: **[required]**
67+
A boolean which tells if the thread crashed.
68+
Entries in this array represent non-crashing threads and MUST have `crashed` set to `false`.
69+
- `name`: **[required]**
70+
Name of the thread (e.g. `'ct_worker_0'`).
71+
On Linux, this is typically the `comm` field from `/proc/<pid>/task/<tid>/stat`.
72+
- `stack`: **[required]**
73+
The `StackTrace` of the thread.
74+
See below for more details on how stacktraces are formatted.
75+
- `state`: **[optional]**
76+
Platform-specific state of the thread when its state was captured (CPU registers dump for iOS, thread state enum for Android, etc.).
77+
Currently on Linux this is the single-character state letter from `/proc/<pid>/task/<tid>/stat`
78+
(e.g. `"R"` for running, `"S"` for sleeping, `"D"` for uninterruptible wait).
79+
When collection was cut short (receiver timeout or `max_threads` cap), consumers SHOULD check `counters.threads_incomplete`.
8580
- `is_crash`: **[required]**
8681
Boolean true if the error was a crash, false otherwise.
8782
- `kind`: **[required]**
@@ -256,7 +251,7 @@ A stacktrace consists of
256251

257252
## Version History
258253

259-
This section documents the evolution of the crashtracker structured log format across versions 1.0 through 1.7. The current specification above reflects version 1.7, which includes all features from previous versions.
254+
This section documents the evolution of the crashtracker structured log format across versions 1.0 through 1.8. The current specification above reflects version 1.8, which includes all features from previous versions.
260255

261256
### Version 1.0 (RFC 0005)
262257
*Initial version*
@@ -333,15 +328,27 @@ This section documents the evolution of the crashtracker structured log format a
333328

334329
**Motivation:** Thread collection in the receiver is time-bounded (the overall receiver timeout is shared between stdin reading and ptrace-based collection). When the budget runs out, collection stops early and the resulting thread list is partial. Without an explicit `incomplete` flag there is no way for consumers to distinguish "no other threads existed" from "collection was cut short". The `count` field avoids an extra array-length computation and keeps the shape consistent with other counted collections in the schema.
335330

331+
### Version 1.8
332+
*Flat thread array for downstream compatibility*
333+
334+
**Changes from v1.7:**
335+
- `error.threads` is again a bare array of thread objects (each with `crashed`, `name`, `stack`, and optional `state`), not a `Threads` wrapper
336+
- The `Threads` wrapper fields (`threads`, `count`, `incomplete`) are removed from `error.threads`
337+
- `error.threads` is omitted entirely when multi-thread collection is disabled or when no background threads were collected
338+
- Partial thread collection is reported via `counters.threads_incomplete` set to `1` when collection was cut short
339+
- Updated `data_schema_version` to "1.8"
340+
341+
**Motivation:** The v1.7 nested `Threads` object broke downstream pipelines that expect `error.threads` to be a flat array of thread records. Restoring the array shape while moving completeness metadata to `counters` keeps the report compatible with existing consumers and still allows distinguishing partial collection from "collection disabled or empty".
342+
336343
## Appendix A: Example output
337344

338345
An example crash report in version 1.0 format is [available here](artifacts/0005-crashtracker-example.json).
339346

340-
Note: This example uses version 1.0 format. Version 1.1+ may include additional fields such as `incomplete` in stacktraces, `experimental` at the root level, `comments` in stackframes, `mangled_name` in stackframes, `thread_name` in error objects, `ucontext` at the root level for UNIX signal crashes, and (v1.7+) `error.threads` as a `Threads` object with `threads`, `count`, and `incomplete` fields.
347+
Note: This example uses version 1.0 format. Version 1.1+ may include additional fields such as `incomplete` in stacktraces, `experimental` at the root level, `comments` in stackframes, `mangled_name` in stackframes, `thread_name` in error objects, `ucontext` at the root level for UNIX signal crashes, (v1.7) `error.threads` as a `Threads` wrapper object, and (v1.8+) `error.threads` as a flat array of thread objects with `counters.threads_incomplete` for partial collection.
341348

342349
## Appendix B: Json Schema
343350

344-
The current JSON schema (version 1.7) is [available here](artifacts/crashtracker-unified-runtime-stack-schema-v1_7.json).
351+
The current JSON schema (version 1.8) is [available here](artifacts/crashtracker-unified-runtime-stack-schema-v1_8.json).
345352

346353
Historical schemas are also available:
347354
- [Version 1.0 schema](artifacts/0005-crashtracker-schema.json)
@@ -351,3 +358,4 @@ Historical schemas are also available:
351358
- [Version 1.4 schema](artifacts/0009-crashtracker-schema.json)
352359
- [Version 1.5 schema](artifacts/crashtracker-unified-runtime-stack-schema-v1_5.json)
353360
- [Version 1.6 schema](artifacts/crashtracker-unified-runtime-stack-schema-v1_6.json)
361+
- [Version 1.7 schema](artifacts/crashtracker-unified-runtime-stack-schema-v1_7.json)

0 commit comments

Comments
 (0)