Skip to content

Commit cb57d7c

Browse files
authored
DBMON-6617 Add CPU time and CPU wait to ClickHouse DBM payloads (query metrics, completions, and errors) (DataDog#23882)
* Add CPU time metrics to ClickHouse statement metrics * Add CPU time metrics to ClickHouse query completions * Add unit tests for CPU time metrics in ClickHouse statement metrics * Add changelog entry for ClickHouse CPU time metrics * Add CPU time metrics to ClickHouse query errors * Add tests for ClickHouse CPU time fields * Remove mean_cpu_us time from ClickHouse statement metrics
1 parent 92a4f36 commit cb57d7c

8 files changed

Lines changed: 73 additions & 1 deletion

File tree

clickhouse/changelog.d/23882.added

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add CPU time metrics (cpu_us, cpu_wait_us) to ClickHouse query metrics, query completions, and query errors.

clickhouse/datadog_checks/clickhouse/query_completions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@
4343
result_rows,
4444
result_bytes,
4545
memory_usage,
46+
ProfileEvents['OSCPUVirtualTimeMicroseconds'] as cpu_us,
47+
ProfileEvents['OSCPUWaitMicroseconds'] as cpu_wait_us,
4648
query_start_time_microseconds,
4749
event_time_microseconds,
4850
query_id,
@@ -200,6 +202,8 @@ def _collect_completed_queries(self):
200202
result_rows_count,
201203
result_bytes,
202204
memory_usage,
205+
cpu_us,
206+
cpu_wait_us,
203207
query_start_time_microseconds,
204208
event_time_microseconds,
205209
query_id,
@@ -231,6 +235,8 @@ def _collect_completed_queries(self):
231235
'result_rows': int(result_rows_count) if result_rows_count else 0,
232236
'result_bytes': int(result_bytes) if result_bytes else 0,
233237
'memory_usage': int(memory_usage) if memory_usage else 0,
238+
'cpu_us': int(cpu_us) if cpu_us else 0,
239+
'cpu_wait_us': int(cpu_wait_us) if cpu_wait_us else 0,
234240
'query_start_time_microseconds': self.to_microseconds(query_start_time_microseconds),
235241
'event_time_microseconds': event_time_int,
236242
'query_id': str(query_id) if query_id else '',
@@ -300,6 +306,8 @@ def _create_batched_payload(self, rows):
300306
'result_rows': row.get('result_rows', 0),
301307
'result_bytes': row.get('result_bytes', 0),
302308
'memory_usage': row.get('memory_usage', 0),
309+
'cpu_us': row.get('cpu_us', 0),
310+
'cpu_wait_us': row.get('cpu_wait_us', 0),
303311
'query_start_time_microseconds': row.get('query_start_time_microseconds', 0),
304312
'event_time_microseconds': row.get('event_time_microseconds', 0),
305313
'initial_query_id': row.get('initial_query_id', ''),

clickhouse/datadog_checks/clickhouse/query_errors.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
result_rows,
4242
result_bytes,
4343
memory_usage,
44+
ProfileEvents['OSCPUVirtualTimeMicroseconds'] as cpu_us,
45+
ProfileEvents['OSCPUWaitMicroseconds'] as cpu_wait_us,
4446
query_start_time_microseconds,
4547
event_time_microseconds,
4648
query_id,
@@ -178,6 +180,8 @@ def _collect_query_errors(self):
178180
result_rows_count,
179181
result_bytes,
180182
memory_usage,
183+
cpu_us,
184+
cpu_wait_us,
181185
query_start_time_microseconds,
182186
event_time_microseconds,
183187
query_id,
@@ -219,6 +223,8 @@ def _collect_query_errors(self):
219223
'result_rows': result_rows_count or 0,
220224
'result_bytes': result_bytes or 0,
221225
'memory_usage': memory_usage or 0,
226+
'cpu_us': int(cpu_us) if cpu_us else 0,
227+
'cpu_wait_us': int(cpu_wait_us) if cpu_wait_us else 0,
222228
'query_start_time_microseconds': self.to_microseconds(query_start_time_microseconds),
223229
'event_time_microseconds': event_time_int,
224230
'query_id': query_id or '',
@@ -284,6 +290,8 @@ def _create_batched_payload(self, rows: list) -> dict | None:
284290
'result_rows': row.get('result_rows', 0),
285291
'result_bytes': row.get('result_bytes', 0),
286292
'memory_usage': row.get('memory_usage', 0),
293+
'cpu_us': row.get('cpu_us', 0),
294+
'cpu_wait_us': row.get('cpu_wait_us', 0),
287295
'query_start_time_microseconds': row.get('query_start_time_microseconds', 0),
288296
'event_time_microseconds': row.get('event_time_microseconds', 0),
289297
'initial_query_id': row.get('initial_query_id', ''),

clickhouse/datadog_checks/clickhouse/statements.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@
5757
sum(result_rows) as total_result_rows,
5858
sum(result_bytes) as total_result_bytes,
5959
sum(memory_usage) as total_memory_usage,
60+
sum(ProfileEvents['OSCPUVirtualTimeMicroseconds']) as total_cpu_us,
61+
sum(ProfileEvents['OSCPUWaitMicroseconds']) as total_cpu_wait_us,
6062
max(memory_usage) as peak_memory_usage,
6163
max(event_time_microseconds) as max_event_time_microseconds
6264
FROM {query_log_table}
@@ -256,6 +258,8 @@ def _load_query_log_statements(self):
256258
total_result_rows,
257259
total_result_bytes,
258260
total_memory_usage,
261+
total_cpu_us,
262+
total_cpu_wait_us,
259263
peak_memory_usage,
260264
max_event_time_microseconds,
261265
) = row
@@ -287,6 +291,8 @@ def _load_query_log_statements(self):
287291
'written_bytes': int(total_written_bytes) if total_written_bytes else 0,
288292
'result_bytes': int(total_result_bytes) if total_result_bytes else 0,
289293
'memory_usage': int(total_memory_usage) if total_memory_usage else 0,
294+
'cpu_us': int(total_cpu_us) if total_cpu_us else 0,
295+
'cpu_wait_us': int(total_cpu_wait_us) if total_cpu_wait_us else 0,
290296
'peak_memory_usage': int(peak_memory_usage) if peak_memory_usage else 0,
291297
}
292298
result_rows.append(result_row)
@@ -343,6 +349,8 @@ def _merge_rows_across_nodes(node_rows):
343349
'result_rows',
344350
'result_bytes',
345351
'memory_usage',
352+
'cpu_us',
353+
'cpu_wait_us',
346354
]
347355
for field in sum_fields:
348356
merged[field] = sum(r.get(field, 0) for r in rows)

clickhouse/tests/test_dbm_integration.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
'result_rows',
4040
'result_bytes',
4141
'memory_usage',
42+
'cpu_us',
43+
'cpu_wait_us',
4244
'peak_memory_usage',
4345
}
4446

@@ -359,6 +361,8 @@ def test_query_completions_data(aggregator, instance, dd_run_check, datadog_agen
359361
assert details['username'] is not None
360362
assert details['read_rows'] >= 0
361363
assert details['event_time_microseconds'] > 0
364+
assert details['cpu_us'] >= 0
365+
assert details['cpu_wait_us'] >= 0
362366

363367

364368
def test_explain_plan_collected(aggregator, instance, dd_run_check, datadog_agent):
@@ -621,3 +625,5 @@ def test_query_errors_data(aggregator, instance, dd_run_check):
621625
assert 'UNKNOWN_TABLE' in details['exception']
622626
assert details['exception_code'] == 60
623627
assert 'stack_trace' in details
628+
assert details['cpu_us'] >= 0
629+
assert details['cpu_wait_us'] >= 0

clickhouse/tests/test_query_completions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ def test_create_batched_payload_query_details(check_with_dbm):
136136
'result_rows': 100,
137137
'result_bytes': 10240,
138138
'memory_usage': 5242880,
139+
'cpu_us': 1872000,
140+
'cpu_wait_us': 35000,
139141
'event_time_microseconds': 1746205423150500,
140142
'query_start_time_microseconds': 1746205423000000,
141143
'initial_query_id': 'test-query-id-123',
@@ -165,6 +167,8 @@ def test_create_batched_payload_query_details(check_with_dbm):
165167
assert query_details['query_id'] == 'test-query-id-123'
166168
assert query_details['read_rows'] == 1000
167169
assert query_details['memory_usage'] == 5242880
170+
assert query_details['cpu_us'] == 1872000
171+
assert query_details['cpu_wait_us'] == 35000
168172

169173
# Verify metadata is included
170174
assert query_details['metadata']['tables'] == ['users']
@@ -253,6 +257,10 @@ def test_completed_queries_query_format():
253257
assert 'event_time_microseconds' in COMPLETED_QUERIES_QUERY
254258
assert 'query_start_time_microseconds' in COMPLETED_QUERIES_QUERY
255259

260+
# CPU fields read from the ProfileEvents map
261+
assert "ProfileEvents['OSCPUVirtualTimeMicroseconds']" in COMPLETED_QUERIES_QUERY
262+
assert "ProfileEvents['OSCPUWaitMicroseconds']" in COMPLETED_QUERIES_QUERY
263+
256264

257265
def test_normalize_query_with_obfuscation(check_with_dbm):
258266
"""Test that query normalization properly obfuscates and extracts metadata"""

clickhouse/tests/test_query_errors.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def test_normalize_query(check_with_dbm):
115115

116116

117117
def test_create_batched_payload_error_fields(check_with_dbm):
118-
"""Test that batched payload includes exception, exception_code, and stack_trace"""
118+
"""Test that batched payload includes exception, exception_code, stack_trace, and CPU time fields."""
119119
query_errors = check_with_dbm.query_errors
120120
query_errors._tags_no_db = ['test:clickhouse']
121121

@@ -134,6 +134,8 @@ def test_create_batched_payload_error_fields(check_with_dbm):
134134
'result_rows': 0,
135135
'result_bytes': 0,
136136
'memory_usage': 0,
137+
'cpu_us': 1872000,
138+
'cpu_wait_us': 35000,
137139
'event_time_microseconds': 1746205423150500,
138140
'query_start_time_microseconds': 1746205423000000,
139141
'initial_query_id': 'err-query-id-456',
@@ -159,6 +161,8 @@ def test_create_batched_payload_error_fields(check_with_dbm):
159161
assert query_details['exception'] == 'Table default.nonexistent_table does not exist. (UNKNOWN_TABLE)'
160162
assert query_details['exception_code'] == 60
161163
assert query_details['stack_trace'] == 'DB::Exception::Exception at 0x1234...'
164+
assert query_details['cpu_us'] == 1872000
165+
assert query_details['cpu_wait_us'] == 35000
162166

163167

164168
def test_create_batched_payload_structure(check_with_dbm):
@@ -224,6 +228,10 @@ def test_query_errors_sql_query_format():
224228
assert 'memory_usage' in QUERY_ERRORS_QUERY
225229
assert 'event_time_microseconds' in QUERY_ERRORS_QUERY
226230

231+
# CPU time fields
232+
assert "ProfileEvents['OSCPUVirtualTimeMicroseconds']" in QUERY_ERRORS_QUERY
233+
assert "ProfileEvents['OSCPUWaitMicroseconds']" in QUERY_ERRORS_QUERY
234+
227235

228236
def test_rate_limiting(check_with_dbm):
229237
"""Test that query error rate limiting works correctly"""

clickhouse/tests/test_statement_metrics.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ def test_normalize_queries(check_with_dbm):
124124
'result_bytes': 5000,
125125
'memory_usage': 1000000,
126126
'peak_memory_usage': 1500000,
127+
'cpu_us': 80000,
128+
'cpu_wait_us': 8000,
127129
}
128130
]
129131

@@ -189,6 +191,8 @@ def test_query_signature_matches_samples_pipeline(check_with_dbm):
189191
'result_bytes': 0,
190192
'memory_usage': 0,
191193
'peak_memory_usage': 0,
194+
'cpu_us': 1000,
195+
'cpu_wait_us': 100,
192196
}
193197
]
194198
normalized = metrics._normalize_queries(rows)
@@ -452,6 +456,8 @@ def test_normalize_queries_handles_obfuscation_failure(check_with_dbm):
452456
'result_bytes': 0,
453457
'memory_usage': 0,
454458
'peak_memory_usage': 0,
459+
'cpu_us': 1000,
460+
'cpu_wait_us': 100,
455461
},
456462
{
457463
'normalized_query_hash': '67890',
@@ -472,6 +478,8 @@ def test_normalize_queries_handles_obfuscation_failure(check_with_dbm):
472478
'result_bytes': 0,
473479
'memory_usage': 0,
474480
'peak_memory_usage': 0,
481+
'cpu_us': 500,
482+
'cpu_wait_us': 50,
475483
},
476484
]
477485

@@ -500,6 +508,9 @@ def test_statements_query_format():
500508
# Filters
501509
assert 'event_time_microseconds <= now64(6)' in STATEMENTS_QUERY
502510

511+
assert "ProfileEvents['OSCPUVirtualTimeMicroseconds']" in STATEMENTS_QUERY
512+
assert "ProfileEvents['OSCPUWaitMicroseconds']" in STATEMENTS_QUERY
513+
503514

504515
def test_merge_rows_across_nodes_single_node():
505516
"""When a query only appears on one node, it should pass through unchanged."""
@@ -520,13 +531,17 @@ def test_merge_rows_across_nodes_single_node():
520531
'result_bytes': 500,
521532
'memory_usage': 5000,
522533
'peak_memory_usage': 8000,
534+
'cpu_us': 80000,
535+
'cpu_wait_us': 8000,
523536
}
524537
]
525538

526539
result = ClickhouseStatementMetrics._merge_rows_across_nodes(rows)
527540
assert len(result) == 1
528541
assert result[0]['count'] == 10
529542
assert result[0]['total_time'] == 100.0
543+
assert result[0]['cpu_us'] == 80000
544+
assert result[0]['cpu_wait_us'] == 8000
530545

531546

532547
def test_merge_rows_across_nodes_sums_counts():
@@ -548,6 +563,8 @@ def test_merge_rows_across_nodes_sums_counts():
548563
'result_bytes': 5000,
549564
'memory_usage': 50000,
550565
'peak_memory_usage': 80000,
566+
'cpu_us': 500000,
567+
'cpu_wait_us': 50000,
551568
},
552569
{
553570
'normalized_query_hash': 'hash1',
@@ -563,6 +580,8 @@ def test_merge_rows_across_nodes_sums_counts():
563580
'result_bytes': 2500,
564581
'memory_usage': 30000,
565582
'peak_memory_usage': 90000,
583+
'cpu_us': 300000,
584+
'cpu_wait_us': 30000,
566585
},
567586
]
568587

@@ -580,6 +599,8 @@ def test_merge_rows_across_nodes_sums_counts():
580599
assert merged['result_bytes'] == 7500
581600
assert merged['memory_usage'] == 80000
582601
assert merged['peak_memory_usage'] == 90000
602+
assert merged['cpu_us'] == 800000
603+
assert merged['cpu_wait_us'] == 80000
583604
assert merged['mean_time'] == 800.0 / 150
584605

585606

@@ -600,6 +621,8 @@ def test_merge_rows_across_nodes_different_queries():
600621
'result_bytes': 0,
601622
'memory_usage': 0,
602623
'peak_memory_usage': 0,
624+
'cpu_us': 80000,
625+
'cpu_wait_us': 8000,
603626
}
604627

605628
rows = [
@@ -636,6 +659,8 @@ def test_metrics_row_with_empty_values(check_with_dbm):
636659
'result_bytes': 0,
637660
'memory_usage': 0,
638661
'peak_memory_usage': 0,
662+
'cpu_us': 0, # Zero CPU
663+
'cpu_wait_us': 0,
639664
}
640665
]
641666

0 commit comments

Comments
 (0)