Skip to content

Commit a2863c3

Browse files
committed
feat: enhance ClickHouse Cloud log collection with internal user filtering and additional metrics
1 parent 63328e7 commit a2863c3

8 files changed

Lines changed: 234 additions & 26 deletions

File tree

.editorconfig

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
root = true
2+
3+
[*]
4+
indent_style = space
5+
indent_size = 4
6+
end_of_line = lf
7+
charset = utf-8
8+
trim_trailing_whitespace = true
9+
insert_final_newline = true
10+
11+
[*.{yml,yaml}]
12+
indent_size = 2
13+
14+
[*.md]
15+
trim_trailing_whitespace = false
16+
17+
[Makefile]
18+
indent_style = tab

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,5 @@ conf.d/openmetrics_clickhouse.yaml
3737
clickhouse-data/
3838

3939
PRD.md
40-
*.yaml
40+
*.yaml
41+
*.csv

CHANGELOG.md

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,3 @@ All notable changes to this project will be documented in this file.
44

55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7-
8-
## [1.0.0] - 2026-04-03
9-
10-
### Added
11-
- Custom Datadog Agent check for ClickHouse Cloud log collection
12-
- Query log collection from `system.query_log` (completed queries and exceptions)
13-
- Text log collection from `system.text_log` (Error, Warning, Fatal levels)
14-
- OpenMetrics configuration for ClickHouse Cloud Prometheus endpoint
15-
- Cursor-based pagination with duplicate-delivery-over-loss semantics
16-
- Configurable batch size, slow-query threshold, backfill window, and query timeout
17-
- Input validation with bounds checking on all numeric config parameters
18-
- HTTP retry logic (2 retries with backoff on 502/503/504)
19-
- Configurable `cluster_name` for the Datadog `service` field
20-
- `ddsource: clickhouse` on all log entries for Datadog pipeline compatibility
21-
- Comprehensive test suite (53 tests)
22-
- CI workflow with ruff lint and pytest across Python 3.10/3.11/3.12
23-
- Dependabot configuration for pip and GitHub Actions

checks/clickhouse_cloud.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,30 +35,57 @@
3535
read_bytes,
3636
result_rows,
3737
written_rows,
38+
written_bytes,
3839
exception,
3940
exception_code,
4041
query,
4142
type,
43+
query_kind,
44+
current_database,
4245
arrayStringConcat(tables, ', ') AS tables,
4346
client_name
4447
FROM system.query_log
45-
WHERE type IN ('QueryFinish', 'ExceptionWhileProcessing')
48+
WHERE type IN ('QueryFinish', 'ExceptionWhileProcessing', 'ExceptionBeforeStart')
49+
AND is_initial_query = 1
4650
AND event_time_microseconds > fromUnixTimestamp64Micro({last_cursor})
51+
AND query NOT LIKE '%system.query_log%'
52+
AND query NOT LIKE '%system.text_log%'
53+
{internal_user_filter}
4754
ORDER BY event_time_microseconds ASC
4855
LIMIT {batch_size}
4956
"""
5057

58+
# Filter clause injected into QUERY_LOG_SQL when exclude_internal_users is True.
59+
# ClickHouse Cloud runs health checks, backups, and observability queries under
60+
# service accounts whose usernames follow several patterns:
61+
# 1. "*-internal" suffix (monitoring-internal, operator-internal, backups-internal, etc.)
62+
# 2. "clickhouse-cloud-*" prefix (clickhouse-cloud-monitor — the primary metrics scraper)
63+
# 3. "prometheus-exporter" — Prometheus metrics collection
64+
# 4. Empty string "" — internal system-level metrics queries (SELECT from
65+
# system.dimensional_metrics, system.histogram_metrics, etc.) that run
66+
# under a blank user and generate heavy query_log volume.
67+
# Together these generate ~99% of query_log volume on an idle cluster with zero
68+
# operational value for application teams.
69+
INTERNAL_USER_FILTER = (
70+
"AND user NOT LIKE '%-internal'"
71+
" AND user NOT LIKE 'clickhouse-cloud-%'"
72+
" AND user != 'prometheus-exporter'"
73+
" AND user != ''"
74+
)
75+
5176
TEXT_LOG_SQL = """
5277
SELECT
5378
event_time,
5479
toUnixTimestamp64Micro(event_time_microseconds) AS cursor_us,
5580
level,
5681
logger_name,
5782
message,
58-
thread_id
83+
thread_id,
84+
query_id
5985
FROM system.text_log
60-
WHERE level IN ('Error', 'Warning', 'Fatal')
86+
WHERE level IN ('Fatal', 'Critical', 'Error', 'Warning')
6187
AND event_time_microseconds > fromUnixTimestamp64Micro({last_cursor})
88+
AND logger_name NOT IN ('QueryProfiler', 'GlobalProfiler')
6289
ORDER BY event_time_microseconds ASC
6390
LIMIT {batch_size}
6491
"""
@@ -76,6 +103,7 @@
76103

77104
TYPE_QUERY_FINISH = "QueryFinish"
78105
TYPE_QUERY_EXCEPTION = "ExceptionWhileProcessing"
106+
TYPE_QUERY_EXCEPTION_BEFORE_START = "ExceptionBeforeStart"
79107

80108
# ---------------------------------------------------------------------------
81109
# Datadog metric / service-check names
@@ -92,6 +120,7 @@
92120

93121
TEXT_LOG_LEVEL_MAP: dict[str, str] = {
94122
"Fatal": "critical",
123+
"Critical": "critical",
95124
"Error": "error",
96125
"Warning": "warning",
97126
}
@@ -128,6 +157,7 @@ def __init__(
128157
# Feature toggles
129158
self.collect_query_logs: bool = inst.get("collect_query_logs", True)
130159
self.collect_text_logs: bool = inst.get("collect_text_logs", True)
160+
self.exclude_internal_users: bool = inst.get("exclude_internal_users", True)
131161

132162
# Tuning — validate all numeric config to prevent SQL injection and
133163
# catch misconfiguration early (e.g. log_batch_size: "all").
@@ -371,8 +401,10 @@ def _collect_logs(
371401

372402
def _collect_query_logs(self) -> None:
373403
"""Fetch new rows from system.query_log and send as Datadog logs."""
404+
user_filter = INTERNAL_USER_FILTER if self.exclude_internal_users else ""
405+
sql = QUERY_LOG_SQL.replace("{internal_user_filter}", user_filter)
374406
self._collect_logs(
375-
sql_template=QUERY_LOG_SQL,
407+
sql_template=sql,
376408
cursor_key=CURSOR_QUERY_LOG,
377409
check_name=SC_QUERY_LOG_CONNECT,
378410
gauge_name=GAUGE_QUERY_LOG_ROWS,
@@ -384,7 +416,7 @@ def _build_query_log_payload(self, row: dict[str, Any]) -> dict[str, Any]:
384416
query_type = row.get("type", "")
385417

386418
# Determine log level
387-
if query_type == TYPE_QUERY_EXCEPTION:
419+
if query_type in (TYPE_QUERY_EXCEPTION, TYPE_QUERY_EXCEPTION_BEFORE_START):
388420
level = "error"
389421
type_label = "exception"
390422
else:
@@ -407,9 +439,12 @@ def _build_query_log_payload(self, row: dict[str, Any]) -> dict[str, Any]:
407439
"clickhouse.read_bytes": int(row.get("read_bytes", 0)),
408440
"clickhouse.result_rows": int(row.get("result_rows", 0)),
409441
"clickhouse.written_rows": int(row.get("written_rows", 0)),
442+
"clickhouse.written_bytes": int(row.get("written_bytes", 0)),
410443
"clickhouse.exception": row.get("exception", ""),
411444
"clickhouse.exception_code": int(row.get("exception_code", 0)),
412445
"clickhouse.query_type": type_label,
446+
"clickhouse.query_kind": row.get("query_kind", ""),
447+
"clickhouse.database": row.get("current_database", ""),
413448
"clickhouse.tables": row.get("tables", ""),
414449
"clickhouse.client": row.get("client_name", ""),
415450
}
@@ -441,6 +476,7 @@ def _build_text_log_payload(self, row: dict[str, Any]) -> dict[str, Any]:
441476
"status": level,
442477
"clickhouse.logger": row.get("logger_name", ""),
443478
"clickhouse.thread_id": str(row.get("thread_id", "")),
479+
"clickhouse.query_id": row.get("query_id", ""),
444480
}
445481

446482
# ------------------------------------------------------------------

conf.d/clickhouse_cloud.d/conf.yaml.example

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,16 @@ instances:
4040
#
4141
# collect_text_logs: true
4242

43+
## @param exclude_internal_users - boolean - optional - default: true
44+
## Exclude queries from ClickHouse Cloud internal service accounts.
45+
## ClickHouse Cloud runs health checks, backups, and observability queries
46+
## under system accounts (e.g. clickhouse-cloud-monitor, *-internal,
47+
## prometheus-exporter) that generate ~99% of query_log volume on idle
48+
## clusters with no operational value for application teams.
49+
## Set to false if you need visibility into internal platform queries.
50+
#
51+
# exclude_internal_users: true
52+
4353
## @param log_batch_size - integer - optional - default: 1000
4454
## Maximum number of log rows fetched per check run, per table.
4555
## Range: 1–10000. Higher values reduce API calls but increase memory usage.

tests/fixtures/query_log_rows.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,13 @@
1010
"read_bytes": 1048576,
1111
"result_rows": 50,
1212
"written_rows": 0,
13+
"written_bytes": 0,
1314
"exception": "",
1415
"exception_code": 0,
1516
"query": "SELECT count() FROM events WHERE date > '2026-03-01'",
1617
"type": "QueryFinish",
18+
"query_kind": "Select",
19+
"current_database": "default",
1720
"tables": "default.events",
1821
"client_name": "python-driver"
1922
},
@@ -28,10 +31,13 @@
2831
"read_bytes": 524288000,
2932
"result_rows": 1,
3033
"written_rows": 0,
34+
"written_bytes": 0,
3135
"exception": "",
3236
"exception_code": 0,
3337
"query": "SELECT uniq(user_id) FROM events",
3438
"type": "QueryFinish",
39+
"query_kind": "Select",
40+
"current_database": "default",
3541
"tables": "default.events",
3642
"client_name": "clickhouse-client"
3743
},
@@ -46,10 +52,13 @@
4652
"read_bytes": 0,
4753
"result_rows": 0,
4854
"written_rows": 0,
55+
"written_bytes": 0,
4956
"exception": "Code: 60. DB::Exception: Table default.nonexistent doesn't exist.",
5057
"exception_code": 60,
5158
"query": "SELECT * FROM nonexistent",
5259
"type": "ExceptionWhileProcessing",
60+
"query_kind": "Select",
61+
"current_database": "default",
5362
"tables": "",
5463
"client_name": "python-driver"
5564
}

tests/fixtures/text_log_rows.json

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,25 @@
55
"level": "Error",
66
"logger_name": "MergeTreeBackgroundExecutor",
77
"message": "Exception while processing merge: Code: 241. DB::Exception: Memory limit exceeded",
8-
"thread_id": 12345
8+
"thread_id": 12345,
9+
"query_id": "abc-123-def-456"
910
},
1011
{
1112
"event_time": "2026-03-29 10:01:05",
1213
"cursor_us": 1743246065000000,
1314
"level": "Warning",
1415
"logger_name": "ReplicatedMergeTreeQueue",
1516
"message": "Part all_0_0_0 is not available on any replica",
16-
"thread_id": 12346
17+
"thread_id": 12346,
18+
"query_id": ""
1719
},
1820
{
1921
"event_time": "2026-03-29 10:01:10",
2022
"cursor_us": 1743246070000000,
2123
"level": "Fatal",
2224
"logger_name": "Application",
2325
"message": "Caught signal 11 (SIGSEGV). Stack trace: ...",
24-
"thread_id": 1
26+
"thread_id": 1,
27+
"query_id": ""
2528
}
2629
]

0 commit comments

Comments
 (0)