Skip to content

Commit f8afffc

Browse files
committed
test: real public log corpora + store integration tests
Add 11 real, PII-cleansed public log datasets (LogHub, ITA NASA/Calgary, SecRepo, elastic nginx) as gzipped slices under tests/testdata/, built by a re-runnable, stdlib-only downloader (build.py) with a deterministic PII cleanser and a manifest. Committed in plain git (not LFS); attribution in NOTICE. Integration tests reduce this real, messy data in memory and through every Source - PostgreSQL, ClickHouse and Kafka - asserting substantial reduction, cross-store consistency, and no-crash on all four modes over the ugliest set. Test infra: - corpora.py: dataset reader + file->SQL/ClickHouse/Kafka loaders. - conftest: env-first then docker (Redpanda for Kafka), bare + seeded SQL engines, a full-SASL Kafka config from env (reaches the authenticated PET broker), LOGREDUCER_KEEP_CONTAINERS to keep services for fast re-runs. - Temp tables/topics use unique names and are dropped/deleted afterwards, so runs against a shared/persistent instance never collide or leak state. 265 tests pass (integration exercised against real ClickHouse + Kafka + docker PostgreSQL/MySQL). HYPERCI_ALLOW_FEAT=1
1 parent 6fb0a11 commit f8afffc

22 files changed

Lines changed: 1043 additions & 31 deletions

.env.example

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,17 @@
44
# credential). Everything here is OPTIONAL: it is read only by the
55
# integration tests (tests/conftest.py loads .env via python-dotenv) and by
66
# the logging runtime. The library itself needs none of it.
7+
#
8+
# Test service selection is env-first, then docker:
9+
# 1. if the vars below point at a reachable service, tests use it;
10+
# 2. otherwise a throwaway docker container is started (ClickHouse, Redpanda
11+
# for Kafka, PostgreSQL, MySQL) and cleaned up at the end;
12+
# 3. if neither is available the dependent test skips.
13+
# CI sets none of these -> it always uses docker. Tests create uniquely-named
14+
# tables/topics and drop them afterwards, so pointing at a shared/persistent
15+
# instance is safe.
716

817
# --- ClickHouse (integration tests) -----------------------------------------
9-
# Point the ClickHouse integration test at an existing server. If unset or
10-
# unreachable, the test falls back to a throwaway Docker ClickHouse
11-
# (testcontainers), or skips when Docker is unavailable too.
1218
# CLICKHOUSE_HOST="localhost"
1319
# CLICKHOUSE_PORT="8123"
1420
# CLICKHOUSE_USER="default"
@@ -17,9 +23,22 @@
1723
# CLICKHOUSE_VERIFY="true" # false = skip TLS certificate verification
1824

1925
# --- Kafka (integration tests) ----------------------------------------------
20-
# Point the Kafka integration test at an existing broker. Same fallback
21-
# behaviour as ClickHouse: Docker testcontainers, else skip.
26+
# A bare bootstrap is a plain (no-auth) broker. Add KAFKA_SECURITY_PROTOCOL to
27+
# reach an authenticated broker (SASL_SSL/SASL_PLAINTEXT); the SASL_* + SSL vars
28+
# are then merged into the librdkafka client config.
2229
# KAFKA_BOOTSTRAP_SERVERS="localhost:9092"
30+
# KAFKA_SECURITY_PROTOCOL="SASL_SSL" # e.g. SASL_SSL | SASL_PLAINTEXT
31+
# KAFKA_SASL_MECHANISM="SCRAM-SHA-512"
32+
# KAFKA_SASL_USERNAME="user"
33+
# KAFKA_SASL_PASSWORD=""
34+
# KAFKA_SSL_CA_LOCATION="/path/to/ca.pem" # CA bundle for TLS verification
35+
# KAFKA_SSL_VERIFY="true" # false = skip TLS cert verification
36+
37+
# --- Docker test containers (optional) --------------------------------------
38+
# Keep the throwaway docker services RUNNING after the run (default: stop +
39+
# remove). Handy for fast re-runs - export the printed endpoint as the vars
40+
# above so the next run takes the env-first path.
41+
# LOGREDUCER_KEEP_CONTAINERS="1"
2342

2443
# --- Logging (runtime, optional) ---------------------------------------------
2544
# Honoured by logreducer's logging when enabled (see README "Logging").

.gitattributes

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,11 @@ data/**/*.xml filter=lfs diff=lfs merge=lfs -text
126126
tests/data/**/*.log filter=lfs diff=lfs merge=lfs -text
127127
tests/fixtures/**/*.log filter=lfs diff=lfs merge=lfs -text
128128

129+
# logreducer real-log corpora: bounded, write-once, committed in PLAIN git
130+
# (deliberately NOT LFS - avoids LFS bandwidth/quota and a git-lfs requirement
131+
# for anyone cloning the public repo). Overrides the *.log.gz LFS rule above.
132+
tests/testdata/*.log.gz -filter -diff -merge -text
133+
129134
# Track large output files
130135
output/**/*.log filter=lfs diff=lfs merge=lfs -text
131136
output/**/*.json filter=lfs diff=lfs merge=lfs -text

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,12 @@ logs/
9898
# Generated reduction output (churns every test run - never track)
9999
data/output/
100100

101+
# Test-data builder download cache (the gzipped slices are committed, not this)
102+
tests/testdata/.cache/
103+
104+
# Local-only TLS CA(s) for pointing tests at internal PET services (never shipped)
105+
.certs/
106+
101107
# Temporary files
102108
*.tmp
103109
*.temp

NOTICE

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,32 @@ Some LogHub datasets originate from public sources, including the USENIX CFDR
2222
computer failure data repository (BGL and Thunderbird traces, Sandia National
2323
Laboratories). The datasets are redistributed here unmodified for demonstration
2424
and testing. See data/samples/README.md for the per-file mapping.
25+
26+
-------------------------------------------------------------------------------
27+
Integration-test corpora (tests/testdata/)
28+
-------------------------------------------------------------------------------
29+
30+
The gzipped log corpora under tests/testdata/ are truncated slices of REAL
31+
public datasets, PII-cleansed (IPs/IPv6/emails/MAC addresses and specific
32+
hostnames deterministically rewritten to synthetic values; structure and
33+
repetition preserved). Each is redistributed under its own licence. See
34+
tests/testdata/manifest.json for the per-dataset source URL, licence, and
35+
cleansing status. Sources and licences:
36+
37+
LogHub (loghub_*): Zhu et al., "Loghub", ISSRE 2023.
38+
https://github.com/logpai/loghub - Zenodo 10.5281/zenodo.8196385,
39+
CC-BY-4.0. BGL/Thunderbird upstream: Oliner and Stearley, DSN 2007.
40+
41+
The Internet Traffic Archive (ita_*): NASA-HTTP and Calgary-HTTP web-server
42+
access logs. https://ita.ee.lbl.gov/ - per-trace "may be freely
43+
redistributed". Use is limited to general traffic-pattern analysis.
44+
45+
Security Repo (secrepo_*): Security Repo by Mike Sconzo, https://www.secrepo.com/
46+
- Creative Commons Attribution 4.0 International (CC-BY-4.0).
47+
48+
elastic/examples (elastic_*): nginx JSON logs from https://github.com/elastic/examples
49+
- Apache License, Version 2.0.
50+
51+
These corpora are used only for testing and are not part of the distributed
52+
package. The build script tests/testdata/build.py records how each was fetched
53+
and cleansed.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ dev = [
246246
"sqlalchemy>=2.0.30", # SQLSource against SQLite (real, no server)
247247
"clickhouse-connect>=0.8.0", # ClickHouseSource (local cluster or docker)
248248
"confluent-kafka>=2.14.0", # KafkaSource/KafkaSink
249-
"testcontainers[clickhouse,kafka,postgres,mysql]>=4.14.0", # docker fallback for CH/Kafka/PG/MySQL
249+
"testcontainers[clickhouse,kafka,postgres,mysql]>=4.14.0", # docker fallback (Redpanda module ships in base)
250250
"psycopg[binary]>=3.2", # PostgreSQL driver (real TABLESAMPLE/setseed sampling tests)
251251
"pymysql>=1.1.0", # MySQL driver (real RAND(seed) sampling tests)
252252
"python-dotenv>=1.0.0", # load the local .env (CLICKHOUSE_*) in tests

tests/integration/conftest.py

Lines changed: 94 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,25 @@ def _env_first(*names: str, default: str = "") -> str:
3434
return default
3535

3636

37+
def _keep_containers() -> bool:
38+
"""LOGREDUCER_KEEP_CONTAINERS=1 leaves docker services RUNNING after the run.
39+
40+
Default is to stop/remove them. Keeping them lets you re-run fast: grab the
41+
printed endpoint and export it (KAFKA_BOOTSTRAP_SERVERS / CLICKHOUSE_* / a
42+
DB URL) so the next run takes the env-first path and skips container startup.
43+
"""
44+
return os.environ.get("LOGREDUCER_KEEP_CONTAINERS", "").strip().lower() in ("1", "true", "yes", "on")
45+
46+
47+
def _stop(container: Any, label: str, endpoint: str) -> None:
48+
"""Stop a throwaway container, unless the caller asked to keep it running."""
49+
if _keep_containers():
50+
print(f"\n[keep] {label} left running at {endpoint} (LOGREDUCER_KEEP_CONTAINERS=1)")
51+
return
52+
with contextlib.suppress(Exception):
53+
container.stop()
54+
55+
3756
# ---------------------------------------------------------------------------
3857
# ClickHouse
3958
# ---------------------------------------------------------------------------
@@ -131,36 +150,69 @@ def clickhouse_client() -> Iterator[Any]:
131150
finally:
132151
with contextlib.suppress(Exception):
133152
client.close()
134-
container.stop() # stop the fallback container as soon as we are done
153+
endpoint = f"{container.get_container_host_ip()}:{container.get_exposed_port(8123)}"
154+
_stop(container, "ClickHouse", endpoint)
135155

136156

137157
# ---------------------------------------------------------------------------
138158
# Kafka
139159
# ---------------------------------------------------------------------------
140160

141161

142-
def _kafka_from_env() -> str | None:
143-
"""Return a reachable bootstrap.servers string from env, or None."""
162+
def _kafka_config_from_env() -> str | dict[str, Any] | None:
163+
"""Return a reachable Kafka config from env, or None.
164+
165+
A bare ``KAFKA_BOOTSTRAP_SERVERS`` yields a plain bootstrap string. Adding
166+
``KAFKA_SECURITY_PROTOCOL`` (e.g. SASL_SSL) upgrades it to a full librdkafka
167+
config dict (SASL mechanism/username/password + TLS CA) - used to reach the
168+
authenticated PET broker. Probed with an AdminClient before use.
169+
"""
144170
servers = _env_first("KAFKA_BOOTSTRAP_SERVERS")
145171
if not servers:
146172
return None
173+
protocol = _env_first("KAFKA_SECURITY_PROTOCOL")
174+
config: str | dict[str, Any]
175+
if protocol:
176+
config = {"bootstrap.servers": servers, "security.protocol": protocol}
177+
for env_key, conf_key in (
178+
("KAFKA_SASL_MECHANISM", "sasl.mechanism"),
179+
("KAFKA_SASL_USERNAME", "sasl.username"),
180+
("KAFKA_SASL_PASSWORD", "sasl.password"),
181+
("KAFKA_SSL_CA_LOCATION", "ssl.ca.location"),
182+
):
183+
value = _env_first(env_key)
184+
if value:
185+
config[conf_key] = value
186+
# Internal PET broker: skip TLS cert verification (same stance as
187+
# CLICKHOUSE_VERIFY=false) - the chain uses an internal, pre-rebrand CA.
188+
if not _env_bool("KAFKA_SSL_VERIFY", default=True):
189+
config["enable.ssl.certificate.verification"] = "false"
190+
else:
191+
config = servers
147192
try:
148193
from confluent_kafka.admin import AdminClient
149194

150-
AdminClient({"bootstrap.servers": servers, "socket.timeout.ms": 3000}).list_topics(timeout=5)
151-
return servers
195+
probe = dict(config) if isinstance(config, dict) else {"bootstrap.servers": config}
196+
probe["socket.timeout.ms"] = 6000
197+
AdminClient(probe).list_topics(timeout=8)
198+
return config
152199
except Exception:
153200
return None
154201

155202

156-
def _kafka_from_docker() -> tuple[str, Any] | None:
157-
"""Start a throwaway Kafka container; return (bootstrap, container) or None."""
203+
def _redpanda_from_docker() -> tuple[str, Any] | None:
204+
"""Start a throwaway Redpanda container; return (bootstrap, container) or None.
205+
206+
Redpanda is the docker broker (Kafka-API compatible, single binary, far
207+
smaller/faster to start than Apache Kafka) - confluent-kafka talks to it
208+
unchanged.
209+
"""
158210
try:
159-
from testcontainers.kafka import KafkaContainer
211+
from testcontainers.kafka import RedpandaContainer
160212
except ImportError:
161213
return None
162214
try:
163-
container = KafkaContainer()
215+
container = RedpandaContainer()
164216
container.start()
165217
except Exception:
166218
with contextlib.suppress(Exception):
@@ -170,21 +222,26 @@ def _kafka_from_docker() -> tuple[str, Any] | None:
170222

171223

172224
@pytest.fixture(scope="session")
173-
def kafka_bootstrap() -> Iterator[str]:
174-
"""A Kafka bootstrap string - local if configured, else docker, else skip."""
175-
servers = _kafka_from_env()
176-
if servers is not None:
177-
yield servers
225+
def kafka_bootstrap() -> Iterator[str | dict[str, Any]]:
226+
"""A Kafka config - a configured broker (str, or a SASL dict for PET) if
227+
reachable, else a Redpanda docker broker (str), else skip.
228+
229+
The value is accepted directly by KafkaSource/KafkaSink and the corpora
230+
loaders (all take a bootstrap string or a full librdkafka config dict).
231+
"""
232+
config = _kafka_config_from_env()
233+
if config is not None:
234+
yield config
178235
return
179236

180-
docker = _kafka_from_docker()
237+
docker = _redpanda_from_docker()
181238
if docker is None:
182-
pytest.skip("no Kafka: set KAFKA_BOOTSTRAP_SERVERS, or start Docker")
239+
pytest.skip("no Kafka broker: set KAFKA_BOOTSTRAP_SERVERS, or start Docker (Redpanda)")
183240
servers, container = docker
184241
try:
185242
yield servers
186243
finally:
187-
container.stop() # stop the fallback container as soon as we are done
244+
_stop(container, "Redpanda", servers)
188245

189246

190247
# ---------------------------------------------------------------------------
@@ -207,8 +264,8 @@ def _seed_logs(engine: Any, rows: int) -> None:
207264

208265

209266
@pytest.fixture(scope="session")
210-
def pg_logs_engine() -> Iterator[Any]:
211-
"""A PostgreSQL engine with a seeded `logs` table (docker, else skip)."""
267+
def pg_engine() -> Iterator[Any]:
268+
"""A bare PostgreSQL engine on a throwaway container (docker, else skip)."""
212269
try:
213270
from sqlalchemy import create_engine
214271
from testcontainers.postgres import PostgresContainer
@@ -221,17 +278,16 @@ def pg_logs_engine() -> Iterator[Any]:
221278
pytest.skip(f"no Docker for PostgreSQL: {exc}")
222279
engine = create_engine(container.get_connection_url())
223280
try:
224-
_seed_logs(engine, 5000)
225281
yield engine
226282
finally:
227283
with contextlib.suppress(Exception):
228284
engine.dispose()
229-
container.stop()
285+
_stop(container, "PostgreSQL", container.get_connection_url())
230286

231287

232288
@pytest.fixture(scope="session")
233-
def mysql_logs_engine() -> Iterator[Any]:
234-
"""A MySQL engine with a seeded `logs` table (docker, else skip)."""
289+
def mysql_engine() -> Iterator[Any]:
290+
"""A bare MySQL engine on a throwaway container (docker, else skip)."""
235291
try:
236292
from sqlalchemy import create_engine
237293
from testcontainers.mysql import MySqlContainer
@@ -249,9 +305,22 @@ def mysql_logs_engine() -> Iterator[Any]:
249305
url = url.replace("mysql://", "mysql+pymysql://", 1)
250306
engine = create_engine(url)
251307
try:
252-
_seed_logs(engine, 5000)
253308
yield engine
254309
finally:
255310
with contextlib.suppress(Exception):
256311
engine.dispose()
257-
container.stop()
312+
_stop(container, "MySQL", url)
313+
314+
315+
@pytest.fixture(scope="session")
316+
def pg_logs_engine(pg_engine: Any) -> Any:
317+
"""PostgreSQL with a seeded synthetic `logs` table (for the sampling tests)."""
318+
_seed_logs(pg_engine, 5000)
319+
return pg_engine
320+
321+
322+
@pytest.fixture(scope="session")
323+
def mysql_logs_engine(mysql_engine: Any) -> Any:
324+
"""MySQL with a seeded synthetic `logs` table (for the sampling tests)."""
325+
_seed_logs(mysql_engine, 5000)
326+
return mysql_engine

0 commit comments

Comments
 (0)