diff --git a/.gitignore b/.gitignore index 015196ef..5e963d59 100644 --- a/.gitignore +++ b/.gitignore @@ -41,6 +41,9 @@ dev_reports/ # Python artifacts __pycache__/ *.py[cod] +.coverage +.pytest_cache/ +htmlcov/ # Python virtual environments .venv/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 040ca38e..8e0e5678 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -39,61 +39,33 @@ build:test:images: config docker push "$CI_REGISTRY_IMAGE/postgres-ai-configs:$PGAI_TAG" - # Build and push reporter + # Build and push reporter (TypeScript/Bun) docker build \ --build-arg "VERSION=$PGAI_TAG" \ --build-arg "BUILD_TS=$BUILD_TS" \ - -f reporter/Dockerfile \ + -f cli/Dockerfile.reporter \ -t "$CI_REGISTRY_IMAGE/reporter:$PGAI_TAG" \ - reporter + cli docker push "$CI_REGISTRY_IMAGE/reporter:$PGAI_TAG" - # Build and push monitoring-flask-backend + # Build and push metrics-server (TypeScript/Bun, replaces monitoring-flask-backend) docker build \ --build-arg "VERSION=$PGAI_TAG" \ --build-arg "BUILD_TS=$BUILD_TS" \ - -f monitoring_flask_backend/Dockerfile \ - -t "$CI_REGISTRY_IMAGE/monitoring-flask-backend:$PGAI_TAG" \ - monitoring_flask_backend - docker push "$CI_REGISTRY_IMAGE/monitoring-flask-backend:$PGAI_TAG" + -f cli/Dockerfile.metrics-server \ + -t "$CI_REGISTRY_IMAGE/metrics-server:$PGAI_TAG" \ + cli + docker push "$CI_REGISTRY_IMAGE/metrics-server:$PGAI_TAG" echo "" echo "Images pushed to GitLab Container Registry:" echo " $CI_REGISTRY_IMAGE/postgres-ai-configs:$PGAI_TAG" echo " $CI_REGISTRY_IMAGE/reporter:$PGAI_TAG" - echo " $CI_REGISTRY_IMAGE/monitoring-flask-backend:$PGAI_TAG" + echo " $CI_REGISTRY_IMAGE/metrics-server:$PGAI_TAG" rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' - if: '$CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH =~ /^feature\//' -reporter:tests: - stage: test - image: python:3.11-bullseye - variables: - GIT_STRATEGY: fetch - PIP_DISABLE_PIP_VERSION_CHECK: "1" - PIP_NO_CACHE_DIR: "1" - before_script: - - python --version - - pip install --upgrade pip - - apt-get update - - apt-get install -y --no-install-recommends postgresql postgresql-client && rm -rf /var/lib/apt/lists/* - - pip install -r reporter/requirements-dev.txt - script: - - chown -R postgres:postgres "$CI_PROJECT_DIR" - - su - postgres -c "cd \"$CI_PROJECT_DIR\" && python -m pytest --run-integration --cov=reporter --cov-report=term --cov-report=xml:coverage/reporter-coverage.xml tests/reporter" - # Fix ownership for artifact collection - - chown -R root:root "$CI_PROJECT_DIR/coverage" || true - coverage: '/TOTAL\s+\d+\s+\d+\s+(\d+)%/' - artifacts: - when: always - paths: - - coverage/ - expire_in: 7 days - rules: - - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' - - if: '$CI_COMMIT_BRANCH == "main"' - cli:node:smoke: stage: test image: node:20-alpine @@ -298,19 +270,19 @@ cli:npm:publish: --platform "$PLATFORMS" \ --build-arg "VERSION=$VERSION" \ --build-arg "BUILD_TS=$BUILD_TS" \ - -f reporter/Dockerfile \ + -f cli/Dockerfile.reporter \ -t "postgresai/reporter:$VERSION" \ --push \ - reporter + cli docker buildx build \ --platform "$PLATFORMS" \ --build-arg "VERSION=$VERSION" \ --build-arg "BUILD_TS=$BUILD_TS" \ - -f monitoring_flask_backend/Dockerfile \ - -t "postgresai/monitoring-flask-backend:$VERSION" \ + -f cli/Dockerfile.metrics-server \ + -t "postgresai/metrics-server:$VERSION" \ --push \ - monitoring_flask_backend + cli docker buildx build \ --platform "$PLATFORMS" \ @@ -324,7 +296,7 @@ cli:npm:publish: echo "" echo "Published images:" echo " postgresai/reporter:$VERSION" - echo " postgresai/monitoring-flask-backend:$VERSION" + echo " postgresai/metrics-server:$VERSION" echo " postgresai/postgres-ai-configs:$VERSION" docker:publish:images: @@ -384,7 +356,7 @@ cli:node:e2e:dind: echo "Pulling images from GitLab Container Registry..." docker pull "$CI_REGISTRY_IMAGE/postgres-ai-configs:$PGAI_TAG" docker pull "$CI_REGISTRY_IMAGE/reporter:$PGAI_TAG" - docker pull "$CI_REGISTRY_IMAGE/monitoring-flask-backend:$PGAI_TAG" + docker pull "$CI_REGISTRY_IMAGE/metrics-server:$PGAI_TAG" echo "Images ready:" docker images | grep "$CI_REGISTRY_IMAGE" # Create .env file with registry and tag @@ -425,7 +397,7 @@ cli:node:full:dind: echo "Pulling images from GitLab Container Registry..." docker pull "$CI_REGISTRY_IMAGE/postgres-ai-configs:$PGAI_TAG" docker pull "$CI_REGISTRY_IMAGE/reporter:$PGAI_TAG" - docker pull "$CI_REGISTRY_IMAGE/monitoring-flask-backend:$PGAI_TAG" + docker pull "$CI_REGISTRY_IMAGE/metrics-server:$PGAI_TAG" echo "Images ready:" docker images | grep "$CI_REGISTRY_IMAGE" # Create .env file with registry and tag diff --git a/.vscode/launch.example.json b/.vscode/launch.example.json index 68c50fff..b6818856 100644 --- a/.vscode/launch.example.json +++ b/.vscode/launch.example.json @@ -2,39 +2,42 @@ "version": "0.2.0", "configurations": [ { - "name": "Run Reporter (local)", - "type": "debugpy", + "name": "Run Reporter (TypeScript)", + "type": "node", "request": "launch", - "module": "reporter.postgres_reports", - "env": { - "PYTHONPATH": "${workspaceFolder}" - }, + "runtimeExecutable": "bun", + "runtimeArgs": ["run"], + "program": "${workspaceFolder}/cli/lib/reporter.ts", "args": [ "--prometheus-url", - "http://127.0.0.1:59090", - "--postgres-sink-url", - "postgresql://pgwatch@127.0.0.1:55433/measurements", - "--no-upload", - "--output", - "-" + "http://127.0.0.1:59090" ], - "console": "integratedTerminal", - "justMyCode": true + "cwd": "${workspaceFolder}/cli", + "console": "integratedTerminal" }, { - "name": "Attach (Flask in Docker: debugpy 5678)", - "type": "debugpy", - "request": "attach", - "connect": { "host": "127.0.0.1", "port": 5678 }, - "pathMappings": [ - { - "localRoot": "${workspaceFolder}/monitoring_flask_backend", - "remoteRoot": "/app" - } - ], - "justMyCode": true + "name": "Run Metrics Server (TypeScript)", + "type": "node", + "request": "launch", + "runtimeExecutable": "bun", + "runtimeArgs": ["run"], + "program": "${workspaceFolder}/cli/lib/metrics-server.ts", + "env": { + "PORT": "8000", + "PROMETHEUS_URL": "http://127.0.0.1:59090" + }, + "cwd": "${workspaceFolder}/cli", + "console": "integratedTerminal" + }, + { + "name": "Run CLI Tests", + "type": "node", + "request": "launch", + "runtimeExecutable": "bun", + "runtimeArgs": ["test"], + "cwd": "${workspaceFolder}/cli", + "console": "integratedTerminal" } ] } - diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cfb3c57d..c4bd9674 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,12 +24,12 @@ This workflow lets you: - run the monitoring stack via Docker Compose - iterate on **custom code** without rebuilding images or committing changes - run the **reporter on your host** (recommended) and debug it -- optionally debug the Flask backend running in Docker +- optionally debug the metrics server running in Docker ### What runs where (quick mental model) -- **Docker**: pgwatch collectors + sinks + Grafana (+ optional Flask dev container) -- **Host**: `reporter/postgres_reports.py` (recommended for iteration & debugging) +- **Docker**: pgwatch collectors + sinks + Grafana (+ optional metrics-server dev container) +- **Host**: `cli/lib/reporter.ts` (recommended for iteration & debugging) ### One-time local setup (no commits) @@ -53,7 +53,7 @@ cp docker-compose.override.example.yml docker-compose.override.yml This enables: - using local `./config/**` (Prometheus/Grafana/pgwatch configs) instead of published config images -- Flask bind-mount + optional debugpy +- Metrics server bind-mount for live reload - exposing `sink-postgres` on localhost for host-run reporter - (optional) an alternate mode to run the reporter *inside Docker* (commented in the example override). **Host-run reporter is the primary workflow.** @@ -238,41 +238,31 @@ This repo includes `.vscode/launch.json` with a config: Use **Run and Debug** → select **Run Reporter (local)**. -### Debug Flask backend in Docker (optional) +### Debug metrics-server in Docker (optional) -The override file bind-mounts `./monitoring_flask_backend` into the container for fast iteration. +The override file bind-mounts `./cli/lib` into the container for fast iteration. -#### Run without debugger +#### Run metrics-server locally ```bash -docker compose up -d --force-recreate monitoring_flask_backend +docker compose up -d --force-recreate metrics-server ``` -#### Enable debugpy (attach debugger) - -```bash -DEBUGPY_FLASK=1 docker compose up -d --force-recreate monitoring_flask_backend -``` - -Then attach from Cursor/VS Code: - -- **Attach (Flask in Docker: debugpy 5678)** - -The Flask service (gunicorn) is exposed on: +The metrics server is exposed on: - `http://localhost:55000` -### (Optional) Debug reporter in Docker +### Debug reporter locally -This is usually slower than host-run debugging, but it exists for parity: +The reporter is a TypeScript/Bun application. Run it directly: ```bash -DEBUGPY_REPORTER=1 docker compose up -d --force-recreate postgres-reports +cd cli && bun run lib/reporter.ts --prometheus-url http://localhost:59090 ``` -Then attach: +Then use VS Code debugging: -- **Attach (Reporter in Docker: debugpy 5679)** +- **Run Reporter (TypeScript)** - from `.vscode/launch.example.json` ### Troubleshooting @@ -341,7 +331,7 @@ postgresai mon reset ```bash postgresai mon logs postgresai mon logs grafana -postgresai mon logs monitoring_flask_backend +postgresai mon logs metrics-server ``` ### Stop / start diff --git a/README.md b/README.md index 5a677a32..63c4028f 100644 --- a/README.md +++ b/README.md @@ -182,7 +182,7 @@ This monitoring solution exposes several ports that **MUST** be properly firewal - **Port 58089** (PGWatch Prometheus) - Database monitoring interface - **Port 59090** (Victoria Metrics) - Metrics storage and queries - **Port 59091** (PGWatch Prometheus endpoint) - Metrics collection -- **Port 55000** (Flask API) - Backend API service +- **Port 55000** (Metrics Server) - Backend API service - **Port 55432** (Demo DB) - When using `--demo` option - **Port 55433** (Metrics DB) - Postgres metrics storage diff --git a/cli/Dockerfile.metrics-server b/cli/Dockerfile.metrics-server new file mode 100644 index 00000000..1a6ebcc9 --- /dev/null +++ b/cli/Dockerfile.metrics-server @@ -0,0 +1,49 @@ +# Dockerfile for the TypeScript Metrics Server +# Replaces the Python Flask backend with a Bun-based implementation + +FROM oven/bun:1-slim AS builder + +WORKDIR /app + +# Copy package files +COPY package.json bun.lockb* ./ + +# Install dependencies +RUN bun install --frozen-lockfile + +# Copy source files +COPY lib/ lib/ +COPY bin/ bin/ +COPY scripts/ scripts/ +COPY test/ test/ +COPY tsconfig.json ./ + +# Verify the build works +RUN bun run typecheck || true + +# Production stage +FROM oven/bun:1-slim + +# Install curl for healthcheck +RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Copy node modules and source +COPY --from=builder /app/node_modules ./node_modules +COPY --from=builder /app/lib ./lib +COPY --from=builder /app/package.json ./ + +# Set environment variables +ENV PORT=8000 +ENV PROMETHEUS_URL=http://localhost:8428 + +# Expose port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 + +# Run the metrics server +CMD ["bun", "run", "lib/metrics-server.ts"] diff --git a/cli/Dockerfile.reporter b/cli/Dockerfile.reporter new file mode 100644 index 00000000..0ac3411e --- /dev/null +++ b/cli/Dockerfile.reporter @@ -0,0 +1,42 @@ +# Dockerfile for the TypeScript Reporter +# Replaces the Python reporter with a Bun-based implementation + +FROM oven/bun:1-slim AS builder + +WORKDIR /app + +# Copy package files +COPY package.json bun.lockb* ./ + +# Install dependencies +RUN bun install --frozen-lockfile + +# Copy source files +COPY lib/ lib/ +COPY bin/ bin/ +COPY scripts/ scripts/ +COPY test/ test/ +COPY tsconfig.json ./ + +# Verify the build works +RUN bun run typecheck || true + +# Production stage +FROM oven/bun:1-slim + +WORKDIR /app + +# Copy node modules and source +COPY --from=builder /app/node_modules ./node_modules +COPY --from=builder /app/lib ./lib +COPY --from=builder /app/package.json ./ + +# Create reports directory +RUN mkdir -p /app/reports + +# Set environment variables +ENV PROMETHEUS_URL=http://sink-prometheus:9090 +ENV POSTGRES_SINK_URL=postgresql://pgwatch@sink-postgres:5432/measurements + +# Run the reporter +CMD ["bun", "run", "lib/reporter.ts"] diff --git a/cli/bin/postgres-ai.ts b/cli/bin/postgres-ai.ts index 191088c1..fefb6886 100644 --- a/cli/bin/postgres-ai.ts +++ b/cli/bin/postgres-ai.ts @@ -1481,7 +1481,7 @@ mon console.log(" ✅ PostgreSQL monitoring infrastructure"); console.log(" ✅ Grafana dashboards (with secure password)"); console.log(" ✅ Prometheus metrics storage"); - console.log(" ✅ Flask API backend"); + console.log(" ✅ Metrics server API"); console.log(" ✅ Automated report generation (every 24h)"); console.log(" ✅ Host stats monitoring (CPU, memory, disk, I/O)\n"); diff --git a/cli/lib/metrics-server.ts b/cli/lib/metrics-server.ts new file mode 100644 index 00000000..93a30284 --- /dev/null +++ b/cli/lib/metrics-server.ts @@ -0,0 +1,1041 @@ +/** + * Metrics Server Module + * ===================== + * HTTP server for exporting PostgreSQL metrics from VictoriaMetrics/Prometheus + * as CSV files. This is the TypeScript/Bun port of the Python Flask backend. + * + * Endpoints: + * - GET /health - Health check with Prometheus connectivity test + * - GET /pgss_metrics/csv - pg_stat_statements metrics as CSV + * - GET /btree_bloat/csv - B-tree bloat metrics as CSV + * - GET /table_info/csv - Table statistics as CSV + * - GET /metrics - List available metrics + * - GET /debug/metrics - Debug endpoint for btree bloat metrics + */ + +// Metric name mapping for cleaner CSV output +const METRIC_NAME_MAPPING: Record = { + calls: "calls", + exec_time_total: "exec_time", + plan_time_total: "plan_time", + rows: "rows", + shared_bytes_hit_total: "shared_blks_hit", + shared_bytes_read_total: "shared_blks_read", + shared_bytes_dirtied_total: "shared_blks_dirtied", + shared_bytes_written_total: "shared_blks_written", + block_read_total: "blk_read_time", + block_write_total: "blk_write_time", +}; + +// pg_stat_statements metrics to query +const PGSS_METRICS = [ + "pgwatch_pg_stat_statements_calls", + "pgwatch_pg_stat_statements_plans_total", + "pgwatch_pg_stat_statements_exec_time_total", + "pgwatch_pg_stat_statements_plan_time_total", + "pgwatch_pg_stat_statements_rows", + "pgwatch_pg_stat_statements_shared_bytes_hit_total", + "pgwatch_pg_stat_statements_shared_bytes_read_total", + "pgwatch_pg_stat_statements_shared_bytes_dirtied_total", + "pgwatch_pg_stat_statements_shared_bytes_written_total", + "pgwatch_pg_stat_statements_block_read_total", + "pgwatch_pg_stat_statements_block_write_total", + "pgwatch_pg_stat_statements_wal_records", + "pgwatch_pg_stat_statements_wal_fpi", + "pgwatch_pg_stat_statements_wal_bytes", + "pgwatch_pg_stat_statements_temp_bytes_read", + "pgwatch_pg_stat_statements_temp_bytes_written", +]; + +// Table metrics to query +const TABLE_BASE_METRICS: Record = { + total_size: "pgwatch_pg_class_total_relation_size_bytes", + table_size: "pgwatch_table_size_detailed_table_main_size_b", + index_size: "pgwatch_table_size_detailed_table_indexes_size_b", + toast_size: "pgwatch_table_size_detailed_total_toast_size_b", + seq_scan: "pgwatch_pg_stat_all_tables_seq_scan", + idx_scan: "pgwatch_pg_stat_all_tables_idx_scan", + n_tup_ins: "pgwatch_table_stats_n_tup_ins", + n_tup_upd: "pgwatch_table_stats_n_tup_upd", + n_tup_del: "pgwatch_table_stats_n_tup_del", + n_tup_hot_upd: "pgwatch_table_stats_n_tup_hot_upd", + heap_blks_read: "pgwatch_pg_statio_all_tables_heap_blks_read", + heap_blks_hit: "pgwatch_pg_statio_all_tables_heap_blks_hit", + idx_blks_read: "pgwatch_pg_statio_all_tables_idx_blks_read", + idx_blks_hit: "pgwatch_pg_statio_all_tables_idx_blks_hit", +}; + +// Btree bloat metrics +const BTREE_BLOAT_METRICS = [ + "pgwatch_pg_btree_bloat_real_size_mib", + "pgwatch_pg_btree_bloat_extra_size", + "pgwatch_pg_btree_bloat_extra_pct", + "pgwatch_pg_btree_bloat_fillfactor", + "pgwatch_pg_btree_bloat_bloat_size", + "pgwatch_pg_btree_bloat_bloat_pct", + "pgwatch_pg_btree_bloat_is_na", +]; + +// Types for Prometheus responses +interface PrometheusMetric { + __name__?: string; + datname?: string; + queryid?: string; + user?: string; + instance?: string; + schemaname?: string; + schema?: string; + relname?: string; + table_name?: string; + tblname?: string; + idxname?: string; + cluster?: string; + node_name?: string; + [key: string]: string | undefined; +} + +interface PrometheusValue { + metric: PrometheusMetric; + value?: [number, string]; + values?: [number, string][]; +} + +interface PrometheusResponse { + status: string; + data: { + result: PrometheusValue[]; + resultType?: string; + }; +} + +// Query key types +type PgssKey = [string, string, string, string]; // [datname, queryid, user, instance] +type TableKey = [string, string, string]; // [datname, schema, table_name] +type BtreeKey = [string, string, string, string]; // [datname, schema, table, index] + +interface MetricDict { + timestamp?: string; + [key: string]: number | string | undefined; +} + +/** + * Parse time parameter (Unix timestamp or ISO format) + */ +export function parseTimeParam(timeStr: string): Date { + // Check if it looks like a Unix timestamp (all digits, optionally with decimal) + if (/^\d+(\.\d+)?$/.test(timeStr)) { + const timestamp = parseFloat(timeStr); + if (!isNaN(timestamp) && timestamp > 0) { + return new Date(timestamp * 1000); + } + } + // Try ISO format - pass directly to Date constructor + const date = new Date(timeStr); + if (!isNaN(date.getTime())) { + return date; + } + throw new Error(`Invalid time format: ${timeStr}`); +} + +/** + * Format date for CSV filename + */ +function formatDateForFilename(date: Date): string { + return date.toISOString().replace(/[-:]/g, "").replace("T", "_").slice(0, 15); +} + +/** + * Convert array of objects to CSV string + */ +export function toCSV(data: Record[], fields: string[]): string { + if (data.length === 0) { + return fields.join(",") + "\n"; + } + + const lines: string[] = []; + lines.push(fields.join(",")); + + for (const row of data) { + const values = fields.map((field) => { + const val = row[field]; + if (val === undefined || val === null) { + return ""; + } + const str = String(val); + // Escape CSV special characters + if (str.includes(",") || str.includes('"') || str.includes("\n")) { + return `"${str.replace(/"/g, '""')}"`; + } + return str; + }); + lines.push(values.join(",")); + } + + return lines.join("\n") + "\n"; +} + +/** + * Prometheus client for querying metrics + */ +export class PrometheusClient { + private baseUrl: string; + + constructor(prometheusUrl: string = "http://localhost:8428") { + this.baseUrl = prometheusUrl.replace(/\/$/, ""); + } + + /** + * Execute instant query + */ + async query(queryStr: string): Promise { + const url = new URL(`${this.baseUrl}/api/v1/query`); + url.searchParams.set("query", queryStr); + + const response = await fetch(url.toString()); + if (!response.ok) { + throw new Error(`Prometheus query failed: ${response.status} ${response.statusText}`); + } + return response.json() as Promise; + } + + /** + * Execute range query + */ + async queryRange( + queryStr: string, + start: Date, + end: Date, + step: string = "60s" + ): Promise { + const url = new URL(`${this.baseUrl}/api/v1/query_range`); + url.searchParams.set("query", queryStr); + url.searchParams.set("start", (start.getTime() / 1000).toString()); + url.searchParams.set("end", (end.getTime() / 1000).toString()); + url.searchParams.set("step", step); + + const response = await fetch(url.toString()); + if (!response.ok) { + throw new Error(`Prometheus range query failed: ${response.status} ${response.statusText}`); + } + return response.json() as Promise; + } + + /** + * Get all available metrics + */ + async allMetrics(): Promise { + const url = new URL(`${this.baseUrl}/api/v1/label/__name__/values`); + const response = await fetch(url.toString()); + if (!response.ok) { + throw new Error(`Failed to get metrics: ${response.status}`); + } + const data = (await response.json()) as { status: string; data: string[] }; + return data.data || []; + } + + /** + * Test connection + */ + async testConnection(): Promise { + try { + const result = await this.query("up"); + return result.status === "success"; + } catch { + return false; + } + } +} + +/** + * Build filter string for Prometheus query + */ +function buildFilterString(filters: Record): string { + const parts: string[] = []; + for (const [key, value] of Object.entries(filters)) { + if (value) { + if (key === "schemaname") { + // Support regex matching for schema + parts.push(`${key}=~"${value}"`); + } else if (key === "node_name" || key === "instance") { + parts.push(`instance=~".*${value}.*"`); + } else { + parts.push(`${key}="${value}"`); + } + } + } + return parts.length > 0 ? `{${parts.join(",")}}` : ""; +} + +/** + * Convert Prometheus data to dictionary keyed by query identifiers (for PGSS) + */ +export function prometheusToPgssDict( + promData: PrometheusValue[], + targetTimestamp: Date +): Map { + const metricsDict = new Map(); + + for (const entry of promData) { + const metric = entry.metric; + const values = entry.values || (entry.value ? [[entry.value[0], entry.value[1]]] : []); + + if (values.length === 0) continue; + + // Find closest value to target timestamp + const targetTs = targetTimestamp.getTime() / 1000; + let closestValue = values[0]; + let minDiff = Math.abs(values[0][0] - targetTs); + + for (const val of values) { + const diff = Math.abs(val[0] - targetTs); + if (diff < minDiff) { + minDiff = diff; + closestValue = val; + } + } + + // Create unique key for this query + const key = [ + metric.datname || "", + metric.queryid || "", + metric.user || "", + metric.instance || "", + ].join("|"); + + if (!metricsDict.has(key)) { + metricsDict.set(key, { + timestamp: new Date(closestValue[0] * 1000).toISOString(), + }); + } + + // Add metric value + const metricName = (metric.__name__ || "pgwatch_pg_stat_statements_calls").replace( + "pgwatch_pg_stat_statements_", + "" + ); + + try { + metricsDict.get(key)![metricName] = parseFloat(closestValue[1]); + } catch { + metricsDict.get(key)![metricName] = 0; + } + } + + return metricsDict; +} + +/** + * Convert Prometheus table data to dictionary + */ +export function prometheusTableToDict( + promData: Map, + targetTimestamp: Date +): Map { + const metricsDict = new Map(); + + for (const [metricName, entries] of promData) { + for (const entry of entries) { + const metric = entry.metric; + const values = entry.values || (entry.value ? [[entry.value[0], entry.value[1]]] : []); + + if (values.length === 0) continue; + + // Find closest value to target timestamp + const targetTs = targetTimestamp.getTime() / 1000; + let closestValue = values[0]; + let minDiff = Math.abs(values[0][0] - targetTs); + + for (const val of values) { + const diff = Math.abs(val[0] - targetTs); + if (diff < minDiff) { + minDiff = diff; + closestValue = val; + } + } + + // Handle different label names + const schemaLabel = metric.schemaname || metric.schema || ""; + const tableLabel = metric.relname || metric.table_name || metric.tblname || ""; + + const key = [metric.datname || "", schemaLabel, tableLabel].join("|"); + + if (!metricsDict.has(key)) { + metricsDict.set(key, { + timestamp: new Date(closestValue[0] * 1000).toISOString(), + }); + } + + try { + metricsDict.get(key)![metricName] = parseFloat(closestValue[1]); + } catch { + metricsDict.get(key)![metricName] = 0; + } + } + } + + return metricsDict; +} + +/** + * Process PGSS data and calculate differences + */ +export function processPgssData( + startData: PrometheusValue[], + endData: PrometheusValue[], + startTime: Date, + endTime: Date +): Record[] { + const startMetrics = prometheusToPgssDict(startData, startTime); + const endMetrics = prometheusToPgssDict(endData, endTime); + + if (startMetrics.size === 0 && endMetrics.size === 0) { + return []; + } + + // Combine all keys + const allKeys = new Set([...startMetrics.keys(), ...endMetrics.keys()]); + const resultRows: Record[] = []; + + for (const key of allKeys) { + const startMetric = startMetrics.get(key) || {}; + const endMetric = endMetrics.get(key) || {}; + + // Parse key + const [dbName, queryId, user, instance] = key.split("|"); + + // Calculate duration + let actualDuration: number; + if (startMetric.timestamp && endMetric.timestamp) { + const startDt = new Date(startMetric.timestamp); + const endDt = new Date(endMetric.timestamp); + actualDuration = (endDt.getTime() - startDt.getTime()) / 1000; + } else { + actualDuration = (endTime.getTime() - startTime.getTime()) / 1000; + } + + const row: Record = { + queryid: queryId, + duration_seconds: actualDuration, + }; + + // Calculate differences and rates for each metric + for (const [col, displayName] of Object.entries(METRIC_NAME_MAPPING)) { + const startVal = (startMetric[col] as number) || 0; + const endVal = (endMetric[col] as number) || 0; + let diff = endVal - startVal; + + // Convert bytes to blocks for block-related metrics + if (displayName.includes("blks") && col.includes("bytes")) { + diff = diff / 8192; + } + + row[displayName] = diff; + + // Calculate rate per second + if (actualDuration > 0) { + row[`${displayName}_per_sec`] = diff / actualDuration; + } else { + row[`${displayName}_per_sec`] = 0; + } + + // Calculate per-call average + const callsDiff = (row["calls"] as number) || 0; + if (callsDiff > 0) { + row[`${displayName}_per_call`] = diff / callsDiff; + } else { + row[`${displayName}_per_call`] = 0; + } + } + + resultRows.push(row); + } + + // Sort by execution time descending + resultRows.sort((a, b) => ((b.exec_time as number) || 0) - ((a.exec_time as number) || 0)); + + return resultRows; +} + +/** + * Process table stats with rates + */ +export function processTableStatsWithRates( + startData: Map, + endData: Map, + startTime: Date, + endTime: Date +): Record[] { + const startMetrics = prometheusTableToDict(startData, startTime); + const endMetrics = prometheusTableToDict(endData, endTime); + + if (startMetrics.size === 0 && endMetrics.size === 0) { + return []; + } + + const allKeys = new Set([...startMetrics.keys(), ...endMetrics.keys()]); + const resultRows: Record[] = []; + + const counterMetrics = [ + "seq_scan", + "idx_scan", + "n_tup_ins", + "n_tup_upd", + "n_tup_del", + "n_tup_hot_upd", + "heap_blks_read", + "heap_blks_hit", + "idx_blks_read", + "idx_blks_hit", + ]; + + const displayNames: Record = { + seq_scan: "seq_scans", + idx_scan: "idx_scans", + n_tup_ins: "inserts", + n_tup_upd: "updates", + n_tup_del: "deletes", + n_tup_hot_upd: "hot_updates", + }; + + for (const key of allKeys) { + const startMetric = startMetrics.get(key) || {}; + const endMetric = endMetrics.get(key) || {}; + + const [dbName, schemaName, tableName] = key.split("|"); + + // Calculate duration + let actualDuration: number; + if (startMetric.timestamp && endMetric.timestamp) { + const startDt = new Date(startMetric.timestamp); + const endDt = new Date(endMetric.timestamp); + actualDuration = (endDt.getTime() - startDt.getTime()) / 1000; + } else { + actualDuration = (endTime.getTime() - startTime.getTime()) / 1000; + } + + const row: Record = { + schema: schemaName, + table_name: tableName, + duration_seconds: actualDuration, + }; + + // Calculate differences and rates for counter metrics + for (const metric of counterMetrics) { + const startVal = (startMetric[metric] as number) || 0; + const endVal = (endMetric[metric] as number) || 0; + const diff = endVal - startVal; + + const displayName = displayNames[metric] || metric; + row[displayName] = diff; + + if (actualDuration > 0) { + row[`${displayName}_per_sec`] = diff / actualDuration; + } else { + row[`${displayName}_per_sec`] = 0; + } + } + + // Size metrics (just use end values) + for (const sizeMetric of ["total_size", "table_size", "index_size", "toast_size"]) { + row[sizeMetric] = (endMetric[sizeMetric] as number) || 0; + } + + resultRows.push(row); + } + + // Sort by total size descending + resultRows.sort((a, b) => ((b.total_size as number) || 0) - ((a.total_size as number) || 0)); + + return resultRows; +} + +/** + * Metrics Server + */ +export class MetricsServer { + private prometheus: PrometheusClient; + private port: number; + private server: ReturnType | null = null; + + constructor(prometheusUrl: string = "http://localhost:8428", port: number = 8000) { + this.prometheus = new PrometheusClient(prometheusUrl); + this.port = port; + } + + /** + * Handle health check + */ + async handleHealth(): Promise { + try { + const healthy = await this.prometheus.testConnection(); + if (healthy) { + return Response.json({ + status: "healthy", + prometheus_url: this.prometheus["baseUrl"], + }); + } + return Response.json({ status: "unhealthy", error: "Prometheus connection failed" }, { status: 500 }); + } catch (error) { + return Response.json( + { status: "unhealthy", error: error instanceof Error ? error.message : String(error) }, + { status: 500 } + ); + } + } + + /** + * Handle PGSS metrics endpoint + */ + async handlePgssMetrics(url: URL): Promise { + try { + const timeStart = url.searchParams.get("time_start"); + const timeEnd = url.searchParams.get("time_end"); + + if (!timeStart || !timeEnd) { + return Response.json( + { error: "time_start and time_end parameters are required" }, + { status: 400 } + ); + } + + const startDt = parseTimeParam(timeStart); + const endDt = parseTimeParam(timeEnd); + + const filters = buildFilterString({ + cluster: url.searchParams.get("cluster_name") || undefined, + datname: url.searchParams.get("db_name") || undefined, + instance: url.searchParams.get("node_name") || undefined, + }); + + // Query metrics at start and end times + const startData: PrometheusValue[] = []; + const endData: PrometheusValue[] = []; + + for (const metric of PGSS_METRICS) { + const queryStr = filters ? `${metric}${filters}` : metric; + try { + // Query around start time + const startResult = await this.prometheus.queryRange( + queryStr, + new Date(startDt.getTime() - 60000), + new Date(startDt.getTime() + 60000) + ); + startData.push(...startResult.data.result); + + // Query around end time + const endResult = await this.prometheus.queryRange( + queryStr, + new Date(endDt.getTime() - 60000), + new Date(endDt.getTime() + 60000) + ); + endData.push(...endResult.data.result); + } catch (err) { + console.warn(`Failed to query metric ${metric}:`, err); + } + } + + // Process data + const csvData = processPgssData(startData, endData, startDt, endDt); + + // Build CSV + const baseFields = ["queryid", "duration_seconds"]; + const metricFields: string[] = []; + const desiredOrder = [ + "calls", + "exec_time", + "plan_time", + "rows", + "shared_blks_hit", + "shared_blks_read", + "shared_blks_dirtied", + "shared_blks_written", + "blk_read_time", + "blk_write_time", + ]; + + for (const displayName of desiredOrder) { + if (Object.values(METRIC_NAME_MAPPING).includes(displayName)) { + metricFields.push(displayName, `${displayName}_per_sec`, `${displayName}_per_call`); + } + } + + const csvContent = toCSV(csvData, [...baseFields, ...metricFields]); + const filename = `pgss_metrics_${formatDateForFilename(startDt)}_${formatDateForFilename(endDt)}.csv`; + + return new Response(csvContent, { + headers: { + "Content-Type": "text/csv", + "Content-Disposition": `attachment; filename=${filename}`, + }, + }); + } catch (error) { + console.error("Error processing PGSS metrics request:", error); + return Response.json( + { error: error instanceof Error ? error.message : String(error) }, + { status: 500 } + ); + } + } + + /** + * Handle btree bloat endpoint + */ + async handleBtreeBloat(url: URL): Promise { + try { + const filters = buildFilterString({ + cluster: url.searchParams.get("cluster_name") || undefined, + node_name: url.searchParams.get("node_name") || undefined, + datname: url.searchParams.get("db_name") || undefined, + schemaname: url.searchParams.get("schemaname") || undefined, + tblname: url.searchParams.get("tblname") || undefined, + idxname: url.searchParams.get("idxname") || undefined, + }); + + const metricResults = new Map>(); + + for (const metric of BTREE_BLOAT_METRICS) { + const query = `last_over_time(${metric}${filters}[1d])`; + try { + const result = await this.prometheus.query(query); + + for (const entry of result.data.result) { + const labels = entry.metric; + const key = [ + labels.datname || "", + labels.schemaname || "", + labels.tblname || "", + labels.idxname || "", + ].join("|"); + + if (!metricResults.has(key)) { + metricResults.set(key, { + database: labels.datname || "", + schemaname: labels.schemaname || "", + tblname: labels.tblname || "", + idxname: labels.idxname || "", + }); + } + + const value = entry.value ? parseFloat(entry.value[1]) : 0; + + if (metric.includes("real_size_mib")) { + metricResults.get(key)!.real_size_mib = value; + } else if (metric.includes("extra_size") && !metric.includes("extra_pct")) { + metricResults.get(key)!.extra_size = value; + } else if (metric.includes("extra_pct")) { + metricResults.get(key)!.extra_pct = value; + } else if (metric.includes("fillfactor")) { + metricResults.get(key)!.fillfactor = value; + } else if (metric.includes("bloat_size")) { + metricResults.get(key)!.bloat_size = value; + } else if (metric.includes("bloat_pct")) { + metricResults.get(key)!.bloat_pct = value; + } else if (metric.includes("is_na")) { + metricResults.get(key)!.is_na = Math.round(value); + } + } + } catch (err) { + console.warn(`Failed to query ${metric}:`, err); + } + } + + const fields = [ + "database", + "schemaname", + "tblname", + "idxname", + "real_size_mib", + "extra_size", + "extra_pct", + "fillfactor", + "bloat_size", + "bloat_pct", + "is_na", + ]; + + const csvContent = toCSV([...metricResults.values()], fields); + + return new Response(csvContent, { + headers: { + "Content-Type": "text/csv", + "Content-Disposition": "attachment; filename=btree_bloat_latest.csv", + }, + }); + } catch (error) { + console.error("Error processing btree bloat request:", error); + return Response.json( + { error: error instanceof Error ? error.message : String(error) }, + { status: 500 } + ); + } + } + + /** + * Handle table info endpoint + */ + async handleTableInfo(url: URL): Promise { + try { + const timeStart = url.searchParams.get("time_start"); + const timeEnd = url.searchParams.get("time_end"); + const calculateRates = Boolean(timeStart && timeEnd); + + const filters = buildFilterString({ + cluster: url.searchParams.get("cluster_name") || undefined, + node_name: url.searchParams.get("node_name") || undefined, + datname: url.searchParams.get("db_name") || undefined, + schemaname: url.searchParams.get("schemaname") || undefined, + tblname: url.searchParams.get("tblname") || undefined, + }); + + let metricResults: Record[] | Map>; + + if (calculateRates) { + const startDt = parseTimeParam(timeStart!); + const endDt = parseTimeParam(timeEnd!); + + const startData = new Map(); + const endData = new Map(); + + for (const [metricName, metricQuery] of Object.entries(TABLE_BASE_METRICS)) { + const queryStr = filters ? `${metricQuery}${filters}` : metricQuery; + try { + const startResult = await this.prometheus.queryRange( + queryStr, + new Date(startDt.getTime() - 60000), + new Date(startDt.getTime() + 60000) + ); + startData.set(metricName, startResult.data.result); + + const endResult = await this.prometheus.queryRange( + queryStr, + new Date(endDt.getTime() - 60000), + new Date(endDt.getTime() + 60000) + ); + endData.set(metricName, endResult.data.result); + } catch (err) { + console.warn(`Failed to query metric ${metricName}:`, err); + } + } + + metricResults = processTableStatsWithRates(startData, endData, startDt, endDt); + + const fields = [ + "schema", + "table_name", + "total_size", + "table_size", + "index_size", + "toast_size", + "seq_scans", + "seq_scans_per_sec", + "idx_scans", + "idx_scans_per_sec", + "inserts", + "inserts_per_sec", + "updates", + "updates_per_sec", + "deletes", + "deletes_per_sec", + "hot_updates", + "hot_updates_per_sec", + "heap_blks_read", + "heap_blks_read_per_sec", + "heap_blks_hit", + "heap_blks_hit_per_sec", + "idx_blks_read", + "idx_blks_read_per_sec", + "idx_blks_hit", + "idx_blks_hit_per_sec", + "duration_seconds", + ]; + + const csvContent = toCSV(metricResults, fields); + const filename = `table_stats_${formatDateForFilename(startDt)}_${formatDateForFilename(endDt)}.csv`; + + return new Response(csvContent, { + headers: { + "Content-Type": "text/csv", + "Content-Disposition": `attachment; filename=${filename}`, + }, + }); + } else { + // Instant query mode + metricResults = new Map>(); + + for (const [metricName, metricQuery] of Object.entries(TABLE_BASE_METRICS)) { + const query = `last_over_time(${metricQuery}${filters}[1d])`; + try { + const result = await this.prometheus.query(query); + + for (const entry of result.data.result) { + const labels = entry.metric; + const schemaLabel = labels.schemaname || labels.schema || ""; + const tableLabel = labels.relname || labels.table_name || labels.tblname || ""; + + const key = [labels.datname || "", schemaLabel, tableLabel].join("|"); + + if (!metricResults.has(key)) { + metricResults.set(key, { + schema: schemaLabel, + table_name: tableLabel, + }); + } + + const value = entry.value ? parseFloat(entry.value[1]) : 0; + metricResults.get(key)![metricName] = value; + } + } catch (err) { + console.warn(`Failed to query metric ${metricName}:`, err); + } + } + + const fields = [ + "schema", + "table_name", + "total_size", + "table_size", + "index_size", + "toast_size", + "seq_scan", + "idx_scan", + "n_tup_ins", + "n_tup_upd", + "n_tup_del", + "n_tup_hot_upd", + "heap_blks_read", + "heap_blks_hit", + "idx_blks_read", + "idx_blks_hit", + ]; + + const csvContent = toCSV([...metricResults.values()], fields); + + return new Response(csvContent, { + headers: { + "Content-Type": "text/csv", + "Content-Disposition": "attachment; filename=table_stats_latest.csv", + }, + }); + } + } catch (error) { + console.error("Error processing table info request:", error); + return Response.json( + { error: error instanceof Error ? error.message : String(error) }, + { status: 500 } + ); + } + } + + /** + * Handle metrics list endpoint + */ + async handleMetricsList(): Promise { + try { + const allMetrics = await this.prometheus.allMetrics(); + const pgssMetrics = allMetrics.filter((m) => m.includes("pg_stat_statements")); + return Response.json({ pg_stat_statements_metrics: pgssMetrics }); + } catch (error) { + return Response.json( + { error: error instanceof Error ? error.message : String(error) }, + { status: 500 } + ); + } + } + + /** + * Handle debug metrics endpoint + */ + async handleDebugMetrics(): Promise { + try { + const allMetrics = await this.prometheus.allMetrics(); + const btreeMetrics = allMetrics.filter((m) => m.includes("btree_bloat")); + + const sampleData: Record = {}; + for (const metric of btreeMetrics.slice(0, 5)) { + try { + const result = await this.prometheus.query(metric); + sampleData[metric] = { + count: result.data.result.length, + sample_labels: result.data.result.slice(0, 2).map((entry) => entry.metric), + }; + } catch (err) { + sampleData[metric] = { error: String(err) }; + } + } + + return Response.json({ + all_metrics_count: allMetrics.length, + btree_metrics: btreeMetrics, + sample_data: sampleData, + }); + } catch (error) { + return Response.json( + { error: error instanceof Error ? error.message : String(error) }, + { status: 500 } + ); + } + } + + /** + * Main request handler + */ + async handleRequest(req: Request): Promise { + const url = new URL(req.url); + const path = url.pathname; + + try { + switch (path) { + case "/health": + return await this.handleHealth(); + case "/pgss_metrics/csv": + return await this.handlePgssMetrics(url); + case "/btree_bloat/csv": + return await this.handleBtreeBloat(url); + case "/table_info/csv": + return await this.handleTableInfo(url); + case "/metrics": + return await this.handleMetricsList(); + case "/debug/metrics": + return await this.handleDebugMetrics(); + default: + return Response.json({ error: "Not found" }, { status: 404 }); + } + } catch (error) { + console.error("Request handler error:", error); + return Response.json( + { error: error instanceof Error ? error.message : String(error) }, + { status: 500 } + ); + } + } + + /** + * Start the server + */ + start(): void { + this.server = Bun.serve({ + port: this.port, + fetch: (req) => this.handleRequest(req), + }); + console.log(`Metrics server listening on http://localhost:${this.port}`); + } + + /** + * Stop the server + */ + stop(): void { + if (this.server) { + this.server.stop(); + this.server = null; + } + } +} + +// CLI entrypoint +if (import.meta.main) { + const prometheusUrl = process.env.PROMETHEUS_URL || "http://localhost:8428"; + const port = parseInt(process.env.PORT || "8000", 10); + + const server = new MetricsServer(prometheusUrl, port); + server.start(); +} diff --git a/cli/lib/reporter.ts b/cli/lib/reporter.ts new file mode 100644 index 00000000..e6afa035 --- /dev/null +++ b/cli/lib/reporter.ts @@ -0,0 +1,847 @@ +/** + * PostgreSQL Reporter Module + * ========================== + * Generates JSON health check reports from Prometheus/VictoriaMetrics metrics. + * This is the TypeScript/Bun port of the Python reporter. + * + * The reporter queries Prometheus for PostgreSQL metrics collected by pgwatch + * and generates standardized JSON reports for various health checks. + * + * Check Types: + * - A002: PostgreSQL version + * - A003: PostgreSQL settings + * - A004: Cluster information + * - A007: Altered settings + * - D004: pg_stat_statements settings + * - F001: Autovacuum settings + * - F004: Heap bloat + * - F005: Btree bloat + * - G001: Memory settings + * - H001: Invalid indexes + * - H002: Unused indexes + * - H004: Redundant indexes + * - K001-K008: Query performance metrics + * - M001-M003: Query timing metrics + * - N001: Wait events + */ + +import { Client } from "pg"; +import * as fs from "fs"; +import * as path from "path"; +import * as pkg from "../package.json"; +import { PrometheusClient } from "./metrics-server"; + +// Version for reports +const REPORTER_VERSION = "1.0.0"; + +// Settings filter lists +const D004_SETTINGS = [ + "pg_stat_statements.max", + "pg_stat_statements.track", + "pg_stat_statements.track_utility", + "pg_stat_statements.save", + "pg_stat_statements.track_planning", + "shared_preload_libraries", + "track_activities", + "track_counts", + "track_functions", + "track_io_timing", + "track_wal_io_timing", +]; + +const F001_SETTINGS = [ + "autovacuum", + "autovacuum_analyze_scale_factor", + "autovacuum_analyze_threshold", + "autovacuum_freeze_max_age", + "autovacuum_max_workers", + "autovacuum_multixact_freeze_max_age", + "autovacuum_naptime", + "autovacuum_vacuum_cost_delay", + "autovacuum_vacuum_cost_limit", + "autovacuum_vacuum_insert_scale_factor", + "autovacuum_vacuum_scale_factor", + "autovacuum_vacuum_threshold", + "autovacuum_work_mem", + "vacuum_cost_delay", + "vacuum_cost_limit", + "vacuum_cost_page_dirty", + "vacuum_cost_page_hit", + "vacuum_cost_page_miss", + "vacuum_freeze_min_age", + "vacuum_freeze_table_age", + "vacuum_multixact_freeze_min_age", + "vacuum_multixact_freeze_table_age", +]; + +const G001_SETTINGS = [ + "shared_buffers", + "work_mem", + "maintenance_work_mem", + "effective_cache_size", + "autovacuum_work_mem", + "max_wal_size", + "min_wal_size", + "wal_buffers", + "checkpoint_completion_target", + "max_connections", + "max_prepared_transactions", + "max_locks_per_transaction", + "max_pred_locks_per_transaction", + "max_pred_locks_per_relation", + "max_pred_locks_per_page", + "logical_decoding_work_mem", + "hash_mem_multiplier", + "temp_buffers", + "shared_preload_libraries", + "dynamic_shared_memory_type", + "huge_pages", + "max_files_per_process", + "max_stack_depth", +]; + +// Default excluded databases +const DEFAULT_EXCLUDED_DATABASES = new Set([ + "template0", + "template1", + "rdsadmin", + "azure_maintenance", + "cloudsqladmin", +]); + +// Types +interface PrometheusMetric { + __name__?: string; + cluster?: string; + instance?: string; + datname?: string; + [key: string]: string | undefined; +} + +interface PrometheusValue { + metric: PrometheusMetric; + value?: [number, string]; + values?: [number, string][]; +} + +interface BuildMetadata { + version: string; + build_ts: string | null; +} + +export interface ReportNode { + data: Record; + postgres_version?: { + version: string; + server_version_num: string; + server_major_ver: string; + server_minor_ver: string; + }; +} + +export interface Report { + version: string; + build_ts: string | null; + generation_mode: string; + checkId: string; + checkTitle: string; + timestamptz: string; + nodes: { + primary: string; + standbys: string[]; + }; + results: Record; +} + +/** + * Read text file safely + */ +function readTextFileSafe(p: string): string | null { + try { + const value = fs.readFileSync(p, "utf8").trim(); + return value || null; + } catch { + return null; + } +} + +/** + * Load build metadata + */ +function loadBuildMetadata(): BuildMetadata { + const envPath = process.env.PGAI_BUILD_TS_FILE; + const p = envPath && envPath.trim() ? envPath.trim() : "/BUILD_TS"; + + let buildTs = readTextFileSafe(p); + if (!buildTs) { + try { + const pkgRoot = path.resolve(__dirname, ".."); + buildTs = readTextFileSafe(path.join(pkgRoot, "BUILD_TS")); + } catch { + // Ignore + } + } + if (!buildTs) { + try { + const pkgJsonPath = path.resolve(__dirname, "..", "package.json"); + const st = fs.statSync(pkgJsonPath); + buildTs = st.mtime.toISOString(); + } catch { + buildTs = new Date().toISOString(); + } + } + + return { + version: pkg.version || REPORTER_VERSION, + build_ts: buildTs, + }; +} + +/** + * Format bytes to human-readable string + */ +function formatBytes(bytes: number): string { + if (bytes === 0) return "0 B"; + if (bytes < 0) return `-${formatBytes(-bytes)}`; + if (!Number.isFinite(bytes)) return `${bytes} B`; + const units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]; + const i = Math.min(Math.floor(Math.log(bytes) / Math.log(1024)), units.length - 1); + return `${(bytes / Math.pow(1024, i)).toFixed(2)} ${units[i]}`; +} + +/** + * PostgreSQL Report Generator + */ +export class PostgresReportGenerator { + private prometheus: PrometheusClient; + private postgresSinkUrl: string; + private pgConn: Client | null = null; + private buildMetadata: BuildMetadata; + private excludedDatabases: Set; + + constructor( + prometheusUrl: string = "http://sink-prometheus:9090", + postgresSinkUrl: string = "postgresql://pgwatch@sink-postgres:5432/measurements", + excludedDatabases?: string[] + ) { + this.prometheus = new PrometheusClient(prometheusUrl); + this.postgresSinkUrl = postgresSinkUrl; + this.buildMetadata = loadBuildMetadata(); + this.excludedDatabases = new Set([...DEFAULT_EXCLUDED_DATABASES, ...(excludedDatabases || [])]); + } + + /** + * Test Prometheus connection + */ + async testConnection(): Promise { + return this.prometheus.testConnection(); + } + + /** + * Get all clusters from Prometheus + */ + async getAllClusters(): Promise { + try { + const result = await this.prometheus.query("group by (cluster)(pgwatch_pg_settings)"); + const clusters: string[] = []; + for (const entry of result.data.result) { + if (entry.metric.cluster) { + clusters.push(entry.metric.cluster); + } + } + return clusters.sort(); + } catch (error) { + console.error("Error getting clusters:", error); + return []; + } + } + + /** + * Get all nodes for a cluster + */ + async getAllNodes(cluster: string): Promise { + try { + const result = await this.prometheus.query( + `group by (instance)(pgwatch_pg_settings{cluster="${cluster}"})` + ); + const nodes: string[] = []; + for (const entry of result.data.result) { + if (entry.metric.instance) { + nodes.push(entry.metric.instance); + } + } + return nodes.sort(); + } catch (error) { + console.error("Error getting nodes:", error); + return []; + } + } + + /** + * Get all databases for a cluster/node + */ + async getAllDatabases(cluster: string, nodeName?: string): Promise { + try { + const nodeFilter = nodeName ? `,instance="${nodeName}"` : ""; + const result = await this.prometheus.query( + `group by (datname)(pgwatch_db_stats{cluster="${cluster}"${nodeFilter}})` + ); + const databases: string[] = []; + for (const entry of result.data.result) { + if (entry.metric.datname && !this.excludedDatabases.has(entry.metric.datname)) { + databases.push(entry.metric.datname); + } + } + return databases.sort(); + } catch (error) { + console.error("Error getting databases:", error); + return []; + } + } + + /** + * Create base report structure + */ + createBaseReport(checkId: string, checkTitle: string, nodeName: string): Report { + return { + version: this.buildMetadata.version, + build_ts: this.buildMetadata.build_ts, + generation_mode: "full", + checkId, + checkTitle, + timestamptz: new Date().toISOString(), + nodes: { + primary: nodeName, + standbys: [], + }, + results: {}, + }; + } + + /** + * Query Prometheus instant + */ + async queryInstant(query: string): Promise { + try { + const result = await this.prometheus.query(query); + return result.data.result; + } catch (error) { + console.error(`Prometheus query error for "${query}":`, error); + return []; + } + } + + /** + * Query Prometheus range + */ + async queryRange( + query: string, + startTime: Date, + endTime: Date, + step: string = "60s" + ): Promise { + try { + const result = await this.prometheus.queryRange(query, startTime, endTime, step); + return result.data.result; + } catch (error) { + console.error(`Prometheus range query error for "${query}":`, error); + return []; + } + } + + /** + * Get PostgreSQL version from Prometheus + */ + async getPostgresVersion(cluster: string, nodeName?: string): Promise> { + const nodeFilter = nodeName ? `,instance=~".*${nodeName}.*"` : ""; + const query = `pgwatch_pg_settings{cluster="${cluster}"${nodeFilter},tag_setting_name="server_version"}`; + const result = await this.queryInstant(query); + + let version = ""; + let versionNum = ""; + + for (const entry of result) { + if (entry.metric.tag_setting_name === "server_version") { + version = entry.metric.tag_setting_value || ""; + } + } + + // Get version number + const queryNum = `pgwatch_pg_settings{cluster="${cluster}"${nodeFilter},tag_setting_name="server_version_num"}`; + const resultNum = await this.queryInstant(queryNum); + + for (const entry of resultNum) { + if (entry.metric.tag_setting_name === "server_version_num") { + versionNum = entry.metric.tag_setting_value || ""; + } + } + + // Parse version number + let major = ""; + let minor = ""; + if (versionNum && versionNum.length >= 6) { + try { + const num = parseInt(versionNum, 10); + major = Math.floor(num / 10000).toString(); + minor = (num % 10000).toString(); + } catch { + // Ignore + } + } + + return { + version, + server_version_num: versionNum, + server_major_ver: major, + server_minor_ver: minor, + }; + } + + /** + * Generate A002 - PostgreSQL major version report + */ + async generateA002Report(cluster: string, nodeName: string = "node-01"): Promise { + const report = this.createBaseReport("A002", "Postgres major version", nodeName); + const version = await this.getPostgresVersion(cluster, nodeName); + + report.results[nodeName] = { + data: { version }, + }; + + return report; + } + + /** + * Get settings from Prometheus + */ + async getSettings( + cluster: string, + nodeName?: string, + filter?: string[] + ): Promise> { + const nodeFilter = nodeName ? `,instance=~".*${nodeName}.*"` : ""; + const query = `pgwatch_pg_settings{cluster="${cluster}"${nodeFilter}}`; + const result = await this.queryInstant(query); + + const settings: Record = {}; + + for (const entry of result) { + const name = entry.metric.tag_setting_name; + if (!name) continue; + + // Apply filter if provided + if (filter && !filter.includes(name)) continue; + + const value = entry.metric.tag_setting_value || ""; + const unit = entry.metric.tag_unit || ""; + const category = entry.metric.tag_category || ""; + const vartype = entry.metric.tag_vartype || ""; + + settings[name] = { + setting: value, + unit, + category, + context: "", + vartype, + pretty_value: value, + }; + } + + return settings; + } + + /** + * Generate A003 - PostgreSQL settings report + */ + async generateA003Report(cluster: string, nodeName: string = "node-01"): Promise { + const report = this.createBaseReport("A003", "Postgres settings", nodeName); + const version = await this.getPostgresVersion(cluster, nodeName); + const settings = await this.getSettings(cluster, nodeName); + + report.results[nodeName] = { + data: settings, + postgres_version: version as ReportNode["postgres_version"], + }; + + return report; + } + + /** + * Get altered (non-default) settings + */ + async getAlteredSettings(cluster: string, nodeName?: string): Promise> { + const nodeFilter = nodeName ? `,instance=~".*${nodeName}.*"` : ""; + const query = `pgwatch_pg_settings{cluster="${cluster}"${nodeFilter},is_default="0"}`; + const result = await this.queryInstant(query); + + const settings: Record = {}; + + for (const entry of result) { + const name = entry.metric.tag_setting_name; + if (!name) continue; + + const value = entry.metric.tag_setting_value || ""; + const unit = entry.metric.tag_unit || ""; + const category = entry.metric.tag_category || ""; + + settings[name] = { + value, + unit, + category, + pretty_value: value, + }; + } + + return settings; + } + + /** + * Generate A007 - Altered settings report + */ + async generateA007Report(cluster: string, nodeName: string = "node-01"): Promise { + const report = this.createBaseReport("A007", "Altered settings", nodeName); + const version = await this.getPostgresVersion(cluster, nodeName); + const settings = await this.getAlteredSettings(cluster, nodeName); + + report.results[nodeName] = { + data: settings, + postgres_version: version as ReportNode["postgres_version"], + }; + + return report; + } + + /** + * Generate D004 - pg_stat_statements settings report + */ + async generateD004Report(cluster: string, nodeName: string = "node-01"): Promise { + const report = this.createBaseReport( + "D004", + "pg_stat_statements and pg_stat_kcache settings", + nodeName + ); + const version = await this.getPostgresVersion(cluster, nodeName); + const settings = await this.getSettings(cluster, nodeName, D004_SETTINGS); + + report.results[nodeName] = { + data: settings, + postgres_version: version as ReportNode["postgres_version"], + }; + + return report; + } + + /** + * Generate F001 - Autovacuum settings report + */ + async generateF001Report(cluster: string, nodeName: string = "node-01"): Promise { + const report = this.createBaseReport("F001", "Autovacuum: current settings", nodeName); + const version = await this.getPostgresVersion(cluster, nodeName); + const settings = await this.getSettings(cluster, nodeName, F001_SETTINGS); + + report.results[nodeName] = { + data: settings, + postgres_version: version as ReportNode["postgres_version"], + }; + + return report; + } + + /** + * Generate G001 - Memory settings report + */ + async generateG001Report(cluster: string, nodeName: string = "node-01"): Promise { + const report = this.createBaseReport("G001", "Memory-related settings", nodeName); + const version = await this.getPostgresVersion(cluster, nodeName); + const settings = await this.getSettings(cluster, nodeName, G001_SETTINGS); + + report.results[nodeName] = { + data: settings, + postgres_version: version as ReportNode["postgres_version"], + }; + + return report; + } + + /** + * Get heap bloat data from Prometheus + */ + async getHeapBloat(cluster: string, nodeName?: string): Promise[]> { + const nodeFilter = nodeName ? `,instance=~".*${nodeName}.*"` : ""; + const query = `pgwatch_pg_table_bloat_approx_tbl_wasted_pct{cluster="${cluster}"${nodeFilter}} > 20`; + const result = await this.queryInstant(query); + + const bloatData: Record[] = []; + + for (const entry of result) { + const value = entry.value ? parseFloat(entry.value[1]) : 0; + if (this.excludedDatabases.has(entry.metric.datname || "")) continue; + + bloatData.push({ + database: entry.metric.datname || "", + schemaname: entry.metric.schemaname || "", + tablename: entry.metric.tablename || "", + bloat_pct: value, + }); + } + + return bloatData.sort((a, b) => (b.bloat_pct as number) - (a.bloat_pct as number)); + } + + /** + * Generate F004 - Heap bloat report + */ + async generateF004Report(cluster: string, nodeName: string = "node-01"): Promise { + const report = this.createBaseReport("F004", "Heap bloat", nodeName); + const version = await this.getPostgresVersion(cluster, nodeName); + const bloatData = await this.getHeapBloat(cluster, nodeName); + + // Group by database + const byDatabase: Record = {}; + for (const entry of bloatData) { + const db = entry.database as string; + if (!byDatabase[db]) { + byDatabase[db] = []; + } + byDatabase[db].push(entry); + } + + report.results[nodeName] = { + data: byDatabase, + postgres_version: version as ReportNode["postgres_version"], + }; + + return report; + } + + /** + * Get btree bloat data from Prometheus + */ + async getBtreeBloat(cluster: string, nodeName?: string): Promise[]> { + const nodeFilter = nodeName ? `,instance=~".*${nodeName}.*"` : ""; + const query = `pgwatch_pg_btree_bloat_bloat_pct{cluster="${cluster}"${nodeFilter}} > 20`; + const result = await this.queryInstant(query); + + const bloatData: Record[] = []; + + for (const entry of result) { + const value = entry.value ? parseFloat(entry.value[1]) : 0; + if (this.excludedDatabases.has(entry.metric.datname || "")) continue; + + bloatData.push({ + database: entry.metric.datname || "", + schemaname: entry.metric.schemaname || "", + tablename: entry.metric.tblname || "", + idxname: entry.metric.idxname || "", + bloat_pct: value, + }); + } + + return bloatData.sort((a, b) => (b.bloat_pct as number) - (a.bloat_pct as number)); + } + + /** + * Generate F005 - Btree bloat report + */ + async generateF005Report(cluster: string, nodeName: string = "node-01"): Promise { + const report = this.createBaseReport("F005", "Btree bloat", nodeName); + const version = await this.getPostgresVersion(cluster, nodeName); + const bloatData = await this.getBtreeBloat(cluster, nodeName); + + // Group by database + const byDatabase: Record = {}; + for (const entry of bloatData) { + const db = entry.database as string; + if (!byDatabase[db]) { + byDatabase[db] = []; + } + byDatabase[db].push(entry); + } + + report.results[nodeName] = { + data: byDatabase, + postgres_version: version as ReportNode["postgres_version"], + }; + + return report; + } + + /** + * Get invalid indexes from Prometheus + */ + async getInvalidIndexes(cluster: string, nodeName?: string): Promise[]> { + const nodeFilter = nodeName ? `,instance=~".*${nodeName}.*"` : ""; + const query = `pgwatch_invalid_indexes_count{cluster="${cluster}"${nodeFilter}} > 0`; + const result = await this.queryInstant(query); + + const indexes: Record[] = []; + + for (const entry of result) { + if (this.excludedDatabases.has(entry.metric.datname || "")) continue; + + indexes.push({ + database: entry.metric.datname || "", + schemaname: entry.metric.schemaname || "", + tablename: entry.metric.tablename || "", + indexname: entry.metric.indexname || "", + }); + } + + return indexes; + } + + /** + * Generate H001 - Invalid indexes report + */ + async generateH001Report(cluster: string, nodeName: string = "node-01"): Promise { + const report = this.createBaseReport("H001", "Invalid indexes", nodeName); + const version = await this.getPostgresVersion(cluster, nodeName); + const indexes = await this.getInvalidIndexes(cluster, nodeName); + + // Group by database + const byDatabase: Record = {}; + for (const entry of indexes) { + const db = entry.database as string; + if (!byDatabase[db]) { + byDatabase[db] = { + invalid_indexes: [], + total_count: 0, + }; + } + (byDatabase[db] as Record).invalid_indexes.push(entry); + (byDatabase[db] as Record).total_count++; + } + + report.results[nodeName] = { + data: byDatabase, + postgres_version: version as ReportNode["postgres_version"], + }; + + return report; + } + + /** + * Get unused indexes from Prometheus + */ + async getUnusedIndexes(cluster: string, nodeName?: string): Promise[]> { + const nodeFilter = nodeName ? `,instance=~".*${nodeName}.*"` : ""; + const query = `pgwatch_unused_indexes_idx_scan_count{cluster="${cluster}"${nodeFilter}} == 0`; + const result = await this.queryInstant(query); + + const indexes: Record[] = []; + + for (const entry of result) { + if (this.excludedDatabases.has(entry.metric.datname || "")) continue; + + indexes.push({ + database: entry.metric.datname || "", + schemaname: entry.metric.schemaname || "", + tablename: entry.metric.tablename || "", + indexname: entry.metric.indexname || "", + idx_scan: 0, + }); + } + + return indexes; + } + + /** + * Generate H002 - Unused indexes report + */ + async generateH002Report(cluster: string, nodeName: string = "node-01"): Promise { + const report = this.createBaseReport("H002", "Unused indexes", nodeName); + const version = await this.getPostgresVersion(cluster, nodeName); + const indexes = await this.getUnusedIndexes(cluster, nodeName); + + // Group by database + const byDatabase: Record = {}; + for (const entry of indexes) { + const db = entry.database as string; + if (!byDatabase[db]) { + byDatabase[db] = { + unused_indexes: [], + total_count: 0, + }; + } + (byDatabase[db] as Record).unused_indexes.push(entry); + (byDatabase[db] as Record).total_count++; + } + + report.results[nodeName] = { + data: byDatabase, + postgres_version: version as ReportNode["postgres_version"], + }; + + return report; + } + + /** + * Generate all reports for a cluster + */ + async generateAllReports( + cluster: string, + nodeName: string = "node-01", + combineNodes: boolean = true + ): Promise> { + const reports: Record = {}; + + console.log(`Generating reports for cluster: ${cluster}`); + + // Generate all reports + const generators: Array<[string, () => Promise]> = [ + ["A002", () => this.generateA002Report(cluster, nodeName)], + ["A003", () => this.generateA003Report(cluster, nodeName)], + ["A007", () => this.generateA007Report(cluster, nodeName)], + ["D004", () => this.generateD004Report(cluster, nodeName)], + ["F001", () => this.generateF001Report(cluster, nodeName)], + ["F004", () => this.generateF004Report(cluster, nodeName)], + ["F005", () => this.generateF005Report(cluster, nodeName)], + ["G001", () => this.generateG001Report(cluster, nodeName)], + ["H001", () => this.generateH001Report(cluster, nodeName)], + ["H002", () => this.generateH002Report(cluster, nodeName)], + ]; + + for (const [checkId, generator] of generators) { + try { + console.log(` Generating ${checkId}...`); + reports[checkId] = await generator(); + } catch (error) { + console.error(` Error generating ${checkId}:`, error); + } + } + + return reports; + } +} + +// CLI entrypoint +if (import.meta.main) { + const prometheusUrl = process.env.PROMETHEUS_URL || "http://localhost:8428"; + const postgresSinkUrl = + process.env.POSTGRES_SINK_URL || "postgresql://pgwatch@localhost:5432/measurements"; + + const generator = new PostgresReportGenerator(prometheusUrl, postgresSinkUrl); + + // Check connection + const connected = await generator.testConnection(); + if (!connected) { + console.error("Cannot connect to Prometheus"); + process.exit(1); + } + + // Get clusters + const clusters = await generator.getAllClusters(); + console.log("Discovered clusters:", clusters); + + // Generate reports for each cluster + for (const cluster of clusters.length > 0 ? clusters : ["local"]) { + const reports = await generator.generateAllReports(cluster); + + // Write reports to files + for (const [checkId, report] of Object.entries(reports)) { + const filename = `${cluster}_${checkId}.json`; + fs.writeFileSync(filename, JSON.stringify(report, null, 2)); + console.log(`Written: ${filename}`); + } + } +} diff --git a/cli/test/init.test.ts b/cli/test/init.test.ts index 5aceb537..63ee4065 100644 --- a/cli/test/init.test.ts +++ b/cli/test/init.test.ts @@ -1,8 +1,21 @@ -import { describe, test, expect, beforeAll } from "bun:test"; +import { describe, test, expect, beforeAll, afterEach } from "bun:test"; import { resolve } from "path"; +import { existsSync, unlinkSync } from "fs"; // Import from source directly since we're using Bun import * as init from "../lib/init"; + +// Clean up .pgwatch-config after tests that may create it +afterEach(() => { + const pgwatchConfig = resolve(process.cwd(), ".pgwatch-config"); + if (existsSync(pgwatchConfig)) { + try { + unlinkSync(pgwatchConfig); + } catch { + // Ignore cleanup errors + } + } +}); const DEFAULT_MONITORING_USER = init.DEFAULT_MONITORING_USER; function runCli(args: string[], env: Record = {}) { diff --git a/cli/test/issues.cli.test.ts b/cli/test/issues.cli.test.ts index 6e3afec5..7493a27f 100644 --- a/cli/test/issues.cli.test.ts +++ b/cli/test/issues.cli.test.ts @@ -38,6 +38,8 @@ function isolatedEnv(extra: Record = {}) { return { XDG_CONFIG_HOME: cfgHome, HOME: cfgHome, + // Explicitly clear API key to prevent leakage from parent environment + PGAI_API_KEY: "", ...extra, }; } diff --git a/cli/test/metrics-server.test.ts b/cli/test/metrics-server.test.ts new file mode 100644 index 00000000..7d31c0dd --- /dev/null +++ b/cli/test/metrics-server.test.ts @@ -0,0 +1,774 @@ +import { describe, test, expect, mock, beforeEach, afterEach } from "bun:test"; +import { + parseTimeParam, + toCSV, + prometheusToPgssDict, + prometheusTableToDict, + processPgssData, + processTableStatsWithRates, + PrometheusClient, + MetricsServer, +} from "../lib/metrics-server"; + +// Test parseTimeParam +describe("parseTimeParam", () => { + test("parses Unix timestamp", () => { + const date = parseTimeParam("1704067200"); + expect(date.getTime()).toBe(1704067200000); + }); + + test("parses ISO format", () => { + const date = parseTimeParam("2024-01-01T00:00:00Z"); + expect(date.toISOString()).toBe("2024-01-01T00:00:00.000Z"); + }); + + test("parses ISO format without Z", () => { + const date = parseTimeParam("2024-01-01T00:00:00+00:00"); + expect(date.getFullYear()).toBe(2024); + }); + + test("throws on invalid format", () => { + expect(() => parseTimeParam("invalid")).toThrow(); + }); +}); + +// Test toCSV +describe("toCSV", () => { + test("creates empty CSV with headers only", () => { + const result = toCSV([], ["col1", "col2"]); + expect(result).toBe("col1,col2\n"); + }); + + test("creates CSV with data", () => { + const data = [ + { col1: "a", col2: "b" }, + { col1: "c", col2: "d" }, + ]; + const result = toCSV(data, ["col1", "col2"]); + expect(result).toBe("col1,col2\na,b\nc,d\n"); + }); + + test("handles missing values", () => { + const data = [{ col1: "a" }]; + const result = toCSV(data, ["col1", "col2"]); + expect(result).toBe("col1,col2\na,\n"); + }); + + test("escapes commas in values", () => { + const data = [{ col1: "a,b", col2: "c" }]; + const result = toCSV(data, ["col1", "col2"]); + expect(result).toContain('"a,b"'); + }); + + test("escapes quotes in values", () => { + const data = [{ col1: 'a"b', col2: "c" }]; + const result = toCSV(data, ["col1", "col2"]); + expect(result).toContain('"a""b"'); + }); + + test("escapes newlines in values", () => { + const data = [{ col1: "a\nb", col2: "c" }]; + const result = toCSV(data, ["col1", "col2"]); + expect(result).toContain('"a\nb"'); + }); + + test("handles null and undefined", () => { + const data = [{ col1: null, col2: undefined }]; + const result = toCSV(data as unknown as Record[], ["col1", "col2"]); + expect(result).toBe("col1,col2\n,\n"); + }); +}); + +// Test prometheusToPgssDict +describe("prometheusToPgssDict", () => { + test("returns empty map for empty input", () => { + const result = prometheusToPgssDict([], new Date()); + expect(result.size).toBe(0); + }); + + test("parses single metric", () => { + const timestamp = new Date("2024-01-01T00:00:00Z"); + const data = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_calls", + datname: "testdb", + queryid: "12345", + user: "postgres", + instance: "localhost", + }, + values: [[timestamp.getTime() / 1000, "100"]], + }, + ]; + + const result = prometheusToPgssDict(data as any, timestamp); + expect(result.size).toBe(1); + + const key = "testdb|12345|postgres|localhost"; + expect(result.has(key)).toBe(true); + expect(result.get(key)?.calls).toBe(100); + }); + + test("handles entry with value instead of values", () => { + const timestamp = new Date("2024-01-01T00:00:00Z"); + const data = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_calls", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + value: [timestamp.getTime() / 1000, "50"], + }, + ]; + + const result = prometheusToPgssDict(data as any, timestamp); + expect(result.size).toBe(1); + expect(result.get("db|1|u|i")?.calls).toBe(50); + }); + + test("skips entries without values", () => { + const data = [ + { + metric: { __name__: "test", datname: "db" }, + values: [], + }, + ]; + + const result = prometheusToPgssDict(data as any, new Date()); + expect(result.size).toBe(0); + }); + + test("finds closest value to timestamp", () => { + const target = new Date("2024-01-01T00:05:00Z"); + const data = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_calls", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + values: [ + [new Date("2024-01-01T00:00:00Z").getTime() / 1000, "100"], + [new Date("2024-01-01T00:04:00Z").getTime() / 1000, "150"], + [new Date("2024-01-01T00:10:00Z").getTime() / 1000, "200"], + ], + }, + ]; + + const result = prometheusToPgssDict(data as any, target); + // Should pick the 00:04:00 value as closest to 00:05:00 + expect(result.get("db|1|u|i")?.calls).toBe(150); + }); +}); + +// Test prometheusTableToDict +describe("prometheusTableToDict", () => { + test("returns empty map for empty input", () => { + const result = prometheusTableToDict(new Map(), new Date()); + expect(result.size).toBe(0); + }); + + test("parses table metrics", () => { + const timestamp = new Date("2024-01-01T00:00:00Z"); + const data = new Map([ + [ + "seq_scan", + [ + { + metric: { datname: "db", schemaname: "public", relname: "users" }, + values: [[timestamp.getTime() / 1000, "100"]], + }, + ], + ], + ]); + + const result = prometheusTableToDict(data as any, timestamp); + expect(result.size).toBe(1); + expect(result.get("db|public|users")?.seq_scan).toBe(100); + }); + + test("handles different schema label names", () => { + const timestamp = new Date("2024-01-01T00:00:00Z"); + const data = new Map([ + [ + "seq_scan", + [ + { + metric: { datname: "db", schema: "myschema", table_name: "mytable" }, + values: [[timestamp.getTime() / 1000, "50"]], + }, + ], + ], + ]); + + const result = prometheusTableToDict(data as any, timestamp); + expect(result.has("db|myschema|mytable")).toBe(true); + }); +}); + +// Test processPgssData +describe("processPgssData", () => { + test("returns empty array for empty input", () => { + const start = new Date("2024-01-01T00:00:00Z"); + const end = new Date("2024-01-02T00:00:00Z"); + const result = processPgssData([], [], start, end); + expect(result).toEqual([]); + }); + + test("calculates differences correctly", () => { + const start = new Date("2024-01-01T00:00:00Z"); + const end = new Date("2024-01-02T00:00:00Z"); + + const startData = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_calls", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + values: [[start.getTime() / 1000, "100"]], + }, + ]; + + const endData = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_calls", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + values: [[end.getTime() / 1000, "200"]], + }, + ]; + + const result = processPgssData(startData as any, endData as any, start, end); + expect(result.length).toBe(1); + expect(result[0].calls).toBe(100); // 200 - 100 + expect(result[0].queryid).toBe("1"); + }); + + test("handles missing end data (fallback duration)", () => { + const start = new Date("2024-01-01T00:00:00Z"); + const end = new Date("2024-01-02T00:00:00Z"); + + const startData = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_calls", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + values: [[start.getTime() / 1000, "100"]], + }, + ]; + + const result = processPgssData(startData as any, [], start, end); + expect(result.length).toBe(1); + expect(result[0].duration_seconds).toBe(86400); // 1 day + }); + + test("calculates rates per second", () => { + const start = new Date("2024-01-01T00:00:00Z"); + const end = new Date("2024-01-01T01:00:00Z"); // 1 hour later + + const startData = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_calls", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + values: [[start.getTime() / 1000, "0"]], + }, + ]; + + const endData = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_calls", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + values: [[end.getTime() / 1000, "3600"]], + }, + ]; + + const result = processPgssData(startData as any, endData as any, start, end); + expect(result[0].calls).toBe(3600); + expect(result[0].calls_per_sec).toBe(1); // 3600 calls / 3600 seconds + }); + + test("sorts by exec_time descending", () => { + const start = new Date("2024-01-01T00:00:00Z"); + const end = new Date("2024-01-02T00:00:00Z"); + + const startData = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_exec_time_total", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + values: [[start.getTime() / 1000, "100"]], + }, + { + metric: { + __name__: "pgwatch_pg_stat_statements_exec_time_total", + datname: "db", + queryid: "2", + user: "u", + instance: "i", + }, + values: [[start.getTime() / 1000, "200"]], + }, + ]; + + const endData = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_exec_time_total", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + values: [[end.getTime() / 1000, "150"]], + }, + { + metric: { + __name__: "pgwatch_pg_stat_statements_exec_time_total", + datname: "db", + queryid: "2", + user: "u", + instance: "i", + }, + values: [[end.getTime() / 1000, "500"]], + }, + ]; + + const result = processPgssData(startData as any, endData as any, start, end); + expect(result.length).toBe(2); + // Query 2 has higher exec_time diff (300 vs 50) + expect(result[0].queryid).toBe("2"); + }); + + test("handles zero duration", () => { + const now = new Date("2024-01-01T00:00:00Z"); + + const startData = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_calls", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + values: [[now.getTime() / 1000, "100"]], + }, + ]; + + const endData = [ + { + metric: { + __name__: "pgwatch_pg_stat_statements_calls", + datname: "db", + queryid: "1", + user: "u", + instance: "i", + }, + values: [[now.getTime() / 1000, "200"]], + }, + ]; + + const result = processPgssData(startData as any, endData as any, now, now); + expect(result[0].calls_per_sec).toBe(0); // Zero duration = 0 rate + }); +}); + +// Test processTableStatsWithRates +describe("processTableStatsWithRates", () => { + test("returns empty array for empty input", () => { + const start = new Date("2024-01-01T00:00:00Z"); + const end = new Date("2024-01-02T00:00:00Z"); + const result = processTableStatsWithRates(new Map(), new Map(), start, end); + expect(result).toEqual([]); + }); + + test("calculates table stats with rates", () => { + const start = new Date("2024-01-01T00:00:00Z"); + const end = new Date("2024-01-02T00:00:00Z"); + + const startData = new Map([ + [ + "seq_scan", + [ + { + metric: { datname: "db", schemaname: "public", relname: "users" }, + values: [[start.getTime() / 1000, "100"]], + }, + ], + ], + ]); + + const endData = new Map([ + [ + "seq_scan", + [ + { + metric: { datname: "db", schemaname: "public", relname: "users" }, + values: [[end.getTime() / 1000, "200"]], + }, + ], + ], + ]); + + const result = processTableStatsWithRates(startData as any, endData as any, start, end); + expect(result.length).toBe(1); + expect(result[0].schema).toBe("public"); + expect(result[0].table_name).toBe("users"); + expect(result[0].seq_scans).toBe(100); + }); + + test("handles zero duration", () => { + const now = new Date("2024-01-01T00:00:00Z"); + + const startData = new Map([ + [ + "seq_scan", + [ + { + metric: { datname: "db", schemaname: "public", relname: "users" }, + values: [[now.getTime() / 1000, "100"]], + }, + ], + ], + ]); + + const endData = new Map([ + [ + "seq_scan", + [ + { + metric: { datname: "db", schemaname: "public", relname: "users" }, + values: [[now.getTime() / 1000, "200"]], + }, + ], + ], + ]); + + const result = processTableStatsWithRates(startData as any, endData as any, now, now); + expect(result[0].seq_scans_per_sec).toBe(0); + }); + + test("sorts by total_size descending", () => { + const start = new Date("2024-01-01T00:00:00Z"); + const end = new Date("2024-01-02T00:00:00Z"); + + const startData = new Map([ + [ + "total_size", + [ + { + metric: { datname: "db", schemaname: "public", relname: "small" }, + values: [[start.getTime() / 1000, "1000"]], + }, + { + metric: { datname: "db", schemaname: "public", relname: "large" }, + values: [[start.getTime() / 1000, "10000"]], + }, + ], + ], + ]); + + const endData = new Map([ + [ + "total_size", + [ + { + metric: { datname: "db", schemaname: "public", relname: "small" }, + values: [[end.getTime() / 1000, "1000"]], + }, + { + metric: { datname: "db", schemaname: "public", relname: "large" }, + values: [[end.getTime() / 1000, "10000"]], + }, + ], + ], + ]); + + const result = processTableStatsWithRates(startData as any, endData as any, start, end); + expect(result.length).toBe(2); + // Large table should be first + expect(result[0].table_name).toBe("large"); + }); +}); + +// Test MetricsServer endpoints +describe("MetricsServer", () => { + let server: MetricsServer; + let originalFetch: typeof global.fetch; + + beforeEach(() => { + server = new MetricsServer("http://test-prometheus:9090", 9999); + originalFetch = global.fetch; + }); + + afterEach(() => { + global.fetch = originalFetch; + }); + + test("handleHealth returns healthy when prometheus is available", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response(JSON.stringify({ status: "success", data: { result: [] } }), { status: 200 }) + ) + ); + + const response = await server.handleHealth(); + expect(response.status).toBe(200); + + const data = await response.json(); + expect(data.status).toBe("healthy"); + }); + + test("handleHealth returns unhealthy on error", async () => { + global.fetch = mock(() => Promise.reject(new Error("Connection refused"))); + + const response = await server.handleHealth(); + expect(response.status).toBe(500); + + const data = await response.json(); + expect(data.status).toBe("unhealthy"); + }); + + test("handlePgssMetrics requires time parameters", async () => { + const url = new URL("http://localhost/pgss_metrics/csv"); + const response = await server.handlePgssMetrics(url); + expect(response.status).toBe(400); + }); + + test("handlePgssMetrics returns CSV", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response(JSON.stringify({ status: "success", data: { result: [] } }), { status: 200 }) + ) + ); + + const url = new URL("http://localhost/pgss_metrics/csv?time_start=1704067200&time_end=1704153600"); + const response = await server.handlePgssMetrics(url); + expect(response.status).toBe(200); + expect(response.headers.get("Content-Type")).toBe("text/csv"); + }); + + test("handleBtreeBloat returns CSV", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response(JSON.stringify({ status: "success", data: { result: [] } }), { status: 200 }) + ) + ); + + const url = new URL("http://localhost/btree_bloat/csv"); + const response = await server.handleBtreeBloat(url); + expect(response.status).toBe(200); + expect(response.headers.get("Content-Type")).toBe("text/csv"); + }); + + test("handleTableInfo instant mode returns CSV", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response(JSON.stringify({ status: "success", data: { result: [] } }), { status: 200 }) + ) + ); + + const url = new URL("http://localhost/table_info/csv"); + const response = await server.handleTableInfo(url); + expect(response.status).toBe(200); + expect(response.headers.get("Content-Disposition")).toContain("table_stats_latest.csv"); + }); + + test("handleTableInfo rate mode returns CSV", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response(JSON.stringify({ status: "success", data: { result: [] } }), { status: 200 }) + ) + ); + + const url = new URL("http://localhost/table_info/csv?time_start=1704067200&time_end=1704153600"); + const response = await server.handleTableInfo(url); + expect(response.status).toBe(200); + expect(response.headers.get("Content-Disposition")).toContain("table_stats_"); + }); + + test("handleMetricsList returns pg_stat_statements metrics", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response( + JSON.stringify({ + status: "success", + data: ["pgwatch_pg_stat_statements_calls", "other_metric"], + }), + { status: 200 } + ) + ) + ); + + const response = await server.handleMetricsList(); + expect(response.status).toBe(200); + + const data = await response.json(); + expect(data.pg_stat_statements_metrics).toContain("pgwatch_pg_stat_statements_calls"); + expect(data.pg_stat_statements_metrics).not.toContain("other_metric"); + }); + + test("handleDebugMetrics returns btree metrics info", async () => { + let callCount = 0; + global.fetch = mock(() => { + callCount++; + if (callCount === 1) { + return Promise.resolve( + new Response( + JSON.stringify({ + status: "success", + data: ["pgwatch_pg_btree_bloat_real_size_mib"], + }), + { status: 200 } + ) + ); + } + return Promise.resolve( + new Response( + JSON.stringify({ status: "success", data: { result: [] } }), + { status: 200 } + ) + ); + }); + + const response = await server.handleDebugMetrics(); + expect(response.status).toBe(200); + + const data = await response.json(); + expect(data.btree_metrics).toBeDefined(); + }); + + test("handleRequest routes to correct handler", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response(JSON.stringify({ status: "success", data: { result: [] } }), { status: 200 }) + ) + ); + + const healthReq = new Request("http://localhost/health"); + const healthResponse = await server.handleRequest(healthReq); + expect(healthResponse.status).toBe(200); + + const notFoundReq = new Request("http://localhost/unknown"); + const notFoundResponse = await server.handleRequest(notFoundReq); + expect(notFoundResponse.status).toBe(404); + }); +}); + +// Test PrometheusClient +describe("PrometheusClient", () => { + let client: PrometheusClient; + let originalFetch: typeof global.fetch; + + beforeEach(() => { + client = new PrometheusClient("http://test-prometheus:9090"); + originalFetch = global.fetch; + }); + + afterEach(() => { + global.fetch = originalFetch; + }); + + test("query sends correct request", async () => { + let capturedUrl: string | undefined; + global.fetch = mock((url: string) => { + capturedUrl = url; + return Promise.resolve( + new Response(JSON.stringify({ status: "success", data: { result: [] } }), { status: 200 }) + ); + }); + + await client.query("up"); + expect(capturedUrl).toContain("/api/v1/query"); + expect(capturedUrl).toContain("query=up"); + }); + + test("queryRange sends correct request", async () => { + let capturedUrl: string | undefined; + global.fetch = mock((url: string) => { + capturedUrl = url; + return Promise.resolve( + new Response(JSON.stringify({ status: "success", data: { result: [] } }), { status: 200 }) + ); + }); + + const start = new Date("2024-01-01T00:00:00Z"); + const end = new Date("2024-01-02T00:00:00Z"); + await client.queryRange("up", start, end); + + expect(capturedUrl).toContain("/api/v1/query_range"); + expect(capturedUrl).toContain("query=up"); + expect(capturedUrl).toContain("start="); + expect(capturedUrl).toContain("end="); + }); + + test("allMetrics returns metric names", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response(JSON.stringify({ status: "success", data: ["metric1", "metric2"] }), { + status: 200, + }) + ) + ); + + const metrics = await client.allMetrics(); + expect(metrics).toContain("metric1"); + expect(metrics).toContain("metric2"); + }); + + test("testConnection returns true on success", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response(JSON.stringify({ status: "success", data: { result: [] } }), { status: 200 }) + ) + ); + + const result = await client.testConnection(); + expect(result).toBe(true); + }); + + test("testConnection returns false on error", async () => { + global.fetch = mock(() => Promise.reject(new Error("Connection refused"))); + + const result = await client.testConnection(); + expect(result).toBe(false); + }); + + test("query throws on HTTP error", async () => { + global.fetch = mock(() => + Promise.resolve(new Response("Internal Server Error", { status: 500 })) + ); + + await expect(client.query("up")).rejects.toThrow(); + }); +}); diff --git a/cli/test/reporter.test.ts b/cli/test/reporter.test.ts new file mode 100644 index 00000000..aa256048 --- /dev/null +++ b/cli/test/reporter.test.ts @@ -0,0 +1,483 @@ +import { describe, test, expect, mock, beforeEach, afterEach } from "bun:test"; +import { PostgresReportGenerator, Report } from "../lib/reporter"; + +describe("PostgresReportGenerator", () => { + let generator: PostgresReportGenerator; + let originalFetch: typeof global.fetch; + + beforeEach(() => { + generator = new PostgresReportGenerator( + "http://test-prometheus:9090", + "postgresql://test@localhost:5432/test" + ); + originalFetch = global.fetch; + }); + + afterEach(() => { + global.fetch = originalFetch; + }); + + // Mock successful Prometheus response + function mockPrometheusSuccess(result: unknown[] = []) { + global.fetch = mock(() => + Promise.resolve( + new Response( + JSON.stringify({ status: "success", data: { result } }), + { status: 200 } + ) + ) + ); + } + + // Mock error Prometheus response + function mockPrometheusError() { + global.fetch = mock(() => Promise.reject(new Error("Connection refused"))); + } + + describe("testConnection", () => { + test("returns true when Prometheus is available", async () => { + mockPrometheusSuccess(); + const result = await generator.testConnection(); + expect(result).toBe(true); + }); + + test("returns false when Prometheus is unavailable", async () => { + mockPrometheusError(); + const result = await generator.testConnection(); + expect(result).toBe(false); + }); + }); + + describe("getAllClusters", () => { + test("returns list of clusters", async () => { + mockPrometheusSuccess([ + { metric: { cluster: "cluster1" } }, + { metric: { cluster: "cluster2" } }, + ]); + + const clusters = await generator.getAllClusters(); + expect(clusters).toContain("cluster1"); + expect(clusters).toContain("cluster2"); + }); + + test("returns empty array on error", async () => { + mockPrometheusError(); + const clusters = await generator.getAllClusters(); + expect(clusters).toEqual([]); + }); + }); + + describe("getAllNodes", () => { + test("returns list of nodes for cluster", async () => { + mockPrometheusSuccess([ + { metric: { instance: "node1:5432" } }, + { metric: { instance: "node2:5432" } }, + ]); + + const nodes = await generator.getAllNodes("cluster1"); + expect(nodes).toContain("node1:5432"); + expect(nodes).toContain("node2:5432"); + }); + + test("returns empty array on error", async () => { + mockPrometheusError(); + const nodes = await generator.getAllNodes("cluster1"); + expect(nodes).toEqual([]); + }); + }); + + describe("getAllDatabases", () => { + test("returns list of databases excluding system ones", async () => { + mockPrometheusSuccess([ + { metric: { datname: "mydb" } }, + { metric: { datname: "template0" } }, + { metric: { datname: "template1" } }, + ]); + + const databases = await generator.getAllDatabases("cluster1"); + expect(databases).toContain("mydb"); + expect(databases).not.toContain("template0"); + expect(databases).not.toContain("template1"); + }); + + test("returns empty array on error", async () => { + mockPrometheusError(); + const databases = await generator.getAllDatabases("cluster1"); + expect(databases).toEqual([]); + }); + }); + + describe("createBaseReport", () => { + test("creates correct report structure", () => { + const report = generator.createBaseReport("A002", "Postgres major version", "node-01"); + + expect(report.checkId).toBe("A002"); + expect(report.checkTitle).toBe("Postgres major version"); + expect(report.generation_mode).toBe("full"); + expect(report.nodes.primary).toBe("node-01"); + expect(report.nodes.standbys).toEqual([]); + expect(report.results).toEqual({}); + expect(typeof report.timestamptz).toBe("string"); + }); + }); + + describe("generateA002Report", () => { + test("generates PostgreSQL version report", async () => { + let queryCount = 0; + global.fetch = mock(() => { + queryCount++; + if (queryCount === 1) { + // First query for server_version + return Promise.resolve( + new Response( + JSON.stringify({ + status: "success", + data: { + result: [ + { + metric: { + tag_setting_name: "server_version", + tag_setting_value: "16.3", + }, + }, + ], + }, + }), + { status: 200 } + ) + ); + } + // Second query for server_version_num + return Promise.resolve( + new Response( + JSON.stringify({ + status: "success", + data: { + result: [ + { + metric: { + tag_setting_name: "server_version_num", + tag_setting_value: "160003", + }, + }, + ], + }, + }), + { status: 200 } + ) + ); + }); + + const report = await generator.generateA002Report("cluster1", "node-01"); + + expect(report.checkId).toBe("A002"); + expect(report.checkTitle).toBe("Postgres major version"); + expect(report.nodes.primary).toBe("node-01"); + expect(report.results["node-01"]).toBeDefined(); + }); + }); + + describe("generateA003Report", () => { + test("generates settings report", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response( + JSON.stringify({ + status: "success", + data: { + result: [ + { + metric: { + tag_setting_name: "shared_buffers", + tag_setting_value: "128MB", + tag_unit: "", + tag_category: "Resource Usage / Memory", + tag_vartype: "string", + }, + }, + ], + }, + }), + { status: 200 } + ) + ) + ); + + const report = await generator.generateA003Report("cluster1", "node-01"); + + expect(report.checkId).toBe("A003"); + expect(report.checkTitle).toBe("Postgres settings"); + }); + }); + + describe("generateA007Report", () => { + test("generates altered settings report", async () => { + global.fetch = mock(() => + Promise.resolve( + new Response( + JSON.stringify({ + status: "success", + data: { + result: [ + { + metric: { + tag_setting_name: "max_connections", + tag_setting_value: "200", + tag_unit: "", + tag_category: "Connections", + is_default: "0", + }, + }, + ], + }, + }), + { status: 200 } + ) + ) + ); + + const report = await generator.generateA007Report("cluster1", "node-01"); + + expect(report.checkId).toBe("A007"); + expect(report.checkTitle).toBe("Altered settings"); + }); + }); + + describe("generateD004Report", () => { + test("generates pg_stat_statements settings report", async () => { + mockPrometheusSuccess([ + { + metric: { + tag_setting_name: "pg_stat_statements.max", + tag_setting_value: "10000", + }, + }, + ]); + + const report = await generator.generateD004Report("cluster1", "node-01"); + + expect(report.checkId).toBe("D004"); + expect(report.checkTitle).toBe("pg_stat_statements and pg_stat_kcache settings"); + }); + }); + + describe("generateF001Report", () => { + test("generates autovacuum settings report", async () => { + mockPrometheusSuccess([ + { + metric: { + tag_setting_name: "autovacuum", + tag_setting_value: "on", + }, + }, + ]); + + const report = await generator.generateF001Report("cluster1", "node-01"); + + expect(report.checkId).toBe("F001"); + expect(report.checkTitle).toBe("Autovacuum: current settings"); + }); + }); + + describe("generateG001Report", () => { + test("generates memory settings report", async () => { + mockPrometheusSuccess([ + { + metric: { + tag_setting_name: "shared_buffers", + tag_setting_value: "128MB", + }, + }, + ]); + + const report = await generator.generateG001Report("cluster1", "node-01"); + + expect(report.checkId).toBe("G001"); + expect(report.checkTitle).toBe("Memory-related settings"); + }); + }); + + describe("generateF004Report", () => { + test("generates heap bloat report", async () => { + mockPrometheusSuccess([ + { + metric: { + datname: "mydb", + schemaname: "public", + tablename: "users", + }, + value: [1704067200, "25.5"], + }, + ]); + + const report = await generator.generateF004Report("cluster1", "node-01"); + + expect(report.checkId).toBe("F004"); + expect(report.checkTitle).toBe("Heap bloat"); + }); + + test("excludes system databases from heap bloat", async () => { + mockPrometheusSuccess([ + { + metric: { + datname: "template0", + schemaname: "public", + tablename: "test", + }, + value: [1704067200, "30.0"], + }, + ]); + + const report = await generator.generateF004Report("cluster1", "node-01"); + expect(Object.keys(report.results["node-01"].data)).toHaveLength(0); + }); + }); + + describe("generateF005Report", () => { + test("generates btree bloat report", async () => { + mockPrometheusSuccess([ + { + metric: { + datname: "mydb", + schemaname: "public", + tblname: "users", + idxname: "users_pkey", + }, + value: [1704067200, "22.0"], + }, + ]); + + const report = await generator.generateF005Report("cluster1", "node-01"); + + expect(report.checkId).toBe("F005"); + expect(report.checkTitle).toBe("Btree bloat"); + }); + }); + + describe("generateH001Report", () => { + test("generates invalid indexes report", async () => { + mockPrometheusSuccess([ + { + metric: { + datname: "mydb", + schemaname: "public", + tablename: "users", + indexname: "users_invalid_idx", + }, + value: [1704067200, "1"], + }, + ]); + + const report = await generator.generateH001Report("cluster1", "node-01"); + + expect(report.checkId).toBe("H001"); + expect(report.checkTitle).toBe("Invalid indexes"); + }); + }); + + describe("generateH002Report", () => { + test("generates unused indexes report", async () => { + mockPrometheusSuccess([ + { + metric: { + datname: "mydb", + schemaname: "public", + tablename: "users", + indexname: "users_unused_idx", + }, + value: [1704067200, "0"], + }, + ]); + + const report = await generator.generateH002Report("cluster1", "node-01"); + + expect(report.checkId).toBe("H002"); + expect(report.checkTitle).toBe("Unused indexes"); + }); + }); + + describe("generateAllReports", () => { + test("generates all reports for a cluster", async () => { + mockPrometheusSuccess([]); + + const reports = await generator.generateAllReports("cluster1", "node-01"); + + expect("A002" in reports).toBe(true); + expect("A003" in reports).toBe(true); + expect("A007" in reports).toBe(true); + expect("D004" in reports).toBe(true); + expect("F001" in reports).toBe(true); + expect("F004" in reports).toBe(true); + expect("F005" in reports).toBe(true); + expect("G001" in reports).toBe(true); + expect("H001" in reports).toBe(true); + expect("H002" in reports).toBe(true); + }); + }); + + describe("getSettings with filter", () => { + test("filters settings by provided list", async () => { + mockPrometheusSuccess([ + { + metric: { + tag_setting_name: "autovacuum", + tag_setting_value: "on", + }, + }, + { + metric: { + tag_setting_name: "shared_buffers", + tag_setting_value: "128MB", + }, + }, + ]); + + const settings = await generator.getSettings("cluster1", "node-01", ["autovacuum"]); + + expect("autovacuum" in settings).toBe(true); + expect("shared_buffers" in settings).toBe(false); + }); + }); + + describe("excluded databases", () => { + test("uses default exclusions", async () => { + mockPrometheusSuccess([ + { metric: { datname: "mydb" } }, + { metric: { datname: "rdsadmin" } }, + ]); + + const databases = await generator.getAllDatabases("cluster1"); + expect(databases).toContain("mydb"); + expect(databases).not.toContain("rdsadmin"); + }); + + test("supports custom exclusions", async () => { + const customGenerator = new PostgresReportGenerator( + "http://test-prometheus:9090", + "postgresql://test@localhost:5432/test", + ["customdb"] + ); + + global.fetch = mock(() => + Promise.resolve( + new Response( + JSON.stringify({ + status: "success", + data: { + result: [ + { metric: { datname: "mydb" } }, + { metric: { datname: "customdb" } }, + ], + }, + }), + { status: 200 } + ) + ) + ); + + const databases = await customGenerator.getAllDatabases("cluster1"); + expect(databases).toContain("mydb"); + expect(databases).not.toContain("customdb"); + }); + }); +}); diff --git a/config/grafana/dashboards/Dashboard_10_Index health.json b/config/grafana/dashboards/Dashboard_10_Index health.json index 8060a883..6d6ff5e2 100644 --- a/config/grafana/dashboards/Dashboard_10_Index health.json +++ b/config/grafana/dashboards/Dashboard_10_Index health.json @@ -166,7 +166,7 @@ "root_selector": "", "source": "url", "type": "csv", - "url": "http://flask-pgss-api:8000/btree_bloat/csv", + "url": "http://metrics-server:8000/btree_bloat/csv", "url_options": { "data": "", "method": "GET", diff --git a/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json b/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json index cb1a041c..14ea5888 100644 --- a/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json +++ b/config/grafana/dashboards/Dashboard_2_Aggregated_query_analysis.json @@ -364,7 +364,7 @@ "root_selector": "", "source": "url", "type": "csv", - "url": "http://flask-pgss-api:8000/pgss_metrics/csv", + "url": "http://metrics-server:8000/pgss_metrics/csv", "url_options": { "data": "", "method": "GET", diff --git a/config/grafana/dashboards/Dashboard_8_Table_Stats.json b/config/grafana/dashboards/Dashboard_8_Table_Stats.json index f82383c4..ff4392d8 100644 --- a/config/grafana/dashboards/Dashboard_8_Table_Stats.json +++ b/config/grafana/dashboards/Dashboard_8_Table_Stats.json @@ -414,7 +414,7 @@ "root_selector": "", "source": "url", "type": "csv", - "url": "http://flask-pgss-api:8000/table_info/csv", + "url": "http://metrics-server:8000/table_info/csv", "url_options": { "data": "", "method": "GET", diff --git a/docker-compose.local.yml b/docker-compose.local.yml index 9582f626..69e3a84e 100644 --- a/docker-compose.local.yml +++ b/docker-compose.local.yml @@ -1,8 +1,9 @@ services: - # Local development override: build Flask backend from source instead of pulling an image. - monitoring_flask_backend: + # Local development override: build metrics-server from source instead of pulling an image. + metrics-server: build: - context: ./monitoring_flask_backend - image: postgresai/monitoring_flask_backend:local + context: ./cli + dockerfile: Dockerfile.metrics-server + image: postgresai/metrics-server:local diff --git a/docker-compose.override.example.yml b/docker-compose.override.example.yml index fd0f1b9a..9fd4aedd 100644 --- a/docker-compose.override.example.yml +++ b/docker-compose.override.example.yml @@ -34,33 +34,23 @@ services: echo \"config-init (local): copied ./config -> postgres_ai_configs volume\"" ] - # Local dev override for the Flask backend: + # Local dev override for the metrics server: # - bind-mount local code for instant iteration # - expose HTTP port to host - # - enable debugpy for debugger attach (opt-in via DEBUGPY_FLASK=1) - monitoring_flask_backend: + metrics-server: ports: - # HTTP (gunicorn) + # HTTP - "127.0.0.1:55000:8000" - # Debugger attach (debugpy) - - "127.0.0.1:5678:5678" volumes: - - ./monitoring_flask_backend:/app + - ./cli/lib:/app/lib environment: - PROMETHEUS_URL=http://sink-prometheus:9090 - # Set to 1 to enable debugpy attach on port 5678 - - DEBUGPY_FLASK=${DEBUGPY_FLASK:-0} - command: - [ - "bash", - "-lc", - "if [ \"${DEBUGPY_FLASK:-0}\" = \"1\" ]; then python -m pip install --no-cache-dir debugpy && exec python -m debugpy --listen 0.0.0.0:5678 -m gunicorn --bind 0.0.0.0:8000 --workers 1 --timeout 120 --reload app:app; else exec gunicorn --bind 0.0.0.0:8000 --workers 1 --timeout 120 --reload app:app; fi" - ] + - PORT=8000 # Alternate mode: run/debug the reporter inside Docker. # # NOTE: The primary dev workflow is running reports on the host via: - # ./scripts/run_reporter_local.sh + # cd cli && bun run lib/reporter.ts --prometheus-url http://localhost:59090 # # The host-run workflow is faster and easier to debug, and is enabled by: # - exposing sink-postgres on 127.0.0.1:55433 (see below) @@ -70,21 +60,10 @@ services: # # postgres-reports: # volumes: - # - ./reporter/postgres_reports.py:/app/postgres_reports.py - # - ./reporter/reporter:/app/reporter + # - ./cli/lib:/app/lib # environment: # - PROMETHEUS_URL=http://sink-prometheus:9090 # - REPORTER_INITIAL_DELAY_SECONDS=0 - # # Set to 1 to enable debugpy attach on port 5679 - # - DEBUGPY_REPORTER=${DEBUGPY_REPORTER:-0} - # ports: - # - "127.0.0.1:5679:5679" - # command: - # [ - # "bash", - # "-lc", - # "if [ \"${DEBUGPY_REPORTER:-0}\" = \"1\" ]; then python -m pip install --no-cache-dir debugpy && exec python -m debugpy --listen 0.0.0.0:5679 /app/postgres_reports.py --prometheus-url http://sink-prometheus:9090 --postgres-sink-url postgresql://pgwatch@sink-postgres:5432/measurements --no-upload --output /app/reports/dev_report_%Y%m%d_%H%M%S.json; else exec python /app/postgres_reports.py --prometheus-url http://sink-prometheus:9090 --postgres-sink-url postgresql://pgwatch@sink-postgres:5432/measurements --no-upload --output /app/reports/dev_report_%Y%m%d_%H%M%S.json; fi" - # ] # Expose sink-postgres to the host so you can run/debug reporter locally. # Bound to localhost only (safe for local dev). diff --git a/docker-compose.yml b/docker-compose.yml index c2aa44fe..873bb57d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -169,11 +169,11 @@ services: sink-prometheus: condition: service_started restart: unless-stopped - monitoring_flask_backend: - image: ${PGAI_REGISTRY:-postgresai}/monitoring-flask-backend:${PGAI_TAG:?PGAI_TAG is required} - container_name: flask-pgss-api + metrics-server: + image: ${PGAI_REGISTRY:-postgresai}/metrics-server:${PGAI_TAG:?PGAI_TAG is required} + container_name: metrics-server environment: - - FLASK_ENV=production + - PORT=8000 - PROMETHEUS_URL=http://sink-prometheus:9090 depends_on: - sink-prometheus diff --git a/monitoring_flask_backend/Dockerfile b/monitoring_flask_backend/Dockerfile deleted file mode 100644 index 940a96f0..00000000 --- a/monitoring_flask_backend/Dockerfile +++ /dev/null @@ -1,44 +0,0 @@ -FROM python:3.11-slim - -# Build metadata (required on tag builds) -ARG VERSION -ARG BUILD_TS - -# Validate build args early (fail fast in CI) -RUN test -n "${VERSION}" || (echo "VERSION build arg is required" && exit 1) -RUN test -n "${BUILD_TS}" || (echo "BUILD_TS build arg is required" && exit 1) - -LABEL org.opencontainers.image.title="PostgresAI Monitoring Flask Backend" -LABEL org.opencontainers.image.vendor="PostgresAI" -LABEL org.opencontainers.image.source="https://gitlab.com/postgres-ai/postgres_ai" -LABEL org.opencontainers.image.version="${VERSION}" -LABEL org.opencontainers.image.created="${BUILD_TS}" - -# Set working directory -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - gcc \ - && rm -rf /var/lib/apt/lists/* - -# Copy requirements and install Python dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy application code -COPY app.py . - -# Embed build metadata into the image filesystem -RUN printf '%s' "${VERSION}" > /VERSION \ - && printf '%s' "${BUILD_TS}" > /BUILD_TS - -# Expose port -EXPOSE 8000 - -# Set environment variables -ENV FLASK_APP=app.py -ENV FLASK_ENV=production - -# Run the application -CMD ["gunicorn", "--bind", "0.0.0.0:8000", "--workers", "4", "--timeout", "120", "app:app"] \ No newline at end of file diff --git a/monitoring_flask_backend/app.py b/monitoring_flask_backend/app.py deleted file mode 100644 index ba45d9b0..00000000 --- a/monitoring_flask_backend/app.py +++ /dev/null @@ -1,834 +0,0 @@ -from flask import Flask, request, jsonify, make_response -from prometheus_api_client import PrometheusConnect -import csv -import io -from datetime import datetime, timezone, timedelta -import logging -import os - -# Configure logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -app = Flask(__name__) - -# Prometheus connection - use environment variable with fallback -PROMETHEUS_URL = os.environ.get('PROMETHEUS_URL', 'http://localhost:8428') - -# Metric name mapping for cleaner CSV output -METRIC_NAME_MAPPING = { - 'calls': 'calls', - 'exec_time_total': 'exec_time', - 'plan_time_total': 'plan_time', - 'rows': 'rows', - 'shared_bytes_hit_total': 'shared_blks_hit', - 'shared_bytes_read_total': 'shared_blks_read', - 'shared_bytes_dirtied_total': 'shared_blks_dirtied', - 'shared_bytes_written_total': 'shared_blks_written', - 'block_read_total': 'blk_read_time', - 'block_write_total': 'blk_write_time' -} - -def get_prometheus_client(): - """Get Prometheus client connection""" - try: - return PrometheusConnect(url=PROMETHEUS_URL, disable_ssl=True) - except Exception as e: - logger.error(f"Failed to connect to Prometheus: {e}") - raise - -@app.route('/health', methods=['GET']) -def health(): - """Health check endpoint""" - try: - prom = get_prometheus_client() - # Simple query to test connection - prom.get_current_metric_value(metric_name='up') - return jsonify({"status": "healthy", "prometheus_url": PROMETHEUS_URL}) - except Exception as e: - return jsonify({"status": "unhealthy", "error": str(e)}), 500 - -@app.route('/pgss_metrics/csv', methods=['GET']) -def get_pgss_metrics_csv(): - """ - Get pg_stat_statements metrics as CSV with time-based difference calculation - - Query parameters: - - time_start: Start time (ISO format or Unix timestamp) - - time_end: End time (ISO format or Unix timestamp) - - cluster_name: Cluster name filter (optional) - - node_name: Node name filter (optional) - - db_name: Database name filter (optional) - """ - try: - # Get query parameters - time_start = request.args.get('time_start') - time_end = request.args.get('time_end') - cluster_name = request.args.get('cluster_name') - node_name = request.args.get('node_name') - db_name = request.args.get('db_name') - - if not time_start or not time_end: - return jsonify({"error": "time_start and time_end parameters are required"}), 400 - - # Parse time parameters - try: - # Try parsing as Unix timestamp first - start_dt = datetime.fromtimestamp(float(time_start), tz=timezone.utc) - except ValueError: - # Try parsing as ISO format - start_dt = datetime.fromisoformat(time_start.replace('Z', '+00:00')) - - try: - end_dt = datetime.fromtimestamp(float(time_end), tz=timezone.utc) - except ValueError: - end_dt = datetime.fromisoformat(time_end.replace('Z', '+00:00')) - - # Connect to Prometheus - prom = get_prometheus_client() - - # Build the base query for pg_stat_statements metrics - base_query = 'pgwatch_pg_stat_statements_calls' - - # Add filters if provided - filters = [] - if cluster_name: - filters.append(f'cluster="{cluster_name}"') - if node_name: - filters.append(f'instance=~".*{node_name}.*"') - if db_name: - filters.append(f'datname="{db_name}"') - - if filters: - base_query += '{' + ','.join(filters) + '}' - - logger.info(f"Querying Prometheus with base query: {base_query}") - - # Get all pg_stat_statements metrics - all_metrics = [ - 'pgwatch_pg_stat_statements_calls', - 'pgwatch_pg_stat_statements_plans_total', - 'pgwatch_pg_stat_statements_exec_time_total', - 'pgwatch_pg_stat_statements_plan_time_total', - 'pgwatch_pg_stat_statements_rows', - 'pgwatch_pg_stat_statements_shared_bytes_hit_total', - 'pgwatch_pg_stat_statements_shared_bytes_read_total', - 'pgwatch_pg_stat_statements_shared_bytes_dirtied_total', - 'pgwatch_pg_stat_statements_shared_bytes_written_total', - 'pgwatch_pg_stat_statements_block_read_total', - 'pgwatch_pg_stat_statements_block_write_total', - 'pgwatch_pg_stat_statements_wal_records', - 'pgwatch_pg_stat_statements_wal_fpi', - 'pgwatch_pg_stat_statements_wal_bytes', - 'pgwatch_pg_stat_statements_temp_bytes_read', - 'pgwatch_pg_stat_statements_temp_bytes_written' - ] - - # Apply filters to each metric - filtered_metrics = [] - for metric in all_metrics: - if filters: - filtered_metrics.append(f'{metric}{{{",".join(filters)}}}') - else: - filtered_metrics.append(metric) - - # Get metrics at start and end times using instant queries - start_data = [] - end_data = [] - - for metric in filtered_metrics: - try: - start_metric_data = prom.get_metric_range_data( - metric_name=metric, - start_time=start_dt - timedelta(minutes=1), - end_time=start_dt + timedelta(minutes=1) - ) - if start_metric_data: - start_data.extend(start_metric_data) - - end_metric_data = prom.get_metric_range_data( - metric_name=metric, - start_time=end_dt - timedelta(minutes=1), - end_time=end_dt + timedelta(minutes=1) - ) - if end_metric_data: - end_data.extend(end_metric_data) - except Exception as e: - logger.warning(f"Failed to query metric {metric}: {e}") - continue - - # Process the data to calculate differences - csv_data = process_pgss_data(start_data, end_data, start_dt, end_dt) - - # Create CSV response - output = io.StringIO() - if csv_data: - # Define explicit field order with queryid first, then duration, then metrics with their rates - base_fields = ['queryid', 'duration_seconds'] - all_metric_fields = [] - - # Get metric fields from the mapping in specific order with their rates - desired_order = [ - 'calls', 'exec_time', 'plan_time', 'rows', 'shared_blks_hit', - 'shared_blks_read', 'shared_blks_dirtied', 'shared_blks_written', - 'blk_read_time', 'blk_write_time' - ] - - for display_name in desired_order: - if display_name in METRIC_NAME_MAPPING.values(): - all_metric_fields.append(display_name) - all_metric_fields.append(f'{display_name}_per_sec') - all_metric_fields.append(f'{display_name}_per_call') - - # Combine all fields in desired order - all_fields = base_fields + all_metric_fields - - writer = csv.DictWriter(output, fieldnames=all_fields) - writer.writeheader() - writer.writerows(csv_data) - - csv_content = output.getvalue() - output.close() - - # Create response - response = make_response(csv_content) - response.headers['Content-Type'] = 'text/csv' - response.headers['Content-Disposition'] = f'attachment; filename=pgss_metrics_{start_dt.strftime("%Y%m%d_%H%M%S")}_{end_dt.strftime("%Y%m%d_%H%M%S")}.csv' - - return response - - except Exception as e: - logger.error(f"Error processing request: {e}") - return jsonify({"error": str(e)}), 500 - -def process_pgss_data(start_data, end_data, start_time, end_time): - """ - Process pg_stat_statements data and calculate differences between start and end times - """ - # Convert Prometheus data to dictionaries - start_metrics = prometheus_to_dict(start_data, start_time) - end_metrics = prometheus_to_dict(end_data, end_time) - - if not start_metrics and not end_metrics: - return [] - - # Create a combined dictionary with all unique query identifiers - all_keys = set() - all_keys.update(start_metrics.keys()) - all_keys.update(end_metrics.keys()) - - result_rows = [] - - # Calculate differences for each query - for key in all_keys: - start_metric = start_metrics.get(key, {}) - end_metric = end_metrics.get(key, {}) - - # Extract identifier components from key - db_name, query_id, user, instance = key - - # Calculate actual duration from metric timestamps - start_timestamp = start_metric.get('timestamp') - end_timestamp = end_metric.get('timestamp') - - if start_timestamp and end_timestamp: - start_dt = datetime.fromisoformat(start_timestamp) - end_dt = datetime.fromisoformat(end_timestamp) - actual_duration = (end_dt - start_dt).total_seconds() - else: - # Fallback to query parameter duration if timestamps are missing - actual_duration = (end_time - start_time).total_seconds() - - # Create result row - row = { - 'queryid': query_id, - 'duration_seconds': actual_duration - } - - # Numeric columns to calculate differences for (using original metric names) - numeric_cols = list(METRIC_NAME_MAPPING.keys()) - - # Calculate differences and rates - for col in numeric_cols: - start_val = start_metric.get(col, 0) - end_val = end_metric.get(col, 0) - diff = end_val - start_val - - # Use simplified display name for CSV columns - display_name = METRIC_NAME_MAPPING[col] - - # Convert bytes to blocks for block-related metrics (PostgreSQL uses 8KB blocks) - if 'blks' in display_name and 'bytes' in col: - diff = diff / 8192 # Convert bytes to 8KB blocks - - row[display_name] = diff - - # Calculate rates per second - if row['duration_seconds'] > 0: - row[f'{display_name}_per_sec'] = diff / row['duration_seconds'] - else: - row[f'{display_name}_per_sec'] = 0 - - # Calculate per-call averages - calls_diff = row.get('calls', 0) - if calls_diff > 0: - row[f'{display_name}_per_call'] = diff / calls_diff - else: - row[f'{display_name}_per_call'] = 0 - - result_rows.append(row) - - # Sort by total execution time difference (descending) - result_rows.sort(key=lambda x: x.get('exec_time', 0), reverse=True) - - return result_rows - -def prometheus_to_dict(prom_data, timestamp): - """ - Convert Prometheus API response to dictionary keyed by query identifiers - """ - if not prom_data: - return {} - - metrics_dict = {} - - for metric_data in prom_data: - metric = metric_data.get('metric', {}) - values = metric_data.get('values', []) - - if not values: - continue - - # Get the closest value to our timestamp - closest_value = min(values, key=lambda x: abs(float(x[0]) - timestamp.timestamp())) - - # Create unique key for this query - key = ( - metric.get('datname', ''), - metric.get('queryid', ''), - metric.get('user', ''), - metric.get('instance', '') - ) - - # Initialize metric dict if not exists - if key not in metrics_dict: - metrics_dict[key] = { - 'timestamp': datetime.fromtimestamp(float(closest_value[0]), tz=timezone.utc).isoformat(), - } - - # Add metric value - metric_name = metric.get('__name__', 'pgwatch_pg_stat_statements_calls') - clean_name = metric_name.replace('pgwatch_pg_stat_statements_', '') - - try: - metrics_dict[key][clean_name] = float(closest_value[1]) - except (ValueError, IndexError): - metrics_dict[key][clean_name] = 0 - - return metrics_dict - -@app.route('/metrics', methods=['GET']) -def list_metrics(): - """List available metrics in Prometheus""" - try: - prom = get_prometheus_client() - metrics = prom.all_metrics() - pgss_metrics = [m for m in metrics if 'pg_stat_statements' in m] - return jsonify({"pg_stat_statements_metrics": pgss_metrics}) - except Exception as e: - return jsonify({"error": str(e)}), 500 - -@app.route('/debug/metrics', methods=['GET']) -def debug_metrics(): - """ - Debug endpoint to check what metrics are actually available in Prometheus - """ - try: - prom = get_prometheus_client() - - # Get all available metrics - all_metrics = prom.all_metrics() - - # Filter for pg_btree_bloat metrics - btree_metrics = [m for m in all_metrics if 'btree_bloat' in m] - - # Get sample data for each btree metric - sample_data = {} - for metric in btree_metrics[:5]: # Limit to first 5 to avoid overwhelming - try: - result = prom.get_current_metric_value(metric_name=metric) - sample_data[metric] = { - 'count': len(result), - 'sample_labels': [entry.get('metric', {}) for entry in result[:2]] # First 2 entries - } - except Exception as e: - sample_data[metric] = {'error': str(e)} - - return jsonify({ - 'all_metrics_count': len(all_metrics), - 'btree_metrics': btree_metrics, - 'sample_data': sample_data - }) - except Exception as e: - return jsonify({"error": str(e)}), 500 - - -@app.route('/btree_bloat/csv', methods=['GET']) -def get_btree_bloat_csv(): - """ - Get the most recent pg_btree_bloat metrics as a CSV table. - """ - try: - # Get query parameters - cluster_name = request.args.get('cluster_name') - node_name = request.args.get('node_name') - db_name = request.args.get('db_name') - schemaname = request.args.get('schemaname') - tblname = request.args.get('tblname') - idxname = request.args.get('idxname') - - # Build label filters - filters = [] - if cluster_name: - filters.append(f'cluster="{cluster_name}"') - if node_name: - filters.append(f'node_name="{node_name}"') - if schemaname: - filters.append(f'schemaname="{schemaname}"') - if tblname: - filters.append(f'tblname="{tblname}"') - if idxname: - filters.append(f'idxname="{idxname}"') - if db_name: - filters.append(f'datname="{db_name}"') - - filter_str = '{' + ','.join(filters) + '}' if filters else '' - - # Metrics to fetch with last_over_time to get only the most recent value - metric_queries = [ - f'last_over_time(pgwatch_pg_btree_bloat_real_size_mib{filter_str}[1d])', - f'last_over_time(pgwatch_pg_btree_bloat_extra_size{filter_str}[1d])', - f'last_over_time(pgwatch_pg_btree_bloat_extra_pct{filter_str}[1d])', - f'last_over_time(pgwatch_pg_btree_bloat_fillfactor{filter_str}[1d])', - f'last_over_time(pgwatch_pg_btree_bloat_bloat_size{filter_str}[1d])', - f'last_over_time(pgwatch_pg_btree_bloat_bloat_pct{filter_str}[1d])', - f'last_over_time(pgwatch_pg_btree_bloat_is_na{filter_str}[1d])', - ] - - prom = get_prometheus_client() - metric_results = {} - - for query in metric_queries: - try: - # Use custom_query instead of get_current_metric_value - result = prom.custom_query(query=query) - - for entry in result: - metric_labels = entry.get('metric', {}) - key = ( - metric_labels.get('datname', ''), - metric_labels.get('schemaname', ''), - metric_labels.get('tblname', ''), - metric_labels.get('idxname', '') - ) - - if key not in metric_results: - metric_results[key] = { - 'database': metric_labels.get('datname', ''), - 'schemaname': metric_labels.get('schemaname', ''), - 'tblname': metric_labels.get('tblname', ''), - 'idxname': metric_labels.get('idxname', ''), - } - - # Extract metric type from query and store value - if 'real_size_mib' in query: - metric_results[key]['real_size_mib'] = float(entry['value'][1]) - elif 'extra_size' in query and 'extra_pct' not in query: - metric_results[key]['extra_size'] = float(entry['value'][1]) - elif 'extra_pct' in query: - metric_results[key]['extra_pct'] = float(entry['value'][1]) - elif 'fillfactor' in query: - metric_results[key]['fillfactor'] = float(entry['value'][1]) - elif 'bloat_size' in query: - metric_results[key]['bloat_size'] = float(entry['value'][1]) - elif 'bloat_pct' in query: - metric_results[key]['bloat_pct'] = float(entry['value'][1]) - elif 'is_na' in query: - metric_results[key]['is_na'] = int(float(entry['value'][1])) - - except Exception as e: - logger.warning(f"Failed to query: {query}, error: {e}") - continue - - # Prepare CSV output - output = io.StringIO() - fieldnames = [ - 'database', 'schemaname', 'tblname', 'idxname', - 'real_size_mib', 'extra_size', 'extra_pct', 'fillfactor', - 'bloat_size', 'bloat_pct', 'is_na' - ] - writer = csv.DictWriter(output, fieldnames=fieldnames) - writer.writeheader() - for row in metric_results.values(): - writer.writerow(row) - - csv_content = output.getvalue() - output.close() - - # Create response - response = make_response(csv_content) - response.headers['Content-Type'] = 'text/csv' - response.headers['Content-Disposition'] = 'attachment; filename=btree_bloat_latest.csv' - return response - - except Exception as e: - logger.error(f"Error processing btree bloat request: {e}") - return jsonify({"error": str(e)}), 500 - -@app.route('/table_info/csv', methods=['GET']) -def get_table_info_csv(): - """ - Get comprehensive table information including size metrics, tuple statistics, and I/O statistics as a CSV table. - Supports both instant queries (without time parameters) and rate calculations over a time period. - - Query parameters: - - time_start: Start time (ISO format or Unix timestamp) - optional - - time_end: End time (ISO format or Unix timestamp) - optional - - cluster_name: Cluster name filter (optional) - - node_name: Node name filter (optional) - - db_name: Database name filter (optional) - - schemaname: Schema name filter (optional, supports regex with ~) - - tblname: Table name filter (optional) - """ - try: - # Get query parameters - time_start = request.args.get('time_start') - time_end = request.args.get('time_end') - cluster_name = request.args.get('cluster_name') - node_name = request.args.get('node_name') - db_name = request.args.get('db_name') - schemaname = request.args.get('schemaname') - tblname = request.args.get('tblname') - - # Determine if we should calculate rates - calculate_rates = bool(time_start and time_end) - - if calculate_rates: - # Parse time parameters - try: - start_dt = datetime.fromtimestamp(float(time_start), tz=timezone.utc) - except ValueError: - start_dt = datetime.fromisoformat(time_start.replace('Z', '+00:00')) - - try: - end_dt = datetime.fromtimestamp(float(time_end), tz=timezone.utc) - except ValueError: - end_dt = datetime.fromisoformat(time_end.replace('Z', '+00:00')) - - # Build label filters - filters = [] - if cluster_name: - filters.append(f'cluster="{cluster_name}"') - if node_name: - filters.append(f'node_name="{node_name}"') - if schemaname: - # Support regex pattern matching with =~ - filters.append(f'schemaname=~"{schemaname}"') - if tblname: - filters.append(f'tblname="{tblname}"') - if db_name: - filters.append(f'datname="{db_name}"') - - filter_str = '{' + ','.join(filters) + '}' if filters else '' - - prom = get_prometheus_client() - - # Define base metrics to query (without last_over_time wrapper for rate calculation) - base_metrics = { - # Size metrics - 'total_size': f'pgwatch_pg_class_total_relation_size_bytes{filter_str}', - 'table_size': f'pgwatch_table_size_detailed_table_main_size_b{filter_str}', - 'index_size': f'pgwatch_table_size_detailed_table_indexes_size_b{filter_str}', - 'toast_size': f'pgwatch_table_size_detailed_total_toast_size_b{filter_str}', - # Scan statistics - 'seq_scan': f'pgwatch_pg_stat_all_tables_seq_scan{filter_str}', - 'idx_scan': f'pgwatch_pg_stat_all_tables_idx_scan{filter_str}', - # Tuple statistics - 'n_tup_ins': f'pgwatch_table_stats_n_tup_ins{filter_str}', - 'n_tup_upd': f'pgwatch_table_stats_n_tup_upd{filter_str}', - 'n_tup_del': f'pgwatch_table_stats_n_tup_del{filter_str}', - 'n_tup_hot_upd': f'pgwatch_table_stats_n_tup_hot_upd{filter_str}', - # I/O statistics - 'heap_blks_read': f'pgwatch_pg_statio_all_tables_heap_blks_read{filter_str}', - 'heap_blks_hit': f'pgwatch_pg_statio_all_tables_heap_blks_hit{filter_str}', - 'idx_blks_read': f'pgwatch_pg_statio_all_tables_idx_blks_read{filter_str}', - 'idx_blks_hit': f'pgwatch_pg_statio_all_tables_idx_blks_hit{filter_str}', - } - - if calculate_rates: - # Get metrics at start and end times - start_data = {} - end_data = {} - - for metric_name, metric_query in base_metrics.items(): - try: - # Get data at start time - start_result = prom.get_metric_range_data( - metric_name=metric_query, - start_time=start_dt - timedelta(minutes=1), - end_time=start_dt + timedelta(minutes=1) - ) - if start_result: - start_data[metric_name] = start_result - - # Get data at end time - end_result = prom.get_metric_range_data( - metric_name=metric_query, - start_time=end_dt - timedelta(minutes=1), - end_time=end_dt + timedelta(minutes=1) - ) - if end_result: - end_data[metric_name] = end_result - except Exception as e: - logger.warning(f"Failed to query metric {metric_name}: {e}") - continue - - # Process the data to calculate rates - metric_results = process_table_stats_with_rates(start_data, end_data, start_dt, end_dt) - else: - # Get instant values using last_over_time - metric_results = {} - for metric_name, metric_query in base_metrics.items(): - try: - result = prom.custom_query(query=f'last_over_time({metric_query}[1d])') - for entry in result: - metric_labels = entry.get('metric', {}) - - # Use different key depending on label names - schema_label = metric_labels.get('schemaname') or metric_labels.get('schema', '') - table_label = metric_labels.get('relname') or metric_labels.get('table_name') or metric_labels.get('tblname', '') - - key = ( - metric_labels.get('datname', ''), - schema_label, - table_label, - ) - - if key not in metric_results: - metric_results[key] = { - 'database': metric_labels.get('datname', ''), - 'schema': schema_label, - 'table_name': table_label, - } - - value = float(entry['value'][1]) - metric_results[key][metric_name] = value - except Exception as e: - logger.warning(f"Failed to query metric {metric_name}: {e}") - continue - - # Prepare CSV output - output = io.StringIO() - - if calculate_rates: - # Fields with rate calculations - fieldnames = [ - 'schema', 'table_name', - # Size metrics (bytes) - 'total_size', 'table_size', 'index_size', 'toast_size', - # Scan statistics with rates - 'seq_scans', 'seq_scans_per_sec', - 'idx_scans', 'idx_scans_per_sec', - # Tuple statistics with rates - 'inserts', 'inserts_per_sec', - 'updates', 'updates_per_sec', - 'deletes', 'deletes_per_sec', - 'hot_updates', 'hot_updates_per_sec', - # I/O statistics with rates (in bytes using block_size) - 'heap_blks_read', 'heap_blks_read_per_sec', - 'heap_blks_hit', 'heap_blks_hit_per_sec', - 'idx_blks_read', 'idx_blks_read_per_sec', - 'idx_blks_hit', 'idx_blks_hit_per_sec', - 'duration_seconds' - ] - else: - # Fields without rate calculations - fieldnames = [ - 'schema', 'table_name', - 'total_size', 'table_size', 'index_size', 'toast_size', - 'seq_scan', 'idx_scan', - 'n_tup_ins', 'n_tup_upd', 'n_tup_del', 'n_tup_hot_upd', - 'heap_blks_read', 'heap_blks_hit', - 'idx_blks_read', 'idx_blks_hit' - ] - - # Remove 'database' field from rows if present (not in fieldnames) - for row in metric_results.values(): - row.pop('database', None) - - writer = csv.DictWriter(output, fieldnames=fieldnames) - writer.writeheader() - - # Write rows (handle both dict and list) - if isinstance(metric_results, dict): - rows = metric_results.values() - else: - rows = metric_results - - for row in rows: - writer.writerow(row) - - csv_content = output.getvalue() - output.close() - - # Create response - response = make_response(csv_content) - response.headers['Content-Type'] = 'text/csv' - - if calculate_rates: - filename = f'table_stats_{start_dt.strftime("%Y%m%d_%H%M%S")}_{end_dt.strftime("%Y%m%d_%H%M%S")}.csv' - else: - filename = 'table_stats_latest.csv' - - response.headers['Content-Disposition'] = f'attachment; filename={filename}' - return response - - except Exception as e: - logger.error(f"Error processing table stats request: {e}") - return jsonify({"error": str(e)}), 500 - -def process_table_stats_with_rates(start_data, end_data, start_time, end_time): - """ - Process table statistics and calculate rates between start and end times - """ - # Convert data to dictionaries - start_metrics = prometheus_table_to_dict(start_data, start_time) - end_metrics = prometheus_table_to_dict(end_data, end_time) - - if not start_metrics and not end_metrics: - return [] - - # Get all unique table identifiers - all_keys = set() - all_keys.update(start_metrics.keys()) - all_keys.update(end_metrics.keys()) - - result_rows = [] - - for key in all_keys: - start_metric = start_metrics.get(key, {}) - end_metric = end_metrics.get(key, {}) - - # Extract identifier components from key - db_name, schema_name, table_name = key - - # Calculate actual duration - start_timestamp = start_metric.get('timestamp') - end_timestamp = end_metric.get('timestamp') - - if start_timestamp and end_timestamp: - start_dt = datetime.fromisoformat(start_timestamp) - end_dt = datetime.fromisoformat(end_timestamp) - actual_duration = (end_dt - start_dt).total_seconds() - else: - actual_duration = (end_time - start_time).total_seconds() - - # Create result row - row = { - 'schema': schema_name, - 'table_name': table_name, - 'duration_seconds': actual_duration - } - - # Counter metrics to calculate differences and rates - counter_metrics = [ - 'seq_scan', 'idx_scan', 'n_tup_ins', 'n_tup_upd', - 'n_tup_del', 'n_tup_hot_upd', 'heap_blks_read', 'heap_blks_hit', - 'idx_blks_read', 'idx_blks_hit' - ] - - # Mapping for display names - display_names = { - 'seq_scan': 'seq_scans', - 'idx_scan': 'idx_scans', - 'n_tup_ins': 'inserts', - 'n_tup_upd': 'updates', - 'n_tup_del': 'deletes', - 'n_tup_hot_upd': 'hot_updates', - } - - # Calculate differences and rates - for metric in counter_metrics: - start_val = start_metric.get(metric, 0) - end_val = end_metric.get(metric, 0) - diff = end_val - start_val - - # Use display name if available - display_name = display_names.get(metric, metric) - - row[display_name] = diff - - # Calculate rate per second - if actual_duration > 0: - row[f'{display_name}_per_sec'] = diff / actual_duration - else: - row[f'{display_name}_per_sec'] = 0 - - # Size metrics (just use end values, these don't need rates) - for size_metric in ['total_size', 'table_size', 'index_size', 'toast_size']: - row[size_metric] = end_metric.get(size_metric, 0) - - result_rows.append(row) - - # Sort by total size descending - result_rows.sort(key=lambda x: x.get('total_size', 0), reverse=True) - - return result_rows - -def prometheus_table_to_dict(prom_data, timestamp): - """ - Convert Prometheus table metrics to dictionary keyed by table identifiers - """ - if not prom_data: - return {} - - metrics_dict = {} - - for metric_name, metric_results in prom_data.items(): - for metric_data in metric_results: - metric = metric_data.get('metric', {}) - values = metric_data.get('values', []) - - if not values: - continue - - # Get the closest value to our timestamp - closest_value = min(values, key=lambda x: abs(float(x[0]) - timestamp.timestamp())) - - # Handle different label names - schema_label = metric.get('schemaname') or metric.get('schema', '') - table_label = metric.get('relname') or metric.get('table_name') or metric.get('tblname', '') - - # Create unique key for this table - key = ( - metric.get('datname', ''), - schema_label, - table_label, - ) - - # Initialize metric dict if not exists - if key not in metrics_dict: - metrics_dict[key] = { - 'timestamp': datetime.fromtimestamp(float(closest_value[0]), tz=timezone.utc).isoformat(), - } - - # Add metric value - try: - metrics_dict[key][metric_name] = float(closest_value[1]) - except (ValueError, IndexError): - metrics_dict[key][metric_name] = 0 - - return metrics_dict - -if __name__ == '__main__': - app.run(host='0.0.0.0', port=5000, debug=True) \ No newline at end of file diff --git a/monitoring_flask_backend/requirements.txt b/monitoring_flask_backend/requirements.txt deleted file mode 100644 index d5909183..00000000 --- a/monitoring_flask_backend/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -Flask==3.0.0 -prometheus-api-client==0.5.4 -python-dateutil==2.8.2 -gunicorn==21.2.0 -requests==2.31.0 \ No newline at end of file diff --git a/postgres_ai_helm/templates/grafana-datasources.yaml b/postgres_ai_helm/templates/grafana-datasources.yaml index 26f52720..c67b06bd 100644 --- a/postgres_ai_helm/templates/grafana-datasources.yaml +++ b/postgres_ai_helm/templates/grafana-datasources.yaml @@ -32,12 +32,12 @@ data: postgresVersion: 1500 secureJsonData: password: {{ .Values.secrets.postgres.password }} - {{- if .Values.flask.enabled }} - - name: Flask API + {{- if .Values.metricsServer.enabled }} + - name: Metrics Server API type: yesoreyeram-infinity-datasource access: proxy uid: aerffb0z8rjlsc - url: http://{{ include "postgres-ai-monitoring.fullname" . }}-flask:{{ .Values.flask.service.port }} + url: http://{{ include "postgres-ai-monitoring.fullname" . }}-metrics-server:{{ .Values.metricsServer.service.port }} isDefault: false editable: true jsonData: diff --git a/postgres_ai_helm/templates/flask-deployment.yaml b/postgres_ai_helm/templates/metrics-server-deployment.yaml similarity index 63% rename from postgres_ai_helm/templates/flask-deployment.yaml rename to postgres_ai_helm/templates/metrics-server-deployment.yaml index 30f2f892..985b27d5 100644 --- a/postgres_ai_helm/templates/flask-deployment.yaml +++ b/postgres_ai_helm/templates/metrics-server-deployment.yaml @@ -1,23 +1,23 @@ -{{- if .Values.flask.enabled }} +{{- if .Values.metricsServer.enabled }} apiVersion: apps/v1 kind: Deployment metadata: - name: {{ include "postgres-ai-monitoring.fullname" . }}-flask + name: {{ include "postgres-ai-monitoring.fullname" . }}-metrics-server namespace: {{ include "postgres-ai-monitoring.namespace" . }} labels: {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} - app.kubernetes.io/component: flask-api + app.kubernetes.io/component: metrics-server spec: - replicas: {{ .Values.flask.replicas | default 2 }} + replicas: {{ .Values.metricsServer.replicas | default 2 }} selector: matchLabels: {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 6 }} - app.kubernetes.io/component: flask-api + app.kubernetes.io/component: metrics-server template: metadata: labels: {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 8 }} - app.kubernetes.io/component: flask-api + app.kubernetes.io/component: metrics-server spec: {{- with .Values.imagePullSecrets }} imagePullSecrets: @@ -28,38 +28,37 @@ spec: image: busybox:1.36 command: ['sh', '-c', 'until nc -z {{ include "postgres-ai-monitoring.fullname" . }}-victoriametrics {{ .Values.victoriaMetrics.service.port }}; do echo waiting for victoriametrics; sleep 2; done'] containers: - - name: flask - image: {{ .Values.flask.image }} - imagePullPolicy: {{ .Values.flask.imagePullPolicy | default "IfNotPresent" }} + - name: metrics-server + image: {{ .Values.metricsServer.image }} + imagePullPolicy: {{ .Values.metricsServer.imagePullPolicy | default "IfNotPresent" }} env: - - name: FLASK_ENV - value: "production" + - name: PORT + value: "8000" - name: PROMETHEUS_URL value: "http://{{ include "postgres-ai-monitoring.fullname" . }}-victoriametrics:{{ .Values.victoriaMetrics.service.port }}" - {{- range $key, $value := .Values.flask.env }} + {{- range $key, $value := .Values.metricsServer.env }} - name: {{ $key }} value: {{ $value | quote }} {{- end }} ports: - name: http - containerPort: {{ .Values.flask.containerPort | default 8000 }} + containerPort: {{ .Values.metricsServer.containerPort | default 8000 }} protocol: TCP - {{- with .Values.flask.resources }} + {{- with .Values.metricsServer.resources }} resources: {{- toYaml . | nindent 12 }} {{- end }} livenessProbe: httpGet: - path: {{ .Values.flask.healthPath | default "/health" }} + path: {{ .Values.metricsServer.healthPath | default "/health" }} port: http initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: - path: {{ .Values.flask.healthPath | default "/health" }} + path: {{ .Values.metricsServer.healthPath | default "/health" }} port: http initialDelaySeconds: 5 periodSeconds: 5 {{- end }} - diff --git a/postgres_ai_helm/templates/flask-service.yaml b/postgres_ai_helm/templates/metrics-server-service.yaml similarity index 50% rename from postgres_ai_helm/templates/flask-service.yaml rename to postgres_ai_helm/templates/metrics-server-service.yaml index 225b6b51..7f1cb4be 100644 --- a/postgres_ai_helm/templates/flask-service.yaml +++ b/postgres_ai_helm/templates/metrics-server-service.yaml @@ -1,22 +1,21 @@ -{{- if .Values.flask.enabled }} +{{- if .Values.metricsServer.enabled }} apiVersion: v1 kind: Service metadata: - name: {{ include "postgres-ai-monitoring.fullname" . }}-flask + name: {{ include "postgres-ai-monitoring.fullname" . }}-metrics-server namespace: {{ include "postgres-ai-monitoring.namespace" . }} labels: {{- include "postgres-ai-monitoring.labels" . | nindent 4 }} - app.kubernetes.io/component: flask-api + app.kubernetes.io/component: metrics-server spec: - type: {{ .Values.flask.service.type | default "ClusterIP" }} + type: {{ .Values.metricsServer.service.type | default "ClusterIP" }} ports: - - port: {{ .Values.flask.service.port | default 8000 }} + - port: {{ .Values.metricsServer.service.port | default 8000 }} targetPort: http protocol: TCP name: http selector: {{- include "postgres-ai-monitoring.selectorLabels" . | nindent 4 }} - app.kubernetes.io/component: flask-api + app.kubernetes.io/component: metrics-server {{- end }} - diff --git a/postgres_ai_helm/values.yaml b/postgres_ai_helm/values.yaml index 902c4fb3..fcff8f3f 100644 --- a/postgres_ai_helm/values.yaml +++ b/postgres_ai_helm/values.yaml @@ -60,9 +60,9 @@ pgwatchPrometheus: image: cybertecpostgresql/pgwatch:3 resources: {} -flask: +metricsServer: enabled: true - image: postgresai/monitoring-flask-backend:latest + image: postgresai/metrics-server:latest imagePullPolicy: IfNotPresent containerPort: 8000 healthPath: /health diff --git a/reporter/Dockerfile b/reporter/Dockerfile deleted file mode 100644 index 74b9d685..00000000 --- a/reporter/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -FROM python:3.11-slim - -ARG VERSION -RUN test -n "${VERSION}" || (echo "VERSION build arg is required" && exit 1) -ARG BUILD_TS -RUN test -n "${BUILD_TS}" || (echo "BUILD_TS build arg is required" && exit 1) - -LABEL org.opencontainers.image.title="PostgresAI Reporter" -LABEL org.opencontainers.image.description="Automated Postgres health check and monitoring reports" -LABEL org.opencontainers.image.vendor="PostgresAI" -LABEL org.opencontainers.image.source="https://github.com/PostgresAI/postgres-ai-monitoring" -LABEL org.opencontainers.image.version="${VERSION}" -LABEL org.opencontainers.image.created="${BUILD_TS}" - -# Set working directory -WORKDIR /app - -# Install dependencies -COPY requirements.txt /app/requirements.txt -RUN pip install --no-cache-dir -r /app/requirements.txt - -# Copy the full reporter package (postgres_reports.py imports reporter.*) -RUN mkdir -p /app/reporter -COPY . /app/reporter - -# Make script executable -RUN chmod +x /app/reporter/postgres_reports.py - -# Embed build metadata into the image filesystem -RUN printf '%s' "${VERSION}" > /VERSION \ - && printf '%s' "${BUILD_TS}" > /BUILD_TS - -# Create reports directory -RUN mkdir -p /app/reports - -# Default command -CMD ["python", "-m", "reporter.postgres_reports"] - diff --git a/reporter/__init__.py b/reporter/__init__.py deleted file mode 100644 index 9e176ed6..00000000 --- a/reporter/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Reporter package exposing report generation utilities.""" diff --git a/reporter/logger.py b/reporter/logger.py deleted file mode 100644 index 16ba6997..00000000 --- a/reporter/logger.py +++ /dev/null @@ -1,52 +0,0 @@ -import logging -import os -import sys -from typing import Optional - - -class _DynamicStdoutHandler(logging.Handler): - """ - Write to the *current* sys.stdout at emit-time. - - This matters for pytest's capture (capsys), which swaps sys.stdout per-test; a - StreamHandler created at import-time would hold a stale stream reference. - """ - - def emit(self, record: logging.LogRecord) -> None: - msg = self.format(record) - stream = sys.stdout - stream.write(msg + "\n") - stream.flush() - - -def get_logger(name: str = "reporter", log_level: Optional[int] = None) -> logging.Logger: - """ - Return a configured logger for reporter code. - - - Formatter matches the style used in our other repo: - "%(asctime)s - %(levelname)s - %(message)s" - - Level defaults to REPORTER_LOG_LEVEL env var (INFO if unset). - - Uses a dynamic stdout handler to cooperate with pytest capture. - """ - logger = logging.getLogger(name) - if logger.handlers: - return logger - - if log_level is None: - level_name = os.environ.get("REPORTER_LOG_LEVEL", "INFO").upper() - log_level = logging._nameToLevel.get(level_name, logging.INFO) # type: ignore[attr-defined] - - app_handler = _DynamicStdoutHandler() - app_handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) - app_handler.setLevel(log_level) - - logger.setLevel(log_level) - logger.addHandler(app_handler) - logger.propagate = False - return logger - - -# Default logger used by reporter modules. -logger = get_logger() - - diff --git a/reporter/postgres_reports.py b/reporter/postgres_reports.py deleted file mode 100644 index cd4affa3..00000000 --- a/reporter/postgres_reports.py +++ /dev/null @@ -1,5028 +0,0 @@ -#!/usr/bin/env python3 -""" -PostgreSQL Reports Generator using PromQL - -This script generates JSON reports containing Observations for specific PostgreSQL -check types (A002, A003, A004, A007, D004, F001, F004, F005, H001, H002, H004, -K001, K003, K004, K005, K006, K007, K008, M001, M002, M003, N001) by querying -Prometheus metrics using PromQL. - -IMPORTANT: Scope of this module -------------------------------- -This module ONLY generates JSON reports with raw Observations (data collected -from Prometheus/PostgreSQL). The following are explicitly OUT OF SCOPE: - - - Converting JSON reports to other formats (Markdown, HTML, PDF, etc.) - - Generating Conclusions based on Observations - - Generating Recommendations based on Conclusions - - Any report rendering or presentation logic - -These responsibilities are handled by separate components in the system. -The JSON output from this module serves as input for downstream processing. -""" - -__version__ = "1.0.2" - -import requests -import json -import time -import re -import gc -from datetime import datetime, timedelta, timezone -from typing import Dict, List, Any, Optional, Tuple, Sequence -import argparse -import sys -import os -from pathlib import Path -try: - import psycopg2 - import psycopg2.extras -except ImportError: # pragma: no cover - psycopg2 = None - - -from reporter.logger import logger - - -class PostgresReportGenerator: - # Default databases to always exclude - DEFAULT_EXCLUDED_DATABASES = {'template0', 'template1', 'rdsadmin', 'azure_maintenance', 'cloudsqladmin'} - - # Settings filter lists for reports based on A003 - D004_SETTINGS = [ - 'pg_stat_statements.max', - 'pg_stat_statements.track', - 'pg_stat_statements.track_utility', - 'pg_stat_statements.save', - 'pg_stat_statements.track_planning', - 'shared_preload_libraries', - 'track_activities', - 'track_counts', - 'track_functions', - 'track_io_timing', - 'track_wal_io_timing' - ] - - F001_SETTINGS = [ - 'autovacuum', - 'autovacuum_analyze_scale_factor', - 'autovacuum_analyze_threshold', - 'autovacuum_freeze_max_age', - 'autovacuum_max_workers', - 'autovacuum_multixact_freeze_max_age', - 'autovacuum_naptime', - 'autovacuum_vacuum_cost_delay', - 'autovacuum_vacuum_cost_limit', - 'autovacuum_vacuum_insert_scale_factor', - 'autovacuum_vacuum_scale_factor', - 'autovacuum_vacuum_threshold', - 'autovacuum_work_mem', - 'vacuum_cost_delay', - 'vacuum_cost_limit', - 'vacuum_cost_page_dirty', - 'vacuum_cost_page_hit', - 'vacuum_cost_page_miss', - 'vacuum_freeze_min_age', - 'vacuum_freeze_table_age', - 'vacuum_multixact_freeze_min_age', - 'vacuum_multixact_freeze_table_age' - ] - - G001_SETTINGS = [ - 'shared_buffers', - 'work_mem', - 'maintenance_work_mem', - 'effective_cache_size', - 'autovacuum_work_mem', - 'max_wal_size', - 'min_wal_size', - 'wal_buffers', - 'checkpoint_completion_target', - 'max_connections', - 'max_prepared_transactions', - 'max_locks_per_transaction', - 'max_pred_locks_per_transaction', - 'max_pred_locks_per_relation', - 'max_pred_locks_per_page', - 'logical_decoding_work_mem', - 'hash_mem_multiplier', - 'temp_buffers', - 'shared_preload_libraries', - 'dynamic_shared_memory_type', - 'huge_pages', - 'max_files_per_process', - 'max_stack_depth' - ] - - def __init__(self, prometheus_url: str = "http://sink-prometheus:9090", - postgres_sink_url: str = "postgresql://pgwatch@sink-postgres:5432/measurements", - excluded_databases: Optional[List[str]] = None): - """ - Initialize the PostgreSQL report generator. - - Args: - prometheus_url: URL of the Prometheus instance (default: http://sink-prometheus:9090) - postgres_sink_url: Connection string for the Postgres sink database - (default: postgresql://pgwatch@sink-postgres:5432/measurements) - excluded_databases: Additional databases to exclude from reports - """ - self.prometheus_url = prometheus_url - self.base_url = f"{prometheus_url}/api/v1" - self.postgres_sink_url = postgres_sink_url - self.pg_conn = None - self._build_metadata = self._load_build_metadata() - # Combine default exclusions with user-provided exclusions - self.excluded_databases = self.DEFAULT_EXCLUDED_DATABASES.copy() - if excluded_databases: - self.excluded_databases.update(excluded_databases) - - def _read_text_file(self, path: str) -> Optional[str]: - """Read and strip a small text file. Returns None if missing/empty/unreadable.""" - try: - with open(path, "r", encoding="utf-8") as f: - value = f.read().strip() - return value or None - except Exception: - return None - - def _load_build_metadata(self) -> Dict[str, Optional[str]]: - """ - Load build metadata from the container filesystem. - - Defaults: - - VERSION_FILE: /VERSION - - BUILD_TS_FILE: /BUILD_TS - Both paths can be overridden for testing via env: - - PGAI_VERSION_FILE - - PGAI_BUILD_TS_FILE - """ - version_path = os.getenv("PGAI_VERSION_FILE", "/VERSION") - build_ts_path = os.getenv("PGAI_BUILD_TS_FILE", "/BUILD_TS") - return { - "version": self._read_text_file(version_path), - "build_ts": self._read_text_file(build_ts_path), - } - - def test_connection(self) -> bool: - """Test connection to Prometheus.""" - try: - response = requests.get(f"{self.base_url}/status/config", timeout=10) - return response.status_code == 200 - except Exception as e: - logger.error(f"Connection failed: {e}") - return False - - def connect_postgres_sink(self) -> bool: - """Connect to Postgres sink database.""" - if not self.postgres_sink_url: - return False - if psycopg2 is None: - raise RuntimeError("psycopg2 is required for postgres sink access but is not installed") - - try: - self.pg_conn = psycopg2.connect(self.postgres_sink_url) - return True - except Exception as e: - logger.error(f"Postgres sink connection failed: {e}") - return False - - def close_postgres_sink(self): - """Close Postgres sink connection.""" - if self.pg_conn: - self.pg_conn.close() - self.pg_conn = None - - def get_index_definitions_from_sink(self, db_name: str = None) -> Dict[str, str]: - """ - Get index definitions from the Postgres sink database. - - Args: - db_name: Optional database name to filter results - - Returns: - Dictionary mapping index names to their definitions - """ - if not self.pg_conn: - if not self.connect_postgres_sink(): - return {} - - index_definitions = {} - - try: - with self.pg_conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='index_defs_cursor') as cursor: - # Use server-side cursor for memory efficiency with large result sets - # PERFORMANCE NOTE: This query will use a Seq Scan on index_definitions table. - # This is acceptable because: - # 1. This method is called VERY rarely (only during report generation) - # 2. The table size is expected to remain small (< 10000 rows per database) - # 3. Current latency is well under 1 second for typical workloads - # - # If the table grows significantly larger (>> 10000 rows) or latency exceeds 1s, - # consider adding a GIN index on the data JSONB column or materialized view. - if db_name: - query = """ - select distinct on (data->>'indexrelname') - data->>'indexrelname' as indexrelname, - data->>'index_definition' as index_definition, - dbname - from public.index_definitions - where dbname = %s - order by data->>'indexrelname', time desc - """ - cursor.execute(query, (db_name,)) - else: - query = """ - select distinct on (dbname, data->>'indexrelname') - data->>'indexrelname' as indexrelname, - data->>'index_definition' as index_definition, - dbname - from public.index_definitions - order by dbname, data->>'indexrelname', time desc - """ - cursor.execute(query) - - # Use iterator to fetch rows in batches instead of loading all at once - for row in cursor: - if row['indexrelname']: - # Include database name in the key to avoid collisions across databases - key = f"{row['dbname']}.{row['indexrelname']}" if not db_name else row['indexrelname'] - index_definitions[key] = row['index_definition'] - - except Exception as e: - logger.error(f"Error fetching index definitions from Postgres sink: {e}") - - return index_definitions - - def get_queryid_queries_from_sink(self, query_text_limit: int = 655360, db_names: List[str] = None) -> Dict[str, Dict[str, str]]: - """ - Get queryid-to-query text mappings from the Postgres sink database. - - Args: - query_text_limit: Maximum number of characters for each query text (default: 655360) - db_names: Optional list of database names to filter results (default: fetch all) - - Returns: - Dictionary with database names as keys, containing queryid->query mappings - """ - if not self.pg_conn: - if not self.connect_postgres_sink(): - return {} - - queries_by_db: Dict[str, Dict[str, str]] = {} - - try: - # Use server-side cursor for memory efficiency with large result sets - with self.pg_conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='queryid_cursor') as cursor: - # Query unique queryid-to-query mappings - # The pgss_queryid_queries table stores deduplicated queryid->query mappings - if db_names: - query = """ - select distinct on (dbname, data->>'queryid') - dbname, - data->>'queryid' as queryid, - data->>'query' as query - from public.pgss_queryid_queries - where - dbname = ANY(%s) - and data->>'queryid' is not null - and data->>'query' is not null - order by dbname, data->>'queryid', time desc - """ - cursor.execute(query, (db_names,)) - else: - query = """ - select distinct on (dbname, data->>'queryid') - dbname, - data->>'queryid' as queryid, - data->>'query' as query - from public.pgss_queryid_queries - where - data->>'queryid' is not null - and data->>'query' is not null - order by dbname, data->>'queryid', time desc - """ - cursor.execute(query) - - # Use iterator to fetch rows in batches instead of loading all at once - for row in cursor: - db_name = row['dbname'] - queryid = row['queryid'] - query_text = row['query'] - - # Skip if queryid is missing - if not queryid: - continue - - # Truncate query text if it exceeds the limit - if query_text and len(query_text) > query_text_limit: - query_text = query_text[:query_text_limit] + '...' - - # Initialize database dict if needed - if db_name not in queries_by_db: - queries_by_db[db_name] = {} - - queries_by_db[db_name][queryid] = query_text or '' - - except Exception as e: - logger.error(f"Error fetching queryid queries from Postgres sink: {e}") - - return queries_by_db - - def query_instant(self, query: str) -> Dict[str, Any]: - """ - Execute an instant PromQL query. - - Args: - query: PromQL query string - - Returns: - Dictionary containing the query results - """ - params = {'query': query} - - try: - response = requests.get(f"{self.base_url}/query", params=params) - if response.status_code == 200: - return response.json() - else: - logger.error(f"Query failed with status {response.status_code}: {response.text}") - return {} - except Exception as e: - logger.error(f"Query error: {e}") - return {} - - def _get_postgres_version_info(self, cluster: str, node_name: str) -> Dict[str, str]: - """ - Fetch and parse Postgres version information from pgwatch settings metrics. - - Notes: - - This helper is intentionally defensive: it validates the returned setting_name label - (tests may stub query responses broadly by metric name substring). - - Uses a single query with a regex on setting_name to reduce roundtrips. - """ - # Support both label schemas: - # - newer/expected-by-tests: setting_name/setting_value - # - older/pgwatch-tagged: tag_setting_name/tag_setting_value - queries = [ - ( - "setting_name", - f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}", ' - f'setting_name=~"server_version|server_version_num"}}[3h])', - ), - ( - "tag_setting_name", - f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}", ' - f'tag_setting_name=~"server_version|server_version_num"}}[3h])', - ), - ] - - version_str = None - version_num = None - - for label_name, query in queries: - result = self.query_instant(query) - if result.get("status") != "success": - continue - for item in (result.get("data", {}) or {}).get("result", []) or []: - metric = item.get("metric", {}) or {} - setting_name = metric.get("setting_name") or metric.get("tag_setting_name") or "" - setting_value = metric.get("setting_value") or metric.get("tag_setting_value") or "" - - if setting_name == "server_version" and setting_value and not version_str: - version_str = setting_value - elif setting_name == "server_version_num" and setting_value and not version_num: - version_num = setting_value - - if version_str or version_num: - break - - if not (version_str or version_num): - logger.warning(f"No version data found (cluster={cluster}, node_name={node_name})") - - server_version = version_str or "Unknown" - version_info: Dict[str, str] = { - "version": server_version, - "server_version_num": version_num or "Unknown", - "server_major_ver": "Unknown", - "server_minor_ver": "Unknown", - } - - if server_version != "Unknown": - # Handle both formats: - # - "15.3" - # - "15.3 (Ubuntu 15.3-1.pgdg20.04+1)" - version_parts = server_version.split()[0].split(".") - if len(version_parts) >= 1 and version_parts[0]: - version_info["server_major_ver"] = version_parts[0] - if len(version_parts) >= 2: - version_info["server_minor_ver"] = ".".join(version_parts[1:]) - else: - version_info["server_minor_ver"] = "0" - - return version_info - - def generate_a002_version_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: - """ - Generate A002 Version Information report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing version information - """ - logger.info(f"Generating A002 Version Information report for cluster='{cluster}', node_name='{node_name}'...") - version_info = self._get_postgres_version_info(cluster, node_name) - return self.format_report_data("A002", {"version": version_info}, node_name) - - def generate_a003_settings_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: - """ - Generate A003 PostgreSQL Settings report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing settings information - """ - logger.info("Generating A003 PostgreSQL Settings report...") - - # Query all PostgreSQL settings using the pgwatch_settings_configured metric with last_over_time - # This metric has labels for each setting name - settings_query = f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - result = self.query_instant(settings_query) - - settings_data = {} - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - # Extract setting name from labels - setting_name = item['metric'].get('setting_name', '') - setting_value = item['metric'].get('setting_value', '') - - # Skip if we don't have a setting name - if not setting_name: - continue - - # Get additional metadata from labels - category = item['metric'].get('category', 'Other') - unit = item['metric'].get('unit', '') - context = item['metric'].get('context', '') - vartype = item['metric'].get('vartype', '') - - settings_data[setting_name] = { - "setting": setting_value, - "unit": unit, - "category": category, - "context": context, - "vartype": vartype, - "pretty_value": self.format_setting_value(setting_name, setting_value, unit) - } - else: - logger.warning(f"A003 - No settings data returned for cluster={cluster}, node_name={node_name}") - logger.info(f"Query result status: {result.get('status')}") - logger.info(f"Query result data: {result.get('data', {})}") - - return self.format_report_data("A003", settings_data, node_name, postgres_version=self._get_postgres_version_info(cluster, node_name)) - - def generate_a004_cluster_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: - """ - Generate A004 Cluster Information report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing cluster information - """ - logger.info("Generating A004 Cluster Information report...") - - # Query cluster information - cluster_queries = { - 'active_connections': f'sum(last_over_time(pgwatch_pg_stat_activity_count{{cluster="{cluster}", node_name="{node_name}", state="active"}}[3h]))', - 'idle_connections': f'sum(last_over_time(pgwatch_pg_stat_activity_count{{cluster="{cluster}", node_name="{node_name}", state="idle"}}[3h]))', - 'total_connections': f'sum(last_over_time(pgwatch_pg_stat_activity_count{{cluster="{cluster}", node_name="{node_name}"}}[3h]))', - 'database_sizes': f'sum(last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h]))', - 'cache_hit_ratio': f'sum(last_over_time(pgwatch_db_stats_blks_hit{{cluster="{cluster}", node_name="{node_name}"}}[3h])) / clamp_min(sum(last_over_time(pgwatch_db_stats_blks_hit{{cluster="{cluster}", node_name="{node_name}"}}[3h])) + sum(last_over_time(pgwatch_db_stats_blks_read{{cluster="{cluster}", node_name="{node_name}"}}[3h])), 1) * 100', - 'transactions_per_sec': f'sum(rate(pgwatch_db_stats_xact_commit{{cluster="{cluster}", node_name="{node_name}"}}[5m])) + sum(rate(pgwatch_db_stats_xact_rollback{{cluster="{cluster}", node_name="{node_name}"}}[5m]))', - 'checkpoints_per_sec': f'sum(rate(pgwatch_pg_stat_bgwriter_checkpoints_timed{{cluster="{cluster}", node_name="{node_name}"}}[5m])) + sum(rate(pgwatch_pg_stat_bgwriter_checkpoints_req{{cluster="{cluster}", node_name="{node_name}"}}[5m]))', - 'deadlocks': f'sum(last_over_time(pgwatch_db_stats_deadlocks{{cluster="{cluster}", node_name="{node_name}"}}[3h]))', - 'temp_files': f'sum(last_over_time(pgwatch_db_stats_temp_files{{cluster="{cluster}", node_name="{node_name}"}}[3h]))', - 'temp_bytes': f'sum(last_over_time(pgwatch_db_stats_temp_bytes{{cluster="{cluster}", node_name="{node_name}"}}[3h]))', - } - - cluster_data = {} - for metric_name, query in cluster_queries.items(): - result = self.query_instant(query) - if result.get('status') == 'success' and result.get('data', {}).get('result'): - values = result['data']['result'] - if values: - latest_value = values[0].get('value', [None, None])[1] - cluster_data[metric_name] = { - "value": latest_value, - "unit": self.get_cluster_metric_unit(metric_name), - "description": self.get_cluster_metric_description(metric_name) - } - - # Get database sizes - db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - db_sizes_result = self.query_instant(db_sizes_query) - database_sizes = {} - - if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): - for result in db_sizes_result['data']['result']: - db_name = result['metric'].get('datname', 'unknown') - size_bytes = float(result['value'][1]) - database_sizes[db_name] = size_bytes - - return self.format_report_data( - "A004", - { - "general_info": cluster_data, - "database_sizes": database_sizes, - }, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_a007_altered_settings_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ - str, Any]: - """ - Generate A007 Altered Settings report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing altered settings information - """ - logger.info("Generating A007 Altered Settings report...") - - # Query settings by source using the pgwatch_settings_is_default metric with last_over_time - # This returns settings where is_default = 0 (i.e., non-default/altered settings) - settings_by_source_query = f'last_over_time(pgwatch_settings_is_default{{cluster="{cluster}", node_name="{node_name}"}}[3h]) < 1' - result = self.query_instant(settings_by_source_query) - - altered_settings = {} - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - # Extract setting information from labels - setting_name = item['metric'].get('setting_name', '') - value = item['metric'].get('setting_value', '') - unit = item['metric'].get('unit', '') - category = item['metric'].get('category', 'Other') - - # Skip if we don't have a setting name - if not setting_name: - continue - - pretty_value = self.format_setting_value(setting_name, value, unit) - altered_settings[setting_name] = { - "value": value, - "unit": unit, - "category": category, - "pretty_value": pretty_value - } - else: - logger.warning(f"A007 - No altered settings data returned for cluster={cluster}, node_name={node_name}") - logger.info(f"Query result status: {result.get('status')}") - - return self.format_report_data("A007", altered_settings, node_name, postgres_version=self._get_postgres_version_info(cluster, node_name)) - - def generate_h001_invalid_indexes_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ - str, Any]: - """ - Generate H001 Invalid Indexes report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing invalid indexes information - """ - logger.info("Generating H001 Invalid Indexes report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - # Get database sizes - db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - db_sizes_result = self.query_instant(db_sizes_query) - database_sizes = {} - - if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): - for result in db_sizes_result['data']['result']: - db_name = result['metric'].get('datname', 'unknown') - size_bytes = float(result['value'][1]) - database_sizes[db_name] = size_bytes - - invalid_indexes_by_db = {} - for db_name in databases: - # Fetch index definitions from the sink for this database (used to aid remediation) - index_definitions = self.get_index_definitions_from_sink(db_name) - # Query invalid indexes for each database - invalid_indexes_query = f'last_over_time(pgwatch_pg_invalid_indexes{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])' - result = self.query_instant(invalid_indexes_query) - - invalid_indexes = [] - total_size = 0 - - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - # Extract index information from labels and values - schema_name = item['metric'].get('schema_name', 'unknown') - table_name = item['metric'].get('table_name', 'unknown') - index_name = item['metric'].get('index_name', 'unknown') - relation_name = item['metric'].get('relation_name', f"{schema_name}.{table_name}") - - # Get index size from the metric value - index_size_bytes = float(item['value'][1]) if item.get('value') else 0 - supports_fk = item['metric'].get('supports_fk', '0') - - invalid_index = { - "schema_name": schema_name, - "table_name": table_name, - "index_name": index_name, - "relation_name": relation_name, - "index_size_bytes": index_size_bytes, - "index_size_pretty": self.format_bytes(index_size_bytes), - "index_definition": index_definitions.get(index_name, "Definition not available"), - "supports_fk": bool(int(supports_fk)) - } - - invalid_indexes.append(invalid_index) - total_size += index_size_bytes - - # Skip databases with no invalid indexes - if not invalid_indexes: - continue - - db_size_bytes = database_sizes.get(db_name, 0) - invalid_indexes_by_db[db_name] = { - "invalid_indexes": invalid_indexes, - "total_count": len(invalid_indexes), - "total_size_bytes": total_size, - "total_size_pretty": self.format_bytes(total_size), - "database_size_bytes": db_size_bytes, - "database_size_pretty": self.format_bytes(db_size_bytes) - } - - return self.format_report_data( - "H001", - invalid_indexes_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_h002_unused_indexes_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: - """ - Generate H002 Unused Indexes report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing unused indexes information - """ - logger.info("Generating H002 Unused Indexes report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - # Get database sizes - db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - db_sizes_result = self.query_instant(db_sizes_query) - database_sizes = {} - - if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): - for result in db_sizes_result['data']['result']: - db_name = result['metric'].get('datname', 'unknown') - size_bytes = float(result['value'][1]) - database_sizes[db_name] = size_bytes - - # Query postmaster uptime to get startup time - postmaster_uptime_query = f'last_over_time(pgwatch_db_stats_postmaster_uptime_s{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - postmaster_uptime_result = self.query_instant(postmaster_uptime_query) - - postmaster_startup_time = None - postmaster_startup_epoch = None - if postmaster_uptime_result.get('status') == 'success' and postmaster_uptime_result.get('data', {}).get('result'): - uptime_seconds = float(postmaster_uptime_result['data']['result'][0]['value'][1]) if postmaster_uptime_result['data']['result'] else None - if uptime_seconds: - postmaster_startup_epoch = datetime.now().timestamp() - uptime_seconds - postmaster_startup_time = datetime.fromtimestamp(postmaster_startup_epoch).isoformat() - - unused_indexes_by_db = {} - for db_name in databases: - # Get index definitions from Postgres sink database for this specific database - index_definitions = self.get_index_definitions_from_sink(db_name) - # Query stats_reset timestamp for this database - stats_reset_query = f'last_over_time(pgwatch_stats_reset_stats_reset_epoch{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])' - stats_reset_result = self.query_instant(stats_reset_query) - - stats_reset_epoch = None - days_since_reset = None - stats_reset_time = None - - if stats_reset_result.get('status') == 'success' and stats_reset_result.get('data', {}).get('result'): - stats_reset_epoch = float(stats_reset_result['data']['result'][0]['value'][1]) if stats_reset_result['data']['result'] else None - if stats_reset_epoch: - stats_reset_time = datetime.fromtimestamp(stats_reset_epoch).isoformat() - days_since_reset = (datetime.now() - datetime.fromtimestamp(stats_reset_epoch)).days - - # Query unused indexes for each database using last_over_time to get most recent value - unused_indexes_query = f'last_over_time(pgwatch_unused_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])' - unused_result = self.query_instant(unused_indexes_query) - - unused_indexes = [] - if unused_result.get('status') == 'success' and unused_result.get('data', {}).get('result'): - for item in unused_result['data']['result']: - schema_name = item['metric'].get('schema_name', 'unknown') - table_name = item['metric'].get('table_name', 'unknown') - index_name = item['metric'].get('index_name', 'unknown') - reason = item['metric'].get('reason', 'Unknown') - - # Get the index size from the metric value - index_size_bytes = float(item['value'][1]) if item.get('value') else 0 - - # Query other related metrics for this index - idx_scan_query = f'last_over_time(pgwatch_unused_indexes_idx_scan{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[3h])' - idx_scan_result = self.query_instant(idx_scan_query) - idx_scan = float(idx_scan_result['data']['result'][0]['value'][1]) if idx_scan_result.get('data', - {}).get( - 'result') else 0 - - # Get index definition from collected metrics - index_definition = index_definitions.get(index_name, "Definition not available") - - index_data = { - "schema_name": schema_name, - "table_name": table_name, - "index_name": index_name, - "index_definition": index_definition, - "reason": reason, - "idx_scan": idx_scan, - "index_size_bytes": index_size_bytes, - "idx_is_btree": item['metric'].get('idx_is_btree', 'false') == 'true', - "supports_fk": bool(int(item['metric'].get('supports_fk', 0))) - } - - index_data['index_size_pretty'] = self.format_bytes(index_data['index_size_bytes']) - - unused_indexes.append(index_data) - - # Sort by index size descending - unused_indexes.sort(key=lambda x: x['index_size_bytes'], reverse=True) - - # Skip databases with no unused indexes - if not unused_indexes: - continue - - total_unused_size = sum(idx['index_size_bytes'] for idx in unused_indexes) - - db_size_bytes = database_sizes.get(db_name, 0) - unused_indexes_by_db[db_name] = { - "unused_indexes": unused_indexes, - "total_count": len(unused_indexes), - "total_size_bytes": total_unused_size, - "total_size_pretty": self.format_bytes(total_unused_size), - "database_size_bytes": db_size_bytes, - "database_size_pretty": self.format_bytes(db_size_bytes), - "stats_reset": { - "stats_reset_epoch": stats_reset_epoch, - "stats_reset_time": stats_reset_time, - "days_since_reset": days_since_reset, - "postmaster_startup_epoch": postmaster_startup_epoch, - "postmaster_startup_time": postmaster_startup_time - } - } - - return self.format_report_data( - "H002", - unused_indexes_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_h004_redundant_indexes_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ - str, Any]: - """ - Generate H004 Redundant Indexes report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing redundant indexes information - """ - logger.info("Generating H004 Redundant Indexes report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - # Get database sizes - db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - db_sizes_result = self.query_instant(db_sizes_query) - database_sizes = {} - - if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): - for result in db_sizes_result['data']['result']: - db_name = result['metric'].get('datname', 'unknown') - size_bytes = float(result['value'][1]) - database_sizes[db_name] = size_bytes - - redundant_indexes_by_db = {} - for db_name in databases: - # Fetch index definitions from the sink for this database (used to aid remediation) - index_definitions = self.get_index_definitions_from_sink(db_name) - # Query redundant indexes for each database using last_over_time to get most recent value - redundant_indexes_query = f'last_over_time(pgwatch_redundant_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}"}}[3h])' - result = self.query_instant(redundant_indexes_query) - - redundant_indexes = [] - total_size = 0 - - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - schema_name = item['metric'].get('schema_name', 'unknown') - table_name = item['metric'].get('table_name', 'unknown') - index_name = item['metric'].get('index_name', 'unknown') - relation_name = item['metric'].get('relation_name', f"{schema_name}.{table_name}") - access_method = item['metric'].get('access_method', 'unknown') - reason = item['metric'].get('reason', 'Unknown') - - # Get the index size from the metric value - index_size_bytes = float(item['value'][1]) if item.get('value') else 0 - - # Query other related metrics for this index - table_size_query = f'last_over_time(pgwatch_redundant_indexes_table_size_bytes{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[3h])' - table_size_result = self.query_instant(table_size_query) - table_size_bytes = float( - table_size_result['data']['result'][0]['value'][1]) if table_size_result.get('data', {}).get( - 'result') else 0 - - index_usage_query = f'last_over_time(pgwatch_redundant_indexes_index_usage{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[3h])' - index_usage_result = self.query_instant(index_usage_query) - index_usage = float(index_usage_result['data']['result'][0]['value'][1]) if index_usage_result.get( - 'data', {}).get('result') else 0 - - supports_fk_query = f'last_over_time(pgwatch_redundant_indexes_supports_fk{{cluster="{cluster}", node_name="{node_name}", dbname="{db_name}", schema_name="{schema_name}", table_name="{table_name}", index_name="{index_name}"}}[3h])' - supports_fk_result = self.query_instant(supports_fk_query) - supports_fk = bool( - int(supports_fk_result['data']['result'][0]['value'][1])) if supports_fk_result.get('data', - {}).get( - 'result') else False - - # Build redundant_to array from the reason field - # The reason field contains comma-separated index names - # (the indexes that make this index redundant) - # Note: In full mode, index sizes for redundant_to are not available - # (would require additional Prometheus queries). Use express mode for sizes. - redundant_to = [] - for idx_name in [r.strip() for r in reason.split(',') if r.strip()]: - redundant_to.append({ - "index_name": idx_name, - "index_definition": index_definitions.get(idx_name, "Definition not available"), - "index_size_bytes": 0, - "index_size_pretty": "N/A" - }) - - redundant_index = { - "schema_name": schema_name, - "table_name": table_name, - "index_name": index_name, - "relation_name": relation_name, - "access_method": access_method, - "reason": reason, - "index_size_bytes": index_size_bytes, - "table_size_bytes": table_size_bytes, - "index_usage": index_usage, - "supports_fk": supports_fk, - "index_definition": index_definitions.get(index_name, "Definition not available"), - "index_size_pretty": self.format_bytes(index_size_bytes), - "table_size_pretty": self.format_bytes(table_size_bytes), - "redundant_to": redundant_to - } - - redundant_indexes.append(redundant_index) - total_size += index_size_bytes - - # Sort by index size descending - redundant_indexes.sort(key=lambda x: x['index_size_bytes'], reverse=True) - - # Skip databases with no redundant indexes - if not redundant_indexes: - continue - - db_size_bytes = database_sizes.get(db_name, 0) - redundant_indexes_by_db[db_name] = { - "redundant_indexes": redundant_indexes, - "total_count": len(redundant_indexes), - "total_size_bytes": total_size, - "total_size_pretty": self.format_bytes(total_size), - "database_size_bytes": db_size_bytes, - "database_size_pretty": self.format_bytes(db_size_bytes) - } - - return self.format_report_data( - "H004", - redundant_indexes_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_d004_pgstat_settings_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ - str, Any]: - """ - Generate D004 pgstatstatements and pgstatkcache Settings report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing pg_stat_statements and pg_stat_kcache settings information - """ - logger.info("Generating D004 pgstatstatements and pgstatkcache Settings report...") - - # Define relevant pg_stat_statements and pg_stat_kcache settings - pgstat_settings = [ - 'pg_stat_statements.max', - 'pg_stat_statements.track', - 'pg_stat_statements.track_utility', - 'pg_stat_statements.save', - 'pg_stat_statements.track_planning', - 'shared_preload_libraries', - 'track_activities', - 'track_counts', - 'track_functions', - 'track_io_timing', - 'track_wal_io_timing' - ] - - # Query all PostgreSQL settings for pg_stat_statements and related using last_over_time - settings_query = f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - result = self.query_instant(settings_query) - - pgstat_data = {} - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - setting_name = item['metric'].get('setting_name', '') - - # Skip if no setting name - if not setting_name: - continue - - # Filter for pg_stat_statements and related settings - if setting_name in pgstat_settings: - setting_value = item['metric'].get('setting_value', '') - category = item['metric'].get('category', 'Statistics') - unit = item['metric'].get('unit', '') - context = item['metric'].get('context', '') - vartype = item['metric'].get('vartype', '') - - pgstat_data[setting_name] = { - "setting": setting_value, - "unit": unit, - "category": category, - "context": context, - "vartype": vartype, - "pretty_value": self.format_setting_value(setting_name, setting_value, unit) - } - else: - logger.warning(f"D004 - No settings data returned for cluster={cluster}, node_name={node_name}") - - # Check if pg_stat_kcache extension is available and working by querying its metrics - kcache_status = self._check_pg_stat_kcache_status(cluster, node_name) - - # Check if pg_stat_statements is available and working by querying its metrics - pgss_status = self._check_pg_stat_statements_status(cluster, node_name) - - return self.format_report_data( - "D004", - { - "settings": pgstat_data, - "pg_stat_statements_status": pgss_status, - "pg_stat_kcache_status": kcache_status, - }, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def _check_pg_stat_kcache_status(self, cluster: str, node_name: str) -> Dict[str, Any]: - """ - Check if pg_stat_kcache extension is working by querying its metrics. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing pg_stat_kcache status information - """ - kcache_queries = { - 'exec_user_time': f'last_over_time(pgwatch_pg_stat_kcache_exec_user_time{{cluster="{cluster}", node_name="{node_name}"}}[3h])', - 'exec_system_time': f'last_over_time(pgwatch_pg_stat_kcache_exec_system_time{{cluster="{cluster}", node_name="{node_name}"}}[3h])', - 'exec_total_time': f'last_over_time(pgwatch_pg_stat_kcache_exec_total_time{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - } - - kcache_status = { - "extension_available": False, - "metrics_count": 0, - "total_exec_time": 0, - "total_user_time": 0, - "total_system_time": 0, - "sample_queries": [] - } - - for metric_name, query in kcache_queries.items(): - result = self.query_instant(query) - if result.get('status') == 'success' and result.get('data', {}).get('result'): - kcache_status["extension_available"] = True - results = result['data']['result'] - - for item in results[:5]: # Get sample of top 5 queries - queryid = item['metric'].get('queryid', 'unknown') - user = item['metric'].get('tag_user', 'unknown') - value = float(item['value'][1]) if item.get('value') else 0 - - # Add to totals - if metric_name == 'exec_total_time': - kcache_status["total_exec_time"] += value - kcache_status["metrics_count"] = len(results) - - # Store sample query info - if len(kcache_status["sample_queries"]) < 5: - kcache_status["sample_queries"].append({ - "queryid": queryid, - "user": user, - "exec_total_time": value - }) - elif metric_name == 'exec_user_time': - kcache_status["total_user_time"] += value - elif metric_name == 'exec_system_time': - kcache_status["total_system_time"] += value - - return kcache_status - - def _check_pg_stat_statements_status(self, cluster: str, node_name: str) -> Dict[str, Any]: - """ - Check if pg_stat_statements extension is working by querying its metrics. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing pg_stat_statements status information - """ - pgss_query = f'last_over_time(pgwatch_pg_stat_statements_calls{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - result = self.query_instant(pgss_query) - - pgss_status = { - "extension_available": False, - "metrics_count": 0, - "total_calls": 0, - "sample_queries": [] - } - - if result.get('status') == 'success' and result.get('data', {}).get('result'): - pgss_status["extension_available"] = True - results = result['data']['result'] - pgss_status["metrics_count"] = len(results) - - for item in results[:5]: # Get sample of top 5 queries - queryid = item['metric'].get('queryid', 'unknown') - user = item['metric'].get('tag_user', 'unknown') - datname = item['metric'].get('datname', 'unknown') - calls = float(item['value'][1]) if item.get('value') else 0 - - pgss_status["total_calls"] += calls - - # Store sample query info - if len(pgss_status["sample_queries"]) < 5: - pgss_status["sample_queries"].append({ - "queryid": queryid, - "user": user, - "database": datname, - "calls": calls - }) - - return pgss_status - - def generate_f001_autovacuum_settings_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ - str, Any]: - """ - Generate F001 Autovacuum: Current Settings report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing autovacuum settings information - """ - logger.info("Generating F001 Autovacuum: Current Settings report...") - - # Define autovacuum related settings - autovacuum_settings = [ - 'autovacuum', - 'autovacuum_analyze_scale_factor', - 'autovacuum_analyze_threshold', - 'autovacuum_freeze_max_age', - 'autovacuum_max_workers', - 'autovacuum_multixact_freeze_max_age', - 'autovacuum_naptime', - 'autovacuum_vacuum_cost_delay', - 'autovacuum_vacuum_cost_limit', - 'autovacuum_vacuum_insert_scale_factor', - 'autovacuum_vacuum_scale_factor', - 'autovacuum_vacuum_threshold', - 'autovacuum_work_mem', - 'vacuum_cost_delay', - 'vacuum_cost_limit', - 'vacuum_cost_page_dirty', - 'vacuum_cost_page_hit', - 'vacuum_cost_page_miss', - 'vacuum_freeze_min_age', - 'vacuum_freeze_table_age', - 'vacuum_multixact_freeze_min_age', - 'vacuum_multixact_freeze_table_age' - ] - - # Query all PostgreSQL settings for autovacuum using last_over_time - settings_query = f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - result = self.query_instant(settings_query) - - autovacuum_data = {} - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - setting_name = item['metric'].get('setting_name', 'unknown') - - # Filter for autovacuum and vacuum settings - if setting_name in autovacuum_settings: - setting_value = item['metric'].get('setting_value', '') - category = item['metric'].get('category', 'Autovacuum') - unit = item['metric'].get('unit', '') - context = item['metric'].get('context', '') - vartype = item['metric'].get('vartype', '') - - autovacuum_data[setting_name] = { - "setting": setting_value, - "unit": unit, - "category": category, - "context": context, - "vartype": vartype, - "pretty_value": self.format_setting_value(setting_name, setting_value, unit) - } - - return self.format_report_data("F001", autovacuum_data, node_name, postgres_version=self._get_postgres_version_info(cluster, node_name)) - - def generate_f005_btree_bloat_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: - """ - Generate F005 Autovacuum: Btree Index Bloat (Estimated) report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing btree index bloat information - """ - logger.info("Generating F005 Autovacuum: Btree Index Bloat (Estimated) report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - # Get database sizes - db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - db_sizes_result = self.query_instant(db_sizes_query) - database_sizes = {} - - if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): - for result in db_sizes_result['data']['result']: - db_name = result['metric'].get('datname', 'unknown') - size_bytes = float(result['value'][1]) - database_sizes[db_name] = size_bytes - - bloated_indexes_by_db = {} - for db_name in databases: - # Fetch last vacuum timestamp per table (from pg_stat_all_tables) so we can attach it to indexes. - last_vacuum_query = ( - f'last_over_time(pgwatch_pg_stat_all_tables_last_vacuum' - f'{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])' - ) - last_vacuum_result = self.query_instant(last_vacuum_query) - last_vacuum_by_table: Dict[str, float] = {} - if last_vacuum_result.get('status') == 'success' and last_vacuum_result.get('data', {}).get('result'): - for item in last_vacuum_result['data']['result']: - metric = item.get('metric', {}) - schema_name = ( - metric.get('schemaname') - or metric.get('tag_schemaname') - or 'unknown' - ) - # pg_stat_all_tables uses relname, but be defensive in case of label differences. - relname = ( - metric.get('relname') - or metric.get('tag_relname') - or metric.get('tblname') - or metric.get('tag_tblname') - or metric.get('table_name') - or 'unknown' - ) - key = f"{schema_name}.{relname}" - value = float(item['value'][1]) if item.get('value') else 0 - last_vacuum_by_table[key] = value - - # Fetch table sizes from pg_class as a fallback if pg_btree_bloat_table_size_mib is unavailable. - table_sizes_query = ( - f'last_over_time(pgwatch_pg_class_relation_size_bytes' - f'{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}", relkind="r"}}[3h])' - ) - table_sizes_result = self.query_instant(table_sizes_query) - table_size_by_table: Dict[str, float] = {} - if table_sizes_result.get('status') == 'success' and table_sizes_result.get('data', {}).get('result'): - for item in table_sizes_result['data']['result']: - metric = item.get('metric', {}) or {} - schema_name = ( - metric.get('schemaname') - or metric.get('tag_schemaname') - or 'unknown' - ) - relname = ( - metric.get('relname') - or metric.get('tag_relname') - or metric.get('tblname') - or metric.get('tag_tblname') - or metric.get('table_name') - or 'unknown' - ) - key = f"{schema_name}.{relname}" - value = float(item['value'][1]) if item.get('value') else 0 - table_size_by_table[key] = value - - # Query btree bloat using multiple metrics for each database with last_over_time [1d] - bloat_queries = { - # Backward/forward compatible: - # - Older pgwatch configs may expose bytes gauges (real_size, table_size) - # - Newer configs expose MiB gauges (real_size_mib, table_size_mib) - 'real_size_mib': f'last_over_time(pgwatch_pg_btree_bloat_real_size_mib{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'real_size': f'last_over_time(pgwatch_pg_btree_bloat_real_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'table_size_mib': f'last_over_time(pgwatch_pg_btree_bloat_table_size_mib{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'table_size': f'last_over_time(pgwatch_pg_btree_bloat_table_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'extra_size': f'last_over_time(pgwatch_pg_btree_bloat_extra_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'extra_pct': f'last_over_time(pgwatch_pg_btree_bloat_extra_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'fillfactor': f'last_over_time(pgwatch_pg_btree_bloat_fillfactor{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'bloat_size': f'last_over_time(pgwatch_pg_btree_bloat_bloat_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'bloat_pct': f'last_over_time(pgwatch_pg_btree_bloat_bloat_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - } - - bloated_indexes = {} - - for metric_type, query in bloat_queries.items(): - result = self.query_instant(query) - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - metric = item.get('metric', {}) or {} - schema_name = ( - metric.get('schemaname') - or metric.get('tag_schemaname') - or 'unknown' - ) - table_name = ( - metric.get('tblname') - or metric.get('tag_tblname') - or metric.get('relname') - or metric.get('tag_relname') - or metric.get('table_name') - or 'unknown' - ) - index_name = ( - metric.get('idxname') - or metric.get('tag_idxname') - or metric.get('index_name') - or 'unknown' - ) - - index_key = f"{schema_name}.{table_name}.{index_name}" - - if index_key not in bloated_indexes: - bloated_indexes[index_key] = { - "schema_name": schema_name, - "table_name": table_name, - "index_name": index_name, - "real_size_mib": 0, - "table_size_mib": 0, - "real_size": 0, # bytes (from bytes gauge or derived from MiB) - "table_size": 0, # bytes (from bytes gauge or derived from MiB or fallback) - "extra_size": 0, - "extra_pct": 0, - "fillfactor": 0, - "bloat_size": 0, - "bloat_pct": 0, - "last_vacuum": 0, - } - - value = float(item['value'][1]) if item.get('value') else 0 - bloated_indexes[index_key][metric_type] = value - - # Skip databases with no bloat data - if not bloated_indexes: - continue - - # Convert to list and add pretty formatting - bloated_indexes_list = [] - total_bloat_size = 0 - - for index_data in bloated_indexes.values(): - key = f"{index_data.get('schema_name', 'unknown')}.{index_data.get('table_name', 'unknown')}" - last_vacuum_epoch = float(last_vacuum_by_table.get(key, 0) or 0) - index_data['last_vacuum_epoch'] = last_vacuum_epoch - index_data['last_vacuum'] = self.format_epoch_timestamp(last_vacuum_epoch) - # Sizes are bytes in the report output. - # Prefer bytes gauges if present, otherwise convert from MiB. - real_size = float(index_data.get('real_size', 0) or 0) - if real_size <= 0: - real_size_mib = float(index_data.get('real_size_mib', 0) or 0) - real_size = real_size_mib * 1024 * 1024 if real_size_mib > 0 else 0 - index_data['real_size'] = int(real_size) - index_data.pop('real_size_mib', None) - - table_size = float(index_data.get('table_size', 0) or 0) - if table_size <= 0: - table_size_mib = float(index_data.get('table_size_mib', 0) or 0) - table_size = table_size_mib * 1024 * 1024 if table_size_mib > 0 else 0 - if table_size <= 0: - table_size = float(table_size_by_table.get(key, 0) or 0) - index_data['table_size'] = int(table_size) - index_data.pop('table_size_mib', None) - - index_data['real_size_pretty'] = self.format_bytes(index_data['real_size']) - index_data['table_size_pretty'] = self.format_bytes(index_data['table_size']) - index_data['extra_size_pretty'] = self.format_bytes(index_data['extra_size']) - index_data['bloat_size_pretty'] = self.format_bytes(index_data['bloat_size']) - - bloated_indexes_list.append(index_data) - total_bloat_size += index_data['bloat_size'] - - # Sort by bloat percentage descending - bloated_indexes_list.sort(key=lambda x: x['bloat_pct'], reverse=True) - - db_size_bytes = database_sizes.get(db_name, 0) - bloated_indexes_by_db[db_name] = { - "bloated_indexes": bloated_indexes_list, - "total_count": len(bloated_indexes_list), - "total_bloat_size_bytes": total_bloat_size, - "total_bloat_size_pretty": self.format_bytes(total_bloat_size), - "database_size_bytes": db_size_bytes, - "database_size_pretty": self.format_bytes(db_size_bytes) - } - - return self.format_report_data( - "F005", - bloated_indexes_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_g001_memory_settings_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[ - str, Any]: - """ - Generate G001 Memory-related Settings report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing memory-related settings information - """ - logger.info("Generating G001 Memory-related Settings report...") - - # Define memory-related settings - memory_settings = [ - 'shared_buffers', - 'work_mem', - 'maintenance_work_mem', - 'effective_cache_size', - 'autovacuum_work_mem', - 'max_wal_size', - 'min_wal_size', - 'wal_buffers', - 'checkpoint_completion_target', - 'max_connections', - 'max_prepared_transactions', - 'max_locks_per_transaction', - 'max_pred_locks_per_transaction', - 'max_pred_locks_per_relation', - 'max_pred_locks_per_page', - 'logical_decoding_work_mem', - 'hash_mem_multiplier', - 'temp_buffers', - 'shared_preload_libraries', - 'dynamic_shared_memory_type', - 'huge_pages', - 'max_files_per_process', - 'max_stack_depth' - ] - - # Query all PostgreSQL settings for memory-related settings using last_over_time - settings_query = f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - result = self.query_instant(settings_query) - - memory_data = {} - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - setting_name = item['metric'].get('setting_name', '') - - # Skip if no setting name - if not setting_name: - continue - - # Filter for memory-related settings - if setting_name in memory_settings: - setting_value = item['metric'].get('setting_value', '') - category = item['metric'].get('category', 'Memory') - unit = item['metric'].get('unit', '') - context = item['metric'].get('context', '') - vartype = item['metric'].get('vartype', '') - - memory_data[setting_name] = { - "setting": setting_value, - "unit": unit, - "category": category, - "context": context, - "vartype": vartype, - "pretty_value": self.format_setting_value(setting_name, setting_value, unit) - } - else: - logger.warning(f"G001 - No settings data returned for cluster={cluster}, node_name={node_name}") - - # Calculate some memory usage estimates and recommendations - memory_analysis = self._analyze_memory_settings(memory_data) - - return self.format_report_data( - "G001", - { - "settings": memory_data, - "analysis": memory_analysis, - }, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def _analyze_memory_settings(self, memory_data: Dict[str, Any]) -> Dict[str, Any]: - """ - Analyze memory settings and provide estimates and recommendations. - - Args: - memory_data: Dictionary of memory settings - - Returns: - Dictionary containing memory analysis - """ - analysis = { - "estimated_total_memory_usage": {} - } - - try: - # Extract key memory values for analysis - shared_buffers = self._parse_memory_value(memory_data.get('shared_buffers', {}).get('setting', '128MB')) - work_mem = self._parse_memory_value(memory_data.get('work_mem', {}).get('setting', '4MB')) - maintenance_work_mem = self._parse_memory_value( - memory_data.get('maintenance_work_mem', {}).get('setting', '64MB')) - effective_cache_size = self._parse_memory_value( - memory_data.get('effective_cache_size', {}).get('setting', '4GB')) - max_connections = int(memory_data.get('max_connections', {}).get('setting', '100')) - wal_buffers = self._parse_memory_value(memory_data.get('wal_buffers', {}).get('setting', '16MB')) - - # Calculate estimated memory usage - shared_memory = shared_buffers + wal_buffers - potential_work_mem_usage = work_mem * max_connections # Worst case scenario - - analysis["estimated_total_memory_usage"] = { - "shared_buffers_bytes": shared_buffers, - "shared_buffers_pretty": self.format_bytes(shared_buffers), - "wal_buffers_bytes": wal_buffers, - "wal_buffers_pretty": self.format_bytes(wal_buffers), - "shared_memory_total_bytes": shared_memory, - "shared_memory_total_pretty": self.format_bytes(shared_memory), - "work_mem_per_connection_bytes": work_mem, - "work_mem_per_connection_pretty": self.format_bytes(work_mem), - "max_work_mem_usage_bytes": potential_work_mem_usage, - "max_work_mem_usage_pretty": self.format_bytes(potential_work_mem_usage), - "maintenance_work_mem_bytes": maintenance_work_mem, - "maintenance_work_mem_pretty": self.format_bytes(maintenance_work_mem), - "effective_cache_size_bytes": effective_cache_size, - "effective_cache_size_pretty": self.format_bytes(effective_cache_size) - } - - # Generate recommendations - except (ValueError, TypeError): - # If parsing fails, return empty analysis - analysis["estimated_total_memory_usage"] = {} - - return analysis - - def _parse_memory_value(self, value: str) -> int: - """ - Parse a PostgreSQL memory value string to bytes. - - Args: - value: Memory value string (e.g., "128MB", "4GB", "8192") - - Returns: - Memory value in bytes - """ - if not value or value == '-1': - return 0 - - value = str(value).strip().upper() - - # Handle unit suffixes - if value.endswith('TB'): - return int(float(value[:-2]) * 1024 * 1024 * 1024 * 1024) - elif value.endswith('GB'): - return int(float(value[:-2]) * 1024 * 1024 * 1024) - elif value.endswith('MB'): - return int(float(value[:-2]) * 1024 * 1024) - elif value.endswith('KB'): - return int(float(value[:-2]) * 1024) - elif value.endswith('B'): - return int(float(value[:-1])) - else: - # Assume it's in the PostgreSQL default unit (typically 8KB blocks for some settings) - try: - numeric_value = int(value) - # For most memory settings, bare numbers are in KB or 8KB blocks - # This is a simplified assumption - in reality it depends on the specific setting - return numeric_value * 1024 # Assume KB if no unit specified - except ValueError: - return 0 - - def generate_f004_heap_bloat_report(self, cluster: str = "local", node_name: str = "node-01") -> Dict[str, Any]: - """ - Generate F004 Autovacuum: Heap Bloat (Estimated) report. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - Dictionary containing heap bloat information - """ - logger.info("Generating F004 Autovacuum: Heap Bloat (Estimated) report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("F004 - No databases found") - - # Get database sizes - db_sizes_query = f'last_over_time(pgwatch_db_size_size_b{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - db_sizes_result = self.query_instant(db_sizes_query) - database_sizes = {} - - if db_sizes_result.get('status') == 'success' and db_sizes_result.get('data', {}).get('result'): - for result in db_sizes_result['data']['result']: - db_name = result['metric'].get('datname', 'unknown') - size_bytes = float(result['value'][1]) - database_sizes[db_name] = size_bytes - - bloated_tables_by_db = {} - for db_name in databases: - # Fetch last vacuum timestamp per table (from pg_stat_all_tables). - # Note: prefer `relname`, but be defensive since other parts of the codebase / configs - # sometimes use `tblname`. - last_vacuum_query = ( - f'last_over_time(pgwatch_pg_stat_all_tables_last_vacuum' - f'{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])' - ) - last_vacuum_result = self.query_instant(last_vacuum_query) - last_vacuum_by_table: Dict[str, float] = {} - if last_vacuum_result.get('status') == 'success' and last_vacuum_result.get('data', {}).get('result'): - for item in last_vacuum_result['data']['result']: - metric = item.get('metric', {}) - schema_name = ( - metric.get('schemaname') - or metric.get('tag_schemaname') - or 'unknown' - ) - relname = ( - metric.get('relname') - or metric.get('tag_relname') - or metric.get('tblname') - or metric.get('tag_tblname') - or metric.get('table_name') - or 'unknown' - ) - key = f"{schema_name}.{relname}" - value = float(item['value'][1]) if item.get('value') else 0 - last_vacuum_by_table[key] = value - - # Query table bloat using multiple metrics for each database - # Try with 10h window first, then fall back to instant query - bloat_queries = { - # pgwatch publishes "real size" in MiB (real_size_mib). We keep 'real_size' in the - # output as a backwards-compatible alias but it is based on MiB. - 'real_size_mib': f'last_over_time(pgwatch_pg_table_bloat_real_size_mib{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'extra_size': f'last_over_time(pgwatch_pg_table_bloat_extra_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'extra_pct': f'last_over_time(pgwatch_pg_table_bloat_extra_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'fillfactor': f'last_over_time(pgwatch_pg_table_bloat_fillfactor{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'bloat_size': f'last_over_time(pgwatch_pg_table_bloat_bloat_size{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - 'bloat_pct': f'last_over_time(pgwatch_pg_table_bloat_bloat_pct{{cluster="{cluster}", node_name="{node_name}", datname="{db_name}"}}[3h])', - } - - bloated_tables = {} - for metric_type, query in bloat_queries.items(): - result = self.query_instant(query) - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - schema_name = item['metric'].get('schemaname', 'unknown') - table_name = item['metric'].get('tblname', 'unknown') - - table_key = f"{schema_name}.{table_name}" - - if table_key not in bloated_tables: - bloated_tables[table_key] = { - "schema_name": schema_name, - "table_name": table_name, - # Stored temporarily as MiB because pgwatch publishes real_size_mib. - # We'll convert it to bytes for report output. - "real_size": 0, - "extra_size": 0, - "extra_pct": 0, - "fillfactor": 0, - "bloat_size": 0, - "bloat_pct": 0, - "last_vacuum": 0, - } - - value = float(item['value'][1]) if item.get('value') else 0 - bloated_tables[table_key][metric_type] = value - else: - if metric_type == 'real_size_mib': # Only log once per database - logger.warning(f"F004 - No bloat data for database {db_name}, metric {metric_type}") - - # Skip databases with no bloat data - if not bloated_tables: - continue - - # Convert to list and add pretty formatting - bloated_tables_list = [] - total_bloat_size = 0 - - for table_data in bloated_tables.values(): - # Normalize real size: Prometheus provides it in MiB (real_size_mib), but the report - # should expose real_size in bytes. - real_size_mib = float(table_data.get('real_size_mib', 0) or 0) - table_data['real_size'] = int(real_size_mib * 1024 * 1024) - # Remove intermediate field so it's not part of the report payload. - table_data.pop('real_size_mib', None) - # Attach last vacuum timestamp (epoch seconds) from pg_stat_all_tables. - key = f"{table_data.get('schema_name', 'unknown')}.{table_data.get('table_name', 'unknown')}" - last_vacuum_epoch = float(last_vacuum_by_table.get(key, 0) or 0) - table_data['last_vacuum_epoch'] = last_vacuum_epoch - table_data['last_vacuum'] = self.format_epoch_timestamp(last_vacuum_epoch) - table_data['real_size_pretty'] = self.format_bytes(table_data['real_size']) - table_data['extra_size_pretty'] = self.format_bytes(table_data['extra_size']) - table_data['bloat_size_pretty'] = self.format_bytes(table_data['bloat_size']) - - bloated_tables_list.append(table_data) - total_bloat_size += table_data['bloat_size'] - - # Sort by bloat percentage descending - bloated_tables_list.sort(key=lambda x: x['bloat_pct'], reverse=True) - - db_size_bytes = database_sizes.get(db_name, 0) - bloated_tables_by_db[db_name] = { - "bloated_tables": bloated_tables_list, - "total_count": len(bloated_tables_list), - "total_bloat_size_bytes": total_bloat_size, - "total_bloat_size_pretty": self.format_bytes(total_bloat_size), - "database_size_bytes": db_size_bytes, - "database_size_pretty": self.format_bytes(db_size_bytes) - } - - return self.format_report_data( - "F004", - bloated_tables_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_k001_query_calls_report(self, cluster: str = "local", node_name: str = "node-01", - time_range_minutes: int = 60, use_hourly: bool = True) -> Dict[str, Any]: - """ - Generate K001 Globally Aggregated Query Metrics report (sorted by calls). - - Args: - cluster: Cluster name - node_name: Node name - time_range_minutes: Time range in minutes for metrics collection (used when use_hourly=False) - use_hourly: Use hourly topk aggregation logic (default: True) - - Returns: - Dictionary containing query metrics sorted by calls - """ - logger.info("Generating K001 Globally Aggregated Query Metrics report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("K001 - No databases found") - - queries_by_db = {} - - if use_hourly and time_range_minutes >= 60: - # Use hourly topk aggregation - hours = time_range_minutes // 60 - metric_name = "pgwatch_pg_stat_statements_calls" - - for db_name in databases: - logger.info(f"K001: Processing database {db_name} (hourly mode)...") - - per_query, other, timeline = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, metric_name, hours=hours - ) - - if not per_query and sum(other) == 0: - logger.warning(f"K001 - No query metrics returned for database {db_name}") - continue # Skip databases with no data - - # Calculate total calls per query across all hours - query_totals = [] - for queryid, hourly_values in per_query.items(): - total_calls = sum(hourly_values) - query_totals.append({ - "queryid": queryid, - "total_calls": total_calls, - "hourly_calls": hourly_values - }) - - # Sort by total calls (descending) - sorted_metrics = sorted(query_totals, key=lambda x: x.get('total_calls', 0), reverse=True) - - # Calculate totals - total_calls = sum(q.get('total_calls', 0) for q in sorted_metrics) + sum(other) - - queries_by_db[db_name] = { - "query_metrics": sorted_metrics, - "other_calls_hourly": other, - "summary": { - "total_queries_tracked": len(sorted_metrics), - "total_calls": total_calls, - "total_calls_tracked_queries": sum(q.get('total_calls', 0) for q in sorted_metrics), - "total_calls_other": sum(other), - "time_range_hours": hours, - "hourly_timestamps": timeline - } - } - else: - # Fallback to original logic for sub-hourly or when explicitly disabled - end_time = datetime.now() - start_time = end_time - timedelta(minutes=time_range_minutes) - - for db_name in databases: - logger.info(f"K001: Processing database {db_name}...") - # Get pg_stat_statements metrics for this database - query_metrics = self._get_pgss_metrics_data_by_db(cluster, node_name, db_name, start_time, end_time) - - if not query_metrics: - logger.warning(f"K001 - No query metrics returned for database {db_name}") - - # Sort by calls (descending) - sorted_metrics = sorted(query_metrics, key=lambda x: x.get('calls', 0), reverse=True) - - # Calculate totals for this database - total_calls = sum(q.get('calls', 0) for q in sorted_metrics) - total_time = sum(q.get('total_time', 0) for q in sorted_metrics) - total_rows = sum(q.get('rows', 0) for q in sorted_metrics) - - queries_by_db[db_name] = { - "query_metrics": sorted_metrics, - "summary": { - "total_queries": len(sorted_metrics), - "total_calls": total_calls, - "total_time_ms": total_time, - "total_rows": total_rows, - "time_range_minutes": time_range_minutes, - "start_time": start_time.isoformat(), - "end_time": end_time.isoformat() - } - } - - return self.format_report_data( - "K001", - queries_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_k003_top_queries_report(self, cluster: str = "local", node_name: str = "node-01", - time_range_minutes: int = 60, limit: int = 100, use_hourly: bool = True) -> Dict[str, Any]: - """ - Generate K003 Top Queries by total_time (exec + plan) report. - - Args: - cluster: Cluster name - node_name: Node name - time_range_minutes: Time range in minutes for metrics collection (used when use_hourly=False) - limit: Number of top queries to return (default: 100) - use_hourly: Use hourly topk aggregation logic (default: True) - - Returns: - Dictionary containing top queries sorted by total execution time (exec + plan) - """ - logger.info("Generating K003 Top Queries by total_time (exec + plan) report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("K003 - No databases found") - - queries_by_db = {} - - if use_hourly and time_range_minutes >= 60: - # Use hourly topk aggregation - hours = time_range_minutes // 60 - - for db_name in databases: - logger.info(f"K003: Processing database {db_name} (hourly mode)...") - - # Get exec time - exec_per_query, exec_other, timeline = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, "pgwatch_pg_stat_statements_exec_time_total", hours=hours - ) - - # Get plan time (might not be available in older PG versions) - plan_per_query, plan_other, _ = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, "pgwatch_pg_stat_statements_plan_time_total", hours=hours - ) - - # Check if plan time is actually non-zero (not just if data exists) - total_plan_time = sum(sum(values) for values in plan_per_query.values()) + sum(plan_other) - plan_time_available = total_plan_time > 0 - - if not exec_per_query and sum(exec_other) == 0: - logger.warning(f"K003 - No query metrics returned for database {db_name}") - continue # Skip databases with no data - - # Combine exec and plan time per query across all hours - all_queryids = set(exec_per_query.keys()) | set(plan_per_query.keys()) - query_totals = [] - - for queryid in all_queryids: - exec_values = exec_per_query.get(queryid, [0] * hours) - plan_values = plan_per_query.get(queryid, [0] * hours) - - # Combine exec + plan for each hour - hourly_total_time = [e + p for e, p in zip(exec_values, plan_values)] - total_time = sum(hourly_total_time) - total_exec_time = sum(exec_values) - total_plan_time = sum(plan_values) - - query_totals.append({ - "queryid": queryid, - "total_time_ms": total_time, - "total_exec_time_ms": total_exec_time, - "total_plan_time_ms": total_plan_time, - "hourly_time_ms": hourly_total_time, - "hourly_exec_time_ms": exec_values, - "hourly_plan_time_ms": plan_values if plan_time_available else None - }) - - # Sort by total_time (descending) and limit to top N - sorted_metrics = sorted(query_totals, key=lambda x: x.get('total_time_ms', 0), reverse=True)[:limit] - - # Calculate other time (exec + plan) - other_time_hourly = [e + p for e, p in zip(exec_other, plan_other)] - - # Calculate totals - total_time = sum(q.get('total_time_ms', 0) for q in sorted_metrics) + sum(other_time_hourly) - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "other_time_hourly": other_time_hourly, - "other_exec_time_hourly": exec_other, - "other_plan_time_hourly": plan_other if plan_time_available else None, - "summary": { - "queries_returned": len(sorted_metrics), - "total_time_ms": total_time, - "total_time_tracked_queries_ms": sum(q.get('total_time_ms', 0) for q in sorted_metrics), - "total_time_other_ms": sum(other_time_hourly), - "time_range_hours": hours, - "hourly_timestamps": timeline, - "limit": limit, - "plan_time_available": plan_time_available, - "note": "Includes both exec and plan time" if plan_time_available else "Plan time unavailable, showing exec time only" - } - } - else: - # Fallback to original logic for sub-hourly or when explicitly disabled - end_time = datetime.now() - start_time = end_time - timedelta(minutes=time_range_minutes) - - for db_name in databases: - logger.info(f"K003: Processing database {db_name}...") - # Get pg_stat_statements metrics for this database - query_metrics = self._get_pgss_metrics_data_by_db(cluster, node_name, db_name, start_time, end_time) - - if not query_metrics: - logger.warning(f"K003 - No query metrics returned for database {db_name}") - - # Sort by total_time (descending) and limit to top N per database - sorted_metrics = sorted(query_metrics, key=lambda x: x.get('total_time', 0), reverse=True)[:limit] - - # Calculate totals for the top queries in this database - total_calls = sum(q.get('calls', 0) for q in sorted_metrics) - total_time = sum(q.get('total_time', 0) for q in sorted_metrics) - total_rows = sum(q.get('rows', 0) for q in sorted_metrics) - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "summary": { - "queries_returned": len(sorted_metrics), - "total_calls": total_calls, - "total_time_ms": total_time, - "total_rows": total_rows, - "time_range_minutes": time_range_minutes, - "start_time": start_time.isoformat(), - "end_time": end_time.isoformat(), - "limit": limit - } - } - - return self.format_report_data( - "K003", - queries_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_m001_mean_time_report(self, cluster: str = "local", node_name: str = "node-01", - time_range_minutes: int = 60, limit: int = 100, use_hourly: bool = True) -> Dict[str, Any]: - """ - Generate M001 Top Queries by mean execution time report. - - Args: - cluster: Cluster name - node_name: Node name - time_range_minutes: Time range in minutes for metrics collection (used when use_hourly=False) - limit: Number of top queries to return (default: 100) - use_hourly: Use hourly topk aggregation logic (default: True) - - Returns: - Dictionary containing top queries sorted by mean execution time - """ - logger.info("Generating M001 Top Queries by mean execution time report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("M001 - No databases found") - - queries_by_db = {} - - if use_hourly and time_range_minutes >= 60: - # Use hourly topk aggregation - hours = time_range_minutes // 60 - - for db_name in databases: - logger.info(f"M001: Processing database {db_name} (hourly mode)...") - - # Get both time and calls metrics - time_per_query, time_other, timeline = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, "pgwatch_pg_stat_statements_exec_time_total", hours=hours - ) - calls_per_query, calls_other, _ = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, "pgwatch_pg_stat_statements_calls", hours=hours - ) - - if not time_per_query and sum(time_other) == 0: - logger.warning(f"M001 - No query metrics returned for database {db_name}") - continue # Skip databases with no data - - # Calculate mean time per query across all hours - query_means = [] - for queryid in time_per_query.keys(): - total_time = sum(time_per_query[queryid]) - total_calls = sum(calls_per_query.get(queryid, [0] * hours)) - - if total_calls > 0: - mean_time = total_time / total_calls - query_means.append({ - "queryid": queryid, - "mean_time_ms": mean_time, - "total_time_ms": total_time, - "total_calls": total_calls, - "hourly_time_ms": time_per_query[queryid], - "hourly_calls": calls_per_query.get(queryid, [0] * hours) - }) - - # Sort by mean_time (descending) and limit to top N - sorted_metrics = sorted(query_means, key=lambda x: x.get('mean_time_ms', 0), reverse=True)[:limit] - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "other_time_hourly": time_other, - "other_calls_hourly": calls_other, - "summary": { - "queries_returned": len(sorted_metrics), - "total_time_tracked_queries_ms": sum(q.get('total_time_ms', 0) for q in sorted_metrics), - "total_calls_tracked_queries": sum(q.get('total_calls', 0) for q in sorted_metrics), - "time_range_hours": hours, - "hourly_timestamps": timeline, - "limit": limit - } - } - else: - # Fallback to original logic for sub-hourly or when explicitly disabled - end_time = datetime.now() - start_time = end_time - timedelta(minutes=time_range_minutes) - - for db_name in databases: - logger.info(f"M001: Processing database {db_name}...") - # Get pg_stat_statements metrics for this database - query_metrics = self._get_pgss_metrics_data_by_db(cluster, node_name, db_name, start_time, end_time) - - if not query_metrics: - logger.warning(f"M001 - No query metrics returned for database {db_name}") - - # Calculate mean execution time for each query - queries_with_mean = [] - for q in query_metrics: - calls = q.get('calls', 0) - total_time = q.get('total_time', 0) - if calls > 0: - mean_time = total_time / calls - q['mean_time'] = mean_time - queries_with_mean.append(q) - - # Sort by mean_time (descending) and limit to top N per database - sorted_metrics = sorted(queries_with_mean, key=lambda x: x.get('mean_time', 0), reverse=True)[:limit] - - # Calculate totals for the top queries in this database - total_calls = sum(q.get('calls', 0) for q in sorted_metrics) - total_time = sum(q.get('total_time', 0) for q in sorted_metrics) - total_rows = sum(q.get('rows', 0) for q in sorted_metrics) - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "summary": { - "queries_returned": len(sorted_metrics), - "total_calls": total_calls, - "total_time_ms": total_time, - "total_rows": total_rows, - "time_range_minutes": time_range_minutes, - "start_time": start_time.isoformat(), - "end_time": end_time.isoformat(), - "limit": limit - } - } - - return self.format_report_data( - "M001", - queries_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_m002_rows_report(self, cluster: str = "local", node_name: str = "node-01", - time_range_minutes: int = 60, limit: int = 100, use_hourly: bool = True) -> Dict[str, Any]: - """ - Generate M002 Top Queries by rows (I/O intensity) report. - - Args: - cluster: Cluster name - node_name: Node name - time_range_minutes: Time range in minutes for metrics collection (used when use_hourly=False) - limit: Number of top queries to return (default: 100) - use_hourly: Use hourly topk aggregation logic (default: True) - - Returns: - Dictionary containing top queries sorted by rows processed - """ - logger.info("Generating M002 Top Queries by rows report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("M002 - No databases found") - - queries_by_db = {} - - if use_hourly and time_range_minutes >= 60: - # Use hourly topk aggregation - hours = time_range_minutes // 60 - metric_name = "pgwatch_pg_stat_statements_rows" - - for db_name in databases: - logger.info(f"M002: Processing database {db_name} (hourly mode)...") - - per_query, other, timeline = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, metric_name, hours=hours - ) - - if not per_query and sum(other) == 0: - logger.warning(f"M002 - No query metrics returned for database {db_name}") - continue # Skip databases with no data - - # Calculate total rows per query across all hours - query_totals = [] - for queryid, hourly_values in per_query.items(): - total_rows = sum(hourly_values) - query_totals.append({ - "queryid": queryid, - "total_rows": total_rows, - "hourly_rows": hourly_values - }) - - # Sort by total_rows (descending) and limit to top N - sorted_metrics = sorted(query_totals, key=lambda x: x.get('total_rows', 0), reverse=True)[:limit] - - # Calculate totals - total_rows = sum(q.get('total_rows', 0) for q in sorted_metrics) + sum(other) - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "other_rows_hourly": other, - "summary": { - "queries_returned": len(sorted_metrics), - "total_rows": total_rows, - "total_rows_tracked_queries": sum(q.get('total_rows', 0) for q in sorted_metrics), - "total_rows_other": sum(other), - "time_range_hours": hours, - "hourly_timestamps": timeline, - "limit": limit - } - } - else: - # Fallback to original logic for sub-hourly or when explicitly disabled - end_time = datetime.now() - start_time = end_time - timedelta(minutes=time_range_minutes) - - for db_name in databases: - logger.info(f"M002: Processing database {db_name}...") - # Get pg_stat_statements metrics for this database - query_metrics = self._get_pgss_metrics_data_by_db(cluster, node_name, db_name, start_time, end_time) - - if not query_metrics: - logger.warning(f"M002 - No query metrics returned for database {db_name}") - - # Sort by rows (descending) and limit to top N per database - sorted_metrics = sorted(query_metrics, key=lambda x: x.get('rows', 0), reverse=True)[:limit] - - # Calculate totals for the top queries in this database - total_calls = sum(q.get('calls', 0) for q in sorted_metrics) - total_time = sum(q.get('total_time', 0) for q in sorted_metrics) - total_rows = sum(q.get('rows', 0) for q in sorted_metrics) - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "summary": { - "queries_returned": len(sorted_metrics), - "total_calls": total_calls, - "total_time_ms": total_time, - "total_rows": total_rows, - "time_range_minutes": time_range_minutes, - "start_time": start_time.isoformat(), - "end_time": end_time.isoformat(), - "limit": limit - } - } - - return self.format_report_data( - "M002", - queries_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_m003_io_time_report(self, cluster: str = "local", node_name: str = "node-01", - time_range_minutes: int = 60, limit: int = 100, use_hourly: bool = True) -> Dict[str, Any]: - """ - Generate M003 Top Queries by I/O time report. - - Args: - cluster: Cluster name - node_name: Node name - time_range_minutes: Time range in minutes for metrics collection (used when use_hourly=False) - limit: Number of top queries to return (default: 100) - use_hourly: Use hourly topk aggregation logic (default: True) - - Returns: - Dictionary containing top queries sorted by total I/O time - """ - logger.info("Generating M003 Top Queries by I/O time report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("M003 - No databases found") - - queries_by_db = {} - - if use_hourly and time_range_minutes >= 60: - # Use hourly topk aggregation - hours = time_range_minutes // 60 - - for db_name in databases: - logger.info(f"M003: Processing database {db_name} (hourly mode)...") - - # Get both read and write I/O time metrics - read_per_query, read_other, timeline = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, "pgwatch_pg_stat_statements_block_read_total", hours=hours - ) - write_per_query, write_other, _ = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, "pgwatch_pg_stat_statements_block_write_total", hours=hours - ) - - if not read_per_query and not write_per_query and sum(read_other) == 0 and sum(write_other) == 0: - logger.warning(f"M003 - No query metrics returned for database {db_name}") - continue # Skip databases with no data - - # Combine read and write times, calculate total I/O time per query - all_queryids = set(read_per_query.keys()) | set(write_per_query.keys()) - query_io_totals = [] - - for queryid in all_queryids: - read_values = read_per_query.get(queryid, [0] * hours) - write_values = write_per_query.get(queryid, [0] * hours) - - # Combine read and write for each hour - hourly_io_time = [r + w for r, w in zip(read_values, write_values)] - total_io_time = sum(hourly_io_time) - - query_io_totals.append({ - "queryid": queryid, - "total_io_time_ms": total_io_time, - "total_read_time_ms": sum(read_values), - "total_write_time_ms": sum(write_values), - "hourly_io_time_ms": hourly_io_time, - "hourly_read_time_ms": read_values, - "hourly_write_time_ms": write_values - }) - - # Sort by total_io_time (descending) and limit to top N - sorted_metrics = sorted(query_io_totals, key=lambda x: x.get('total_io_time_ms', 0), reverse=True)[:limit] - - # Calculate other I/O time - other_io_time_hourly = [r + w for r, w in zip(read_other, write_other)] - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "other_io_time_hourly": other_io_time_hourly, - "other_read_time_hourly": read_other, - "other_write_time_hourly": write_other, - "summary": { - "queries_returned": len(sorted_metrics), - "total_io_time_tracked_queries_ms": sum(q.get('total_io_time_ms', 0) for q in sorted_metrics), - "total_io_time_other_ms": sum(other_io_time_hourly), - "time_range_hours": hours, - "hourly_timestamps": timeline, - "limit": limit - } - } - else: - # Fallback to original logic for sub-hourly or when explicitly disabled - end_time = datetime.now() - start_time = end_time - timedelta(minutes=time_range_minutes) - - for db_name in databases: - logger.info(f"M003: Processing database {db_name}...") - # Get pg_stat_statements metrics for this database - query_metrics = self._get_pgss_metrics_data_by_db(cluster, node_name, db_name, start_time, end_time) - - if not query_metrics: - logger.warning(f"M003 - No query metrics returned for database {db_name}") - - # Calculate total I/O time for each query - queries_with_io_time = [] - for q in query_metrics: - blk_read_time = q.get('blk_read_time', 0) - blk_write_time = q.get('blk_write_time', 0) - total_io_time = blk_read_time + blk_write_time - q['total_io_time'] = total_io_time - queries_with_io_time.append(q) - - # Sort by total_io_time (descending) and limit to top N per database - sorted_metrics = sorted(queries_with_io_time, key=lambda x: x.get('total_io_time', 0), reverse=True)[:limit] - - # Calculate totals for the top queries in this database - total_calls = sum(q.get('calls', 0) for q in sorted_metrics) - total_time = sum(q.get('total_time', 0) for q in sorted_metrics) - total_rows = sum(q.get('rows', 0) for q in sorted_metrics) - total_io_time = sum(q.get('total_io_time', 0) for q in sorted_metrics) - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "summary": { - "queries_returned": len(sorted_metrics), - "total_calls": total_calls, - "total_time_ms": total_time, - "total_rows": total_rows, - "total_io_time_ms": total_io_time, - "time_range_minutes": time_range_minutes, - "start_time": start_time.isoformat(), - "end_time": end_time.isoformat(), - "limit": limit - } - } - - return self.format_report_data( - "M003", - queries_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_k004_temp_bytes_report(self, cluster: str = "local", node_name: str = "node-01", - time_range_minutes: int = 60, limit: int = 100, use_hourly: bool = True) -> Dict[str, Any]: - """ - Generate K004 Top Queries by temp bytes written report. - - Args: - cluster: Cluster name - node_name: Node name - time_range_minutes: Time range in minutes for metrics collection (used when use_hourly=False) - limit: Number of top queries to return (default: 100) - use_hourly: Use hourly topk aggregation logic (default: True) - - Returns: - Dictionary containing top queries sorted by temp bytes written - """ - logger.info("Generating K004 Top Queries by temp bytes written report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("K004 - No databases found") - - queries_by_db = {} - - if use_hourly and time_range_minutes >= 60: - # Use hourly topk aggregation - hours = time_range_minutes // 60 - metric_name = "pgwatch_pg_stat_statements_temp_bytes_written" - - for db_name in databases: - logger.info(f"K004: Processing database {db_name} (hourly mode)...") - - per_query, other, timeline = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, metric_name, hours=hours - ) - - if not per_query and sum(other) == 0: - logger.warning(f"K004 - No query metrics returned for database {db_name}") - continue # Skip databases with no data - - # Calculate total temp bytes per query across all hours - query_totals = [] - for queryid, hourly_values in per_query.items(): - total_bytes = sum(hourly_values) - query_totals.append({ - "queryid": queryid, - "total_temp_bytes": total_bytes, - "hourly_temp_bytes": hourly_values - }) - - # Sort by total_temp_bytes (descending) and limit to top N - sorted_metrics = sorted(query_totals, key=lambda x: x.get('total_temp_bytes', 0), reverse=True)[:limit] - - # Calculate totals - total_bytes = sum(q.get('total_temp_bytes', 0) for q in sorted_metrics) + sum(other) - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "other_temp_bytes_hourly": other, - "summary": { - "queries_returned": len(sorted_metrics), - "total_temp_bytes": total_bytes, - "total_temp_bytes_tracked_queries": sum(q.get('total_temp_bytes', 0) for q in sorted_metrics), - "total_temp_bytes_other": sum(other), - "time_range_hours": hours, - "hourly_timestamps": timeline, - "limit": limit - } - } - else: - # Fallback for sub-hourly (not typically needed) - pass - - return self.format_report_data( - "K004", - queries_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_k005_wal_bytes_report(self, cluster: str = "local", node_name: str = "node-01", - time_range_minutes: int = 60, limit: int = 100, use_hourly: bool = True) -> Dict[str, Any]: - """ - Generate K005 Top Queries by WAL generation report. - - Args: - cluster: Cluster name - node_name: Node name - time_range_minutes: Time range in minutes for metrics collection (used when use_hourly=False) - limit: Number of top queries to return (default: 100) - use_hourly: Use hourly topk aggregation logic (default: True) - - Returns: - Dictionary containing top queries sorted by WAL bytes generated - """ - logger.info("Generating K005 Top Queries by WAL generation report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("K005 - No databases found") - - queries_by_db = {} - - if use_hourly and time_range_minutes >= 60: - # Use hourly topk aggregation - hours = time_range_minutes // 60 - metric_name = "pgwatch_pg_stat_statements_wal_bytes" - - for db_name in databases: - logger.info(f"K005: Processing database {db_name} (hourly mode)...") - - per_query, other, timeline = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, metric_name, hours=hours - ) - - if not per_query and sum(other) == 0: - logger.warning(f"K005 - No query metrics returned for database {db_name}") - continue # Skip databases with no data - - # Calculate total WAL bytes per query across all hours - query_totals = [] - for queryid, hourly_values in per_query.items(): - total_bytes = sum(hourly_values) - query_totals.append({ - "queryid": queryid, - "total_wal_bytes": total_bytes, - "hourly_wal_bytes": hourly_values - }) - - # Sort by total_wal_bytes (descending) and limit to top N - sorted_metrics = sorted(query_totals, key=lambda x: x.get('total_wal_bytes', 0), reverse=True)[:limit] - - # Calculate totals - total_bytes = sum(q.get('total_wal_bytes', 0) for q in sorted_metrics) + sum(other) - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "other_wal_bytes_hourly": other, - "summary": { - "queries_returned": len(sorted_metrics), - "total_wal_bytes": total_bytes, - "total_wal_bytes_tracked_queries": sum(q.get('total_wal_bytes', 0) for q in sorted_metrics), - "total_wal_bytes_other": sum(other), - "time_range_hours": hours, - "hourly_timestamps": timeline, - "limit": limit - } - } - else: - # Fallback for sub-hourly (not typically needed) - pass - - return self.format_report_data( - "K005", - queries_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_k006_shared_read_report(self, cluster: str = "local", node_name: str = "node-01", - time_range_minutes: int = 60, limit: int = 100, use_hourly: bool = True) -> Dict[str, Any]: - """ - Generate K006 Top Queries by shared blocks read report. - - Args: - cluster: Cluster name - node_name: Node name - time_range_minutes: Time range in minutes for metrics collection (used when use_hourly=False) - limit: Number of top queries to return (default: 100) - use_hourly: Use hourly topk aggregation logic (default: True) - - Returns: - Dictionary containing top queries sorted by shared blocks read - """ - logger.info("Generating K006 Top Queries by shared blocks read report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("K006 - No databases found") - - queries_by_db = {} - - if use_hourly and time_range_minutes >= 60: - # Use hourly topk aggregation - hours = time_range_minutes // 60 - metric_name = "pgwatch_pg_stat_statements_shared_bytes_read_total" - - for db_name in databases: - logger.info(f"K006: Processing database {db_name} (hourly mode)...") - - per_query, other, timeline = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, metric_name, hours=hours - ) - - if not per_query and sum(other) == 0: - logger.warning(f"K006 - No query metrics returned for database {db_name}") - continue # Skip databases with no data - - # Calculate total shared read bytes per query across all hours - query_totals = [] - for queryid, hourly_values in per_query.items(): - total_bytes = sum(hourly_values) - query_totals.append({ - "queryid": queryid, - "total_shared_read_bytes": total_bytes, - "hourly_shared_read_bytes": hourly_values - }) - - # Sort by total_shared_read_bytes (descending) and limit to top N - sorted_metrics = sorted(query_totals, key=lambda x: x.get('total_shared_read_bytes', 0), reverse=True)[:limit] - - # Calculate totals - total_bytes = sum(q.get('total_shared_read_bytes', 0) for q in sorted_metrics) + sum(other) - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "other_shared_read_bytes_hourly": other, - "summary": { - "queries_returned": len(sorted_metrics), - "total_shared_read_bytes": total_bytes, - "total_shared_read_bytes_tracked_queries": sum(q.get('total_shared_read_bytes', 0) for q in sorted_metrics), - "total_shared_read_bytes_other": sum(other), - "time_range_hours": hours, - "hourly_timestamps": timeline, - "limit": limit - } - } - else: - # Fallback for sub-hourly (not typically needed) - pass - - return self.format_report_data( - "K006", - queries_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_k007_shared_hit_report(self, cluster: str = "local", node_name: str = "node-01", - time_range_minutes: int = 60, limit: int = 100, use_hourly: bool = True) -> Dict[str, Any]: - """ - Generate K007 Top Queries by shared blocks hit report. - - Args: - cluster: Cluster name - node_name: Node name - time_range_minutes: Time range in minutes for metrics collection (used when use_hourly=False) - limit: Number of top queries to return (default: 100) - use_hourly: Use hourly topk aggregation logic (default: True) - - Returns: - Dictionary containing top queries sorted by shared blocks hit - """ - logger.info("Generating K007 Top Queries by shared blocks hit report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("K007 - No databases found") - - queries_by_db = {} - - if use_hourly and time_range_minutes >= 60: - # Use hourly topk aggregation - hours = time_range_minutes // 60 - metric_name = "pgwatch_pg_stat_statements_shared_bytes_hit_total" - - for db_name in databases: - logger.info(f"K007: Processing database {db_name} (hourly mode)...") - - per_query, other, timeline = self._get_hourly_topk_pgss_data( - cluster, node_name, db_name, metric_name, hours=hours - ) - - if not per_query and sum(other) == 0: - logger.warning(f"K007 - No query metrics returned for database {db_name}") - continue # Skip databases with no data - - # Calculate total shared hit bytes per query across all hours - query_totals = [] - for queryid, hourly_values in per_query.items(): - total_bytes = sum(hourly_values) - query_totals.append({ - "queryid": queryid, - "total_shared_hit_bytes": total_bytes, - "hourly_shared_hit_bytes": hourly_values - }) - - # Sort by total_shared_hit_bytes (descending) and limit to top N - sorted_metrics = sorted(query_totals, key=lambda x: x.get('total_shared_hit_bytes', 0), reverse=True)[:limit] - - # Calculate totals - total_bytes = sum(q.get('total_shared_hit_bytes', 0) for q in sorted_metrics) + sum(other) - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "other_shared_hit_bytes_hourly": other, - "summary": { - "queries_returned": len(sorted_metrics), - "total_shared_hit_bytes": total_bytes, - "total_shared_hit_bytes_tracked_queries": sum(q.get('total_shared_hit_bytes', 0) for q in sorted_metrics), - "total_shared_hit_bytes_other": sum(other), - "time_range_hours": hours, - "hourly_timestamps": timeline, - "limit": limit - } - } - else: - # Fallback for sub-hourly (not typically needed) - pass - - return self.format_report_data( - "K007", - queries_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_k008_shared_hit_read_report( - self, - cluster: str = "local", - node_name: str = "node-01", - time_range_minutes: int = 60, - limit: int = 100, - use_hourly: bool = True, - ) -> Dict[str, Any]: - """ - Generate K008 Top Queries by shared blocks (hit + read) report. - - Notes: - - Our hourly topk utility (`_get_hourly_topk_pgss_data`) can only rank by a single metric. - Here we fetch hit and read separately and then combine them in Python (similar to K003). - - Args: - cluster: Cluster name - node_name: Node name - time_range_minutes: Time range in minutes for metrics collection (used when use_hourly=False) - limit: Number of top queries to return (default: 100) - use_hourly: Use hourly topk aggregation logic (default: True) - - Returns: - Dictionary containing top queries sorted by (shared hit + shared read) bytes - """ - logger.info("Generating K008 Top Queries by shared blocks (hit + read) report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("K008 - No databases found") - - queries_by_db: Dict[str, Any] = {} - - if use_hourly and time_range_minutes >= 60: - # Use hourly topk aggregation - hours = time_range_minutes // 60 - - hit_metric = "pgwatch_pg_stat_statements_shared_bytes_hit_total" - read_metric = "pgwatch_pg_stat_statements_shared_bytes_read_total" - - for db_name in databases: - logger.info(f"K008: Processing database {db_name} (hourly mode)...") - - per_query, other, timeline = self._get_hourly_topk_pgss_data_sum2( - cluster, - node_name, - db_name, - hit_metric, - read_metric, - hours=hours, - ) - - if not per_query and sum(other) == 0: - logger.warning(f"K008 - No query metrics returned for database {db_name}") - continue # Skip databases with no data - - query_totals = [] - for queryid, hourly_total_bytes in per_query.items(): - total_bytes = sum(hourly_total_bytes) - query_totals.append( - { - "queryid": queryid, - "total_shared_hit_read_bytes": total_bytes, - "hourly_shared_hit_read_bytes": hourly_total_bytes, - } - ) - - # Sort by total_shared_hit_read_bytes (descending) and limit to top N - sorted_metrics = sorted( - query_totals, key=lambda x: x.get("total_shared_hit_read_bytes", 0), reverse=True - )[:limit] - - tracked_total = sum(q.get("total_shared_hit_read_bytes", 0) for q in sorted_metrics) - other_total = sum(other) - total_bytes = tracked_total + other_total - - queries_by_db[db_name] = { - "top_queries": sorted_metrics, - "other_shared_hit_read_bytes_hourly": other, - "summary": { - "queries_returned": len(sorted_metrics), - "total_shared_hit_read_bytes": total_bytes, - "total_shared_hit_read_bytes_tracked_queries": tracked_total, - "total_shared_hit_read_bytes_other": other_total, - "time_range_hours": hours, - "hourly_timestamps": timeline, - "limit": limit, - }, - } - else: - # Fallback for sub-hourly (not typically needed) - pass - - return self.format_report_data( - "K008", - queries_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def generate_n001_wait_events_report(self, cluster: str = "local", node_name: str = "node-01", - hours: int = 24) -> Dict[str, Any]: - """ - Generate N001 Wait Events report with hourly breakdown grouped by wait_event_type and query_id. - - Args: - cluster: Cluster name - node_name: Node name - hours: Number of hours to analyze (default: 24) - - Returns: - Dictionary containing wait events grouped by type and query_id with hourly occurrences - """ - logger.info("Generating N001 Wait Events report...") - - # Get all databases - databases = self.get_all_databases(cluster, node_name) - - if not databases: - logger.warning("N001 - No databases found") - - # Build timeline - now = int(time.time()) - end_s = self._floor_hour(now) - start_s, timeline = self._build_timeline(end_s, hours, step_s=3600) - - wait_events_by_db = {} - - for db_name in databases: - logger.info(f"N001: Processing database {db_name}...") - - # Query wait events from Prometheus - # pgwatch_wait_events_total has labels: wait_event_type, wait_event, query_id, datname - filters = [ - f'cluster="{cluster}"', - f'node_name="{node_name}"', - f'datname="{db_name}"' - ] - filter_str = '{' + ','.join(filters) + '}' - - # Get wait events data over the time range with hourly step - metric_name = f'pgwatch_wait_events_total{filter_str}' - - try: - result = self.query_range(metric_name, datetime.fromtimestamp(start_s), - datetime.fromtimestamp(end_s), step="3600s") - - if not result: - logger.warning(f"N001 - No wait events data for database {db_name}") - continue - - # Build timestamp to hour index map - ts_to_hour = {ts: idx for idx, ts in enumerate(timeline)} - - # Process results to group by wait_event_type -> query_id with hourly breakdown - wait_events_grouped = {} - - for series in result: - metric = series.get('metric', {}) - wait_event_type = metric.get('wait_event_type', 'Unknown') - wait_event = metric.get('wait_event', 'Unknown') - query_id = metric.get('query_id', '0') - - # Get the values (timestamp, value pairs) - values = series.get('values', []) - - # Group by wait_event_type - if wait_event_type not in wait_events_grouped: - wait_events_grouped[wait_event_type] = { - 'queries': {}, - 'total_occurrences': 0, - 'unique_queries': 0 - } - - # Add query_id under this wait_event_type - if query_id not in wait_events_grouped[wait_event_type]['queries']: - wait_events_grouped[wait_event_type]['queries'][query_id] = { - 'occurrences': 0, - 'hourly_occurrences': [0] * hours, - 'wait_events': {} - } - - # Process hourly values - for timestamp, value in values: - try: - count = float(value) - if count == 0: - continue - - ts = int(timestamp) - if ts not in ts_to_hour: - continue - - hour_idx = ts_to_hour[ts] - - # Update hourly arrays for query only - wait_events_grouped[wait_event_type]['queries'][query_id]['hourly_occurrences'][hour_idx] += int(count) - - # Track individual wait events - if wait_event not in wait_events_grouped[wait_event_type]['queries'][query_id]['wait_events']: - wait_events_grouped[wait_event_type]['queries'][query_id]['wait_events'][wait_event] = { - 'occurrences': 0 - } - wait_events_grouped[wait_event_type]['queries'][query_id]['wait_events'][wait_event]['occurrences'] += int(count) - - except (ValueError, TypeError): - continue - - # Calculate totals - for wait_type in wait_events_grouped: - for query_id in wait_events_grouped[wait_type]['queries']: - query_data = wait_events_grouped[wait_type]['queries'][query_id] - query_data['occurrences'] = sum(query_data['hourly_occurrences']) - - # Calculate total_occurrences from all queries - wait_events_grouped[wait_type]['total_occurrences'] = sum( - q['occurrences'] for q in wait_events_grouped[wait_type]['queries'].values() - ) - - # Skip databases with no wait events data - if not wait_events_grouped: - logger.warning(f"N001 - No wait events data for database {db_name}") - continue - - # Count unique queries and convert to list - for wait_type in wait_events_grouped: - wait_events_grouped[wait_type]['unique_queries'] = len(wait_events_grouped[wait_type]['queries']) - - queries_list = [] - for query_id, data in wait_events_grouped[wait_type]['queries'].items(): - queries_list.append({ - 'query_id': query_id, - 'occurrences': data['occurrences'], - 'hourly_occurrences': data['hourly_occurrences'], - 'wait_events': data['wait_events'] - }) - # Sort by occurrences descending - queries_list.sort(key=lambda x: x['occurrences'], reverse=True) - wait_events_grouped[wait_type]['queries_list'] = queries_list - # Remove the dict version - del wait_events_grouped[wait_type]['queries'] - - wait_events_by_db[db_name] = { - 'wait_event_types': wait_events_grouped, - 'summary': { - 'time_range_hours': hours, - 'start_time': datetime.fromtimestamp(start_s).isoformat(), - 'end_time': datetime.fromtimestamp(end_s).isoformat(), - 'wait_event_types_count': len(wait_events_grouped), - 'total_occurrences': sum(wt['total_occurrences'] for wt in wait_events_grouped.values()), - 'hourly_timestamps': timeline - } - } - - except Exception as e: - logger.error(f"Error querying wait events for database {db_name}: {e}") - continue - - return self.format_report_data( - "N001", - wait_events_by_db, - node_name, - postgres_version=self._get_postgres_version_info(cluster, node_name), - ) - - def _get_pgss_metrics_data(self, cluster: str, node_name: str, start_time: datetime, end_time: datetime) -> List[ - Dict[str, Any]]: - """ - Get pg_stat_statements metrics data between two time points. - Adapted from the logic in monitoring_flask_backend/app.py get_pgss_metrics_csv(). - - Args: - cluster: Cluster name - node_name: Node name - start_time: Start datetime - end_time: End datetime - - Returns: - List of query metrics with calculated differences - """ - # Metric name mapping for cleaner output - METRIC_NAME_MAPPING = { - 'calls': 'calls', - 'exec_time_total': 'total_time', - 'rows': 'rows', - 'shared_bytes_hit_total': 'shared_blks_hit', - 'shared_bytes_read_total': 'shared_blks_read', - 'shared_bytes_dirtied_total': 'shared_blks_dirtied', - 'shared_bytes_written_total': 'shared_blks_written', - 'block_read_total': 'blk_read_time', - 'block_write_total': 'blk_write_time' - } - - # Build filters - filters = [f'cluster="{cluster}"', f'node_name="{node_name}"'] - filter_str = '{' + ','.join(filters) + '}' - - # Get all pg_stat_statements metrics - all_metrics = [ - 'pgwatch_pg_stat_statements_calls', - 'pgwatch_pg_stat_statements_exec_time_total', - 'pgwatch_pg_stat_statements_rows', - 'pgwatch_pg_stat_statements_shared_bytes_hit_total', - 'pgwatch_pg_stat_statements_shared_bytes_read_total', - 'pgwatch_pg_stat_statements_shared_bytes_dirtied_total', - 'pgwatch_pg_stat_statements_shared_bytes_written_total', - 'pgwatch_pg_stat_statements_block_read_total', - 'pgwatch_pg_stat_statements_block_write_total' - ] - - # Get metrics at start and end times - start_data = [] - end_data = [] - - for metric in all_metrics: - metric_with_filters = f'{metric}{filter_str}' - - try: - # Query metrics around start time - use instant queries at specific timestamps - start_result = self.query_range(metric_with_filters, start_time - timedelta(minutes=1), - start_time + timedelta(minutes=1)) - if start_result: - start_data.extend(start_result) - - # Query metrics around end time - end_result = self.query_range(metric_with_filters, end_time - timedelta(minutes=1), - end_time + timedelta(minutes=1)) - if end_result: - end_data.extend(end_result) - - except Exception as e: - logger.warning(f"Failed to query metric {metric}: {e}") - continue - - # Process the data to calculate differences - return self._process_pgss_data(start_data, end_data, start_time, end_time, METRIC_NAME_MAPPING) - - def query_range(self, query: str, start_time: datetime, end_time: datetime, step: str = "30s") -> List[ - Dict[str, Any]]: - """ - Execute a range PromQL query. - - Args: - query: PromQL query string - start_time: Start time - end_time: End time - step: Query step interval - - Returns: - List of query results - """ - params = { - 'query': query, - 'start': start_time.timestamp(), - 'end': end_time.timestamp(), - 'step': step - } - - try: - response = requests.get(f"{self.base_url}/query_range", params=params) - if response.status_code == 200: - result = response.json() - if result.get('status') == 'success': - return result.get('data', {}).get('result', []) - else: - logger.error(f"Range query failed with status {response.status_code}: {response.text}") - except Exception as e: - logger.error(f"Range query error: {e}") - - return [] - - def _process_pgss_data(self, start_data: List[Dict], end_data: List[Dict], - start_time: datetime, end_time: datetime, - metric_mapping: Dict[str, str]) -> List[Dict[str, Any]]: - """ - Process pg_stat_statements data and calculate differences between start and end times. - Adapted from the logic in monitoring_flask_backend/app.py process_pgss_data(). - """ - # Convert Prometheus data to dictionaries - start_metrics = self._prometheus_to_dict(start_data, start_time) - end_metrics = self._prometheus_to_dict(end_data, end_time) - - if not start_metrics and not end_metrics: - return [] - - # Create a combined dictionary with all unique query identifiers - all_keys = set() - all_keys.update(start_metrics.keys()) - all_keys.update(end_metrics.keys()) - - result_rows = [] - - # Calculate differences for each query - for key in all_keys: - start_metric = start_metrics.get(key, {}) - end_metric = end_metrics.get(key, {}) - - # Extract identifier components from key - db_name, query_id, user, instance = key - - # Calculate actual duration from metric timestamps - start_timestamp = start_metric.get('timestamp') - end_timestamp = end_metric.get('timestamp') - - if start_timestamp and end_timestamp: - start_dt = datetime.fromisoformat(start_timestamp) - end_dt = datetime.fromisoformat(end_timestamp) - actual_duration = (end_dt - start_dt).total_seconds() - else: - # Fallback to query parameter duration if timestamps are missing - actual_duration = (end_time - start_time).total_seconds() - - # Create result row - row = { - 'queryid': query_id, - 'database': db_name, - 'user': user, - 'duration_seconds': actual_duration - } - - # Numeric columns to calculate differences for (using original metric names) - numeric_cols = list(metric_mapping.keys()) - - # Calculate differences and rates - for col in numeric_cols: - start_val = start_metric.get(col, 0) - end_val = end_metric.get(col, 0) - diff = end_val - start_val - - # Use simplified display name - display_name = metric_mapping[col] - - # Convert bytes to blocks for block-related metrics (PostgreSQL uses 8KB blocks) - if 'blks' in display_name and 'bytes' in col: - diff = diff / 8192 # Convert bytes to 8KB blocks - - row[display_name] = diff - - # Calculate rates per second - if row['duration_seconds'] > 0: - row[f'{display_name}_per_sec'] = diff / row['duration_seconds'] - else: - row[f'{display_name}_per_sec'] = 0 - - # Calculate per-call averages - calls_diff = row.get('calls', 0) - if calls_diff > 0: - row[f'{display_name}_per_call'] = diff / calls_diff - else: - row[f'{display_name}_per_call'] = 0 - - result_rows.append(row) - - return result_rows - - def _prometheus_to_dict(self, prom_data: List[Dict], timestamp: datetime) -> Dict: - """ - Convert Prometheus API response to dictionary keyed by query identifiers. - Adapted from the logic in monitoring_flask_backend/app.py prometheus_to_dict(). - """ - if not prom_data: - return {} - - metrics_dict = {} - - for metric_data in prom_data: - metric = metric_data.get('metric', {}) - values = metric_data.get('values', []) - - if not values: - continue - - # Get the closest value to our timestamp - closest_value = min(values, key=lambda x: abs(float(x[0]) - timestamp.timestamp())) - - # Create unique key for this query - # Note: 'user' label may not exist in all metric configurations - key = ( - metric.get('datname', ''), - metric.get('queryid', ''), - metric.get('user', metric.get('tag_user', '')), # Fallback to tag_user or empty - metric.get('instance', '') - ) - - # Initialize metric dict if not exists - if key not in metrics_dict: - metrics_dict[key] = { - 'timestamp': datetime.fromtimestamp(float(closest_value[0])).isoformat(), - } - - # Add metric value - metric_name = metric.get('__name__', 'pgwatch_pg_stat_statements_calls') - clean_name = metric_name.replace('pgwatch_pg_stat_statements_', '') - - try: - metrics_dict[key][clean_name] = float(closest_value[1]) - except (ValueError, IndexError): - metrics_dict[key][clean_name] = 0 - - return metrics_dict - - def _floor_hour(self, ts: int) -> int: - """ - Floor timestamp to the nearest hour. - - Args: - ts: Unix timestamp in seconds - - Returns: - Floored timestamp - """ - return (ts // 3600) * 3600 - - def _build_timeline(self, end_s: int, hours: int = 24, step_s: int = 3600) -> Tuple[int, List[int]]: - """ - Build a timeline of hourly timestamps. - - Args: - end_s: End timestamp (floored to hour) - hours: Number of hours to cover (default: 24) - step_s: Step size in seconds (default: 3600 = 1 hour) - - Returns: - Tuple of (start_timestamp, list of timestamps) - """ - start_s = end_s - (hours - 1) * step_s - return start_s, [start_s + i * step_s for i in range(hours)] - - def _build_qid_regex(self, qids: List[str]) -> str: - """ - Build a PromQL regex pattern for queryid matching. - - Args: - qids: List of query IDs - - Returns: - PromQL regex pattern - """ - # queryid is integer-like (can be negative). DO NOT escape '-' for PromQL strings. - for q in qids: - if not re.fullmatch(r"-?\d+", q): - raise ValueError(f"Unexpected queryid: {q}") - return "^(?:" + "|".join(qids) + ")$" - - def _to_series_map(self, result: List[Dict]) -> Dict[str, Dict[int, float]]: - """ - Convert Prometheus query_range result to a map of series. - - Args: - result: Prometheus query_range result - - Returns: - Dict mapping queryid to dict of timestamp -> value - """ - out = {} - for s in result: - qid = (s.get("metric") or {}).get("queryid", "__single__") - pts = {int(ts): float(v) for ts, v in s.get("values", [])} - out[qid] = pts - return out - - def _densify(self, series_pts: Dict[str, Dict[int, float]], qids: List[str], - timeline: List[int], fill: float = 0.0) -> Dict[str, List[float]]: - """ - Densify sparse series data to have values for all timeline points. - - Args: - series_pts: Map of queryid to timestamp -> value - qids: List of query IDs to densify - timeline: List of timestamps - fill: Fill value for missing data points (default: 0.0) - - Returns: - Dict mapping queryid to list of values aligned to timeline - """ - return { - qid: [series_pts.get(qid, {}).get(ts, fill) for ts in timeline] - for qid in qids - } - - def _get_hourly_topk_pgss_data_multi( - self, - cluster: str, - node_name: str, - db_name: str, - metric_names: Sequence[str], - hours: int = 24, - step_s: int = 3600, - k: int = 3, - ) -> Tuple[Dict[str, List[float]], List[float], List[int]]: - """ - Generalization of `_get_hourly_topk_pgss_data` that ranks and returns per-hour series by the - sum of one or more pg_stat_statements Prometheus metrics. - """ - metric_names = [m for m in metric_names if m] - if not metric_names: - raise ValueError("metric_names must contain at least one metric name") - - now = int(time.time()) - end_s = self._floor_hour(now) - start_s, timeline = self._build_timeline(end_s, hours, step_s) - - filters = [f'cluster="{cluster}"', f'node_name="{node_name}"', f'datname="{db_name}"'] - filter_str = '{' + ','.join(filters) + '}' - step_str = f"{step_s}s" - - def _sum_by_qid_increase(metric: str, fstr: str) -> str: - return f"sum by (queryid) (increase({metric}{fstr}[1h]))" - - def _sum_increase(metric: str, fstr: str) -> str: - return f"sum(increase({metric}{fstr}[1h]))" - - # Find union of queryids that ever appear in hourly top-k by the sum of metrics. - topk_expr = " + ".join(_sum_by_qid_increase(m, filter_str) for m in metric_names) - q_topk = f"topk({k}, ({topk_expr}))" - topk_result = self.query_range( - q_topk, datetime.fromtimestamp(start_s), datetime.fromtimestamp(end_s), step=step_str - ) - union = sorted( - { - (s.get("metric") or {}).get("queryid") - for s in topk_result - if (s.get("metric") or {}).get("queryid") is not None - } - ) - - # Total per hour (for "other" calculation) - total_expr = " + ".join(_sum_increase(m, filter_str) for m in metric_names) - q_total = f"({total_expr})" - total_result = self.query_range( - q_total, datetime.fromtimestamp(start_s), datetime.fromtimestamp(end_s), step=step_str - ) - total_map = self._to_series_map(total_result).get("__single__", {}) - total = [total_map.get(ts, 0.0) for ts in timeline] - - if not union: - return {}, total[:], timeline - - # Hourly series for all union queryids (densified to N points each) - qid_re = self._build_qid_regex(union) - union_filters = filters + [f'queryid=~"{qid_re}"'] - union_filter_str = '{' + ','.join(union_filters) + '}' - union_expr = " + ".join(_sum_by_qid_increase(m, union_filter_str) for m in metric_names) - q_union = f"({union_expr})" - union_result = self.query_range( - q_union, datetime.fromtimestamp(start_s), datetime.fromtimestamp(end_s), step=step_str - ) - union_pts = self._to_series_map(union_result) - per_query = self._densify(union_pts, union, timeline, fill=0.0) - - # Calculate other = total - sum(union) - other: List[float] = [] - neg_examples: List[Tuple[int, float, float, float]] = [] - for i in range(hours): - union_sum = sum(per_query[qid][i] for qid in union) - o_raw = total[i] - union_sum - if o_raw < 0: - # Keep small float noise quiet, but surface meaningful negatives. - if o_raw < -1e-6 and len(neg_examples) < 5: - neg_examples.append((timeline[i], o_raw, total[i], union_sum)) - o_raw = 0.0 - other.append(o_raw) - - if neg_examples: - min_neg = min(v[1] for v in neg_examples) - logger.warning( - "Hourly topk: negative 'other' clamped to 0 " - f"(cluster={cluster}, node={node_name}, db={db_name}, metrics={list(metric_names)}, " - f"hours={hours}, step_s={step_s}, k={k}, min_other={min_neg:.6g}, " - f"examples={neg_examples})" - ) - - return per_query, other, timeline - - def _get_hourly_topk_pgss_data(self, cluster: str, node_name: str, db_name: str, - metric_name: str = "pgwatch_pg_stat_statements_calls", - hours: int = 24, step_s: int = 3600, - k: int = 3) -> Tuple[Dict[str, List[float]], List[float], List[int]]: - """ - Get hourly topk pg_stat_statements data for a specific database and metric. - - This method finds queries that appear in top-k for any hour within the time range, - then returns per-hour data for those queries plus an "other" category. - - Args: - cluster: Cluster name - node_name: Node name - db_name: Database name - metric_name: Prometheus metric name (default: pgwatch_pg_stat_statements_calls) - hours: Number of hours to look back (default: 24) - step_s: Step size in seconds (default: 3600 = 1 hour) - k: Number of top queries per hour (default: 3) - - Returns: - Tuple of (per_query_dict, other_list, timeline) - - per_query_dict: Dict mapping queryid to list of hourly values - - other_list: List of hourly values for queries not in top-k - - timeline: List of timestamps for the hourly data points - """ - return self._get_hourly_topk_pgss_data_multi( - cluster=cluster, - node_name=node_name, - db_name=db_name, - metric_names=[metric_name], - hours=hours, - step_s=step_s, - k=k, - ) - - def _get_hourly_topk_pgss_data_sum2( - self, - cluster: str, - node_name: str, - db_name: str, - metric_name_a: str, - metric_name_b: str, - hours: int = 24, - step_s: int = 3600, - k: int = 3, - ) -> Tuple[Dict[str, List[float]], List[float], List[int]]: - """ - Like `_get_hourly_topk_pgss_data`, but ranks by the sum of two metrics per queryid, per hour: - - sum by(queryid)(increase(A[1h])) + sum by(queryid)(increase(B[1h])) - - This avoids a correctness pitfall where "union(topk by A, topk by B)" can miss a query that is - not top-k in either A or B individually, but is top-k by (A+B). - """ - return self._get_hourly_topk_pgss_data_multi( - cluster=cluster, - node_name=node_name, - db_name=db_name, - metric_names=[metric_name_a, metric_name_b], - hours=hours, - step_s=step_s, - k=k, - ) - - def format_bytes(self, bytes_value: float) -> str: - """Format bytes value for human readable display.""" - if bytes_value == 0: - return "0 B" - - # Use IEC binary prefixes because we divide by 1024. - units = ['B', 'KiB', 'MiB', 'GiB', 'TiB'] - unit_index = 0 - value = float(bytes_value) - - while value >= 1024 and unit_index < len(units) - 1: - value /= 1024 - unit_index += 1 - - if value >= 100: - return f"{value:.0f} {units[unit_index]}" - elif value >= 10: - return f"{value:.1f} {units[unit_index]}" - else: - return f"{value:.2f} {units[unit_index]}" - - def format_epoch_timestamp(self, epoch_value: float) -> str | None: - """Format epoch seconds as a UTC timestamptz string (ISO-8601, like `timestamptz` in reports).""" - try: - v = float(epoch_value or 0) - except (TypeError, ValueError): - return None - - if v <= 0: - return None - - try: - return datetime.fromtimestamp(v, tz=timezone.utc).isoformat() - except (OverflowError, OSError, ValueError): - return None - - def format_report_data(self, check_id: str, data: Dict[str, Any], host: str = "target-database", - all_hosts: Dict[str, List[str]] = None, - postgres_version: Dict[str, str] = None) -> Dict[str, Any]: - """ - Format data to match template structure. - - Args: - check_id: The check identifier - data: The data to format (can be a dict with node keys if combining multiple nodes) - host: Primary host identifier (used if all_hosts not provided) - all_hosts: Optional dict with 'primary' and 'standbys' keys for multi-node reports - postgres_version: Optional Postgres version info to include at report level - - Returns: - Dictionary formatted for templates - """ - now = datetime.now(timezone.utc) - - # If all_hosts is provided, use it; otherwise use the single host as primary - if all_hosts: - hosts = all_hosts - else: - hosts = { - "primary": host, - "standbys": [], - } - - # Handle both single-node and multi-node data structures - if isinstance(data, dict) and any(isinstance(v, dict) and 'data' in v for v in data.values()): - # Multi-node structure: data is already in {node_name: {"data": ...}} format - # postgres_version should already be embedded per-node; warn if passed here - if postgres_version: - logger.warning(f"postgres_version parameter ignored for multi-node data in {check_id}") - results = data - else: - # Single-node structure: wrap data in host key - node_result = {"data": data} - if postgres_version: - node_result["postgres_version"] = postgres_version - results = {host: node_result} - - template_data = { - "version": self._build_metadata.get("version"), - "build_ts": self._build_metadata.get("build_ts"), - "generation_mode": "full", - "checkId": check_id, - "checkTitle": self.get_check_title(check_id), - "timestamptz": now.isoformat(), - "nodes": hosts, - "results": results - } - - return template_data - - def filter_a003_settings(self, a003_report: Dict[str, Any], setting_names: List[str]) -> Dict[str, Any]: - """ - Filter A003 settings data to include only specified settings. - - Args: - a003_report: Full A003 report containing all settings - setting_names: List of setting names to include - - Returns: - Filtered settings dictionary - """ - filtered = {} - # Handle both single-node and multi-node A003 report structures - results = a003_report.get('results', {}) - for node_name, node_data in results.items(): - data = node_data.get('data', {}) - for setting_name, setting_info in data.items(): - if setting_name in setting_names: - filtered[setting_name] = setting_info - return filtered - - def extract_postgres_version_from_a003(self, a003_report: Dict[str, Any], node_name: str = None) -> Dict[str, str]: - """ - Extract PostgreSQL version info from A003 report settings data. - - Derives version from server_version and server_version_num settings - which are part of the A003 settings data. - - Args: - a003_report: Full A003 report - node_name: Optional specific node name. If None, uses first available node. - - Returns: - Dictionary with postgres version info (version, server_version_num, server_major_ver, server_minor_ver) - """ - results = a003_report.get('results', {}) - if not results: - return {} - - # Get the node data - if node_name and node_name in results: - node_data = results[node_name] - else: - node_data = next(iter(results.values()), {}) - - # First check if postgres_version is already in the node result - if node_data.get('postgres_version'): - return node_data['postgres_version'] - - # Otherwise, extract from settings data (server_version, server_version_num) - data = node_data.get('data', {}) - version_str = None - version_num = None - - # Look for server_version and server_version_num in settings - if 'server_version' in data: - version_str = data['server_version'].get('setting', '') - if 'server_version_num' in data: - version_num = data['server_version_num'].get('setting', '') - - if not version_str and not version_num: - return {} - - # Parse version numbers - major_ver = "" - minor_ver = "" - if version_num and len(version_num) >= 6: - try: - num = int(version_num) - major_ver = str(num // 10000) - minor_ver = str(num % 10000) - except ValueError: - pass - - return { - "version": version_str or "", - "server_version_num": version_num or "", - "server_major_ver": major_ver, - "server_minor_ver": minor_ver - } - - def generate_d004_from_a003(self, a003_report: Dict[str, Any], cluster: str = "local", - node_name: str = "node-01") -> Dict[str, Any]: - """ - Generate D004 report by filtering A003 data for pg_stat_statements settings. - - Args: - a003_report: Full A003 report containing all settings - cluster: Cluster name (for status checks) - node_name: Node name - - Returns: - D004 report dictionary - """ - print("Generating D004 from A003 data...") - - # Filter A003 settings for D004-relevant settings - pgstat_data = self.filter_a003_settings(a003_report, self.D004_SETTINGS) - - # Check extension status (still needs direct queries) - kcache_status = self._check_pg_stat_kcache_status(cluster, node_name) - pgss_status = self._check_pg_stat_statements_status(cluster, node_name) - - # Extract postgres version from A003 - postgres_version = self.extract_postgres_version_from_a003(a003_report, node_name) - - return self.format_report_data( - "D004", - { - "settings": pgstat_data, - "pg_stat_statements_status": pgss_status, - "pg_stat_kcache_status": kcache_status, - }, - node_name, - postgres_version=postgres_version, - ) - - def generate_f001_from_a003(self, a003_report: Dict[str, Any], node_name: str = "node-01") -> Dict[str, Any]: - """ - Generate F001 report by filtering A003 data for autovacuum settings. - - Args: - a003_report: Full A003 report containing all settings - node_name: Node name - - Returns: - F001 report dictionary - """ - print("Generating F001 from A003 data...") - - # Filter A003 settings for F001-relevant settings - autovacuum_data = self.filter_a003_settings(a003_report, self.F001_SETTINGS) - - # Extract postgres version from A003 - postgres_version = self.extract_postgres_version_from_a003(a003_report, node_name) - - return self.format_report_data("F001", autovacuum_data, node_name, postgres_version=postgres_version) - - def generate_g001_from_a003(self, a003_report: Dict[str, Any], node_name: str = "node-01") -> Dict[str, Any]: - """ - Generate G001 report by filtering A003 data for memory settings. - - Args: - a003_report: Full A003 report containing all settings - node_name: Node name - - Returns: - G001 report dictionary with memory analysis - """ - print("Generating G001 from A003 data...") - - # Filter A003 settings for G001-relevant settings - memory_data = self.filter_a003_settings(a003_report, self.G001_SETTINGS) - - # Calculate memory analysis - memory_analysis = self._analyze_memory_settings(memory_data) - - # Extract postgres version from A003 - postgres_version = self.extract_postgres_version_from_a003(a003_report, node_name) - - return self.format_report_data( - "G001", - { - "settings": memory_data, - "analysis": memory_analysis, - }, - node_name, - postgres_version=postgres_version, - ) - - def get_check_title(self, check_id: str) -> str: - """ - Get the human-readable title for a check ID. - - Args: - check_id: The check identifier (e.g., "H004") - - Returns: - Human-readable title for the check - """ - # Mapping based on postgres-checkup README - # https://gitlab.com/postgres-ai/postgres-checkup - check_titles = { - "A001": "System information", - "A002": "Postgres major version", - "A003": "Postgres settings", - "A004": "Cluster information", - "A005": "Extensions", - "A006": "Postgres setting deviations", - "A007": "Altered settings", - "A008": "Disk usage and file system type", - "A010": "Data checksums, wal_log_hints", - "A011": "Connection pooling. pgbouncer", - "A012": "Anti-crash checks", - "A013": "Postgres minor version", - "B001": "SLO/SLA, RPO, RTO", - "B002": "File system, mount flags", - "B003": "Full backups / incremental", - "B004": "WAL archiving", - "B005": "Restore checks, monitoring, alerting", - "C001": "SLO/SLA", - "C002": "Sync/async, Streaming / wal transfer; logical decoding", - "C003": "SPOFs; standby with traffic", - "C004": "Failover", - "C005": "Switchover", - "C006": "Delayed replica", - "C007": "Replication slots. Lags. Standby feedbacks", - "D001": "Logging settings", - "D002": "Useful Linux tools", - "D003": "List of monitoring metrics", - "D004": "pg_stat_statements and pg_stat_kcache settings", - "D005": "track_io_timing, auto_explain", - "D006": "Recommended DBA toolsets", - "D007": "Postgres-specific tools for troubleshooting", - "E001": "WAL/checkpoint settings, IO", - "E002": "Checkpoints, bgwriter, IO", - "F001": "Autovacuum: current settings", - "F002": "Autovacuum: transaction ID wraparound check", - "F003": "Autovacuum: dead tuples", - "F004": "Autovacuum: heap bloat (estimated)", - "F005": "Autovacuum: index bloat (estimated)", - "F006": "Precise heap bloat analysis", - "F007": "Precise index bloat analysis", - "F008": "Autovacuum: resource usage", - "G001": "Memory-related settings", - "G002": "Connections and current activity", - "G003": "Timeouts, locks, deadlocks", - "G004": "Query planner", - "G005": "I/O settings", - "G006": "Default_statistics_target", - "H001": "Invalid indexes", - "H002": "Unused indexes", - "H003": "Non-indexed foreign keys", - "H004": "Redundant indexes", - "J001": "Capacity planning", - "K001": "Globally aggregated query metrics", - "K002": "Workload type", - "K003": "Top queries by total time (total_exec_time + total_plan_time)", - "K004": "Top queries by temp bytes written", - "K005": "Top queries by WAL generation", - "K006": "Top queries by shared blocks read", - "K007": "Top queries by shared blocks hit", - "K008": "Top queries by shared blocks hit+read", - "L001": "Table sizes", - "M001": "Top queries by mean execution time", - "M002": "Top queries by rows (I/O intensity)", - "M003": "Top queries by I/O time", - "N001": "Wait events grouped by type and query", - "L002": "Data types being used", - "L003": "Integer out-of-range risks in PKs", - "L004": "Tables without PK/UK", - } - return check_titles.get(check_id, f"Check {check_id}") - - def get_setting_unit(self, setting_name: str) -> str: - """Get the unit for a PostgreSQL setting.""" - units = { - 'max_connections': 'connections', - 'shared_buffers': '8kB', - 'effective_cache_size': '8kB', - 'work_mem': 'kB', - 'maintenance_work_mem': 'kB', - 'checkpoint_completion_target': '', - 'wal_buffers': '8kB', - 'default_statistics_target': '', - 'random_page_cost': '', - 'effective_io_concurrency': '', - 'autovacuum_max_workers': 'workers', - 'autovacuum_naptime': 's', - 'log_min_duration_statement': 'ms', - 'idle_in_transaction_session_timeout': 'ms', - 'lock_timeout': 'ms', - 'statement_timeout': 'ms', - } - return units.get(setting_name, '') - - def get_setting_category(self, setting_name: str) -> str: - """Get the category for a PostgreSQL setting.""" - categories = { - 'max_connections': 'Connections and Authentication', - 'shared_buffers': 'Memory', - 'effective_cache_size': 'Memory', - 'work_mem': 'Memory', - 'maintenance_work_mem': 'Memory', - 'checkpoint_completion_target': 'Write-Ahead Logging', - 'wal_buffers': 'Write-Ahead Logging', - 'default_statistics_target': 'Query Planning', - 'random_page_cost': 'Query Planning', - 'effective_io_concurrency': 'Asynchronous Behavior', - 'autovacuum_max_workers': 'Autovacuum', - 'autovacuum_naptime': 'Autovacuum', - 'log_min_duration_statement': 'Logging', - 'idle_in_transaction_session_timeout': 'Client Connection Defaults', - 'lock_timeout': 'Client Connection Defaults', - 'statement_timeout': 'Client Connection Defaults', - } - return categories.get(setting_name, 'Other') - - def format_setting_value(self, setting_name: str, value: str, unit: str = "") -> str: - """Format a setting value for display.""" - try: - # If we have a unit from the metric, use it - if unit: - if unit == "8kB": - val = int(value) * 8 - if val >= 1024 and val % 1024 == 0: - return f"{val // 1024} MiB" - else: - return f"{val} KiB" - elif unit == "ms": - val = int(value) - if val >= 1000 and val % 1000 == 0: - return f"{val // 1000} s" - else: - return f"{val} ms" - elif unit == "s": - return f"{value} s" - elif unit == "min": - return f"{value} min" - elif unit == "connections": - return f"{value} connections" - elif unit == "workers": - return f"{value} workers" - else: - return f"{value} {unit}" - - # Fallback to setting name based formatting - if setting_name in ['shared_buffers', 'effective_cache_size', 'work_mem', 'maintenance_work_mem', - 'autovacuum_work_mem', 'logical_decoding_work_mem', 'temp_buffers', 'wal_buffers']: - val = int(value) - if val >= 1024: - return f"{val // 1024} MiB" - else: - return f"{val} KiB" - elif setting_name in ['log_min_duration_statement', 'idle_in_transaction_session_timeout', 'lock_timeout', - 'statement_timeout', 'autovacuum_vacuum_cost_delay', 'vacuum_cost_delay']: - val = int(value) - if val >= 1000: - return f"{val // 1000} s" - else: - return f"{val} ms" - elif setting_name in ['autovacuum_naptime']: - val = int(value) - if val >= 60: - return f"{val // 60} min" - else: - return f"{val} s" - elif setting_name in ['autovacuum_max_workers']: - return f"{value} workers" - elif setting_name in ['pg_stat_statements.max']: - return f"{value} statements" - elif setting_name in ['max_wal_size', 'min_wal_size']: - val = int(value) - if val >= 1024: - return f"{val // 1024} GiB" - else: - return f"{val} MiB" - elif setting_name in ['checkpoint_completion_target']: - return f"{float(value):.2f}" - elif setting_name in ['hash_mem_multiplier']: - return f"{float(value):.1f}" - elif setting_name in ['max_connections', 'max_prepared_transactions', 'max_locks_per_transaction', - 'max_pred_locks_per_transaction', 'max_pred_locks_per_relation', - 'max_pred_locks_per_page', 'max_files_per_process']: - return f"{value} connections" if "connections" in setting_name else f"{value}" - elif setting_name in ['max_stack_depth']: - val = int(value) - if val >= 1024: - return f"{val // 1024} MiB" - else: - return f"{val} KiB" - elif setting_name in ['autovacuum_analyze_scale_factor', 'autovacuum_vacuum_scale_factor', - 'autovacuum_vacuum_insert_scale_factor']: - return f"{float(value) * 100:.1f}%" - elif setting_name in ['autovacuum', 'track_activities', 'track_counts', 'track_functions', - 'track_io_timing', 'track_wal_io_timing', 'pg_stat_statements.track_utility', - 'pg_stat_statements.save', 'pg_stat_statements.track_planning']: - return "on" if value.lower() in ['on', 'true', '1'] else "off" - elif setting_name in ['huge_pages']: - return value # on/off/try - else: - return str(value) - except (ValueError, TypeError): - return str(value) - - def get_cluster_metric_unit(self, metric_name: str) -> str: - """Get the unit for a cluster metric.""" - units = { - 'active_connections': 'connections', - 'idle_connections': 'connections', - 'total_connections': 'connections', - 'database_size': 'bytes', - 'cache_hit_ratio': '%', - 'transactions_per_sec': 'tps', - 'checkpoints_per_sec': 'checkpoints/s', - 'deadlocks': 'count', - 'temp_files': 'files', - 'temp_bytes': 'bytes', - } - return units.get(metric_name, '') - - def get_cluster_metric_description(self, metric_name: str) -> str: - """Get the description for a cluster metric.""" - descriptions = { - 'active_connections': 'Number of active connections', - 'idle_connections': 'Number of idle connections', - 'total_connections': 'Total number of connections', - 'database_size': 'Total database size in bytes', - 'cache_hit_ratio': 'Cache hit ratio percentage', - 'transactions_per_sec': 'Transactions per second', - 'checkpoints_per_sec': 'Checkpoints per second', - 'deadlocks': 'Number of deadlocks', - 'temp_files': 'Number of temporary files', - 'temp_bytes': 'Size of temporary files in bytes', - } - return descriptions.get(metric_name, '') - - def generate_all_reports(self, cluster: str = "local", node_name: str = None, combine_nodes: bool = True) -> Dict[str, Any]: - """ - Generate all reports. - - Args: - cluster: Cluster name - node_name: Node name (if None and combine_nodes=True, will query all nodes) - combine_nodes: If True, combine primary and replica reports into single report - - Returns: - Dictionary containing all reports - """ - reports = {} - - # Determine which nodes to process - if combine_nodes and node_name is None: - # Get all nodes and combine them - all_nodes = self.get_all_nodes(cluster) - nodes_to_process = [] - if all_nodes["primary"]: - nodes_to_process.append(all_nodes["primary"]) - nodes_to_process.extend(all_nodes["standbys"]) - - # If no nodes found, fall back to default - if not nodes_to_process: - logger.warning(f"No nodes found in cluster '{cluster}', using default 'node-01'") - nodes_to_process = ["node-01"] - all_nodes = {"primary": "node-01", "standbys": []} - else: - logger.info(f"Combining reports from nodes: {nodes_to_process}") - else: - # Use single node (backward compatibility) - if node_name is None: - node_name = "node-01" - nodes_to_process = [node_name] - all_nodes = {"primary": node_name, "standbys": []} - - # Reports that don't depend on A003 (generate first) - independent_report_types = [ - ('A002', self.generate_a002_version_report), - ('A003', self.generate_a003_settings_report), - ('A004', self.generate_a004_cluster_report), - ('A007', self.generate_a007_altered_settings_report), - ('F004', self.generate_f004_heap_bloat_report), - ('F005', self.generate_f005_btree_bloat_report), - ('H001', self.generate_h001_invalid_indexes_report), - ('H002', self.generate_h002_unused_indexes_report), - ('H004', self.generate_h004_redundant_indexes_report), - ('K001', self.generate_k001_query_calls_report), - ('K003', self.generate_k003_top_queries_report), - ('K004', self.generate_k004_temp_bytes_report), - ('K005', self.generate_k005_wal_bytes_report), - ('K006', self.generate_k006_shared_read_report), - ('K007', self.generate_k007_shared_hit_report), - ('K008', self.generate_k008_shared_hit_read_report), - ('M001', self.generate_m001_mean_time_report), - ('M002', self.generate_m002_rows_report), - ('M003', self.generate_m003_io_time_report), - ('N001', self.generate_n001_wait_events_report), - ] - - for check_id, report_func in independent_report_types: - # Determine if this report needs hourly parameters - pgss_hourly_reports = ['K001', 'K003', 'K004', 'K005', 'K006', 'K007', 'K008', 'M001', 'M002', 'M003'] - wait_events_reports = ['N001'] - report_kwargs = {} - if check_id in pgss_hourly_reports: - report_kwargs['time_range_minutes'] = 1440 # 24 hours - elif check_id in wait_events_reports: - report_kwargs['hours'] = 24 # 24 hours - - if len(nodes_to_process) == 1: - # Single node - generate report normally - reports[check_id] = report_func(cluster, nodes_to_process[0], **report_kwargs) - else: - # Multiple nodes - combine reports - combined_results = {} - for node in nodes_to_process: - logger.info(f"Generating {check_id} report for node {node}...") - node_report = report_func(cluster, node, **report_kwargs) - # Extract the data from the node report - if 'results' in node_report and node in node_report['results']: - combined_results[node] = node_report['results'][node] - - # Free node report memory immediately - del node_report - - # Create combined report with all nodes - reports[check_id] = self.format_report_data( - check_id, - combined_results, - all_nodes["primary"] if all_nodes["primary"] else nodes_to_process[0], - all_nodes - ) - - # Free combined results after creating report - del combined_results - - # Periodic garbage collection during report generation - if len(reports) % 5 == 0: - gc.collect() - - # Generate D004, F001, G001 from A003 data (if A003 was generated successfully) - a003_report = reports.get('A003') - if a003_report: - # Reports derived from A003 - a003_derived_reports = [ - ('D004', lambda c, n: self.generate_d004_from_a003(a003_report, c, n)), - ('F001', lambda c, n: self.generate_f001_from_a003(a003_report, n)), - ('G001', lambda c, n: self.generate_g001_from_a003(a003_report, n)), - ] - - for check_id, report_func in a003_derived_reports: - if len(nodes_to_process) == 1: - reports[check_id] = report_func(cluster, nodes_to_process[0]) - else: - # For multi-node, use the first node as reference - # (A003 data already contains all nodes) - combined_results = {} - for node in nodes_to_process: - print(f"Generating {check_id} report for node {node} from A003...") - node_report = report_func(cluster, node) - if 'results' in node_report and node in node_report['results']: - combined_results[node] = node_report['results'][node] - - reports[check_id] = self.format_report_data( - check_id, - combined_results, - all_nodes["primary"] if all_nodes["primary"] else nodes_to_process[0], - all_nodes - ) - else: - # Fallback to direct generation if A003 failed - print("Warning: A003 report not available, generating D004/F001/G001 directly") - fallback_report_types = [ - ('D004', self.generate_d004_pgstat_settings_report), - ('F001', self.generate_f001_autovacuum_settings_report), - ('G001', self.generate_g001_memory_settings_report), - ] - for check_id, report_func in fallback_report_types: - if len(nodes_to_process) == 1: - reports[check_id] = report_func(cluster, nodes_to_process[0]) - else: - combined_results = {} - for node in nodes_to_process: - print(f"Generating {check_id} report for node {node}...") - node_report = report_func(cluster, node) - if 'results' in node_report and node in node_report['results']: - combined_results[node] = node_report['results'][node] - - reports[check_id] = self.format_report_data( - check_id, - combined_results, - all_nodes["primary"] if all_nodes["primary"] else nodes_to_process[0], - all_nodes - ) - - return reports - - def generate_queries_json(self, query_text_limit: int = 1000) -> Dict[str, List[str]]: - """ - DEPRECATED: This method is no longer used. - Query information is now only included in individual query_{queryid}.json files. - - Generate JSON with queryid lists per database. - - Args: - query_text_limit: Not used anymore, kept for backward compatibility - - Returns: - Dictionary with database names as keys, containing lists of queryids - """ - logger.warning("DEPRECATED: generate_queries_json is no longer used") - queries_with_text = self.get_queryid_queries_from_sink(query_text_limit) - - # Convert from {db: {queryid: text}} to {db: [queryid, ...]} - queries_only = {} - for db_name, queries in queries_with_text.items(): - queries_only[db_name] = list(queries.keys()) - - return queries_only - - def extract_queryids_from_reports(self, reports: Dict[str, Any]) -> Dict[str, set]: - """ - Extract all unique queryids from the hourly reports (K001-K007, M001-M003, N001). - - Args: - reports: Dictionary of generated reports keyed by check_id - - Returns: - Dictionary mapping database names to sets of queryids - """ - queryids_by_db: Dict[str, set] = {} - - def extract_from_query_metrics( - container: Dict, - target_key: str = 'query_metrics', - id_field: str = 'queryid' - ): - """ - Helper to extract queryids from a container that may have nested structure. - - Notes: - - Different reports use different list keys for per-query items: - - K001 uses 'query_metrics' - - K003-K007 and M001-M003 use 'top_queries' - - We only keep queryids when we can associate them with a db_name, because - per-query file generation later needs (cluster, node, db, queryid) to query Prometheus. - """ - if not isinstance(container, dict): - return - - # Direct: container has query_metrics - if target_key in container: - for query in container.get(target_key, []): - qid = query.get(id_field) - if qid and str(qid) != '0': - # Try to find db_name from context or use a placeholder - yield str(qid), None - - # Check for 'data' wrapper: container -> data -> db_name -> query_metrics - if 'data' in container and isinstance(container['data'], dict): - for db_name, db_data in container['data'].items(): - if isinstance(db_data, dict) and target_key in db_data: - for query in db_data.get(target_key, []): - qid = query.get(id_field) - if qid and str(qid) != '0': - yield str(qid), db_name - - # Direct db_name -> query_metrics (no data wrapper) - for key, value in container.items(): - if key == 'data': - continue - if isinstance(value, dict) and target_key in value: - for query in value.get(target_key, []): - qid = query.get(id_field) - if qid and str(qid) != '0': - yield str(qid), key - - # Reports with queryid field in query_metrics list - pgss_reports = ['K001', 'K003', 'K004', 'K005', 'K006', 'K007', 'K008', 'M001', 'M002', 'M003'] - - for report_id in pgss_reports: - if report_id not in reports: - continue - - report = reports[report_id] - results = report.get('results', {}) - - # Handle multi-node structure: results -> node_name -> data -> db_name -> query_metrics - for node_key, node_data in results.items(): - if isinstance(node_data, dict): - # K001 uses 'query_metrics', while most other hourly/topk reports use 'top_queries'. - for list_key in ('query_metrics', 'top_queries'): - for queryid, db_name in extract_from_query_metrics(node_data, target_key=list_key): - if db_name: - if db_name not in queryids_by_db: - queryids_by_db[db_name] = set() - queryids_by_db[db_name].add(queryid) - - # N001 Wait Events report - has query_id in queries_list under wait_event_types - if 'N001' in reports: - report = reports['N001'] - results = report.get('results', {}) - - for node_key, node_data in results.items(): - if not isinstance(node_data, dict): - continue - - # Check for 'data' wrapper - data_container = node_data.get('data', node_data) - - for db_name, db_data in data_container.items(): - if not isinstance(db_data, dict): - continue - - wait_types = db_data.get('wait_event_types', {}) - if not wait_types: - continue - - if db_name not in queryids_by_db: - queryids_by_db[db_name] = set() - - for wait_type, wait_data in wait_types.items(): - for query in wait_data.get('queries_list', []): - query_id = query.get('query_id') - if query_id and str(query_id) != '0': - queryids_by_db[db_name].add(str(query_id)) - - # Log summary - total_queryids = sum(len(qids) for qids in queryids_by_db.values()) - logger.info(f"Extracted {total_queryids} unique queryids from hourly reports across {len(queryids_by_db)} database(s)") - - return queryids_by_db - - def get_query_metrics_from_prometheus(self, cluster: str, node_name: str, db_name: str, - queryid: str, hours: int = 24) -> Dict[str, Any]: - """ - Get all pg_stat_statements metrics for a specific query directly from Prometheus. - Fetches daily totals for all metrics shown on Dashboard 3 (Single queryid analysis). - - Args: - cluster: Cluster name - node_name: Node name - db_name: Database name - queryid: Query ID - hours: Number of hours to aggregate (default: 24 for daily totals) - - Returns: - Dictionary of metrics with daily totals - """ - metrics = {} - - # Build filters for this specific query - filters = [ - f'cluster="{cluster}"', - f'node_name="{node_name}"', - f'datname="{db_name}"', - f'queryid="{queryid}"' - ] - filter_str = '{' + ','.join(filters) + '}' - - # Time range - calculate exact 24h window - now = int(time.time()) - end_s = self._floor_hour(now) - start_s = end_s - (hours * 3600) # Exact hours back from end - - # All pg_stat_statements metrics to fetch (matching Dashboard 3) - pgss_metrics = { - 'calls': 'pgwatch_pg_stat_statements_calls', - 'exec_time_ms': 'pgwatch_pg_stat_statements_exec_time_total', - 'plan_time_ms': 'pgwatch_pg_stat_statements_plan_time_total', - 'rows': 'pgwatch_pg_stat_statements_rows', - 'shared_blks_hit_bytes': 'pgwatch_pg_stat_statements_shared_bytes_hit_total', - 'shared_blks_read_bytes': 'pgwatch_pg_stat_statements_shared_bytes_read_total', - 'shared_blks_dirtied_bytes': 'pgwatch_pg_stat_statements_shared_bytes_dirtied_total', - 'shared_blks_written_bytes': 'pgwatch_pg_stat_statements_shared_bytes_written_total', - 'wal_bytes': 'pgwatch_pg_stat_statements_wal_bytes', - 'wal_fpi': 'pgwatch_pg_stat_statements_wal_fpi', - 'wal_records': 'pgwatch_pg_stat_statements_wal_records', - 'temp_bytes_read': 'pgwatch_pg_stat_statements_temp_bytes_read', - 'temp_bytes_written': 'pgwatch_pg_stat_statements_temp_bytes_written', - 'blk_read_time_ms': 'pgwatch_pg_stat_statements_block_read_total', - 'blk_write_time_ms': 'pgwatch_pg_stat_statements_block_write_total', - 'jit_generation_time_ms': 'pgwatch_pg_stat_statements_jit_generation_time', - 'jit_inlining_time_ms': 'pgwatch_pg_stat_statements_jit_inlining_time', - 'jit_optimization_time_ms': 'pgwatch_pg_stat_statements_jit_optimization_time', - 'jit_emission_time_ms': 'pgwatch_pg_stat_statements_jit_emission_time', - } - - # Fetch each metric - for metric_key, metric_name in pgss_metrics.items(): - try: - # Query for total increase over the time range - query = f'sum(increase({metric_name}{filter_str}[{hours}h]))' - result = self.query_instant(query) - - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - value = float(item['value'][1]) if item.get('value') else 0 - # Only include metrics that have non-zero values - if value > 0: - metrics[metric_key] = value - break - except Exception: - # Silently skip metrics that fail (some may not exist for older PG versions) - pass - - # Add time range info - metrics['time_range'] = { - 'hours': hours, - 'start_time': datetime.fromtimestamp(start_s).isoformat(), - 'end_time': datetime.fromtimestamp(end_s).isoformat() - } - - return metrics - - def generate_per_query_jsons(self, reports: Dict[str, Any], cluster: str, - node_name: str = None, - # 640 KB should be enough for anybody - query_text_limit: int = 655360, - hours: int = 24, - write_immediately: bool = False, - include_cluster_prefix: bool = True, - api_url: str = None, - token: str = None, - report_id: str = None) -> List[Dict[str, Any]]: - """ - Generate individual JSON files for each query mentioned in hourly reports. - Fetches all metrics directly from Prometheus (matching Dashboard 3). - - Args: - reports: Dictionary of generated reports keyed by check_id - cluster: Cluster name - node_name: Node name (optional, will use primary if not specified) - query_text_limit: Maximum number of characters for each query text - hours: Number of hours for metric aggregation (default: 24) - write_immediately: If True, write files immediately to reduce memory usage - include_cluster_prefix: If True, prefix per-query filenames with "_". - api_url: API URL for uploads (only used if write_immediately is True) - token: API token for uploads (only used if write_immediately is True) - report_id: Report ID for uploads (only used if write_immediately is True) - - Returns: - List of dictionaries with 'filename' (and optionally 'data' if not written immediately) - """ - logger.info("Generating per-query JSON files...") - - # Extract all queryids from reports - queryids_by_db = self.extract_queryids_from_reports(reports) - - if not queryids_by_db: - logger.warning("No queryids found in hourly reports") - return [] - - # Determine which nodes to include (match generate_all_reports logic) - if node_name is None: - nodes = self.get_all_nodes(cluster) - nodes_to_process: List[str] = [] - if nodes.get("primary"): - nodes_to_process.append(nodes["primary"]) - nodes_to_process.extend(nodes.get("standbys", [])) - - # If no nodes found, fall back to default - if not nodes_to_process: - logger.warning(f"No nodes found in cluster '{cluster}', using default 'node-01'") - nodes_to_process = ["node-01"] - nodes = {"primary": "node-01", "standbys": []} - else: - # Single node (backward compatibility) - nodes_to_process = [node_name] - nodes = {"primary": node_name, "standbys": []} - - # Get query texts from sink - only for databases found in reports (memory optimization) - db_names_list = list(queryids_by_db.keys()) - logger.info(f"Fetching query texts for {len(db_names_list)} database(s): {db_names_list}") - query_texts = self.get_queryid_queries_from_sink(query_text_limit, db_names=db_names_list) - - query_files = [] - # Invert {db: set(queryid)} -> {queryid: set(db)} - dbs_by_queryid: Dict[str, set] = {} - for db_name, queryids in queryids_by_db.items(): - for qid in queryids: - if not qid: - continue - dbs_by_queryid.setdefault(qid, set()).add(db_name) - - total_queries = len(dbs_by_queryid) - processed = 0 - - # Process deterministically (helps debugging) - for queryid in sorted(dbs_by_queryid.keys()): - processed += 1 - dbs_for_query = sorted(list(dbs_by_queryid[queryid])) - logger.info(f"Processing query {processed}/{total_queries}: {queryid[:20]}... (dbs={len(dbs_for_query)}, nodes={len(nodes_to_process)})") - - # Query text is expected to be identical across DBs; pick first non-empty. - query_text = None - for db_name in dbs_for_query: - qt = (query_texts.get(db_name, {}) or {}).get(queryid) - if qt: - query_text = qt - break - - # Build results: results[node_name][db_name] = {"metrics": {...}} - results_by_node: Dict[str, Dict[str, Any]] = {} - time_range = None - - for n in nodes_to_process: - node_block: Dict[str, Any] = {} - for db_name in dbs_for_query: - metrics = self.get_query_metrics_from_prometheus( - cluster, n, db_name, queryid, hours=hours - ) - # Pull out time_range once and keep per-db metrics clean - if time_range is None and isinstance(metrics, dict): - time_range = metrics.pop("time_range", None) - elif isinstance(metrics, dict): - metrics.pop("time_range", None) - - node_block[db_name] = {"metrics": metrics} - results_by_node[n] = node_block - - # Create filename (match per-check report prefix logic) - # - Single-cluster: query_.json - # - Multi-cluster: _query_.json - filename = f"{cluster}_query_{queryid}.json" if include_cluster_prefix else f"query_{queryid}.json" - - # Build the final JSON object (keep timestamptz as the last field) - now = datetime.now(timezone.utc).isoformat() - query_data = { - "cluster_id": cluster, - "query_id": queryid, - "query_text": query_text, - "nodes": nodes, - "results": results_by_node, - } - if time_range: - query_data["time_range"] = time_range - query_data["timestamptz"] = now - - if write_immediately: - # Write to disk immediately to reduce memory usage - with open(filename, "w") as f: - json.dump(query_data, f, indent=2) - logger.info(f"Generated query file: {filename}") - - # Upload if API credentials provided - if api_url and token and report_id: - self.upload_report_file(api_url, token, report_id, filename) - - # Only store filename, not data - query_files.append({"filename": filename}) - - # Free memory immediately after writing - del query_data - else: - # Store in memory (legacy behavior) - query_files.append({ - "filename": filename, - "data": query_data - }) - - # Free memory periodically to reduce peak usage - if processed % 10 == 0: - gc.collect() - - # Final cleanup - del query_texts - gc.collect() - - logger.info(f"Generated {len(query_files)} per-query JSON files") - return query_files - - def get_all_clusters(self) -> List[str]: - """ - Get all unique cluster names (projects) from the metrics. - - Returns: - List of cluster names - """ - # Query for all clusters using last_over_time to get recent values - clusters_query = 'last_over_time(pgwatch_settings_configured[3h])' - result = self.query_instant(clusters_query) - - cluster_set = set() - - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - cluster_name = item['metric'].get('cluster', '') - if cluster_name: - cluster_set.add(cluster_name) - else: - # Debug output - logger.info(f"Debug - get_all_clusters query status: {result.get('status')}") - logger.info(f"Debug - get_all_clusters result count: {len(result.get('data', {}).get('result', []))}") - - if cluster_set: - logger.info(f"Found {len(cluster_set)} cluster(s): {sorted(list(cluster_set))}") - - return sorted(list(cluster_set)) - - def get_all_nodes(self, cluster: str = "local") -> Dict[str, List[str]]: - """ - Get all nodes (primary and replicas) from the metrics. - Uses pgwatch_db_stats_in_recovery_int to determine primary vs standby. - - Args: - cluster: Cluster name - - Returns: - Dictionary with 'primary' and 'standbys' keys containing node names - """ - # Query for all nodes in the cluster using last_over_time - nodes_query = f'last_over_time(pgwatch_settings_configured{{cluster="{cluster}"}}[3h])' - result = self.query_instant(nodes_query) - - nodes = {"primary": None, "standbys": []} - node_set = set() - - if result.get('status') == 'success' and result.get('data', {}).get('result'): - for item in result['data']['result']: - node_name = item['metric'].get('node_name', '') - if node_name and node_name not in node_set: - node_set.add(node_name) - - # Convert to sorted list - node_list = sorted(list(node_set)) - - if node_list: - logger.info(f"Found {len(node_list)} node(s) in cluster '{cluster}': {node_list}") - else: - logger.warning(f"No nodes found in cluster '{cluster}'") - - # Use pgwatch_db_stats_in_recovery_int to determine primary vs standby - # in_recovery = 0 means primary, in_recovery = 1 means standby - for node_name in node_list: - recovery_query = f'last_over_time(pgwatch_db_stats_in_recovery_int{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - recovery_result = self.query_instant(recovery_query) - - is_standby = False - if recovery_result.get('status') == 'success' and recovery_result.get('data', {}).get('result'): - if recovery_result['data']['result']: - in_recovery_value = float(recovery_result['data']['result'][0]['value'][1]) - is_standby = (in_recovery_value > 0) - logger.info(f"Node '{node_name}': in_recovery={int(in_recovery_value)} ({'standby' if is_standby else 'primary'})") - - if is_standby: - nodes["standbys"].append(node_name) - else: - # First non-standby node becomes primary - if nodes["primary"] is None: - nodes["primary"] = node_name - else: - # If we have multiple primaries (shouldn't happen), treat as replicas - logger.warning(f"Multiple primary nodes detected, treating '{node_name}' as replica") - nodes["standbys"].append(node_name) - - logger.info(f"Result: primary={nodes['primary']}, replicas={nodes['standbys']}") - return nodes - - def get_all_databases(self, cluster: str = "local", node_name: str = "node-01") -> List[str]: - """ - Get all databases from the metrics. - - Args: - cluster: Cluster name - node_name: Node name - - Returns: - List of database names - """ - # Build a source-agnostic database list by unifying labels from: - # 1) Generic per-database metric (wraparound) → datname - # 2) Custom index reports (unused/redundant) → dbname - # 3) Btree bloat (for completeness) → datname - databases: List[str] = [] - database_set = set() - - # Helper to add a name safely - def add_db(name: str) -> None: - if name and name not in self.excluded_databases and name not in database_set: - database_set.add(name) - databases.append(name) - - # 1) Generic per-database metric - wrap_q = f'last_over_time(pgwatch_pg_database_wraparound_age_datfrozenxid{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - wrap_res = self.query_instant(wrap_q) - if wrap_res.get('status') == 'success' and wrap_res.get('data', {}).get('result'): - for item in wrap_res['data']['result']: - add_db(item["metric"].get("datname", "")) - - # 2) Custom reports - unused indexes now uses datname, redundant still uses dbname - unused_q = f'last_over_time(pgwatch_unused_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - unused_res = self.query_instant(unused_q) - if unused_res.get('status') == 'success' and unused_res.get('data', {}).get('result'): - for item in unused_res['data']['result']: - add_db(item["metric"].get("datname", "")) - - redun_q = f'last_over_time(pgwatch_redundant_indexes_index_size_bytes{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - redun_res = self.query_instant(redun_q) - if redun_res.get('status') == 'success' and redun_res.get('data', {}).get('result'): - for item in redun_res['data']['result']: - add_db(item["metric"].get("dbname", "")) - - # 3) Btree bloat family - bloat_q = f'last_over_time(pgwatch_pg_btree_bloat_bloat_pct{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - bloat_res = self.query_instant(bloat_q) - if bloat_res.get('status') == 'success' and bloat_res.get('data', {}).get('result'): - for item in bloat_res['data']['result']: - add_db(item["metric"].get("datname", "")) - - # 4) pg_stat_statements metrics (calls) - pgss_q = f'last_over_time(pgwatch_pg_stat_statements_calls{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - pgss_res = self.query_instant(pgss_q) - if pgss_res.get('status') == 'success' and pgss_res.get('data', {}).get('result'): - for item in pgss_res['data']['result']: - add_db(item["metric"].get("datname", "")) - - # 5) Wait events - wait_q = f'last_over_time(pgwatch_wait_events_total{{cluster="{cluster}", node_name="{node_name}"}}[3h])' - wait_res = self.query_instant(wait_q) - if wait_res.get('status') == 'success' and wait_res.get('data', {}).get('result'): - for item in wait_res['data']['result']: - add_db(item["metric"].get("datname", "")) - - return databases - - def _get_pgss_metrics_data_by_db(self, cluster: str, node_name: str, db_name: str, start_time: datetime, - end_time: datetime) -> List[Dict[str, Any]]: - """ - Get pg_stat_statements metrics data for a specific database between two time points. - - Args: - cluster: Cluster name - node_name: Node name - db_name: Database name - start_time: Start datetime - end_time: End datetime - - Returns: - List of query metrics with calculated differences for the specific database - """ - # Metric name mapping for cleaner output - METRIC_NAME_MAPPING = { - 'calls': 'calls', - 'exec_time_total': 'total_time', - 'rows': 'rows', - 'shared_bytes_hit_total': 'shared_blks_hit', - 'shared_bytes_read_total': 'shared_blks_read', - 'shared_bytes_dirtied_total': 'shared_blks_dirtied', - 'shared_bytes_written_total': 'shared_blks_written', - 'block_read_total': 'blk_read_time', - 'block_write_total': 'blk_write_time' - } - - # Build filters including database - filters = [f'cluster="{cluster}"', f'node_name="{node_name}"', f'datname="{db_name}"'] - filter_str = '{' + ','.join(filters) + '}' - - # Get all pg_stat_statements metrics - all_metrics = [ - 'pgwatch_pg_stat_statements_calls', - 'pgwatch_pg_stat_statements_exec_time_total', - 'pgwatch_pg_stat_statements_rows', - 'pgwatch_pg_stat_statements_shared_bytes_hit_total', - 'pgwatch_pg_stat_statements_shared_bytes_read_total', - 'pgwatch_pg_stat_statements_shared_bytes_dirtied_total', - 'pgwatch_pg_stat_statements_shared_bytes_written_total', - 'pgwatch_pg_stat_statements_block_read_total', - 'pgwatch_pg_stat_statements_block_write_total' - ] - - # Get metrics at start and end times - start_data = [] - end_data = [] - - metrics_found = 0 - - for metric in all_metrics: - metric_with_filters = f'{metric}{filter_str}' - - try: - # Query metrics around start time - use instant queries at specific timestamps - start_result = self.query_range(metric_with_filters, start_time - timedelta(minutes=1), - start_time + timedelta(minutes=1)) - if start_result: - start_data.extend(start_result) - metrics_found += 1 - - # Query metrics around end time - end_result = self.query_range(metric_with_filters, end_time - timedelta(minutes=1), - end_time + timedelta(minutes=1)) - if end_result: - end_data.extend(end_result) - - except Exception as e: - logger.warning(f"Failed to query metric {metric} for database {db_name}: {e}") - continue - - if metrics_found == 0: - logger.warning(f"No pg_stat_statements metrics found for database {db_name}") - logger.info(f"Checked time range: {start_time.isoformat()} to {end_time.isoformat()}") - - # Process the data to calculate differences - result = self._process_pgss_data(start_data, end_data, start_time, end_time, METRIC_NAME_MAPPING) - - if not result: - logger.warning(f"_process_pgss_data returned empty result for database {db_name}") - - return result - - def create_report(self, api_url, token, project_name, epoch): - """ - Create a new report in the API. - - Args: - api_url: API URL - token: API token - project_name: Project name (cluster identifier) - epoch: Epoch identifier - - Returns: - Report ID or None if creation fails - """ - request_data = { - "access_token": token, - "project": project_name, - "epoch": epoch, - } - - try: - response = make_request(api_url, "/rpc/checkup_report_create", request_data) - report_id = response.get("report_id") - if not report_id: - message = response.get("message", "Cannot create report.") - logger.warning(f"{message}") - return None - - logger.info(f"Created report ID: {report_id}") - return int(report_id) - except requests.exceptions.HTTPError as e: - status = e.response.status_code if hasattr(e, 'response') else 'unknown' - if status == 404: - logger.warning("API endpoint not available (404). Reports will be saved locally only.") - elif status == 400: - logger.info(f"Request data: {len(json.dumps(request_data))} chars") - logger.warning("API rejected request (400 Bad Request). Reports will be saved locally only.") - logger.warning("This may indicate authentication issues or API format changes.") - else: - logger.error(f"Failed to create report (HTTP {status}): {e}") - return None - except Exception as e: - logger.error(f"Failed to create report: {e}") - return None - - def upload_report_file(self, api_url, token, report_id, path): - """ - Upload a report file to the API. - - Note: The API endpoint may not be available in all deployments. - Use --no-upload flag to skip API uploads. - """ - file_type = os.path.splitext(path)[1].lower().lstrip(".") - file_name = os.path.basename(path) - - data = Path(path).read_text(encoding="utf-8") - - # Prefer extracting check_id from JSON payload (filenames vary: A002.json, cluster_A002.json, etc.) - # Per-query JSON files intentionally do not have checkId (see reporter/schemas/query.schema.json). - check_id = "" - generate_issue = False - if file_type == "json": - try: - payload = json.loads(data) - if isinstance(payload, dict): - maybe = payload.get("checkId") - if isinstance(maybe, str) and maybe: - check_id = maybe - generate_issue = True - except Exception: - logger.warning(f"Upload: failed to parse JSON file '{file_name}', uploading without check_id") - # Keep check_id empty / generate_issue False to avoid mislabeling. - pass - - request_data = { - "access_token": token, - "checkup_report_id": report_id, - "check_id": check_id, - "filename": file_name, - "data": data, - "type": file_type, - "generate_issue": generate_issue - } - - try: - # Try the primary endpoint - response = make_request(api_url, "/rpc/checkup_report_file_post", request_data) - if "message" in response: - raise Exception(response["message"]) - logger.info(f"Uploaded: {file_name}") - except requests.exceptions.HTTPError as e: - status = e.response.status_code if hasattr(e, 'response') else 'unknown' - if status == 404: - logger.warning(f"Upload endpoint not available (404). File saved locally: {path}") - elif status == 400: - logger.warning(f"Upload rejected by API (400 Bad Request). File saved locally: {path}") - logger.warning("This may indicate the API endpoint format has changed or authentication issue.") - else: - logger.error(f"Upload failed for {file_name} (HTTP {status}). File saved locally: {path}") - logger.info("Use --no-upload flag to skip API uploads and suppress these warnings.") - except Exception as e: - logger.error(f"Upload failed for {file_name}: {e}") - logger.info(f"File saved locally: {path}") - - -def make_request(api_url, endpoint, request_data): - response = requests.post(api_url + endpoint, json=request_data) - response.raise_for_status() - return response.json() - - -def main(): - parser = argparse.ArgumentParser(description='Generate PostgreSQL reports using PromQL') - parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}') - parser.add_argument('--prometheus-url', default='http://sink-prometheus:9090', - help='Prometheus URL (default: http://sink-prometheus:9090)') - parser.add_argument('--postgres-sink-url', default='postgresql://pgwatch@sink-postgres:5432/measurements', - help='Postgres sink connection string (default: postgresql://pgwatch@sink-postgres:5432/measurements)') - parser.add_argument('--cluster', default=None, - help='Cluster name (default: auto-detect all clusters)') - parser.add_argument('--node-name', default=None, - help='Node name (default: auto-detect all nodes when combine-nodes is true)') - parser.add_argument('--no-combine-nodes', action='store_true', default=False, - help='Disable combining primary and replica reports into single report') - parser.add_argument('--check-id', - choices=['A002', 'A003', 'A004', 'A007', 'D004', 'F001', 'F004', 'F005', 'G001', 'H001', 'H002', - 'H004', 'K001', 'K003', 'K004', 'K005', 'K006', 'K007', 'K008', 'M001', 'M002', 'M003', 'N001', 'ALL'], - help='Specific check ID to generate (default: ALL)') - parser.add_argument('--output', default='-', - help='Output file (default: stdout)') - parser.add_argument('--api-url', default='https://postgres.ai/api/general') - parser.add_argument('--token', default='') - parser.add_argument('--project-name', default='project-name', - help='Project name for API upload (default: project-name)') - parser.add_argument('--epoch', default='1') - parser.add_argument('--no-upload', action='store_true', default=False, - help='Do not upload reports to the API') - parser.add_argument('--exclude-databases', type=str, default=None, - help='Comma-separated list of additional databases to exclude from reports ' - f'(default exclusions: {", ".join(sorted(PostgresReportGenerator.DEFAULT_EXCLUDED_DATABASES))})') - - args = parser.parse_args() - - # Parse excluded databases - excluded_databases = None - if args.exclude_databases: - excluded_databases = [db.strip() for db in args.exclude_databases.split(',')] - - generator = PostgresReportGenerator(args.prometheus_url, args.postgres_sink_url, excluded_databases) - - # Test connection - if not generator.test_connection(): - logger.error("Cannot connect to Prometheus. Make sure it's running and accessible.") - sys.exit(1) - - try: - # Discover all clusters if not specified - clusters_to_process = [] - if args.cluster: - clusters_to_process = [args.cluster] - else: - clusters_to_process = generator.get_all_clusters() - if not clusters_to_process: - logger.warning("No clusters found, using default 'local'") - clusters_to_process = ['local'] - else: - logger.info(f"Discovered clusters: {clusters_to_process}") - - # Process each cluster - for cluster in clusters_to_process: - logger.info("=" * 60) - logger.info(f"Processing cluster: {cluster}") - logger.info("=" * 60) - - # Set default node_name if not provided and not combining nodes - combine_nodes = not args.no_combine_nodes - if args.node_name is None and not combine_nodes: - args.node_name = "node-01" - - if args.check_id == 'ALL' or args.check_id is None: - # Generate all reports for this cluster - report_id = None - if not args.no_upload: - # Use cluster name as project name if not specified - project_name = args.project_name if args.project_name != 'project-name' else cluster - report_id = generator.create_report(args.api_url, args.token, project_name, args.epoch) - # If report creation failed, disable uploads for this cluster - if report_id is None: - logger.info(f"Skipping API uploads for cluster {cluster}") - - reports = generator.generate_all_reports(cluster, args.node_name, combine_nodes) - - # Generate per-query JSON files BEFORE deleting reports (needs queryids from reports) - # Use write_immediately=True to avoid accumulating all data in memory - logger.info("Generating per-query JSON files (streaming mode to reduce memory usage)...") - query_files = generator.generate_per_query_jsons( - reports, cluster, node_name=args.node_name, - # 640 KB should be enough for anybody - query_text_limit=66560, hours=24, - write_immediately=True, - include_cluster_prefix=(len(clusters_to_process) > 1), - api_url=args.api_url if (not args.no_upload and report_id) else None, - token=args.token if (not args.no_upload and report_id) else None, - report_id=report_id if (not args.no_upload and report_id) else None - ) - - # Clean up query files list - del query_files - gc.collect() - - # Save reports with cluster name prefix - for report_key in list(reports.keys()): # Use list() to avoid dict modification during iteration - output_filename = f"{cluster}_{report_key}.json" if len(clusters_to_process) > 1 else f"{report_key}.json" - with open(output_filename, "w") as f: - json.dump(reports[report_key], f, indent=2) - logger.info(f"Generated report: {output_filename}") - if not args.no_upload and report_id: - generator.upload_report_file(args.api_url, args.token, report_id, output_filename) - - # Free memory immediately after writing each report - del reports[report_key] - if len(reports) > 0 and len(reports) % 5 == 0: - gc.collect() - - # Free memory after writing all reports to disk - del reports - gc.collect() - else: - # Generate specific report - use node_name or default - if args.node_name is None: - args.node_name = "node-01" - - # For D004, F001, G001 - generate A003 first and derive from it - a003_report = None - if args.check_id in ('D004', 'F001', 'G001'): - print(f"Generating A003 first for {args.check_id}...") - a003_report = generator.generate_a003_settings_report(cluster, args.node_name) - - if args.check_id == 'A002': - report = generator.generate_a002_version_report(cluster, args.node_name) - elif args.check_id == 'A003': - report = generator.generate_a003_settings_report(cluster, args.node_name) - elif args.check_id == 'A004': - report = generator.generate_a004_cluster_report(cluster, args.node_name) - elif args.check_id == 'A007': - report = generator.generate_a007_altered_settings_report(cluster, args.node_name) - elif args.check_id == 'D004': - if a003_report: - report = generator.generate_d004_from_a003(a003_report, cluster, args.node_name) - else: - report = generator.generate_d004_pgstat_settings_report(cluster, args.node_name) - elif args.check_id == 'F001': - if a003_report: - report = generator.generate_f001_from_a003(a003_report, args.node_name) - else: - report = generator.generate_f001_autovacuum_settings_report(cluster, args.node_name) - elif args.check_id == 'F004': - report = generator.generate_f004_heap_bloat_report(cluster, args.node_name) - elif args.check_id == 'F005': - report = generator.generate_f005_btree_bloat_report(cluster, args.node_name) - elif args.check_id == 'G001': - if a003_report: - report = generator.generate_g001_from_a003(a003_report, args.node_name) - else: - report = generator.generate_g001_memory_settings_report(cluster, args.node_name) - elif args.check_id == 'H001': - report = generator.generate_h001_invalid_indexes_report(cluster, args.node_name) - elif args.check_id == 'H002': - report = generator.generate_h002_unused_indexes_report(cluster, args.node_name) - elif args.check_id == 'H004': - report = generator.generate_h004_redundant_indexes_report(cluster, args.node_name) - elif args.check_id == 'K001': - report = generator.generate_k001_query_calls_report(cluster, args.node_name, time_range_minutes=1440) - elif args.check_id == 'K003': - report = generator.generate_k003_top_queries_report(cluster, args.node_name, time_range_minutes=1440) - elif args.check_id == 'K004': - report = generator.generate_k004_temp_bytes_report(cluster, args.node_name, time_range_minutes=1440) - elif args.check_id == 'K005': - report = generator.generate_k005_wal_bytes_report(cluster, args.node_name, time_range_minutes=1440) - elif args.check_id == 'K006': - report = generator.generate_k006_shared_read_report(cluster, args.node_name, time_range_minutes=1440) - elif args.check_id == 'K007': - report = generator.generate_k007_shared_hit_report(cluster, args.node_name, time_range_minutes=1440) - elif args.check_id == 'K008': - report = generator.generate_k008_shared_hit_read_report(cluster, args.node_name, time_range_minutes=1440) - elif args.check_id == 'M001': - report = generator.generate_m001_mean_time_report(cluster, args.node_name, time_range_minutes=1440) - elif args.check_id == 'M002': - report = generator.generate_m002_rows_report(cluster, args.node_name, time_range_minutes=1440) - elif args.check_id == 'M003': - report = generator.generate_m003_io_time_report(cluster, args.node_name, time_range_minutes=1440) - elif args.check_id == 'N001': - report = generator.generate_n001_wait_events_report(cluster, args.node_name, hours=24) - - # Determine output filename - base_name = f"{cluster}_{args.check_id}" if len(clusters_to_process) > 1 else args.check_id - output_filename = f"{base_name}.json" if args.output == '-' else args.output - - # Output JSON report - if args.output == '-' and len(clusters_to_process) == 1: - # Report payload to stdout must remain raw JSON (not prefixed with log metadata). - sys.stdout.write(json.dumps(report, indent=2) + "\n") - else: - with open(output_filename, 'w') as f: - json.dump(report, f, indent=2) - logger.info(f"Report written to {output_filename}") - if not args.no_upload: - project_name = args.project_name if args.project_name != 'project-name' else cluster - report_id = generator.create_report(args.api_url, args.token, project_name, args.epoch) - if report_id: - generator.upload_report_file(args.api_url, args.token, report_id, output_filename) - - # Free memory after processing each cluster - logger.info(f"Freeing memory after processing cluster {cluster}...") - - # Close and reconnect postgres to free any accumulated memory - if generator.pg_conn: - logger.info("Reconnecting to Postgres sink to free memory...") - generator.close_postgres_sink() - # Connection will be recreated on next use - - gc.collect() - - except Exception as e: - logger.error(f"Error generating reports: {e}") - raise e - sys.exit(1) - finally: - # Clean up postgres connection - generator.close_postgres_sink() - - -if __name__ == "__main__": - main() diff --git a/reporter/report_schemas.py b/reporter/report_schemas.py deleted file mode 100644 index cc9832d1..00000000 --- a/reporter/report_schemas.py +++ /dev/null @@ -1,50 +0,0 @@ -from __future__ import annotations - -import json -from pathlib import Path -from typing import Any - -from jsonschema import Draft202012Validator - - -def schema_dir() -> Path: - return Path(__file__).resolve().parent / "schemas" - - -def schema_path_for_check_id(check_id: str) -> Path: - return schema_dir() / f"{check_id}.schema.json" - - -def load_schema(check_id: str) -> dict[str, Any]: - path = schema_path_for_check_id(check_id) - with path.open("r", encoding="utf-8") as f: - return json.load(f) - - -def validate_report(report: dict[str, Any]) -> None: - check_id = report.get("checkId") - if not isinstance(check_id, str) or not check_id: - raise ValueError("Report must have non-empty string 'checkId'") - - schema = load_schema(check_id) - Draft202012Validator(schema).validate(report) - - -def query_schema_path() -> Path: - return schema_dir() / "query.schema.json" - - -def load_query_schema() -> dict[str, Any]: - path = query_schema_path() - with path.open("r", encoding="utf-8") as f: - return json.load(f) - - -def validate_query_file(payload: dict[str, Any]) -> None: - """ - Validate per-query JSON files produced by PostgresReportGenerator.generate_per_query_jsons(). - """ - schema = load_query_schema() - Draft202012Validator(schema).validate(payload) - - diff --git a/reporter/requirements-dev.txt b/reporter/requirements-dev.txt deleted file mode 100644 index 47eebdd0..00000000 --- a/reporter/requirements-dev.txt +++ /dev/null @@ -1,7 +0,0 @@ --r requirements.txt -pytest==9.0.1 -pytest-postgresql==7.0.2 -coverage==7.6.10 -pytest-cov==6.0.0 -jsonschema==4.23.0 -PyYAML==6.0.2 diff --git a/reporter/requirements.txt b/reporter/requirements.txt deleted file mode 100644 index 92caa34f..00000000 --- a/reporter/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests==2.32.5 -psycopg2-binary==2.9.11 -jsonschema==4.23.0 diff --git a/tests/reporter/__init__.py b/tests/reporter/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/reporter/conftest.py b/tests/reporter/conftest.py deleted file mode 100644 index 36ffba48..00000000 --- a/tests/reporter/conftest.py +++ /dev/null @@ -1,58 +0,0 @@ -from typing import Callable, Dict, List, Optional, Tuple, Union - -import pytest - - -def pytest_addoption(parser: pytest.Parser) -> None: - """Add a flag for enabling integration tests that require services.""" - parser.addoption( - "--run-integration", - action="store_true", - default=False, - help="Run tests marked as integration/requires_postgres.", - ) - - -def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None: - """Skip integration tests unless --run-integration is given.""" - if config.getoption("--run-integration"): - return - - skip_marker = pytest.mark.skip(reason="integration tests require --run-integration") - for item in items: - if "integration" in item.keywords or "requires_postgres" in item.keywords: - item.add_marker(skip_marker) - - -@pytest.fixture(name="prom_result") -def fixture_prom_result() -> Callable[[Optional[List[Dict]], str], Dict]: - """Build a Prometheus-like payload for the happy-path tests.""" - - def _builder(rows: Optional[List[Dict]] = None, status: str = "success") -> Dict: - return { - "status": status, - "data": { - "result": rows or [], - }, - } - - return _builder - - -@pytest.fixture(name="series_sample") -def fixture_series_sample() -> Callable[[str, Optional[Dict], Optional[List[Tuple[Union[float, int], Union[float, int, str]]]]], Dict]: - """Create metric entries (metric metadata + values array) for query_range tests.""" - - def _builder( - metric_name: str, - labels: Optional[Dict] = None, - values: Optional[List[Tuple[Union[float, int], Union[float, int, str]]]] = None, - ) -> Dict: - labels = labels or {} - values = values or [] - return { - "metric": {"__name__": metric_name, **labels}, - "values": [[ts, str(val)] for ts, val in values], - } - - return _builder diff --git a/tests/reporter/test_extract_queryids_from_reports_unit.py b/tests/reporter/test_extract_queryids_from_reports_unit.py deleted file mode 100644 index ca6ddb02..00000000 --- a/tests/reporter/test_extract_queryids_from_reports_unit.py +++ /dev/null @@ -1,134 +0,0 @@ -from __future__ import annotations - -import pytest - -from reporter.postgres_reports import PostgresReportGenerator - - -@pytest.fixture(name="generator") -def fixture_generator() -> PostgresReportGenerator: - return PostgresReportGenerator( - prometheus_url="http://prom.test", - postgres_sink_url="", - ) - - -@pytest.mark.unit -def test_extract_queryids_from_reports_includes_query_metrics_and_top_queries( - generator: PostgresReportGenerator, -) -> None: - reports = { - # K001-style report: query_metrics - "K001": { - "results": { - "node-1": { - "data": { - "db1": { - "query_metrics": [ - {"queryid": "1"}, - {"queryid": "0"}, # excluded - {"queryid": 2}, # int form - ] - } - } - } - } - }, - # K003-style report: top_queries - "K003": { - "results": { - "node-1": { - "data": { - "db1": { - "top_queries": [ - {"queryid": "3"}, - {"queryid": "-4"}, - ] - }, - "db2": { - "top_queries": [ - {"queryid": "5"}, - ] - }, - } - } - } - }, - # D004 has sample_queries but should NOT be used for per-query file generation. - "D004": { - "results": { - "node-1": { - "data": { - "pg_stat_statements_status": { - "sample_queries": [ - {"queryid": "999"}, - ] - } - } - } - } - }, - } - - out = generator.extract_queryids_from_reports(reports) - - assert out["db1"] == {"1", "2", "3", "-4"} - assert out["db2"] == {"5"} - assert "999" not in (out["db1"] | out["db2"]) - - -@pytest.mark.unit -def test_extract_queryids_from_reports_n001_includes_nonzero_query_id_only( - generator: PostgresReportGenerator, -) -> None: - reports = { - "N001": { - "results": { - "node-1": { - "data": { - "db1": { - "wait_event_types": { - "CPU*": { - "queries_list": [ - {"query_id": "0"}, # excluded - {"query_id": "10"}, - ] - } - } - } - } - } - } - } - } - - out = generator.extract_queryids_from_reports(reports) - - assert out == {"db1": {"10"}} - - -@pytest.mark.unit -def test_extract_queryids_from_reports_d004_only_is_empty( - generator: PostgresReportGenerator, -) -> None: - reports = { - "D004": { - "results": { - "node-1": { - "data": { - "pg_stat_statements_status": { - "sample_queries": [ - {"queryid": "-1100697950502680692"}, - {"queryid": "-115926913472768758"}, - ] - } - } - } - } - } - } - - out = generator.extract_queryids_from_reports(reports) - assert out == {} - - diff --git a/tests/reporter/test_formatters.py b/tests/reporter/test_formatters.py deleted file mode 100644 index f78e35ae..00000000 --- a/tests/reporter/test_formatters.py +++ /dev/null @@ -1,97 +0,0 @@ -import pytest - -from reporter.postgres_reports import PostgresReportGenerator - - -@pytest.fixture(name="generator") -def fixture_generator() -> PostgresReportGenerator: - return PostgresReportGenerator(prometheus_url="http://test", postgres_sink_url="") - - -@pytest.mark.unit -@pytest.mark.parametrize( - "value,expected", - [ - (0, "0 B"), - (1, "1.00 B"), - (1024, "1.00 KiB"), - (10 * 1024, "10.0 KiB"), - (1048576, "1.00 MiB"), - (5 * 1024 ** 3, "5.00 GiB"), - ], -) -def test_format_bytes(generator: PostgresReportGenerator, value: int, expected: str) -> None: - assert generator.format_bytes(value) == expected - - -@pytest.mark.unit -@pytest.mark.parametrize( - "name,value,unit,expected", - [ - ("shared_buffers", "128", "8kB", "1 MiB"), - ("work_mem", "512", "", "512 KiB"), - ("log_min_duration_statement", "2000", "ms", "2 s"), - ("log_min_duration_statement", "500", "ms", "500 ms"), - ("autovacuum_naptime", "120", "", "2 min"), - ("autovacuum", "on", "", "on"), - ("autovacuum", "OFF", "", "off"), - ], -) -def test_format_setting_value( - generator: PostgresReportGenerator, - name: str, - value: str, - unit: str, - expected: str, -) -> None: - assert generator.format_setting_value(name, value, unit) == expected - - -@pytest.mark.unit -def test_get_cluster_metric_metadata(generator: PostgresReportGenerator) -> None: - assert generator.get_cluster_metric_unit("active_connections") == "connections" - assert generator.get_cluster_metric_description( - "active_connections" - ).startswith("Number of active") - assert generator.get_cluster_metric_unit("unknown") == "" - - -@pytest.mark.unit -def test_get_setting_unit_and_category(generator: PostgresReportGenerator) -> None: - assert generator.get_setting_unit("shared_buffers") == "8kB" - assert generator.get_setting_category("shared_buffers") == "Memory" - assert generator.get_setting_unit("nonexistent") == "" - assert generator.get_setting_category("nonexistent") == "Other" - - -@pytest.mark.unit -def test_format_report_data_structure(generator: PostgresReportGenerator) -> None: - host = "db-1" - payload = generator.format_report_data("A002", {"foo": "bar"}, host) - - assert payload["version"] is None - assert payload["build_ts"] is None - assert payload["checkId"] == "A002" - # Newer reporter returns a 'nodes' structure instead of legacy 'hosts'. - assert payload["nodes"]["primary"] == host - assert payload["results"][host]["data"] == {"foo": "bar"} - - -@pytest.mark.unit -def test_format_report_data_includes_build_metadata_from_files( - monkeypatch: pytest.MonkeyPatch, - tmp_path, -) -> None: - version_file = tmp_path / "VERSION" - build_ts_file = tmp_path / "BUILD_TS" - version_file.write_text("0.0.0-test\n", encoding="utf-8") - build_ts_file.write_text("2025-12-17 00:00:00 UTC\n", encoding="utf-8") - - monkeypatch.setenv("PGAI_VERSION_FILE", str(version_file)) - monkeypatch.setenv("PGAI_BUILD_TS_FILE", str(build_ts_file)) - - generator = PostgresReportGenerator(prometheus_url="http://test", postgres_sink_url="") - payload = generator.format_report_data("A002", {"foo": "bar"}, "db-1") - - assert payload["version"] == "0.0.0-test" - assert payload["build_ts"] == "2025-12-17 00:00:00 UTC" diff --git a/tests/reporter/test_generators_hourly_unit.py b/tests/reporter/test_generators_hourly_unit.py deleted file mode 100644 index ede79d1c..00000000 --- a/tests/reporter/test_generators_hourly_unit.py +++ /dev/null @@ -1,288 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import pytest - -from reporter.postgres_reports import PostgresReportGenerator - - -@pytest.fixture(name="generator") -def fixture_generator() -> PostgresReportGenerator: - return PostgresReportGenerator( - prometheus_url="http://prom.test", - postgres_sink_url="", - ) - - -@pytest.fixture(name="fixed_pg_version") -def fixture_fixed_pg_version() -> dict[str, str]: - return { - "version": "15.3", - "server_version_num": "150003", - "server_major_ver": "15", - "server_minor_ver": "3", - } - - -def _stub_hourly_topk_single_metric( - metric_name_to_data: dict[str, tuple[dict[str, list[float]], list[float], list[int]]] -): - def _stub( - cluster: str, - node_name: str, - db_name: str, - metric_name: str = "pgwatch_pg_stat_statements_calls", - hours: int = 24, - step_s: int = 3600, - k: int = 3, - ): - _ = (cluster, node_name, db_name, hours, step_s, k) - if metric_name not in metric_name_to_data: - raise AssertionError(f"Unexpected metric_name: {metric_name}") - return metric_name_to_data[metric_name] - - return _stub - - -@pytest.mark.unit -def test_generate_k004_computes_totals(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - per_query = {"1": [1.0, 2.0], "2": [0.0, 4.0]} - other = [10.0, 0.0] - timeline = [100, 200] - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk_single_metric( - {"pgwatch_pg_stat_statements_temp_bytes_written": (per_query, other, timeline)} - ), - ) - - report = generator.generate_k004_temp_bytes_report("local", "node-1", time_range_minutes=120, limit=50) - db = report["results"]["node-1"]["data"]["db1"] - - assert db["summary"]["total_temp_bytes_other"] == pytest.approx(sum(other)) - assert db["summary"]["total_temp_bytes_tracked_queries"] == pytest.approx(3.0 + 4.0) - assert db["summary"]["total_temp_bytes"] == pytest.approx((3.0 + 4.0) + sum(other)) - - -@pytest.mark.unit -def test_generate_k005_computes_totals(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - per_query = {"1": [5.0, 5.0]} - other = [1.0, 2.0] - timeline = [100, 200] - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk_single_metric({"pgwatch_pg_stat_statements_wal_bytes": (per_query, other, timeline)}), - ) - - report = generator.generate_k005_wal_bytes_report("local", "node-1", time_range_minutes=120, limit=50) - db = report["results"]["node-1"]["data"]["db1"] - - assert db["summary"]["total_wal_bytes_other"] == pytest.approx(sum(other)) - assert db["summary"]["total_wal_bytes_tracked_queries"] == pytest.approx(10.0) - assert db["summary"]["total_wal_bytes"] == pytest.approx(10.0 + sum(other)) - - -@pytest.mark.unit -def test_generate_k006_computes_totals(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - per_query = {"1": [1.0, 1.0], "2": [0.0, 3.0]} - other = [0.0, 10.0] - timeline = [100, 200] - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk_single_metric( - {"pgwatch_pg_stat_statements_shared_bytes_read_total": (per_query, other, timeline)} - ), - ) - - report = generator.generate_k006_shared_read_report("local", "node-1", time_range_minutes=120, limit=50) - db = report["results"]["node-1"]["data"]["db1"] - - assert db["summary"]["total_shared_read_bytes_other"] == pytest.approx(sum(other)) - assert db["summary"]["total_shared_read_bytes_tracked_queries"] == pytest.approx(2.0 + 3.0) - assert db["summary"]["total_shared_read_bytes"] == pytest.approx((2.0 + 3.0) + sum(other)) - - -@pytest.mark.unit -def test_generate_k007_computes_totals(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - per_query = {"1": [2.0, 0.0]} - other = [2.0, 2.0] - timeline = [100, 200] - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk_single_metric( - {"pgwatch_pg_stat_statements_shared_bytes_hit_total": (per_query, other, timeline)} - ), - ) - - report = generator.generate_k007_shared_hit_report("local", "node-1", time_range_minutes=120, limit=50) - db = report["results"]["node-1"]["data"]["db1"] - - assert db["summary"]["total_shared_hit_bytes_other"] == pytest.approx(sum(other)) - assert db["summary"]["total_shared_hit_bytes_tracked_queries"] == pytest.approx(2.0) - assert db["summary"]["total_shared_hit_bytes"] == pytest.approx(2.0 + sum(other)) - - -@pytest.mark.unit -def test_generate_k008_computes_totals(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - per_query = {"1": [6.0, 1.0], "2": [0.0, 3.0], "3": [2.0, 2.0]} - other = [1.0, 10.0] - timeline = [100, 200] - - monkeypatch.setattr(generator, "_get_hourly_topk_pgss_data_sum2", lambda *args, **kwargs: (per_query, other, timeline)) - - report = generator.generate_k008_shared_hit_read_report("local", "node-1", time_range_minutes=120, limit=50) - db = report["results"]["node-1"]["data"]["db1"] - - tracked = sum(sum(v) for v in per_query.values()) - assert db["summary"]["total_shared_hit_read_bytes_tracked_queries"] == pytest.approx(tracked) - assert db["summary"]["total_shared_hit_read_bytes_other"] == pytest.approx(sum(other)) - assert db["summary"]["total_shared_hit_read_bytes"] == pytest.approx(tracked + sum(other)) - - -@pytest.mark.unit -def test_generate_m001_computes_mean(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - timeline = [100, 200] - time_per_query = {"1": [100.0, 100.0], "2": [10.0, 0.0]} - calls_per_query = {"1": [1.0, 1.0], "2": [2.0, 0.0]} - time_other = [0.0, 0.0] - calls_other = [0.0, 0.0] - - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk_single_metric( - { - "pgwatch_pg_stat_statements_exec_time_total": (time_per_query, time_other, timeline), - "pgwatch_pg_stat_statements_calls": (calls_per_query, calls_other, timeline), - } - ), - ) - - report = generator.generate_m001_mean_time_report("local", "node-1", time_range_minutes=120, limit=50) - top = report["results"]["node-1"]["data"]["db1"]["top_queries"] - - # query 1 mean: 200/2 = 100; query 2 mean: 10/2 = 5 - assert top[0]["queryid"] == "1" - assert top[0]["mean_time_ms"] == pytest.approx(100.0) - assert top[1]["queryid"] == "2" - assert top[1]["mean_time_ms"] == pytest.approx(5.0) - - -@pytest.mark.unit -def test_generate_m002_computes_totals(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - per_query = {"1": [1.0, 2.0], "2": [3.0, 0.0]} - other = [10.0, 0.0] - timeline = [100, 200] - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk_single_metric({"pgwatch_pg_stat_statements_rows": (per_query, other, timeline)}), - ) - - report = generator.generate_m002_rows_report("local", "node-1", time_range_minutes=120, limit=50) - db = report["results"]["node-1"]["data"]["db1"] - - tracked = (1.0 + 2.0) + (3.0 + 0.0) - assert db["summary"]["total_rows_tracked_queries"] == pytest.approx(tracked) - assert db["summary"]["total_rows_other"] == pytest.approx(sum(other)) - assert db["summary"]["total_rows"] == pytest.approx(tracked + sum(other)) - - -@pytest.mark.unit -def test_generate_m003_computes_io_totals(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - timeline = [100, 200] - read_per_query = {"1": [1.0, 2.0]} - write_per_query = {"1": [3.0, 4.0]} - read_other = [0.0, 10.0] - write_other = [1.0, 1.0] - - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk_single_metric( - { - "pgwatch_pg_stat_statements_block_read_total": (read_per_query, read_other, timeline), - "pgwatch_pg_stat_statements_block_write_total": (write_per_query, write_other, timeline), - } - ), - ) - - report = generator.generate_m003_io_time_report("local", "node-1", time_range_minutes=120, limit=50) - db = report["results"]["node-1"]["data"]["db1"] - - assert db["top_queries"][0]["total_io_time_ms"] == pytest.approx((1.0 + 2.0) + (3.0 + 4.0)) - assert db["summary"]["total_io_time_other_ms"] == pytest.approx(sum([r + w for r, w in zip(read_other, write_other)])) - - -@pytest.mark.unit -def test_generate_n001_groups_wait_events(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - # Fix timeline deterministically: end_s=7200 for hours=3 -> [0,3600,7200] - monkeypatch.setattr(generator, "_floor_hour", lambda *_: 7200) - - def fake_query_range(_query: str, start, end, step: str = "3600s") -> list[dict[str, Any]]: - _ = (start, end, step) - return [ - { - "metric": { - "wait_event_type": "IO", - "wait_event": "DataFileRead", - "query_id": "123", - }, - "values": [[0, "1"], [3600, "2"], [7200, "0"]], - }, - { - "metric": { - "wait_event_type": "IO", - "wait_event": "DataFileRead", - "query_id": "456", - }, - "values": [[0, "0"], [3600, "1"], [7200, "1"]], - }, - ] - - monkeypatch.setattr(generator, "query_range", fake_query_range) - - report = generator.generate_n001_wait_events_report("local", "node-1", hours=3) - db = report["results"]["node-1"]["data"]["db1"] - io = db["wait_event_types"]["IO"] - assert io["unique_queries"] == 2 - assert io["total_occurrences"] == 5 - # Sorted by occurrences desc: q123 has 3, q456 has 2 - assert io["queries_list"][0]["query_id"] == "123" - assert io["queries_list"][0]["hourly_occurrences"] == [1, 2, 0] - assert io["queries_list"][1]["query_id"] == "456" - assert io["queries_list"][1]["hourly_occurrences"] == [0, 1, 1] - - diff --git a/tests/reporter/test_generators_query_unit.py b/tests/reporter/test_generators_query_unit.py deleted file mode 100644 index f1fcd7c7..00000000 --- a/tests/reporter/test_generators_query_unit.py +++ /dev/null @@ -1,124 +0,0 @@ -from __future__ import annotations - -import json -from typing import Any - -import pytest - -from reporter.postgres_reports import PostgresReportGenerator -from reporter.report_schemas import validate_query_file - - -@pytest.fixture(name="generator") -def fixture_generator() -> PostgresReportGenerator: - return PostgresReportGenerator( - prometheus_url="http://prom.test", - postgres_sink_url="", - ) - - -def _fake_metrics(cluster: str, node_name: str, db_name: str, queryid: str, hours: int) -> dict[str, Any]: - # Return a fresh dict each call because generator pops "time_range". - return { - "calls": float(len(node_name) + len(db_name)), - "total_time": float(len(queryid)), - "rows": float(hours), - "time_range": { - "hours": hours, - "start_time": "2025-01-01T00:00:00+00:00", - "end_time": "2025-01-02T00:00:00+00:00", - }, - } - - -@pytest.mark.unit -def test_generate_per_query_jsons_groups_by_queryid_and_is_node_first( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, -) -> None: - monkeypatch.setattr( - generator, - "extract_queryids_from_reports", - lambda reports: { - "airflow_db_prod": {"qid_1"}, - "db_b": {"qid_1", "qid_2"}, - }, - ) - monkeypatch.setattr( - generator, - "get_queryid_queries_from_sink", - lambda *args, **kwargs: { - "airflow_db_prod": {"qid_1": "SELECT 1"}, - "db_b": {"qid_1": "SELECT 1", "qid_2": "SELECT 2"}, - }, - ) - monkeypatch.setattr( - generator, - "get_all_nodes", - lambda cluster: {"primary": "main", "standbys": ["replica-1", "replica-2"]}, - ) - monkeypatch.setattr(generator, "get_query_metrics_from_prometheus", _fake_metrics) - - out = generator.generate_per_query_jsons( - reports={"K001": {}}, - cluster="prod", - node_name=None, - hours=24, - write_immediately=False, - ) - - assert {item["filename"] for item in out} == {"prod_query_qid_1.json", "prod_query_qid_2.json"} - - q1 = next(item["data"] for item in out if item["filename"] == "prod_query_qid_1.json") - validate_query_file(q1) - - assert q1["cluster_id"] == "prod" - assert q1["query_id"] == "qid_1" - assert q1["query_text"] == "SELECT 1" - assert q1["nodes"]["primary"] == "main" - assert q1["nodes"]["standbys"] == ["replica-1", "replica-2"] - - # Node is the primary dimension. - assert set(q1["results"].keys()) == {"main", "replica-1", "replica-2"} - assert set(q1["results"]["main"].keys()) == {"airflow_db_prod", "db_b"} - - # time_range moved to top-level and removed from per-db metrics. - assert q1["time_range"]["hours"] == 24 - assert "time_range" not in q1["results"]["main"]["airflow_db_prod"]["metrics"] - - -@pytest.mark.unit -def test_generate_per_query_jsons_write_immediately_prefixes_cluster_and_writes_timestamptz_last( - tmp_path, - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, -) -> None: - monkeypatch.chdir(tmp_path) - monkeypatch.setattr(generator, "extract_queryids_from_reports", lambda reports: {"db1": {"qid_1"}}) - monkeypatch.setattr( - generator, "get_queryid_queries_from_sink", lambda *args, **kwargs: {"db1": {"qid_1": "SELECT 1"}} - ) - monkeypatch.setattr(generator, "get_all_nodes", lambda cluster: {"primary": "main", "standbys": ["replica-1"]}) - monkeypatch.setattr(generator, "get_query_metrics_from_prometheus", _fake_metrics) - - out = generator.generate_per_query_jsons( - reports={"K001": {}}, - cluster="prod", - node_name=None, - hours=24, - write_immediately=True, - ) - - assert out == [{"filename": "prod_query_qid_1.json"}] - p = tmp_path / "prod_query_qid_1.json" - assert p.exists() - - payload = json.loads(p.read_text(encoding="utf-8")) - validate_query_file(payload) - - # Ensure timestamptz is last key in the emitted JSON text (ordering requirement). - raw = p.read_text(encoding="utf-8").rstrip() - last_key_line = [ln for ln in raw.splitlines() if ln.lstrip().startswith('"')][-1] - assert last_key_line.lstrip().startswith('"timestamptz"') - - diff --git a/tests/reporter/test_generators_unit.py b/tests/reporter/test_generators_unit.py deleted file mode 100644 index 581c16bf..00000000 --- a/tests/reporter/test_generators_unit.py +++ /dev/null @@ -1,1358 +0,0 @@ -import json -import sys -from datetime import datetime, timedelta -from typing import Any, Callable - -import pytest - -from reporter import postgres_reports as postgres_reports_module -from reporter.postgres_reports import PostgresReportGenerator - - -@pytest.fixture(name="generator") -def fixture_generator() -> PostgresReportGenerator: - return PostgresReportGenerator( - prometheus_url="http://prom.test", - postgres_sink_url="", - ) - - -def _success_metric(value: str) -> dict[str, Any]: - return { - "status": "success", - "data": { - "result": [ - { - "value": [datetime.now().timestamp(), value], - } - ] - }, - } - - -def _query_stub_factory(prom_result, mapping: dict[str, Any]) -> Callable[[str], dict[str, Any]]: - """Return a query_instant stub that matches substrings defined in mapping keys. - - Args: - prom_result: Fallback callable that returns a default Prometheus response - mapping: Dict mapping query substrings to responses (either dict or callable) - - Returns: - A callable that takes a query string and returns a Prometheus-like response - """ - - def _fake(query: str) -> dict[str, Any]: - for needle, payload in mapping.items(): - if needle in query: - return payload(query) if callable(payload) else payload - return prom_result() - - return _fake - - -@pytest.mark.unit -def test_query_instant_hits_prometheus( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, -) -> None: - captured: dict[str, Any] = {} - - class DummyResponse: - status_code = 200 - text = "{}" - - @staticmethod - def json() -> dict[str, Any]: - return {"status": "success", "data": {"result": []}} - - def fake_get( - url: str, - params: dict[str, Any] | None = None, - timeout: int | None = None, - ): - captured["url"] = url - captured["params"] = params - return DummyResponse() - - monkeypatch.setattr(postgres_reports_module.requests, "get", fake_get) - - payload = generator.query_instant("up") - - assert payload["status"] == "success" - assert captured["url"].endswith("/api/v1/query") - assert captured["params"] == {"query": "up"} - - -@pytest.mark.unit -def test_query_range_hits_prometheus( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, -) -> None: - start = datetime(2024, 1, 1, 0, 0, 0) - end = start + timedelta(minutes=5) - captured: dict[str, Any] = {} - - class DummyResponse: - status_code = 200 - text = "{}" - - @staticmethod - def json() -> dict[str, Any]: - return {"status": "success", "data": {"result": []}} - - def fake_get( - url: str, - params: dict[str, Any] | None = None, - timeout: int | None = None, - ): - captured["url"] = url - captured["params"] = params - return DummyResponse() - - monkeypatch.setattr(postgres_reports_module.requests, "get", fake_get) - - payload = generator.query_range("up", start, end, step="60s") - - assert payload == [] - assert captured["url"].endswith("/api/v1/query_range") - assert captured["params"]["query"] == "up" - assert captured["params"]["start"] == start.timestamp() - - -@pytest.mark.unit -def test_generate_a002_version_report( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, -) -> None: - values = { - "server_version": "15.3", - "server_version_num": "150003", - "max_connections": "200", - "shared_buffers": "1024", - "effective_cache_size": "2048", - } - - def fake_query(query: str) -> dict[str, Any]: - # A002 uses a helper that queries both settings via a single regex selector. - if 'setting_name=~"server_version|server_version_num"' in query: - return { - "status": "success", - "data": { - "result": [ - { - "metric": { - "setting_name": "server_version", - "setting_value": values["server_version"], - } - }, - { - "metric": { - "setting_name": "server_version_num", - "setting_value": values["server_version_num"], - } - }, - ] - }, - } - return {"status": "success", "data": {"result": []}} - - monkeypatch.setattr(generator, "query_instant", fake_query) - - report = generator.generate_a002_version_report("local", "node-1") - version = report["results"]["node-1"]["data"]["version"] - - assert version["version"] == "15.3" - assert version["server_major_ver"] == "15" - assert version["server_minor_ver"] == "3" - - -@pytest.mark.unit -def test_generate_a004_cluster_report( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, -) -> None: - def fake_query(query: str) -> dict[str, Any]: - if "pgwatch_db_size_size_b" in query and "sum(" not in query: - return { - "status": "success", - "data": { - "result": [ - {"metric": {"datname": "db1"}, "value": [0, "1024"]}, - {"metric": {"datname": "db2"}, "value": [0, "2048"]}, - ] - }, - } - return _success_metric("42") - - monkeypatch.setattr(generator, "query_instant", fake_query) - - report = generator.generate_a004_cluster_report("local", "node-1") - data = report["results"]["node-1"]["data"] - - assert "general_info" in data and "database_sizes" in data - assert data["general_info"]["active_connections"]["value"] == "42" - assert data["database_sizes"] == {"db1": 1024.0, "db2": 2048.0} - - -@pytest.mark.unit -def test_prometheus_to_dict_and_process_pgss(generator: PostgresReportGenerator) -> None: - base_time = datetime(2024, 1, 1, 0, 0, 0) - later_time = base_time + timedelta(seconds=60) - - def make_metric(name: str, value: float, ts: datetime) -> dict[str, Any]: - return { - "metric": { - "__name__": name, - "datname": "db1", - "queryid": "123", - "user": "postgres", - "instance": "inst1", - }, - "values": [[ts.timestamp(), str(value)]], - } - - start_metrics = [ - make_metric("pgwatch_pg_stat_statements_calls", 10, base_time), - make_metric("pgwatch_pg_stat_statements_exec_time_total", 1000, base_time), - make_metric("pgwatch_pg_stat_statements_rows", 200, base_time), - ] - end_metrics = [ - make_metric("pgwatch_pg_stat_statements_calls", 40, later_time), - make_metric("pgwatch_pg_stat_statements_exec_time_total", 4000, later_time), - make_metric("pgwatch_pg_stat_statements_rows", 260, later_time), - ] - - mapping = { - "calls": "calls", - "exec_time_total": "total_time", - "rows": "rows", - } - - rows = generator._process_pgss_data( - start_metrics, - end_metrics, - base_time, - later_time, - mapping, - ) - - assert len(rows) == 1 - row = rows[0] - assert row["calls"] == 30 - assert row["total_time"] == 3000 - assert pytest.approx(row["total_time_per_sec"], 0.01) == 50 - assert row["rows_per_call"] == pytest.approx(2.0) - - -@pytest.mark.unit -def test_prometheus_to_dict_closest_value(generator: PostgresReportGenerator) -> None: - reference_time = datetime(2024, 1, 1, 12, 0, 0) - - prom_data: list[dict[str, Any]] = [ - { - "metric": { - "__name__": "pgwatch_pg_stat_statements_calls", - "datname": "db1", - "queryid": "q1", - "user": "postgres", - "instance": "inst1", - }, - "values": [ - [reference_time.timestamp() - 10, "10"], - [reference_time.timestamp() + 5, "20"], - ], - } - ] - - converted = generator._prometheus_to_dict(prom_data, reference_time) - - key = ("db1", "q1", "postgres", "inst1") - assert key in converted - assert converted[key]["calls"] == 20 - - -@pytest.mark.unit -def test_generate_a003_settings_report(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - def fake_query(query: str) -> dict[str, Any]: - assert "pgwatch_settings_configured" in query - return { - "status": "success", - "data": { - "result": [ - { - "metric": { - "setting_name": "shared_buffers", - "setting_value": "128", - "category": "Memory", - "unit": "8kB", - "context": "postmaster", - "vartype": "integer", - } - }, - { - "metric": { - "setting_name": "work_mem", - "setting_value": "512", - "category": "Memory", - "unit": "", - "context": "user", - "vartype": "integer", - } - }, - ] - }, - } - - monkeypatch.setattr(generator, "query_instant", fake_query) - - report = generator.generate_a003_settings_report("local", "node-1") - data = report["results"]["node-1"]["data"] - - assert data["shared_buffers"]["pretty_value"] == "1 MiB" - assert data["work_mem"]["unit"] == "" - assert data["work_mem"]["category"] == "Memory" - - -@pytest.mark.unit -def test_generate_a007_altered_settings_report(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - def fake_query(query: str) -> dict[str, Any]: - # Handle version info query from _get_postgres_version_info - if 'setting_name=~"server_version|server_version_num"' in query: - return { - "status": "success", - "data": { - "result": [ - {"metric": {"setting_name": "server_version", "setting_value": "15.0"}}, - {"metric": {"setting_name": "server_version_num", "setting_value": "150000"}}, - ] - }, - } - # Handle altered settings query - assert "pgwatch_settings_is_default" in query - return { - "status": "success", - "data": { - "result": [ - { - "metric": { - "setting_name": "work_mem", - "setting_value": "1024", - "unit": "", - "category": "Memory", - } - }, - { - "metric": { - "setting_name": "autovacuum", - "setting_value": "off", - "unit": "", - "category": "Autovacuum", - } - }, - ] - }, - } - - monkeypatch.setattr(generator, "query_instant", fake_query) - - payload = generator.generate_a007_altered_settings_report("local", "node-1") - data = payload["results"]["node-1"]["data"] - - assert set(data.keys()) == {"work_mem", "autovacuum"} - assert "postgres_version" in payload["results"]["node-1"] # postgres_version is at node level - assert data["work_mem"]["pretty_value"] == "1 MiB" - assert data["autovacuum"]["pretty_value"] == "off" - - -@pytest.mark.unit -def test_get_all_databases_merges_sources(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - def fake_query(query: str) -> dict[str, Any]: - if "wraparound" in query: - return { - "status": "success", - "data": { - "result": [ - {"metric": {"datname": "appdb"}, "value": [0, "1"]}, - {"metric": {"datname": "template0"}, "value": [0, "1"]}, # excluded - ] - }, - } - if "unused_indexes" in query: - return { - "status": "success", - "data": { - "result": [ - # Reporter expects `datname` label for unused indexes metric. - {"metric": {"datname": "analytics"}, "value": [0, "1"]}, - {"metric": {"datname": "appdb"}, "value": [0, "1"]}, # duplicate - ] - }, - } - if "redundant_indexes" in query: - return { - "status": "success", - "data": { - "result": [ - {"metric": {"dbname": "warehouse"}, "value": [0, "1"]}, - ] - }, - } - if "pg_btree_bloat_bloat_pct" in query: - return { - "status": "success", - "data": { - "result": [ - {"metric": {"datname": "inventory"}, "value": [0, "1"]}, - ] - }, - } - return {"status": "success", "data": {"result": []}} - - monkeypatch.setattr(generator, "query_instant", fake_query) - - databases = generator.get_all_databases("local", "node-1") - - assert databases == ["appdb", "analytics", "warehouse", "inventory"] - - -@pytest.mark.unit -def test_check_pg_stat_kcache_status(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, prom_result) -> None: - responses = { - "pgwatch_pg_stat_kcache_exec_total_time": prom_result( - [ - { - "metric": {"queryid": "1", "tag_user": "postgres"}, - "value": [0, "10"], - } - ] - ), - "pgwatch_pg_stat_kcache_exec_user_time": prom_result([{"metric": {}, "value": [0, "4"]}]), - "pgwatch_pg_stat_kcache_exec_system_time": prom_result([{"metric": {}, "value": [0, "6"]}]), - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - status = generator._check_pg_stat_kcache_status("local", "node-1") - - assert status["extension_available"] is True - assert status["metrics_count"] == 1 - assert status["total_exec_time"] == 10.0 - assert status["total_user_time"] == 4.0 - assert status["sample_queries"][0]["queryid"] == "1" - - -@pytest.mark.unit -def test_check_pg_stat_statements_status(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, prom_result) -> None: - response = prom_result( - [ - { - "metric": {"queryid": "1", "tag_user": "postgres", "datname": "db1"}, - "value": [0, "5"], - } - ] - ) - monkeypatch.setattr(generator, "query_instant", lambda query: response) - - status = generator._check_pg_stat_statements_status("local", "node-1") - - assert status["extension_available"] is True - assert status["metrics_count"] == 1 - assert status["total_calls"] == 5.0 - assert status["sample_queries"][0]["database"] == "db1" - - -@pytest.mark.unit -def test_generate_h001_invalid_indexes_report( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - prom_result, -) -> None: - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["maindb"]) - monkeypatch.setattr(generator, "get_index_definitions_from_sink", lambda db: {"idx_invalid": "CREATE INDEX idx_invalid ON public.tbl USING btree (col)"}) - - responses = { - "pgwatch_pg_invalid_indexes": prom_result( - [ - { - "metric": { - "schema_name": "public", - "table_name": "tbl", - "index_name": "idx_invalid", - "relation_name": "public.tbl", - "supports_fk": "1", - }, - "value": [0, "2048"], - } - ] - ) - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - payload = generator.generate_h001_invalid_indexes_report("local", "node-1") - db_data = payload["results"]["node-1"]["data"]["maindb"] - - assert db_data["total_count"] == 1 - assert db_data["total_size_bytes"] == 2048.0 - entry = db_data["invalid_indexes"][0] - assert entry["index_name"] == "idx_invalid" - assert entry["index_size_pretty"].endswith("KiB") - assert entry["index_definition"].startswith("CREATE INDEX") - assert entry["supports_fk"] is True - - -@pytest.mark.unit -def test_generate_h002_unused_indexes_report( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - prom_result, -) -> None: - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["app"]) - monkeypatch.setattr(generator, "get_index_definitions_from_sink", lambda db: {"idx_unused": "CREATE INDEX idx_unused ON t(c)"}) - - responses = { - "pgwatch_db_stats_postmaster_uptime_s": prom_result([{"value": [0, "3600"]}]), - "pgwatch_stats_reset_stats_reset_epoch": prom_result([{"value": [0, "1700000000"]}]), - "pgwatch_unused_indexes_index_size_bytes": prom_result( - [ - { - "metric": { - "schema_name": "public", - "table_name": "tbl", - "index_name": "idx_unused", - "reason": "never scanned", - "idx_is_btree": "true", - "supports_fk": "0", - }, - "value": [0, "1024"], - } - ] - ), - "pgwatch_unused_indexes_idx_scan": prom_result([{"value": [0, "0"]}]), - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - payload = generator.generate_h002_unused_indexes_report("local", "node-1") - db_data = payload["results"]["node-1"]["data"]["app"] - - assert db_data["total_count"] == 1 - unused = db_data["unused_indexes"][0] - assert unused["index_definition"].startswith("CREATE INDEX") - assert unused["idx_scan"] == 0 - assert unused["index_size_pretty"].endswith("KiB") - stats_reset = db_data["stats_reset"] - assert stats_reset["stats_reset_epoch"] == 1700000000.0 - assert stats_reset["postmaster_startup_epoch"] is not None - - -@pytest.mark.unit -def test_generate_h004_redundant_indexes_report( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - prom_result, -) -> None: - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["app"]) - monkeypatch.setattr(generator, "get_index_definitions_from_sink", lambda db: {"idx_dup": "CREATE INDEX idx_dup ON t(c)"}) - - responses = { - "pgwatch_redundant_indexes_index_size_bytes": prom_result( - [ - { - "metric": { - "schema_name": "public", - "table_name": "tbl", - "index_name": "idx_dup", - "relation_name": "public.tbl", - "access_method": "btree", - "reason": "covers columns", - }, - "value": [0, "4096"], - } - ] - ), - "pgwatch_redundant_indexes_table_size_bytes": prom_result([{"value": [0, "8192"]}]), - "pgwatch_redundant_indexes_index_usage": prom_result([{"value": [0, "2"]}]), - "pgwatch_redundant_indexes_supports_fk": prom_result([{"value": [0, "1"]}]), - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - payload = generator.generate_h004_redundant_indexes_report("local", "node-1") - db_data = payload["results"]["node-1"]["data"]["app"] - - assert db_data["total_count"] == 1 - redundant = db_data["redundant_indexes"][0] - assert redundant["index_definition"].startswith("CREATE INDEX") - assert redundant["index_usage"] == 2.0 - assert redundant["index_size_pretty"].endswith("KiB") - assert redundant["supports_fk"] is True - - -@pytest.mark.unit -def test_generate_d004_pgstat_settings_report( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - prom_result, -) -> None: - responses = { - "pgwatch_settings_configured": prom_result( - [ - { - "metric": { - "setting_name": "pg_stat_statements.max", - "setting_value": "1000", - "category": "Stats", - "unit": "", - "context": "postmaster", - "vartype": "integer", - } - } - ] - ) - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - monkeypatch.setattr(generator, "_check_pg_stat_kcache_status", lambda *args, **kwargs: {"extension_available": True}) - monkeypatch.setattr(generator, "_check_pg_stat_statements_status", lambda *args, **kwargs: {"extension_available": False}) - - payload = generator.generate_d004_pgstat_settings_report("local", "node-1") - data = payload["results"]["node-1"]["data"] - - assert "pg_stat_statements.max" in data["settings"] - assert data["pg_stat_kcache_status"]["extension_available"] is True - - -@pytest.mark.unit -def test_generate_f001_autovacuum_settings_report( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - prom_result, -) -> None: - responses = { - "pgwatch_settings_configured": prom_result( - [ - { - "metric": { - "setting_name": "autovacuum_naptime", - "setting_value": "60", - "category": "Autovacuum", - "unit": "", - "context": "sighup", - "vartype": "integer", - } - } - ] - ) - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - payload = generator.generate_f001_autovacuum_settings_report("local", "node-1") - data = payload["results"]["node-1"]["data"] - - assert data["autovacuum_naptime"]["setting"] == "60" - assert data["autovacuum_naptime"]["pretty_value"] == "1 min" - - -@pytest.mark.unit -def test_generate_f005_btree_bloat_report( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - prom_result, -) -> None: - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - responses = { - "pgwatch_pg_stat_all_tables_last_vacuum": prom_result( - [ - { - "metric": {"schemaname": "public", "relname": "t"}, - "value": [0, "1700000000"], - } - ] - ), - "pgwatch_pg_btree_bloat_real_size_mib": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, - "value": [0, "2"], - } - ] - ), - "pgwatch_pg_btree_bloat_table_size_mib": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, - "value": [0, "10"], - } - ] - ), - "pgwatch_pg_btree_bloat_extra_size": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, - "value": [0, "1024"], - } - ] - ), - "pgwatch_pg_btree_bloat_extra_pct": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, - "value": [0, "20"], - } - ] - ), - "pgwatch_pg_btree_bloat_fillfactor": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, - "value": [0, "90"], - } - ] - ), - "pgwatch_pg_btree_bloat_bloat_size": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, - "value": [0, "2048"], - } - ] - ), - "pgwatch_pg_btree_bloat_bloat_pct": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, - "value": [0, "50"], - } - ] - ), - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - payload = generator.generate_f005_btree_bloat_report("local", "node-1") - db_data = payload["results"]["node-1"]["data"]["db1"] - entry = db_data["bloated_indexes"][0] - - assert entry["real_size"] == 2 * 1024 * 1024 - assert entry["real_size_pretty"] == "2.00 MiB" - assert entry["table_size"] == 10 * 1024 * 1024 - assert entry["table_size_pretty"] == "10.0 MiB" - # Prometheus provides *_mib metrics, but the report output should expose bytes-only fields. - assert "real_size_mib" not in entry - assert "table_size_mib" not in entry - assert entry["extra_size"] == 1024.0 - assert entry["bloat_pct"] == 50.0 - assert entry["fillfactor"] == 90.0 - assert entry["last_vacuum_epoch"] == 1700000000.0 - assert entry["last_vacuum"] == "2023-11-14T22:13:20+00:00" - assert entry["bloat_size_pretty"].endswith("KiB") - - -@pytest.mark.unit -def test_generate_f004_heap_bloat_report_real_size_uses_real_size_mib( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - prom_result, -) -> None: - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - responses = { - "pgwatch_db_size_size_b": prom_result( - [ - { - "metric": {"datname": "db1"}, - "value": [0, "1048576"], - } - ] - ), - "pgwatch_pg_stat_all_tables_last_vacuum": prom_result( - [ - { - "metric": {"schemaname": "public", "relname": "t"}, - "value": [0, "1700000000"], - } - ] - ), - "pgwatch_pg_table_bloat_real_size_mib": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t"}, - "value": [0, "128"], - } - ] - ), - "pgwatch_pg_table_bloat_extra_size": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t"}, - "value": [0, "1024"], - } - ] - ), - "pgwatch_pg_table_bloat_extra_pct": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t"}, - "value": [0, "10"], - } - ] - ), - "pgwatch_pg_table_bloat_fillfactor": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t"}, - "value": [0, "100"], - } - ] - ), - "pgwatch_pg_table_bloat_bloat_size": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t"}, - "value": [0, "2048"], - } - ] - ), - "pgwatch_pg_table_bloat_bloat_pct": prom_result( - [ - { - "metric": {"schemaname": "public", "tblname": "t"}, - "value": [0, "20"], - } - ] - ), - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - payload = generator.generate_f004_heap_bloat_report("local", "node-1") - db_data = payload["results"]["node-1"]["data"]["db1"] - entry = db_data["bloated_tables"][0] - - # Prometheus provides real_size_mib, but the report should expose real_size in bytes. - assert entry["real_size"] == 128 * 1024 * 1024 - assert entry["real_size_pretty"] == "128 MiB" - assert entry["fillfactor"] == 100.0 - assert entry["last_vacuum_epoch"] == 1700000000.0 - assert entry["last_vacuum"] == "2023-11-14T22:13:20+00:00" - assert "real_size_mib" not in entry - assert "real_size_bytes" not in entry - - -@pytest.mark.unit -def test_get_pgss_metrics_data_by_db_invokes_all_metrics(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - captured: list[str] = [] - - def fake_query_range(query: str, start, end, step: str = "30s") -> list[dict]: - captured.append(query) - return [] - - monkeypatch.setattr(generator, "query_range", fake_query_range) - sentinel = [{"result": "ok"}] - monkeypatch.setattr(generator, "_process_pgss_data", lambda *args, **kwargs: sentinel) - - start = datetime(2024, 1, 1, 0, 0, 0) - end = start + timedelta(hours=1) - result = generator._get_pgss_metrics_data_by_db("local", "node-1", "db1", start, end) - - assert result == sentinel - # Ensure at least one representative metric was queried with filters - assert any("pgwatch_pg_stat_statements_calls" in q for q in captured) - - -@pytest.mark.unit -def test_generate_all_reports_invokes_every_builder(monkeypatch: pytest.MonkeyPatch) -> None: - generator = PostgresReportGenerator() - called: list[str] = [] - - def stub(name: str): - def _(*args, **kwargs): - called.append(name) - return {name: True} - - return _ - - # Independent builders (not derived from A003) - independent_builders = [ - "generate_a002_version_report", - "generate_a003_settings_report", - "generate_a004_cluster_report", - "generate_a007_altered_settings_report", - "generate_f004_heap_bloat_report", - "generate_f005_btree_bloat_report", - "generate_h001_invalid_indexes_report", - "generate_h002_unused_indexes_report", - "generate_h004_redundant_indexes_report", - "generate_k001_query_calls_report", - "generate_k003_top_queries_report", - "generate_k004_temp_bytes_report", - "generate_k005_wal_bytes_report", - "generate_k006_shared_read_report", - "generate_k007_shared_hit_report", - "generate_k008_shared_hit_read_report", - "generate_m001_mean_time_report", - "generate_m002_rows_report", - "generate_m003_io_time_report", - "generate_n001_wait_events_report", - ] - - # Builders derived from A003 - a003_derived_builders = [ - "generate_d004_from_a003", - "generate_f001_from_a003", - "generate_g001_from_a003", - ] - - for name in independent_builders: - monkeypatch.setattr(generator, name, stub(name)) - - for name in a003_derived_builders: - monkeypatch.setattr(generator, name, stub(name)) - - reports = generator.generate_all_reports("local", "node-1") - - # All report types should be generated - expected_report_codes = { - 'A002', 'A003', 'A004', 'A007', - 'D004', 'F001', 'F004', 'F005', 'G001', - 'H001', 'H002', 'H004', - 'K001', 'K003', 'K004', 'K005', 'K006', 'K007', 'K008', - 'M001', 'M002', 'M003', - 'N001', - } - assert set(reports.keys()) == expected_report_codes - - # All builders should be called - all_builders = independent_builders + a003_derived_builders - assert set(called) == set(all_builders) - - -@pytest.mark.unit -def test_create_report_uses_api(monkeypatch: pytest.MonkeyPatch) -> None: - generator = PostgresReportGenerator() - payloads: list[dict] = [] - - def fake_make_request(api_url, endpoint, request_data): - payloads.append({"endpoint": endpoint, "data": request_data}) - return {"report_id": 42} - - monkeypatch.setattr(postgres_reports_module, "make_request", fake_make_request) - - report_id = generator.create_report("https://api", "tok", "proj", "123") - - assert report_id == 42 - assert payloads[0]["endpoint"] == "/rpc/checkup_report_create" - assert payloads[0]["data"]["project"] == "proj" - - -@pytest.mark.unit -def test_upload_report_file_sends_contents(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None: - generator = PostgresReportGenerator() - captured: dict = {} - - def fake_make_request(api_url, endpoint, request_data): - captured["endpoint"] = endpoint - captured["data"] = request_data - return {} - - monkeypatch.setattr(postgres_reports_module, "make_request", fake_make_request) - - report_file = tmp_path / "A002_report.json" - # check_id is derived from JSON payload (not filename). - report_file.write_text('{"checkId": "A002", "foo": "bar"}', encoding="utf-8") - - generator.upload_report_file("https://api", "tok", 100, str(report_file)) - - assert captured["endpoint"] == "/rpc/checkup_report_file_post" - assert captured["data"]["check_id"] == "A002" - assert captured["data"]["filename"] == report_file.name - - -@pytest.mark.unit -def test_upload_report_file_handles_404_gracefully(tmp_path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: - generator = PostgresReportGenerator() - - def fake_make_request(api_url, endpoint, request_data): - import requests - response = requests.Response() - response.status_code = 404 - raise requests.exceptions.HTTPError(response=response) - - monkeypatch.setattr(postgres_reports_module, "make_request", fake_make_request) - - report_file = tmp_path / "A002_report.json" - report_file.write_text('{"foo": "bar"}', encoding="utf-8") - - # Should not raise exception - generator.upload_report_file("https://api", "tok", 100, str(report_file)) - - captured = capsys.readouterr() - assert "Upload endpoint not available (404)" in captured.out - assert "--no-upload" in captured.out - - -@pytest.mark.unit -def test_create_report_handles_404_gracefully(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: - generator = PostgresReportGenerator() - - def fake_make_request(api_url, endpoint, request_data): - import requests - response = requests.Response() - response.status_code = 404 - raise requests.exceptions.HTTPError(response=response) - - monkeypatch.setattr(postgres_reports_module, "make_request", fake_make_request) - - # Should not raise exception, should return None - report_id = generator.create_report("https://api", "tok", "proj", "123") - - assert report_id is None - captured = capsys.readouterr() - assert "API endpoint not available (404)" in captured.out - - -@pytest.mark.unit -def test_main_runs_specific_check_without_upload(monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str]) -> None: - class DummyGenerator: - DEFAULT_EXCLUDED_DATABASES = {'template0', 'template1', 'rdsadmin', 'azure_maintenance', 'cloudsqladmin'} - - def __init__(self, *args, **kwargs): - self.closed = False - self.pg_conn = None # Add pg_conn attribute for memory cleanup check - - def get_all_clusters(self): - # Match current reporter.main() behavior which always calls - # get_all_clusters() when cluster is not explicitly provided. - return ["local"] - - def test_connection(self) -> bool: - return True - - def generate_a002_version_report(self, cluster, node_name): - return {"checkId": "A002", "results": {node_name: {"data": {"ok": True}}}} - - def close_postgres_sink(self): - self.closed = True - self.pg_conn = None - - monkeypatch.setattr(postgres_reports_module, "PostgresReportGenerator", DummyGenerator) - monkeypatch.setattr(sys, "argv", ["postgres_reports.py", "--check-id", "A002", "--output", "-", "--no-upload"]) - - postgres_reports_module.main() - - captured = capsys.readouterr().out - - # main() prints progress banners along with the JSON payload. - # Extract the JSON object from the captured stdout by finding the - # first line that looks like JSON and ending before any trailing messages. - lines = captured.splitlines() - start_idx = 0 - end_idx = len(lines) - - # Find start of JSON - for i, line in enumerate(lines): - if line.strip().startswith("{"): - start_idx = i - break - - # Find end of JSON (stop at first non-JSON line after JSON starts) - brace_count = 0 - for i in range(start_idx, len(lines)): - line = lines[i].strip() - brace_count += line.count("{") - line.count("}") - if brace_count == 0 and line.endswith("}"): - end_idx = i + 1 - break - - json_str = "\n".join(lines[start_idx:end_idx]) - - output = json.loads(json_str) - assert output["checkId"] == "A002" - assert "results" in output - - -@pytest.mark.unit -def test_main_all_reports_does_not_crash_when_output_is_file(tmp_path, monkeypatch: pytest.MonkeyPatch) -> None: - """ - Regression test for reporter/postgres_reports.py around the `del reports` block - (~4308-4324). - - In ALL-reports mode, providing a normal file path via --output should NOT - cause the process to crash. Current code crashes because it does `del reports` - and then later references `reports` when handling args.output. - """ - class DummyGenerator: - DEFAULT_EXCLUDED_DATABASES = {'template0', 'template1', 'rdsadmin', 'azure_maintenance', 'cloudsqladmin'} - - def __init__(self, *args, **kwargs): - self.pg_conn = None - - def test_connection(self) -> bool: - return True - - def get_all_clusters(self): - return ["local"] - - def generate_all_reports(self, cluster, node_name, combine_nodes=True): - # Minimal plausible payload - return { - "A002": {"checkId": "A002", "results": {"node-1": {"data": {"ok": True}}}}, - "A003": {"checkId": "A003", "results": {"node-1": {"data": {"ok": True}}}}, - } - - def generate_per_query_jsons(self, *args, **kwargs): - return [] - - def close_postgres_sink(self): - self.pg_conn = None - - monkeypatch.setattr(postgres_reports_module, "PostgresReportGenerator", DummyGenerator) - monkeypatch.chdir(tmp_path) - - out_path = tmp_path / "all_reports.json" - monkeypatch.setattr( - sys, - "argv", - [ - "postgres_reports.py", - "--check-id", - "ALL", - "--cluster", - "local", - "--output", - str(out_path), - "--no-upload", - ], - ) - - postgres_reports_module.main() - - -@pytest.mark.unit -def test_main_exits_when_connection_fails(monkeypatch: pytest.MonkeyPatch) -> None: - class FailingGenerator: - DEFAULT_EXCLUDED_DATABASES = {'template0', 'template1', 'rdsadmin', 'azure_maintenance', 'cloudsqladmin'} - - def __init__(self, *args, **kwargs): - pass - - def test_connection(self) -> bool: - return False - - monkeypatch.setattr(postgres_reports_module, "PostgresReportGenerator", FailingGenerator) - monkeypatch.setattr(sys, "argv", ["postgres_reports.py", "--check-id", "A002"]) - - with pytest.raises(SystemExit): - postgres_reports_module.main() - - -# ============================================================================ -# Negative test cases - Error handling -# ============================================================================ - - -@pytest.mark.unit -def test_query_instant_handles_http_404_error(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - """Test that query_instant returns empty dict on HTTP 404 error.""" - class MockResponse: - status_code = 404 - text = "Not Found" - - def json(self): - return {"error": "not found"} - - def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): - return MockResponse() - - monkeypatch.setattr("requests.get", fake_get) - - result = generator.query_instant("test_query") - - assert result == {} - - -@pytest.mark.unit -def test_query_instant_handles_http_500_error(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - """Test that query_instant returns empty dict on HTTP 500 error.""" - class MockResponse: - status_code = 500 - text = "Internal Server Error" - - def json(self): - raise ValueError("Invalid JSON") - - def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): - return MockResponse() - - monkeypatch.setattr("requests.get", fake_get) - - result = generator.query_instant("test_query") - - assert result == {} - - -@pytest.mark.unit -def test_query_instant_handles_timeout(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - """Test that query_instant returns empty dict on request timeout.""" - import requests - - def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): - raise requests.Timeout("Connection timed out") - - monkeypatch.setattr("requests.get", fake_get) - - result = generator.query_instant("test_query") - - assert result == {} - - -@pytest.mark.unit -def test_query_instant_handles_connection_error(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - """Test that query_instant returns empty dict on connection error.""" - import requests - - def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): - raise requests.ConnectionError("Failed to establish connection") - - monkeypatch.setattr("requests.get", fake_get) - - result = generator.query_instant("test_query") - - assert result == {} - - -@pytest.mark.unit -def test_query_instant_handles_malformed_json(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - """Test that query_instant returns empty dict when response has invalid JSON.""" - class MockResponse: - status_code = 200 - - def json(self): - raise ValueError("Invalid JSON") - - def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): - return MockResponse() - - monkeypatch.setattr("requests.get", fake_get) - - result = generator.query_instant("test_query") - - assert result == {} - - -@pytest.mark.unit -def test_query_range_handles_http_error(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - """Test that query_range returns empty list on HTTP error.""" - class MockResponse: - status_code = 503 - text = "Service Unavailable" - - def json(self): - return {"error": "service unavailable"} - - def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): - return MockResponse() - - monkeypatch.setattr("requests.get", fake_get) - - start = datetime.now() - end = start + timedelta(hours=1) - result = generator.query_range("test_query", start, end) - - assert result == [] - - -@pytest.mark.unit -def test_query_range_handles_timeout(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - """Test that query_range returns empty list on timeout.""" - import requests - - def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): - raise requests.Timeout("Connection timed out") - - monkeypatch.setattr("requests.get", fake_get) - - start = datetime.now() - end = start + timedelta(hours=1) - result = generator.query_range("test_query", start, end) - - assert result == [] - - -@pytest.mark.unit -def test_query_range_handles_malformed_response(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - """Test that query_range handles response with missing expected fields.""" - class MockResponse: - status_code = 200 - - def json(self): - # Missing 'data' or 'result' fields - return {"status": "success"} - - def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): - return MockResponse() - - monkeypatch.setattr("requests.get", fake_get) - - start = datetime.now() - end = start + timedelta(hours=1) - result = generator.query_range("test_query", start, end) - - assert result == [] - - -@pytest.mark.unit -def test_query_range_handles_failed_status(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator) -> None: - """Test that query_range handles Prometheus error status.""" - class MockResponse: - status_code = 200 - - def json(self): - return { - "status": "error", - "errorType": "bad_data", - "error": "invalid query" - } - - def fake_get(url: str, params: dict[str, Any] | None = None, timeout: int | None = None): - return MockResponse() - - monkeypatch.setattr("requests.get", fake_get) - - start = datetime.now() - end = start + timedelta(hours=1) - result = generator.query_range("test_query", start, end) - - assert result == [] - - -@pytest.mark.unit -def test_make_request_raises_on_http_error(monkeypatch: pytest.MonkeyPatch) -> None: - """Test that make_request raises exception on HTTP error.""" - class MockResponse: - status_code = 400 - - def raise_for_status(self): - import requests - raise requests.HTTPError("400 Client Error") - - def json(self): - return {} - - def fake_post(url: str, json: dict[str, Any] | None = None): - return MockResponse() - - monkeypatch.setattr("requests.post", fake_post) - - import requests - with pytest.raises(requests.HTTPError): - postgres_reports_module.make_request("http://api.test", "/endpoint", {"data": "test"}) - - -@pytest.mark.unit -def test_make_request_raises_on_connection_error(monkeypatch: pytest.MonkeyPatch) -> None: - """Test that make_request raises exception on connection error.""" - import requests - - def fake_post(url: str, json: dict[str, Any] | None = None): - raise requests.ConnectionError("Connection failed") - - monkeypatch.setattr("requests.post", fake_post) - - with pytest.raises(requests.ConnectionError): - postgres_reports_module.make_request("http://api.test", "/endpoint", {"data": "test"}) diff --git a/tests/reporter/test_hourly_topk_pgss_multi_unit.py b/tests/reporter/test_hourly_topk_pgss_multi_unit.py deleted file mode 100644 index af2e074e..00000000 --- a/tests/reporter/test_hourly_topk_pgss_multi_unit.py +++ /dev/null @@ -1,104 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import pytest - -import reporter.postgres_reports as pr -from reporter.postgres_reports import PostgresReportGenerator - - -@pytest.fixture(name="generator") -def fixture_generator() -> PostgresReportGenerator: - return PostgresReportGenerator( - prometheus_url="http://prom.test", - postgres_sink_url="", - ) - - -@pytest.mark.unit -def test_hourly_topk_multi_clamps_negative_other_and_warns( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - series_sample, -) -> None: - # Make timeline deterministic (avoid relying on wall clock / hour boundaries). - monkeypatch.setattr(generator, "_floor_hour", lambda _ts: 200) - monkeypatch.setattr(generator, "_build_timeline", lambda _end_s, _hours, _step_s: (100, [100, 200])) - - warnings: list[str] = [] - monkeypatch.setattr(pr.logger, "warning", lambda msg: warnings.append(str(msg))) - - def fake_query_range(_query: str, start, end, step: str = "3600s") -> list[dict[str, Any]]: - _ = (start, end, step) - # topk(...) union selection - if _query.startswith("topk("): - return [series_sample("dummy", labels={"queryid": "1"}, values=[(100, 0), (200, 0)])] - # total query - return no series, so totals become 0.0 - if "sum(increase(" in _query and "queryid" not in _query: - return [] - # union query - per queryid series (already aggregated by the query) - if "sum by (queryid)" in _query: - return [series_sample("dummy", labels={"queryid": "1"}, values=[(100, 5.0), (200, 5.0)])] - raise AssertionError(f"Unexpected query: {_query}") - - monkeypatch.setattr(generator, "query_range", fake_query_range) - - per_query, other, timeline = generator._get_hourly_topk_pgss_data_sum2( - cluster="local", - node_name="node-1", - db_name="db1", - metric_name_a="metric_a", - metric_name_b="metric_b", - hours=2, - step_s=3600, - k=3, - ) - - assert timeline == [100, 200] - assert per_query["1"] == pytest.approx([5.0, 5.0]) - assert other == pytest.approx([0.0, 0.0]) - assert len(warnings) == 1 - assert "negative 'other' clamped to 0" in warnings[0] - - -@pytest.mark.unit -def test_hourly_topk_multi_tiny_negative_other_is_silently_clamped( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - series_sample, -) -> None: - monkeypatch.setattr(generator, "_floor_hour", lambda _ts: 200) - monkeypatch.setattr(generator, "_build_timeline", lambda _end_s, _hours, _step_s: (100, [100, 200])) - - warnings: list[str] = [] - monkeypatch.setattr(pr.logger, "warning", lambda msg: warnings.append(str(msg))) - - def fake_query_range(_query: str, start, end, step: str = "3600s") -> list[dict[str, Any]]: - _ = (start, end, step) - if _query.startswith("topk("): - return [series_sample("dummy", labels={"queryid": "1"}, values=[(100, 0), (200, 0)])] - # total is 1.0, union sums to 1.0 + 5e-7 => other = -5e-7 (below warning threshold) - if "sum(increase(" in _query and "queryid" not in _query: - return [series_sample("dummy", labels={}, values=[(100, 1.0), (200, 1.0)])] - if "sum by (queryid)" in _query: - return [series_sample("dummy", labels={"queryid": "1"}, values=[(100, 1.0000005), (200, 1.0000005)])] - raise AssertionError(f"Unexpected query: {_query}") - - monkeypatch.setattr(generator, "query_range", fake_query_range) - - _, other, _ = generator._get_hourly_topk_pgss_data_sum2( - cluster="local", - node_name="node-1", - db_name="db1", - metric_name_a="metric_a", - metric_name_b="metric_b", - hours=2, - step_s=3600, - k=3, - ) - - assert other == pytest.approx([0.0, 0.0]) - assert warnings == [] - - diff --git a/tests/reporter/test_postgres_integration.py b/tests/reporter/test_postgres_integration.py deleted file mode 100644 index 414d0090..00000000 --- a/tests/reporter/test_postgres_integration.py +++ /dev/null @@ -1,75 +0,0 @@ -import json -from datetime import datetime, timezone -from typing import Callable, Tuple - -import pytest - -from reporter.postgres_reports import PostgresReportGenerator - -Seeder = Callable[[str, str, str], None] - - -@pytest.fixture(scope="function") -def sink_index_data(postgresql) -> Tuple[str, Seeder]: - conn = postgresql - conn.autocommit = True - cur = conn.cursor() - cur.execute( - """ - create table if not exists public.index_definitions ( - time timestamptz not null, - dbname text not null, - data jsonb not null, - tag_data jsonb - ) - """ - ) - - def seed(dbname: str, index_name: str, index_def: str) -> None: - payload = { - "indexrelname": index_name, - "index_definition": index_def, - "schemaname": "public", - "relname": "tbl", - } - with conn.cursor() as seed_cur: - seed_cur.execute( - ( - "insert into public.index_definitions " - "(time, dbname, data) values (%s, %s, %s::jsonb)" - ), - (datetime.now(timezone.utc), dbname, json.dumps(payload)), - ) - - host = conn.info.host or conn.info.hostaddr or "localhost" - port = conn.info.port - user = conn.info.user - dbname = conn.info.dbname - dsn = f"postgresql://{user}@{host}:{port}/{dbname}" - - yield dsn, seed - - cur.execute("truncate table public.index_definitions") - cur.close() - - -@pytest.mark.integration -@pytest.mark.requires_postgres -def test_get_index_definitions_from_sink(sink_index_data) -> None: - dsn, seed = sink_index_data - seed("db1", "idx_users", "CREATE INDEX idx_users ON users(id)") - seed("db2", "idx_orders", "CREATE INDEX idx_orders ON orders(id)") - - generator = PostgresReportGenerator( - prometheus_url="http://unused", - postgres_sink_url=dsn, - ) - assert generator.connect_postgres_sink() - - definitions = generator.get_index_definitions_from_sink() - - assert definitions["db1.idx_users"] == "CREATE INDEX idx_users ON users(id)" - assert definitions["db2.idx_orders"] == "CREATE INDEX idx_orders ON orders(id)" - - generator.close_postgres_sink() - assert generator.pg_conn is None diff --git a/tests/reporter/test_report_schemas.py b/tests/reporter/test_report_schemas.py deleted file mode 100644 index 664f04f2..00000000 --- a/tests/reporter/test_report_schemas.py +++ /dev/null @@ -1,545 +0,0 @@ -from __future__ import annotations - -from typing import Any, Callable - -import pytest - -from reporter.postgres_reports import PostgresReportGenerator -from reporter.report_schemas import validate_query_file, validate_report - - -@pytest.fixture(name="generator") -def fixture_generator() -> PostgresReportGenerator: - return PostgresReportGenerator( - prometheus_url="http://prom.test", - postgres_sink_url="", - ) - - -@pytest.fixture(name="fixed_pg_version") -def fixture_fixed_pg_version() -> dict[str, str]: - return { - "version": "15.3", - "server_version_num": "150003", - "server_major_ver": "15", - "server_minor_ver": "3", - } - - -def _query_stub_factory( - prom_result: Callable[[list[dict] | None, str], dict], - mapping: dict[str, Any], -) -> Callable[[str], dict[str, Any]]: - def _fake(query: str) -> dict[str, Any]: - for needle, payload in mapping.items(): - if needle in query: - return payload(query) if callable(payload) else payload - return prom_result([]) - - return _fake - - -@pytest.mark.unit -def test_schema_a002(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - report = generator.generate_a002_version_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_a003( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - - resp = prom_result( - [ - { - "metric": { - "setting_name": "shared_buffers", - "setting_value": "128", - "category": "Memory", - "unit": "8kB", - "context": "postmaster", - "vartype": "integer", - } - } - ] - ) - monkeypatch.setattr(generator, "query_instant", lambda query: resp) - - report = generator.generate_a003_settings_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_a004( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - - def fake_query(query: str) -> dict[str, Any]: - if "pgwatch_db_size_size_b" in query and "sum(" not in query: - return { - "status": "success", - "data": { - "result": [ - {"metric": {"datname": "db1"}, "value": [0, "1024"]}, - ] - }, - } - return {"status": "success", "data": {"result": [{"value": [0, "42"]}]}} - - monkeypatch.setattr(generator, "query_instant", fake_query) - report = generator.generate_a004_cluster_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_a007( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - resp = prom_result( - [ - { - "metric": { - "setting_name": "work_mem", - "setting_value": "1024", - "unit": "", - "category": "Memory", - } - } - ] - ) - monkeypatch.setattr(generator, "query_instant", lambda query: resp) - - report = generator.generate_a007_altered_settings_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_d004( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - - settings_resp = prom_result( - [ - { - "metric": { - "setting_name": "pg_stat_statements.max", - "setting_value": "1000", - "category": "Stats", - "unit": "", - "context": "postmaster", - "vartype": "integer", - } - } - ] - ) - monkeypatch.setattr(generator, "query_instant", lambda query: settings_resp) - monkeypatch.setattr( - generator, - "_check_pg_stat_statements_status", - lambda *args, **kwargs: { - "extension_available": True, - "metrics_count": 1, - "total_calls": 5.0, - "sample_queries": [{"queryid": "1", "user": "postgres", "database": "db1", "calls": 5.0}], - }, - ) - monkeypatch.setattr( - generator, - "_check_pg_stat_kcache_status", - lambda *args, **kwargs: { - "extension_available": True, - "metrics_count": 1, - "total_exec_time": 10.0, - "total_user_time": 4.0, - "total_system_time": 6.0, - "sample_queries": [{"queryid": "1", "user": "postgres", "exec_total_time": 10.0}], - }, - ) - - report = generator.generate_d004_pgstat_settings_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_f001( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - - resp = prom_result( - [ - { - "metric": { - "setting_name": "autovacuum_naptime", - "setting_value": "60", - "category": "Autovacuum", - "unit": "", - "context": "sighup", - "vartype": "integer", - } - } - ] - ) - monkeypatch.setattr(generator, "query_instant", lambda query: resp) - - report = generator.generate_f001_autovacuum_settings_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_f004( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - responses = { - "pgwatch_db_size_size_b": prom_result([{"metric": {"datname": "db1"}, "value": [0, "2048"]}]), - "pgwatch_pg_table_bloat_real_size": prom_result( - [{"metric": {"schemaname": "public", "tblname": "t"}, "value": [0, "4096"]}] - ), - "pgwatch_pg_table_bloat_extra_size": prom_result( - [{"metric": {"schemaname": "public", "tblname": "t"}, "value": [0, "1024"]}] - ), - "pgwatch_pg_table_bloat_extra_pct": prom_result( - [{"metric": {"schemaname": "public", "tblname": "t"}, "value": [0, "25"]}] - ), - "pgwatch_pg_table_bloat_bloat_size": prom_result( - [{"metric": {"schemaname": "public", "tblname": "t"}, "value": [0, "512"]}] - ), - "pgwatch_pg_table_bloat_bloat_pct": prom_result( - [{"metric": {"schemaname": "public", "tblname": "t"}, "value": [0, "12.5"]}] - ), - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - report = generator.generate_f004_heap_bloat_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_f005( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - - responses = { - "pgwatch_db_size_size_b": prom_result([{"metric": {"datname": "db1"}, "value": [0, "2048"]}]), - "pgwatch_pg_btree_bloat_extra_size": prom_result( - [{"metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, "value": [0, "1024"]}] - ), - "pgwatch_pg_btree_bloat_extra_pct": prom_result( - [{"metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, "value": [0, "20"]}] - ), - "pgwatch_pg_btree_bloat_bloat_size": prom_result( - [{"metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, "value": [0, "2048"]}] - ), - "pgwatch_pg_btree_bloat_bloat_pct": prom_result( - [{"metric": {"schemaname": "public", "tblname": "t", "idxname": "idx"}, "value": [0, "50"]}] - ), - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - report = generator.generate_f005_btree_bloat_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_g001( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - resp = prom_result( - [ - { - "metric": { - "setting_name": "shared_buffers", - "setting_value": "128MB", - "category": "Memory", - "unit": "", - "context": "postmaster", - "vartype": "integer", - } - }, - { - "metric": { - "setting_name": "work_mem", - "setting_value": "4MB", - "category": "Memory", - "unit": "", - "context": "user", - "vartype": "integer", - } - }, - { - "metric": { - "setting_name": "maintenance_work_mem", - "setting_value": "64MB", - "category": "Memory", - "unit": "", - "context": "user", - "vartype": "integer", - } - }, - { - "metric": { - "setting_name": "effective_cache_size", - "setting_value": "4GB", - "category": "Memory", - "unit": "", - "context": "user", - "vartype": "integer", - } - }, - { - "metric": { - "setting_name": "max_connections", - "setting_value": "100", - "category": "Connections", - "unit": "", - "context": "postmaster", - "vartype": "integer", - } - }, - { - "metric": { - "setting_name": "wal_buffers", - "setting_value": "16MB", - "category": "WAL", - "unit": "", - "context": "postmaster", - "vartype": "integer", - } - } - ] - ) - monkeypatch.setattr(generator, "query_instant", lambda query: resp) - report = generator.generate_g001_memory_settings_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_h001( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["maindb"]) - monkeypatch.setattr(generator, "get_index_definitions_from_sink", lambda db: {"idx_invalid": "CREATE INDEX idx_invalid ON public.tbl USING btree (col)"}) - responses = { - "pgwatch_db_size_size_b": prom_result([{"metric": {"datname": "maindb"}, "value": [0, "8192"]}]), - "pgwatch_pg_invalid_indexes": prom_result( - [ - { - "metric": { - "schema_name": "public", - "table_name": "tbl", - "index_name": "idx_invalid", - "relation_name": "public.tbl", - "supports_fk": "1", - }, - "value": [0, "2048"], - } - ] - ), - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - report = generator.generate_h001_invalid_indexes_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_h002( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["app"]) - monkeypatch.setattr(generator, "get_index_definitions_from_sink", lambda db: {"idx_unused": "CREATE INDEX idx_unused ON t(c)"}) - - responses = { - "pgwatch_db_size_size_b": prom_result([{"metric": {"datname": "app"}, "value": [0, "8192"]}]), - "pgwatch_db_stats_postmaster_uptime_s": prom_result([{"value": [0, "3600"]}]), - "pgwatch_stats_reset_stats_reset_epoch": prom_result([{"value": [0, "1700000000"]}]), - "pgwatch_unused_indexes_index_size_bytes": prom_result( - [ - { - "metric": { - "schema_name": "public", - "table_name": "tbl", - "index_name": "idx_unused", - "reason": "never scanned", - "idx_is_btree": "true", - "supports_fk": "0", - }, - "value": [0, "1024"], - } - ] - ), - "pgwatch_unused_indexes_idx_scan": prom_result([{"value": [0, "0"]}]), - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - report = generator.generate_h002_unused_indexes_report("local", "node-1") - validate_report(report) - - -@pytest.mark.unit -def test_schema_h004( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, - prom_result, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["app"]) - monkeypatch.setattr(generator, "get_index_definitions_from_sink", lambda db: {"idx_dup": "CREATE INDEX idx_dup ON t(c)"}) - - responses = { - "pgwatch_db_size_size_b": prom_result([{"metric": {"datname": "app"}, "value": [0, "8192"]}]), - "pgwatch_redundant_indexes_index_size_bytes": prom_result( - [ - { - "metric": { - "schema_name": "public", - "table_name": "tbl", - "index_name": "idx_dup", - "relation_name": "public.tbl", - "access_method": "btree", - "reason": "covers columns", - }, - "value": [0, "4096"], - } - ] - ), - "pgwatch_redundant_indexes_table_size_bytes": prom_result([{"value": [0, "8192"]}]), - "pgwatch_redundant_indexes_index_usage": prom_result([{"value": [0, "2"]}]), - "pgwatch_redundant_indexes_supports_fk": prom_result([{"value": [0, "1"]}]), - } - monkeypatch.setattr(generator, "query_instant", _query_stub_factory(prom_result, responses)) - - report = generator.generate_h004_redundant_indexes_report("local", "node-1") - validate_report(report) - - -def _sample_query_metric_row() -> dict[str, Any]: - # Must match _process_pgss_data() output keys for the current mapping used in _get_pgss_metrics_data_by_db(). - return { - "queryid": "123", - "database": "db1", - "user": "postgres", - "duration_seconds": 60.0, - "calls": 30.0, - "calls_per_sec": 0.5, - "calls_per_call": 1.0, - "total_time": 3000.0, - "total_time_per_sec": 50.0, - "total_time_per_call": 100.0, - "rows": 60.0, - "rows_per_sec": 1.0, - "rows_per_call": 2.0, - "shared_blks_hit": 10.0, - "shared_blks_hit_per_sec": 0.166, - "shared_blks_hit_per_call": 0.333, - "shared_blks_read": 0.0, - "shared_blks_read_per_sec": 0.0, - "shared_blks_read_per_call": 0.0, - "shared_blks_dirtied": 0.0, - "shared_blks_dirtied_per_sec": 0.0, - "shared_blks_dirtied_per_call": 0.0, - "shared_blks_written": 0.0, - "shared_blks_written_per_sec": 0.0, - "shared_blks_written_per_call": 0.0, - "blk_read_time": 0.0, - "blk_read_time_per_sec": 0.0, - "blk_read_time_per_call": 0.0, - "blk_write_time": 0.0, - "blk_write_time_per_sec": 0.0, - "blk_write_time_per_call": 0.0, - } - - -@pytest.mark.unit -def test_schema_k001( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr(generator, "_get_pgss_metrics_data_by_db", lambda *args, **kwargs: [_sample_query_metric_row()]) - - report = generator.generate_k001_query_calls_report("local", "node-1", time_range_minutes=60) - validate_report(report) - - -@pytest.mark.unit -def test_schema_k003( - monkeypatch: pytest.MonkeyPatch, - generator: PostgresReportGenerator, - fixed_pg_version, -) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr(generator, "_get_pgss_metrics_data_by_db", lambda *args, **kwargs: [_sample_query_metric_row()]) - - report = generator.generate_k003_top_queries_report("local", "node-1", time_range_minutes=60, limit=50) - validate_report(report) - - -@pytest.mark.unit -def test_schema_query_file() -> None: - payload = { - "cluster_id": "prod", - "query_id": "qid_1", - "query_text": "SELECT 1", - "nodes": {"primary": "main", "standbys": ["replica-1", "replica-2"]}, - "results": { - "main": { - "db1": {"metrics": {"calls": 1, "total_time": 2.5}}, - }, - "replica-1": { - "db1": {"metrics": {"calls": 0, "total_time": 0}}, - }, - }, - "time_range": {"hours": 24, "start_time": "2025-01-01T00:00:00+00:00", "end_time": "2025-01-02T00:00:00+00:00"}, - "timestamptz": "2025-01-02T00:00:00+00:00", - } - validate_query_file(payload) - - diff --git a/tests/reporter/test_report_schemas_hourly.py b/tests/reporter/test_report_schemas_hourly.py deleted file mode 100644 index 085c4fdd..00000000 --- a/tests/reporter/test_report_schemas_hourly.py +++ /dev/null @@ -1,186 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import pytest - -from reporter.postgres_reports import PostgresReportGenerator -from reporter.report_schemas import validate_report - - -@pytest.fixture(name="generator") -def fixture_generator() -> PostgresReportGenerator: - return PostgresReportGenerator( - prometheus_url="http://prom.test", - postgres_sink_url="", - ) - - -@pytest.fixture(name="fixed_pg_version") -def fixture_fixed_pg_version() -> dict[str, str]: - return { - "version": "15.3", - "server_version_num": "150003", - "server_major_ver": "15", - "server_minor_ver": "3", - } - - -def _stub_hourly_topk(metric_to_payload: dict[str, tuple[dict[str, list[float]], list[float], list[int]]]): - def _stub( - cluster: str, - node_name: str, - db_name: str, - metric_name: str = "pgwatch_pg_stat_statements_calls", - hours: int = 24, - step_s: int = 3600, - k: int = 3, - ): - _ = (cluster, node_name, db_name, hours, step_s, k) - if metric_name not in metric_to_payload: - raise AssertionError(f"Unexpected metric_name: {metric_name}") - return metric_to_payload[metric_name] - - return _stub - - -@pytest.mark.unit -def test_schema_k004(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk( - {"pgwatch_pg_stat_statements_temp_bytes_written": ({"1": [1.0]}, [0.0], [100])} - ), - ) - report = generator.generate_k004_temp_bytes_report("local", "node-1", time_range_minutes=60, limit=50) - validate_report(report) - - -@pytest.mark.unit -def test_schema_k005(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk({"pgwatch_pg_stat_statements_wal_bytes": ({"1": [1.0]}, [0.0], [100])}), - ) - report = generator.generate_k005_wal_bytes_report("local", "node-1", time_range_minutes=60, limit=50) - validate_report(report) - - -@pytest.mark.unit -def test_schema_k006(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk( - {"pgwatch_pg_stat_statements_shared_bytes_read_total": ({"1": [1.0]}, [0.0], [100])} - ), - ) - report = generator.generate_k006_shared_read_report("local", "node-1", time_range_minutes=60, limit=50) - validate_report(report) - - -@pytest.mark.unit -def test_schema_k007(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk( - {"pgwatch_pg_stat_statements_shared_bytes_hit_total": ({"1": [1.0]}, [0.0], [100])} - ), - ) - report = generator.generate_k007_shared_hit_report("local", "node-1", time_range_minutes=60, limit=50) - validate_report(report) - - -@pytest.mark.unit -def test_schema_k008(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr(generator, "_get_hourly_topk_pgss_data_sum2", lambda *args, **kwargs: ({"1": [3.0]}, [0.0], [100])) - report = generator.generate_k008_shared_hit_read_report("local", "node-1", time_range_minutes=60, limit=50) - validate_report(report) - - -@pytest.mark.unit -def test_schema_m001(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk( - { - "pgwatch_pg_stat_statements_exec_time_total": ({"1": [10.0]}, [0.0], [100]), - "pgwatch_pg_stat_statements_calls": ({"1": [1.0]}, [0.0], [100]), - } - ), - ) - report = generator.generate_m001_mean_time_report("local", "node-1", time_range_minutes=60, limit=50) - validate_report(report) - - -@pytest.mark.unit -def test_schema_m002(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk({"pgwatch_pg_stat_statements_rows": ({"1": [10.0]}, [0.0], [100])}), - ) - report = generator.generate_m002_rows_report("local", "node-1", time_range_minutes=60, limit=50) - validate_report(report) - - -@pytest.mark.unit -def test_schema_m003(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr( - generator, - "_get_hourly_topk_pgss_data", - _stub_hourly_topk( - { - "pgwatch_pg_stat_statements_block_read_total": ({"1": [10.0]}, [0.0], [100]), - "pgwatch_pg_stat_statements_block_write_total": ({"1": [5.0]}, [0.0], [100]), - } - ), - ) - report = generator.generate_m003_io_time_report("local", "node-1", time_range_minutes=60, limit=50) - validate_report(report) - - -@pytest.mark.unit -def test_schema_n001(monkeypatch: pytest.MonkeyPatch, generator: PostgresReportGenerator, fixed_pg_version) -> None: - monkeypatch.setattr(generator, "_get_postgres_version_info", lambda *args, **kwargs: fixed_pg_version) - monkeypatch.setattr(generator, "get_all_databases", lambda *args, **kwargs: ["db1"]) - monkeypatch.setattr(generator, "_floor_hour", lambda *_: 7200) - - def fake_query_range(_query: str, start, end, step: str = "3600s") -> list[dict[str, Any]]: - _ = (start, end, step) - return [ - { - "metric": { - "wait_event_type": "IO", - "wait_event": "DataFileRead", - "query_id": "123", - }, - "values": [[0, "1"], [3600, "2"], [7200, "0"]], - } - ] - - monkeypatch.setattr(generator, "query_range", fake_query_range) - report = generator.generate_n001_wait_events_report("local", "node-1", hours=3) - validate_report(report) - - diff --git a/tests/reporter/test_upload_report_file_unit.py b/tests/reporter/test_upload_report_file_unit.py deleted file mode 100644 index 19b183c9..00000000 --- a/tests/reporter/test_upload_report_file_unit.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import annotations - -import json -from typing import Any - -import pytest - -from reporter import postgres_reports as postgres_reports_module -from reporter.postgres_reports import PostgresReportGenerator - - -@pytest.mark.unit -def test_upload_report_file_extracts_check_id_from_json(tmp_path: Any, monkeypatch: pytest.MonkeyPatch) -> None: - generator = PostgresReportGenerator(prometheus_url="http://prom.test", postgres_sink_url="") - - report_path = tmp_path / "cluster_A002.json" - report_payload = { - "checkId": "A002", - "checkTitle": "Postgres major version", - "timestamptz": "2025-01-01T00:00:00+00:00", - "nodes": {"primary": "node-1", "standbys": []}, - "results": {"node-1": {"data": {}}}, - } - report_path.write_text(json.dumps(report_payload), encoding="utf-8") - - captured: dict[str, Any] = {} - - def fake_make_request(_api_url: str, _endpoint: str, request_data: dict[str, Any]) -> dict[str, Any]: - captured["request_data"] = request_data - return {} - - monkeypatch.setattr(postgres_reports_module, "make_request", fake_make_request) - - generator.upload_report_file("http://api.test", "tok", 123, str(report_path)) - - req = captured["request_data"] - assert req["check_id"] == "A002" - assert req["generate_issue"] is True - - -@pytest.mark.unit -def test_upload_report_file_query_json_has_no_check_id(tmp_path: Any, monkeypatch: pytest.MonkeyPatch) -> None: - generator = PostgresReportGenerator(prometheus_url="http://prom.test", postgres_sink_url="") - - query_path = tmp_path / "prod_query_123.json" - query_payload = { - "cluster_id": "prod", - "query_id": "123", - "query_text": "select 1", - "nodes": {"primary": "main", "standbys": ["replica-1"]}, - "results": {"main": {"db1": {"metrics": {"calls": 1}}}}, - "timestamptz": "2025-01-01T00:00:00+00:00", - } - query_path.write_text(json.dumps(query_payload), encoding="utf-8") - - captured: dict[str, Any] = {} - - def fake_make_request(_api_url: str, _endpoint: str, request_data: dict[str, Any]) -> dict[str, Any]: - captured["request_data"] = request_data - return {} - - monkeypatch.setattr(postgres_reports_module, "make_request", fake_make_request) - - generator.upload_report_file("http://api.test", "tok", 123, str(query_path)) - - req = captured["request_data"] - assert req["check_id"] == "" - assert req["generate_issue"] is False - -