diff --git a/docs/METRICS.md b/docs/METRICS.md index 76c76b47..d1941a59 100644 --- a/docs/METRICS.md +++ b/docs/METRICS.md @@ -55,7 +55,7 @@ Duration of HTTP requests in seconds, labeled by: - `route` - Request route/path - `status_code` - HTTP status code -Buckets: 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10 seconds +Buckets: 0.01, 0.05, 0.1, 0.5, 1, 5, 10 seconds #### http_requests_total (Counter) @@ -64,6 +64,77 @@ Total number of HTTP requests, labeled by: - `route` - Request route/path - `status_code` - HTTP status code +### GraphQL Metrics + +#### hawk_gql_operation_duration_seconds (Histogram) + +Histogram of total GraphQL operation duration by operation name and type. + +Labels: +- `operation_name` - Name of the GraphQL operation +- `operation_type` - Type of operation (query, mutation, subscription) + +Buckets: 0.01, 0.05, 0.1, 0.5, 1, 5, 10 seconds + +**Purpose**: Identify slow API operations (P95/P99 latency). + +#### hawk_gql_operation_errors_total (Counter) + +Counter of failed GraphQL operations grouped by operation name and error class. + +Labels: +- `operation_name` - Name of the GraphQL operation +- `error_type` - Type/class of the error + +**Purpose**: Detect increased error rates and failing operations. + +#### hawk_gql_resolver_duration_seconds (Histogram) + +Histogram of resolver execution time per type, field, and operation. + +Labels: +- `type_name` - GraphQL type name +- `field_name` - Field name being resolved +- `operation_name` - Name of the GraphQL operation + +Buckets: 0.01, 0.05, 0.1, 0.5, 1, 5 seconds + +**Purpose**: Find slow or CPU-intensive resolvers that degrade overall performance. + +### MongoDB Metrics + +#### hawk_mongo_command_duration_seconds (Histogram) + +Histogram of MongoDB command duration by command, collection family, and database. + +Labels: +- `command` - MongoDB command name (find, insert, update, etc.) +- `collection_family` - Collection family name (extracted from dynamic collection names to reduce cardinality) +- `db` - Database name + +Buckets: 0.01, 0.05, 0.1, 0.5, 1, 5, 10 seconds + +**Purpose**: Detect slow queries and high-latency collections. + +**Note on Collection Families**: To reduce metric cardinality, dynamic collection names are grouped into families. For example: +- `events:projectId` → `events` +- `dailyEvents:projectId` → `dailyEvents` +- `repetitions:projectId` → `repetitions` +- `membership:userId` → `membership` +- `team:workspaceId` → `team` + +This prevents metric explosion when dealing with thousands of projects, users, or workspaces, while still providing meaningful insights into collection performance patterns. + +#### hawk_mongo_command_errors_total (Counter) + +Counter of failed MongoDB commands grouped by command and error code. + +Labels: +- `command` - MongoDB command name +- `error_code` - MongoDB error code + +**Purpose**: Track transient or persistent database errors. + ## Testing ### Manual Testing @@ -98,11 +169,25 @@ The metrics implementation uses the `prom-client` library and consists of: - Initializes a Prometheus registry - Configures default Node.js metrics collection - Defines custom HTTP metrics (duration histogram and request counter) + - Registers GraphQL and MongoDB metrics - Provides middleware for tracking HTTP requests - Creates a separate Express app for serving metrics -2. **Integration** (`src/index.ts`): +2. **GraphQL Metrics** (`src/metrics/graphql.ts`): + - Implements Apollo Server plugin for tracking GraphQL operations + - Tracks operation duration, errors, and resolver execution time + - Automatically captures operation name, type, and field information + +3. **MongoDB Metrics** (`src/metrics/mongodb.ts`): + - Implements MongoDB command monitoring + - Tracks command duration and errors + - Uses MongoDB's command monitoring events + - Extracts collection families from dynamic collection names to reduce cardinality + +4. **Integration** (`src/index.ts`, `src/mongo.ts`): + - Adds GraphQL metrics plugin to Apollo Server - Adds metrics middleware to the main Express app + - Enables MongoDB command monitoring on database clients - Starts metrics server on a separate port - Keeps metrics server isolated from main API traffic diff --git a/package.json b/package.json index f4b8a889..1885091a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "hawk.api", - "version": "1.1.42", + "version": "1.2.0", "main": "index.ts", "license": "BUSL-1.1", "scripts": { diff --git a/src/index.ts b/src/index.ts index 17ae5d26..6a4ebcc0 100644 --- a/src/index.ts +++ b/src/index.ts @@ -27,7 +27,7 @@ import BusinessOperationsFactory from './models/businessOperationsFactory'; import schema from './schema'; import { graphqlUploadExpress } from 'graphql-upload'; import morgan from 'morgan'; -import { metricsMiddleware, createMetricsServer } from './metrics'; +import { metricsMiddleware, createMetricsServer, graphqlMetricsPlugin } from './metrics'; /** * Option to enable playground @@ -122,6 +122,7 @@ class HawkAPI { process.env.NODE_ENV === 'production' ? ApolloServerPluginLandingPageDisabled() : ApolloServerPluginLandingPageGraphQLPlayground(), + graphqlMetricsPlugin, ], context: ({ req }): ResolverContextBase => req.context, formatError: (error): GraphQLError => { diff --git a/src/metrics/graphql.ts b/src/metrics/graphql.ts new file mode 100644 index 00000000..b1c490e0 --- /dev/null +++ b/src/metrics/graphql.ts @@ -0,0 +1,93 @@ +import client from 'prom-client'; +import { ApolloServerPlugin, GraphQLRequestContext, GraphQLRequestListener } from 'apollo-server-plugin-base'; +import { GraphQLError } from 'graphql'; + +/** + * GraphQL operation duration histogram + * Tracks GraphQL operation duration by operation name and type + */ +export const gqlOperationDuration = new client.Histogram({ + name: 'hawk_gql_operation_duration_seconds', + help: 'Histogram of total GraphQL operation duration by operation name and type', + labelNames: ['operation_name', 'operation_type'], + buckets: [0.01, 0.05, 0.1, 0.5, 1, 5, 10], +}); + +/** + * GraphQL operation errors counter + * Tracks failed GraphQL operations grouped by operation name and error class + */ +export const gqlOperationErrors = new client.Counter({ + name: 'hawk_gql_operation_errors_total', + help: 'Counter of failed GraphQL operations grouped by operation name and error class', + labelNames: ['operation_name', 'error_type'], +}); + +/** + * GraphQL resolver duration histogram + * Tracks resolver execution time per type, field, and operation + */ +export const gqlResolverDuration = new client.Histogram({ + name: 'hawk_gql_resolver_duration_seconds', + help: 'Histogram of resolver execution time per type, field, and operation', + labelNames: ['type_name', 'field_name', 'operation_name'], + buckets: [0.01, 0.05, 0.1, 0.5, 1, 5], +}); + +/** + * Apollo Server plugin to track GraphQL metrics + */ +export const graphqlMetricsPlugin: ApolloServerPlugin = { + async requestDidStart(_requestContext: GraphQLRequestContext): Promise { + const startTime = Date.now(); + let operationName = 'unknown'; + let operationType = 'unknown'; + + return { + async didResolveOperation(ctx: GraphQLRequestContext): Promise { + operationName = ctx.operationName || 'anonymous'; + operationType = ctx.operation?.operation || 'unknown'; + }, + + async executionDidStart(): Promise { + return { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + willResolveField({ info }: any): () => void { + const fieldStartTime = Date.now(); + + return (): void => { + const duration = (Date.now() - fieldStartTime) / 1000; + + gqlResolverDuration + .labels( + info.parentType.name, + info.fieldName, + operationName + ) + .observe(duration); + }; + }, + }; + }, + + async willSendResponse(ctx: GraphQLRequestContext): Promise { + const duration = (Date.now() - startTime) / 1000; + + gqlOperationDuration + .labels(operationName, operationType) + .observe(duration); + + // Track errors if any + if (ctx.errors && ctx.errors.length > 0) { + ctx.errors.forEach((error: GraphQLError) => { + const errorType = error.extensions?.code || error.name || 'unknown'; + + gqlOperationErrors + .labels(operationName, errorType as string) + .inc(); + }); + } + }, + }; + }, +}; diff --git a/src/metrics/index.ts b/src/metrics/index.ts index 14818dd0..d6cca281 100644 --- a/src/metrics/index.ts +++ b/src/metrics/index.ts @@ -1,5 +1,7 @@ import client from 'prom-client'; import express from 'express'; +import { gqlOperationDuration, gqlOperationErrors, gqlResolverDuration } from './graphql'; +import { mongoCommandDuration, mongoCommandErrors } from './mongodb'; /** * Create a Registry to register the metrics @@ -19,7 +21,7 @@ const httpRequestDuration = new client.Histogram({ name: 'http_request_duration_seconds', help: 'Duration of HTTP requests in seconds', labelNames: ['method', 'route', 'status_code'], - buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10], + buckets: [0.01, 0.05, 0.1, 0.5, 1, 5, 10], registers: [ register ], }); @@ -34,8 +36,24 @@ const httpRequestCounter = new client.Counter({ registers: [ register ], }); +/** + * Register GraphQL metrics + */ +register.registerMetric(gqlOperationDuration); +register.registerMetric(gqlOperationErrors); +register.registerMetric(gqlResolverDuration); + +/** + * Register MongoDB metrics + */ +register.registerMetric(mongoCommandDuration); +register.registerMetric(mongoCommandErrors); + /** * Express middleware to track HTTP metrics + * @param req - Express request object + * @param res - Express response object + * @param next - Express next function */ export function metricsMiddleware(req: express.Request, res: express.Response, next: express.NextFunction): void { const start = Date.now(); @@ -71,3 +89,9 @@ export function createMetricsServer(): express.Application { return metricsApp; } + +/** + * Export GraphQL metrics plugin and MongoDB metrics setup + */ +export { graphqlMetricsPlugin } from './graphql'; +export { setupMongoMetrics, withMongoMetrics } from './mongodb'; diff --git a/src/metrics/mongodb.ts b/src/metrics/mongodb.ts new file mode 100644 index 00000000..ce3b105c --- /dev/null +++ b/src/metrics/mongodb.ts @@ -0,0 +1,183 @@ +import promClient from 'prom-client'; +import { MongoClient, MongoClientOptions } from 'mongodb'; + +/** + * MongoDB command duration histogram + * Tracks MongoDB command duration by command, collection family, and database + */ +export const mongoCommandDuration = new promClient.Histogram({ + name: 'hawk_mongo_command_duration_seconds', + help: 'Histogram of MongoDB command duration by command, collection family, and db', + labelNames: ['command', 'collection_family', 'db'], + buckets: [0.01, 0.05, 0.1, 0.5, 1, 5, 10], +}); + +/** + * MongoDB command errors counter + * Tracks failed MongoDB commands grouped by command and error code + */ +export const mongoCommandErrors = new promClient.Counter({ + name: 'hawk_mongo_command_errors_total', + help: 'Counter of failed MongoDB commands grouped by command and error code', + labelNames: ['command', 'error_code'], +}); + +/** + * Extract collection name from MongoDB command + * Handles different command types and their collection name locations + * @param command - MongoDB command object + * @param commandName - Name of the command (find, insert, getMore, etc.) + * @returns Raw collection identifier or null + */ +function extractCollectionFromCommand(command: any, commandName: string): unknown { + if (!command) { + return null; + } + + // Special handling for getMore command - collection is in a different field + if (commandName === 'getMore') { + return command.collection || null; + } + + /* + * For most commands, collection name is the value of the command name key + * e.g., { find: "users" } -> collection is "users" + */ + return command[commandName] || null; +} + +/** + * Normalize collection value to string + * Handles BSON types and other non-string values + * @param collection - Collection value from MongoDB command + * @returns Normalized string or 'unknown' + */ +function normalizeCollectionName(collection: unknown): string { + if (!collection) { + return 'unknown'; + } + + // Handle string values directly + if (typeof collection === 'string') { + return collection; + } + + // Handle BSON types and objects with toString method + if (typeof collection === 'object' && 'toString' in collection) { + try { + const str = String(collection); + + // Skip if toString returns object representation like [object Object] + if (!str.startsWith('[object') && str !== 'unknown') { + return str; + } + } catch (e) { + console.error('Error normalizing collection name', e); + // Ignore conversion errors + } + } + + return 'unknown'; +} + +/** + * Extract collection family from full collection name + * Reduces cardinality by grouping dynamic collections + * @param collectionName - Full collection name (e.g., "events:projectId") + * @returns Collection family (e.g., "events") + */ +function getCollectionFamily(collectionName: string): string { + if (collectionName === 'unknown') { + return 'unknown'; + } + + // Extract prefix before colon for dynamic collections + const colonIndex = collectionName.indexOf(':'); + + if (colonIndex > 0) { + return collectionName.substring(0, colonIndex); + } + + return collectionName; +} + +/** + * Enhance MongoClient options with monitoring + * @param options - Original MongoDB connection options + * @returns Enhanced options with monitoring enabled + */ +export function withMongoMetrics(options: MongoClientOptions = {}): MongoClientOptions { + return { + ...options, + monitorCommands: true, + }; +} + +/** + * Setup MongoDB metrics monitoring on a MongoClient + * @param client - MongoDB client to monitor + */ +export function setupMongoMetrics(client: MongoClient): void { + client.on('commandStarted', (event) => { + // Store start time and metadata for this command + const metadataKey = `${event.requestId}`; + + // Extract collection name from the command + const collectionRaw = extractCollectionFromCommand(event.command, event.commandName); + const collection = normalizeCollectionName(collectionRaw); + const collectionFamily = getCollectionFamily(collection); + + const db = event.databaseName || 'unknown'; + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (client as any)[metadataKey] = { + startTime: Date.now(), + collectionFamily, + db, + commandName: event.commandName, + }; + }); + + client.on('commandSucceeded', (event) => { + const metadataKey = `${event.requestId}`; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const metadata = (client as any)[metadataKey]; + + if (metadata) { + const duration = (Date.now() - metadata.startTime) / 1000; + + mongoCommandDuration + .labels(metadata.commandName, metadata.collectionFamily, metadata.db) + .observe(duration); + + // Clean up metadata + // eslint-disable-next-line @typescript-eslint/no-explicit-any + delete (client as any)[metadataKey]; + } + }); + + client.on('commandFailed', (event) => { + const metadataKey = `${event.requestId}`; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const metadata = (client as any)[metadataKey]; + + if (metadata) { + const duration = (Date.now() - metadata.startTime) / 1000; + + mongoCommandDuration + .labels(metadata.commandName, metadata.collectionFamily, metadata.db) + .observe(duration); + + // Track error + const errorCode = event.failure?.code?.toString() || 'unknown'; + + mongoCommandErrors + .labels(metadata.commandName, errorCode) + .inc(); + + // Clean up metadata + // eslint-disable-next-line @typescript-eslint/no-explicit-any + delete (client as any)[metadataKey]; + } + }); +} diff --git a/src/mongo.ts b/src/mongo.ts index 4bb7064e..84d80897 100644 --- a/src/mongo.ts +++ b/src/mongo.ts @@ -1,5 +1,6 @@ import { Db, MongoClient, MongoClientOptions } from 'mongodb'; import HawkCatcher from '@hawk.so/nodejs'; +import { setupMongoMetrics, withMongoMetrics } from './metrics'; const hawkDBUrl = process.env.MONGO_HAWK_DB_URL || 'mongodb://localhost:27017/hawk'; const eventsDBUrl = process.env.MONGO_EVENTS_DB_URL || 'mongodb://localhost:27017/events'; @@ -53,10 +54,10 @@ export const mongoClients: MongoClients = { /** * Common params for all connections */ -const connectionConfig: MongoClientOptions = { +const connectionConfig: MongoClientOptions = withMongoMetrics({ useNewUrlParser: true, useUnifiedTopology: true, -}; +}); /** * Setups connections to the databases (hawk api and events databases) @@ -73,6 +74,10 @@ export async function setupConnections(): Promise { databases.hawk = hawkMongoClient.db(); databases.events = eventsMongoClient.db(); + + // Setup metrics monitoring for both clients + setupMongoMetrics(hawkMongoClient); + setupMongoMetrics(eventsMongoClient); } catch (e) { /** Catch start Mongo errors */ HawkCatcher.send(e as Error); diff --git a/test/integration/cases/metrics.test.ts b/test/integration/cases/metrics.test.ts deleted file mode 100644 index c78e9997..00000000 --- a/test/integration/cases/metrics.test.ts +++ /dev/null @@ -1,41 +0,0 @@ -import axios from 'axios'; -import dotenv from 'dotenv'; -import path from 'path'; - -/** - * Env variables for API - */ -const apiEnv = dotenv.config({ path: path.join(__dirname, '../api.env') }).parsed || {}; - -/** - * Axios instance to send requests to metrics endpoint - */ -const metricsInstance = axios.create({ - baseURL: `http://api:${apiEnv.METRICS_PORT || 9090}`, -}); - -describe('Prometheus Metrics', () => { - test('Metrics endpoint is accessible', async () => { - const response = await metricsInstance.get('/metrics'); - - expect(response.status).toBe(200); - expect(response.headers['content-type']).toMatch(/text\/plain/); - }); - - test('Metrics endpoint returns prometheus format', async () => { - const response = await metricsInstance.get('/metrics'); - - // Check for some default Node.js metrics - expect(response.data).toContain('nodejs_version_info'); - expect(response.data).toContain('process_cpu_user_seconds_total'); - expect(response.data).toContain('nodejs_heap_size_total_bytes'); - }); - - test('Metrics endpoint includes custom HTTP metrics', async () => { - const response = await metricsInstance.get('/metrics'); - - // Check for our custom metrics - expect(response.data).toContain('http_request_duration_seconds'); - expect(response.data).toContain('http_requests_total'); - }); -});