[Hugging Face TGI] Add monitors (DataDog#21355)

dkirov-dd · web-flow · commit ffa57b9474b3 · 2025-09-17T08:08:25.000Z
* Add monitors

* Fix integration tag
diff --git a/hugging_face_tgi/assets/monitors/queue_size_high.json b/hugging_face_tgi/assets/monitors/queue_size_high.json
@@ -0,0 +1,34 @@
+{
+	"version": 2,
+	"created_at": "2025-09-16",
+	"last_updated_at": "2025-09-16",
+	"title": "High queue size",
+	"description": "This monitor tracks the number of requests waiting in the TGI queue. A high queue size indicates that requests are arriving faster than they can be processed, which can lead to increased latency and potential timeouts.",
+	"definition": {
+		"name": "[Hugging Face TGI] High queue size",
+		"type": "query alert",
+		"query": "avg(last_5m):avg:hugging_face_tgi.queue.size{*} > 50",
+		"message": "Hugging Face TGI queue size is high (>50 requests).\n\nThis indicates:\n* Requests arriving faster than processing capacity\n* Potential for increased latency and timeouts\n* Need for scaling or optimization\n\n{{#is_alert}}\nCurrent queue size: {{value}} requests\n{{/is_alert}}\n\nActions:\n1. Consider scaling TGI instances\n2. Review batch size configuration\n3. Check for inefficient request patterns\n4. Monitor resource utilization",
+		"tags": [
+			"integration:hugging-face-tgi"
+		],
+		"options": {
+			"thresholds": {
+				"critical": 50,
+				"warning": 30
+			},
+			"notify_audit": false,
+			"require_full_window": false,
+			"renotify_interval": 60,
+			"include_tags": true,
+			"evaluation_delay": 60,
+			"escalation_message": "",
+			"on_missing_data": "default",
+			"new_host_delay": 300
+		},
+		"priority": 2
+	},
+	"tags": [
+		"integration:hugging-face-tgi"
+	]
+}
diff --git a/hugging_face_tgi/assets/monitors/request_latency_high.json b/hugging_face_tgi/assets/monitors/request_latency_high.json
@@ -0,0 +1,34 @@
+{
+	"version": 2,
+	"created_at": "2025-09-16",
+	"last_updated_at": "2025-09-16",
+	"title": "High request latency",
+	"description": "This monitor tracks the average request duration for Hugging Face TGI. High latency can indicate performance bottlenecks such as model inference issues, resource contention, or inefficient batch processing.",
+	"definition": {
+		"name": "[Hugging Face TGI] High request latency",
+		"type": "query alert",
+		"query": "sum(last_5m):avg:hugging_face_tgi.request.duration.sum{*}.as_count() / avg:hugging_face_tgi.request.duration.count{*}.as_count() > 10",
+		"message": "Hugging Face TGI request latency is high (>10s average).\n\nThis could indicate:\n* Model inference bottlenecks\n* Resource contention (CPU/GPU/memory)\n* Inefficient batch processing\n* Queue buildup\n\n{{#is_alert}}\nAverage latency: {{value}}s\n{{/is_alert}}\n\nCheck:\n1. TGI server resource utilization\n2. Queue size and batch processing metrics\n3. Model performance and configuration",
+		"tags": [
+			"integration:hugging-face-tgi"
+		],
+		"options": {
+			"thresholds": {
+				"critical": 10,
+				"warning": 5
+			},
+			"notify_audit": false,
+			"require_full_window": false,
+			"renotify_interval": 60,
+			"include_tags": true,
+			"evaluation_delay": 300,
+			"escalation_message": "",
+			"on_missing_data": "show_and_notify_no_data",
+			"new_host_delay": 300
+		},
+		"priority": 2
+	},
+	"tags": [
+		"integration:hugging-face-tgi"
+	]
+}
diff --git a/hugging_face_tgi/assets/monitors/token_generation_slow.json b/hugging_face_tgi/assets/monitors/token_generation_slow.json
@@ -0,0 +1,34 @@
+{
+	"version": 2,
+	"created_at": "2025-09-16",
+	"last_updated_at": "2025-09-16",
+	"title": "Slow token generation",
+	"description": "This monitor tracks the mean time per token generation for Hugging Face TGI. Slow token generation indicates model inference performance issues, which directly impacts user experience and throughput.",
+	"definition": {
+		"name": "[Hugging Face TGI] Slow token generation",
+		"type": "query alert",
+		"query": "sum(last_5m):sum:hugging_face_tgi.request.mean_time_per_token.duration.sum{*}.as_count() / sum:hugging_face_tgi.request.mean_time_per_token.duration.count{*}.as_count() > 0.2",
+		"message": "Hugging Face TGI token generation is slow (>200ms per token).\n\nThis indicates:\n* Model inference performance degradation\n* Resource constraints (GPU memory/compute)\n* Inefficient model configuration or parameters\n\n{{#is_alert}}\nMean time per token: {{value}}s\n{{/is_alert}}\n\nInvestigate:\n1. GPU utilization and memory usage\n2. Model configuration and quantization settings\n3. Batch size optimization\n4. Temperature and sampling parameters",
+		"tags": [
+			"integration:hugging-face-tgi"
+		],
+		"options": {
+			"thresholds": {
+				"critical": 0.2,
+				"warning": 0.1
+			},
+			"notify_audit": false,
+			"require_full_window": false,
+			"renotify_interval": 60,
+			"include_tags": true,
+			"evaluation_delay": 300,
+			"escalation_message": "",
+			"on_missing_data": "show_and_notify_no_data",
+			"new_host_delay": 300
+		},
+		"priority": 2
+	},
+	"tags": [
+		"integration:hugging-face-tgi"
+	]
+}
diff --git a/hugging_face_tgi/manifest.json b/hugging_face_tgi/manifest.json
@@ -41,7 +41,11 @@
 	    "text-generation-router"
       ]
     },
-    "monitors": {},
+    "monitors": {
+      "High request latency": "assets/monitors/request_latency_high.json",
+      "High queue size": "assets/monitors/queue_size_high.json", 
+      "Slow token generation": "assets/monitors/token_generation_slow.json"
+    },
     "saved_views": {}
   },
   "author": {