ConnectionMaster · pull · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
diff --git a/.builders/scripts/build_wheels.py b/.builders/scripts/build_wheels.py
@@ -112,10 +112,14 @@ def main():
             str(python_path), '-m', 'pip', 'wheel',
             '-r', str(MOUNT_DIR / 'requirements.in'),
             '--wheel-dir', str(staged_wheel_dir),
-            '--extra-index-url', CUSTOM_EXTERNAL_INDEX,
+            # Temporarily removing extra index urls. See below.
+            # '--extra-index-url', CUSTOM_EXTERNAL_INDEX,
         ]
-        if args.use_built_index:
-            command_args.extend(['--extra-index-url', CUSTOM_BUILT_INDEX])
+        # Temporarily disable extra index urls. There are broken wheels in the gcloud bucket
+        # while working on removing tests from them. Adding extra indices causes undefined behavior
+        # and can pull a broken image, preventing the building from running.
+        # if args.use_built_index:
+        #     command_args.extend(['--extra-index-url', CUSTOM_BUILT_INDEX])
 
         check_process(command_args, env=env_vars)
 

diff --git a/.codecov.yml b/.codecov.yml
@@ -286,6 +286,10 @@ coverage:
         target: 75
         flags:
         - hazelcast
+      Hugging_Face_TGI:
+        target: 75
+        flags:
+        - hugging_face_tgi
       IBM_ACE:
         target: 75
         flags:
@@ -1183,6 +1187,11 @@ flags:
     paths:
     - http_check/datadog_checks/http_check
     - http_check/tests
+  hugging_face_tgi:
+    carryforward: true
+    paths:
+    - hugging_face_tgi/datadog_checks/hugging_face_tgi
+    - hugging_face_tgi/tests
   ibm_ace:
     carryforward: true
     paths:

diff --git a/.github/workflows/config/labeler.yml b/.github/workflows/config/labeler.yml
@@ -299,6 +299,8 @@ integration/hubspot_content_hub:
 - hubspot_content_hub/**/*
 integration/hudi:
 - hudi/**/*
+integration/hugging_face_tgi:
+- hugging_face_tgi/**/*
 integration/hyperv:
 - hyperv/**/*
 integration/iam_access_analyzer:

diff --git a/.github/workflows/test-all.yml b/.github/workflows/test-all.yml
@@ -1522,6 +1522,25 @@ jobs:
       minimum-base-package: ${{ inputs.minimum-base-package }}
       pytest-args: ${{ inputs.pytest-args }}
     secrets: inherit
+  jc3781e1:
+    uses: ./.github/workflows/test-target.yml
+    with:
+      job-name: Hugging Face TGI
+      target: hugging_face_tgi
+      platform: linux
+      runner: '["ubuntu-22.04"]'
+      repo: "${{ inputs.repo }}"
+      python-version: "${{ inputs.python-version }}"
+      latest: ${{ inputs.latest }}
+      agent-image: "${{ inputs.agent-image }}"
+      agent-image-py2: "${{ inputs.agent-image-py2 }}"
+      agent-image-windows: "${{ inputs.agent-image-windows }}"
+      agent-image-windows-py2: "${{ inputs.agent-image-windows-py2 }}"
+      test-py2: ${{ inputs.test-py2 }}
+      test-py3: ${{ inputs.test-py3 }}
+      minimum-base-package: ${{ inputs.minimum-base-package }}
+      pytest-args: ${{ inputs.pytest-args }}
+    secrets: inherit
   j5a9585a:
     uses: ./.github/workflows/test-target.yml
     with:

diff --git a/hugging_face_tgi/CHANGELOG.md b/hugging_face_tgi/CHANGELOG.md
@@ -0,0 +1,4 @@
+# CHANGELOG - Hugging Face TGI
+
+<!-- towncrier release notes start -->
+
diff --git a/hugging_face_tgi/README.md b/hugging_face_tgi/README.md
@@ -0,0 +1,82 @@
+# Agent Check: Hugging Face TGI
+
+## Overview
+
+This check monitors [Hugging Face Text Generation Inference (TGI)][1] through the Datadog Agent. TGI is a toolkit for deploying and serving Large Language Models (LLMs) optimized for text generation with features like continuous batching, tensor parallelism, token streaming, and production-ready optimizations.
+
+The integration provides comprehensive monitoring of your TGI servers by collecting:
+- Request performance metrics including latency, throughput, and token generation rates
+- Batch processing metrics for inference optimization insights
+- Queue depth and request flow monitoring
+- Model serving health and operational metrics
+
+This enables teams to optimize LLM inference performance, track resource utilization, troubleshoot bottlenecks, and ensure reliable model serving at scale.
+
+## Setup
+
+Follow the instructions below to install and configure this check for an Agent running on a host. For containerized environments, see the [Autodiscovery Integration Templates][3] for guidance on applying these instructions.
+
+### Installation
+
+The Hugging Face TGI check is included in the [Datadog Agent][2] package.
+No additional installation is needed on your server.
+
+### Configuration
+
+#### Metrics
+
+1. Ensure that your TGI server is exposing Prometheus metrics on the default metrics endpoint. TGI automatically exposes metrics at `/metrics` endpoint when running. For more information about TGI monitoring, see the [official documentation][10].
+
+2. Edit the `hugging_face_tgi.d/conf.yaml` file, which is located in the `conf.d/` folder at the root of your [Agent's configuration directory][11], to start collecting your Hugging Face TGI performance data. See the [sample hugging_face_tgi.d/conf.yaml][4] for all available configuration options.
+
+   ```yaml
+   instances:
+     - openmetrics_endpoint: http://localhost:80/metrics
+   ```
+
+3. [Restart the Agent][5].
+
+### Validation
+
+[Run the Agent's status subcommand][6] and look for `hugging_face_tgi` under the Checks section.
+
+## Data Collected
+
+### Metrics
+
+See [metadata.csv][7] for a list of metrics provided by this integration.
+
+Key metrics include:
+
+- **Request metrics**: Total requests, successful requests, failed requests, and request duration
+- **Queue metrics**: Queue size and queue duration for monitoring throughput bottlenecks
+- **Token metrics**: Generated tokens, input length, and mean time per token for performance analysis
+- **Batch metrics**: Batch size, batch concatenation, and batch processing durations for optimization insights
+- **Inference metrics**: Forward pass duration, decode duration, and filter duration for model performance monitoring
+
+### Events
+
+The Hugging Face TGI integration does not include any events.
+
+### Service Checks
+
+See [service_checks.json][8] for a list of service checks provided by this integration.
+
+## Troubleshooting
+
+In containerized environments, ensure that the Agent has network access to the TGI metrics endpoint specified in the `hugging_face_tgi.d/conf.yaml` file.
+
+Need help? Contact [Datadog support][9].
+
+
+[1]: https://huggingface.co/docs/text-generation-inference/index
+[2]: /account/settings/agent/latest
+[3]: https://docs.datadoghq.com/agent/kubernetes/integrations/
+[4]: https://github.com/DataDog/integrations-core/blob/master/hugging_face_tgi/datadog_checks/hugging_face_tgi/data/conf.yaml.example
+[5]: https://docs.datadoghq.com/agent/guide/agent-commands/#start-stop-and-restart-the-agent
+[6]: https://docs.datadoghq.com/agent/guide/agent-commands/#agent-status-and-information
+[7]: https://github.com/DataDog/integrations-core/blob/master/hugging_face_tgi/metadata.csv
+[8]: https://github.com/DataDog/integrations-core/blob/master/hugging_face_tgi/assets/service_checks.json
+[9]: https://docs.datadoghq.com/help/
+[10]: https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/monitoring
+[11]: https://docs.datadoghq.com/agent/configuration/agent-configuration-files/#agent-configuration-directory
diff --git a/hugging_face_tgi/assets/configuration/spec.yaml b/hugging_face_tgi/assets/configuration/spec.yaml
@@ -0,0 +1,15 @@
+name: Hugging Face TGI
+files:
+- name: hugging_face_tgi.yaml
+  options:
+    - template: init_config
+      options:
+        - template: init_config/openmetrics
+    - template: instances
+      options:
+        - template: instances/openmetrics
+          overrides:
+            openmetrics_endpoint.value.example: http://localhost:80/metrics
+            openmetrics_endpoint.description: |
+              Endpoint exposing Hugging Face TGI's Prometheus metrics. For more information, refer to
+              https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/monitoring 
diff --git a/hugging_face_tgi/changelog.d/20905.added b/hugging_face_tgi/changelog.d/20905.added
@@ -0,0 +1 @@
+Initial Release
diff --git a/hugging_face_tgi/datadog_checks/hugging_face_tgi/__about__.py b/hugging_face_tgi/datadog_checks/hugging_face_tgi/__about__.py
@@ -0,0 +1,4 @@
+# (C) Datadog, Inc. 2025-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+__version__ = '0.0.1'
diff --git a/hugging_face_tgi/datadog_checks/hugging_face_tgi/__init__.py b/hugging_face_tgi/datadog_checks/hugging_face_tgi/__init__.py
@@ -0,0 +1,7 @@
+# (C) Datadog, Inc. 2025-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+from .__about__ import __version__
+from .check import HuggingFaceTgiCheck
+
+__all__ = ['__version__', 'HuggingFaceTgiCheck']
diff --git a/hugging_face_tgi/datadog_checks/hugging_face_tgi/check.py b/hugging_face_tgi/datadog_checks/hugging_face_tgi/check.py
@@ -0,0 +1,24 @@
+# (C) Datadog, Inc. 2025-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+from datadog_checks.base import OpenMetricsBaseCheckV2
+from datadog_checks.hugging_face_tgi.metrics import METRIC_MAP, RENAME_LABELS_MAP
+
+
+class HuggingFaceTgiCheck(OpenMetricsBaseCheckV2):
+    __NAMESPACE__ = 'hugging_face_tgi'
+
+    DEFAULT_METRIC_LIMIT = 0
+
+    def __init__(self, name, init_config, instances=None):
+        super(HuggingFaceTgiCheck, self).__init__(
+            name,
+            init_config,
+            instances,
+        )
+
+    def get_default_config(self):
+        return {
+            'metrics': [METRIC_MAP],
+            'rename_labels': RENAME_LABELS_MAP,
+        }
diff --git a/hugging_face_tgi/datadog_checks/hugging_face_tgi/config_models/__init__.py b/hugging_face_tgi/datadog_checks/hugging_face_tgi/config_models/__init__.py
@@ -0,0 +1,24 @@
+# (C) Datadog, Inc. 2025-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+
+# This file is autogenerated.
+# To change this file you should edit assets/configuration/spec.yaml and then run the following commands:
+#     ddev -x validate config -s <INTEGRATION_NAME>
+#     ddev -x validate models -s <INTEGRATION_NAME>
+
+from .instance import InstanceConfig
+from .shared import SharedConfig
+
+
+class ConfigMixin:
+    _config_model_instance: InstanceConfig
+    _config_model_shared: SharedConfig
+
+    @property
+    def config(self) -> InstanceConfig:
+        return self._config_model_instance
+
+    @property
+    def shared_config(self) -> SharedConfig:
+        return self._config_model_shared
diff --git a/hugging_face_tgi/datadog_checks/hugging_face_tgi/config_models/defaults.py b/hugging_face_tgi/datadog_checks/hugging_face_tgi/config_models/defaults.py
@@ -0,0 +1,132 @@
+# (C) Datadog, Inc. 2025-present
+# All rights reserved
+# Licensed under a 3-clause BSD style license (see LICENSE)
+
+# This file is autogenerated.
+# To change this file you should edit assets/configuration/spec.yaml and then run the following commands:
+#     ddev -x validate config -s <INTEGRATION_NAME>
+#     ddev -x validate models -s <INTEGRATION_NAME>
+
+
+def shared_skip_proxy():
+    return False
+
+
+def shared_timeout():
+    return 10
+
+
+def instance_allow_redirects():
+    return True
+
+
+def instance_auth_type():
+    return 'basic'
+
+
+def instance_cache_metric_wildcards():
+    return True
+
+
+def instance_cache_shared_labels():
+    return True
+
+
+def instance_collect_counters_with_distributions():
+    return False
+
+
+def instance_collect_histogram_buckets():
+    return True
+
+
+def instance_disable_generic_tags():
+    return False
+
+
+def instance_empty_default_hostname():
+    return False
+
+
+def instance_enable_health_service_check():
+    return True
+
+
+def instance_histogram_buckets_as_distributions():
+    return False
+
+
+def instance_ignore_connection_errors():
+    return False
+
+
+def instance_kerberos_auth():
+    return 'disabled'
+
+
+def instance_kerberos_delegate():
+    return False
+
+
+def instance_kerberos_force_initiate():
+    return False
+
+
+def instance_log_requests():
+    return False
+
+
+def instance_min_collection_interval():
+    return 15
+
+
+def instance_non_cumulative_histogram_buckets():
+    return False
+
+
+def instance_persist_connections():
+    return False
+
+
+def instance_request_size():
+    return 16
+
+
+def instance_skip_proxy():
+    return False
+
+
+def instance_tag_by_endpoint():
+    return True
+
+
+def instance_telemetry():
+    return False
+
+
+def instance_timeout():
+    return 10
+
+
+def instance_tls_ignore_warning():
+    return False
+
+
+def instance_tls_use_host_header():
+    return False
+
+
+def instance_tls_verify():
+    return True
+
+
+def instance_use_latest_spec():
+    return False
+
+
+def instance_use_legacy_auth_encoding():
+    return True
+
+
+def instance_use_process_start_time():
+    return False
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		# CHANGELOG - Hugging Face TGI

		<!-- towncrier release notes start -->