Merge branch 'master' into ROB-746-get-resource-yaml

RoiGlinik · RoiGlinik · commit 5c28656de28a · 2025-03-16T12:07:46.000+02:00
diff --git a/docs/configuration/holmesgpt/builtin_toolsets.rst b/docs/configuration/holmesgpt/builtin_toolsets.rst
@@ -9,6 +9,7 @@ Builtin Toolsets
    toolsets/argocd
    toolsets/aws
    toolsets/confluence
+   toolsets/datetime
    toolsets/docker
    toolsets/grafanaloki
    toolsets/grafanatempo
@@ -17,6 +18,7 @@ Builtin Toolsets
    toolsets/kafka
    toolsets/kubernetes
    toolsets/opensearch
+   toolsets/prometheus
    toolsets/robusta
    toolsets/slab
 
@@ -50,6 +52,11 @@ by the user by providing credentials or API keys to external systems.
         :link: toolsets/confluence
         :link-type: doc
 
+    .. grid-item-card:: :octicon:`cpu;1em;` Datetime
+        :class-card: sd-bg-light sd-bg-text-light
+        :link: toolsets/datetime
+        :link-type: doc
+
     .. grid-item-card:: :octicon:`cpu;1em;` Docker
         :class-card: sd-bg-light sd-bg-text-light
         :link: toolsets/docker
@@ -79,11 +86,17 @@ by the user by providing credentials or API keys to external systems.
         :class-card: sd-bg-light sd-bg-text-light
         :link: toolsets/kubernetes
         :link-type: doc
+
     .. grid-item-card:: :octicon:`cpu;1em;` OpenSearch
         :class-card: sd-bg-light sd-bg-text-light
         :link: toolsets/opensearch
         :link-type: doc
 
+    .. grid-item-card:: :octicon:`cpu;1em;` Prometheus
+        :class-card: sd-bg-light sd-bg-text-light
+        :link: toolsets/prometheus
+        :link-type: doc
+
     .. grid-item-card:: :octicon:`cpu;1em;` Robusta
         :class-card: sd-bg-light sd-bg-text-light
         :link: toolsets/robusta
diff --git a/docs/configuration/holmesgpt/toolsets/datetime.rst b/docs/configuration/holmesgpt/toolsets/datetime.rst
@@ -0,0 +1,33 @@
+Datetime :checkmark:`_`
+=======================
+.. include:: ./_toolset_enabled_by_default.inc.rst
+
+By enabling this toolset, HolmesGPT will be able to get the current UTC date and time.
+This feature works well with other toolsets. For example, the :doc:`prometheus <prometheus>`
+toolset needs ``start`` and ``end`` time parameters to properly create and execute
+PromQL queries.
+
+Configuration
+-------------
+
+.. code-block:: yaml
+
+    holmes:
+        toolsets:
+            datetime:
+                enabled: true
+
+.. include:: ./_toolset_configuration.inc.rst
+
+Capabilities
+------------
+.. include:: ./_toolset_capabilities.inc.rst
+
+.. list-table::
+   :header-rows: 1
+   :widths: 30 70
+
+   * - Tool Name
+     - Description
+   * - get_current_time
+     - Return current time information. Useful to build queries that require a time information
diff --git a/docs/configuration/holmesgpt/toolsets/prometheus.rst b/docs/configuration/holmesgpt/toolsets/prometheus.rst
@@ -0,0 +1,85 @@
+Prometheus
+==========
+
+By enabling this toolset, HolmesGPT will be able to generate graphs from prometheus metrics as well as help you write and
+validate prometheus queries.
+
+There is also an option for Holmes to analyze prometheus metrics. When enabled, HolmesGPT can detect memory leak patterns,
+CPU throttling, high latency for your APIs, etc. The configuration field to enable prometheus metrics analysis is
+``tool_calls_return_data``.
+
+Configuration
+-------------
+
+.. md-tab-set::
+
+  .. md-tab-item:: Robusta Helm Chat
+
+    .. code-block:: yaml
+
+        holmes:
+            toolsets:
+                prometheus/metrics:
+                    enabled: true
+                    config:
+                        prometheus_url: ...
+                        metrics_labels_time_window_hrs: 48 # default value
+                        metrics_labels_cache_duration_hrs: 12 # default value
+                        fetch_labels_with_labels_api: false # default value
+                        fetch_metadata_with_series_api: false # default value
+                        tool_calls_return_data: false # default value
+                        headers:
+                            Authorization: "Basic <base_64_encoded_string>"
+
+
+    .. include:: ./_toolset_configuration.inc.rst
+
+  .. md-tab-item:: Holmes CLI
+
+    Add the following to **~/.holmes/config.yaml**, creating the file if it doesn't exist:
+
+    .. code-block:: yaml
+
+        toolsets:
+            prometheus/metrics:
+                enabled: true
+                config:
+                    prometheus_url: ...
+                    metrics_labels_time_window_hrs: 48 # default value
+                    metrics_labels_cache_duration_hrs: 12 # default value
+                    fetch_labels_with_labels_api: false # default value
+                    fetch_metadata_with_series_api: false # default value
+                    tool_calls_return_data: false # default value
+                    headers:
+                        Authorization: "Basic <base_64_encoded_string>"
+
+It is also possible to set the ``PROMETHEUS_URL`` environment variable instead of the above ``prometheus_url`` config key.
+
+Prior to generating a PromQL query, HolmesQPT tends to list the available metrics. This is done to ensure the metrics used
+in PromQL are actually available.
+
+Below is the full list of options for this toolset:
+
+- **metrics_labels_time_window_hrs** Represents the time window, in hours, over which labels are fetched. This avoids fetching obsolete labels. Set it to ``null`` to let HolmesGPT fetch labels regardless of when they were generated.
+- **metrics_labels_cache_duration_hrs** How long are labels cached, in hours. Set it to ``null`` to disable caching.
+- **fetch_labels_with_labels_api** Uses prometheus `labels API <https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names>`_ to fetch labels instead of the `series API <https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers>`_. In some cases setting to True can improve the performance of the toolset, however there will be an increased number of HTTP calls to prometheus. You can experiment with both as they are functionally identical.
+- **fetch_metadata_with_series_api** Uses the `series API <https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers>`_ instead of the `metadata API <https://prometheus.io/docs/prometheus/latest/querying/api/#querying-metric-metadata>`_. You should only set this value to `true` if the metadata API is disabled or not working. HolmesGPT's ability to select the right metric will be negatively impacted because the series API does not return key metadata like the metrics/series description or their type (gauge, histogram, etc.).
+- **tool_calls_return_data** Experimental. If true, the prometheus data will be available to HolmesGPT. In some cases, HolmesGPT will be able to detect memory leaks or other anomalies. This is disabled by default to reduce the likelyhood of reaching the input token limit.
+- **headers** Extra headers to pass to all prometheus http requests. Use this to pass authentication. Prometheus `supports basic authentication <https://prometheus.io/docs/guides/basic-auth/>`_.
+
+Capabilities
+------------
+.. include:: ./_toolset_capabilities.inc.rst
+
+.. list-table::
+   :header-rows: 1
+   :widths: 30 70
+
+   * - Tool Name
+     - Description
+   * - list_available_metrics
+     - List all the available metrics to query from prometheus, including their types (counter, gauge, histogram, summary) and available labels.
+   * - execute_prometheus_instant_query
+     - Execute an instant PromQL query
+   * - execute_prometheus_range_query
+     - Execute a PromQL range query
diff --git a/src/robusta/core/model/base_params.py b/src/robusta/core/model/base_params.py
@@ -111,6 +111,7 @@ class AIInvestigateParams(HolmesParams):
     ask: Optional[str]
     context: Optional[Dict[str, Any]]
     sections: Optional[Dict[str, str]] = None
+    stream: bool = False
 
 
 class HolmesToolsResult(BaseModel):
diff --git a/src/robusta/core/model/events.py b/src/robusta/core/model/events.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Callable
 
 from pydantic import BaseModel
 
@@ -59,6 +59,7 @@ class ExecutionBaseEvent:
     _scheduler: Optional[PlaybooksScheduler] = None
     _context: Optional[ExecutionContext] = None
     _event_emitter: Optional[EventEmitter] = None
+    _ws: Optional[Callable[[str], None]] = None
 
     def set_context(self, context: ExecutionContext):
         self._context = context
diff --git a/src/robusta/core/playbooks/internal/ai_integration.py b/src/robusta/core/playbooks/internal/ai_integration.py
@@ -62,35 +62,43 @@ def ask_holmes(event: ExecutionBaseEvent, params: AIInvestigateParams):
             include_tool_call_results=True,
             sections=params.sections
         )
-        result = requests.post(f"{holmes_url}/api/investigate", data=holmes_req.json())
-        result.raise_for_status()
 
-        holmes_result = HolmesResult(**json.loads(result.text))
-        title_suffix = (
-            f" on {params.resource.name}"
-            if params.resource and params.resource.name and params.resource.name.lower() != "unresolved"
-            else ""
-        )
-
-        kind = params.resource.kind if params.resource else None
-        finding = Finding(
-            title=f"AI Analysis of {investigation__title}{title_suffix}",
-            aggregation_key="HolmesInvestigationResult",
-            subject=FindingSubject(
-                name=params.resource.name if params.resource else "",
-                namespace=params.resource.namespace if params.resource else "",
-                subject_type=FindingSubjectType.from_kind(kind) if kind else FindingSubjectType.TYPE_NONE,
-                node=params.resource.node if params.resource else "",
-                container=params.resource.container if params.resource else "",
-            ),
-            finding_type=FindingType.AI_ANALYSIS,
-            failure=False,
-        )
-        finding.add_enrichment(
-            [HolmesResultsBlock(holmes_result=holmes_result)], enrichment_type=EnrichmentType.ai_analysis
-        )
+        if params.stream:
+            with requests.post(f"{holmes_url}/api/stream/investigate", data=holmes_req.json(), stream=True) as resp:
+                for line in resp.iter_content(chunk_size=None, decode_unicode=True): # Avoid streaming chunks from holmes. send them as they arrive.
+                    event.ws(data=line)
+            return
 
-        event.add_finding(finding)
+        else:
+            result = requests.post(f"{holmes_url}/api/investigate", data=holmes_req.json())
+            result.raise_for_status()
+
+            holmes_result = HolmesResult(**json.loads(result.text))
+            title_suffix = (
+                f" on {params.resource.name}"
+                if params.resource and params.resource.name and params.resource.name.lower() != "unresolved"
+                else ""
+            )
+
+            kind = params.resource.kind if params.resource else None
+            finding = Finding(
+                title=f"AI Analysis of {investigation__title}{title_suffix}",
+                aggregation_key="HolmesInvestigationResult",
+                subject=FindingSubject(
+                    name=params.resource.name if params.resource else "",
+                    namespace=params.resource.namespace if params.resource else "",
+                    subject_type=FindingSubjectType.from_kind(kind) if kind else FindingSubjectType.TYPE_NONE,
+                    node=params.resource.node if params.resource else "",
+                    container=params.resource.container if params.resource else "",
+                ),
+                finding_type=FindingType.AI_ANALYSIS,
+                failure=False,
+            )
+            finding.add_enrichment(
+                [HolmesResultsBlock(holmes_result=holmes_result)], enrichment_type=EnrichmentType.ai_analysis
+            )
+
+            event.add_finding(finding)
 
     except Exception as e:
         logging.exception(
diff --git a/src/robusta/core/playbooks/playbooks_event_handler.py b/src/robusta/core/playbooks/playbooks_event_handler.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 from robusta.core.model.events import ExecutionBaseEvent
 from robusta.core.playbooks.base_trigger import TriggerEvent
@@ -39,6 +39,13 @@ def run_external_action(
         """Execute an external action"""
         pass
 
+    @abstractmethod
+    def run_external_stream_action(
+        self, action_name: str, action_params: Optional[dict], stream: Callable[str, Optional[str]]
+    ) -> Optional[Dict[str, Any]]:
+        """Execute an external stream action"""
+        pass
+
     @abstractmethod
     def get_global_config(self) -> dict:
         """Return runner global config"""
diff --git a/src/robusta/core/playbooks/playbooks_event_handler_impl.py b/src/robusta/core/playbooks/playbooks_event_handler_impl.py
@@ -197,7 +197,7 @@ def __run_playbook_actions(
         start_time = time.time()
         source: str = (
             "manual_action"
-            if any(name == SYNC_RESPONSE_SINK for name in getattr(execution_event, "named_sinks", []))
+            if any(name == SYNC_RESPONSE_SINK for name in (execution_event.named_sinks or []))
             else ""
         )
         self.__prepare_execution_event(execution_event)
@@ -368,3 +368,37 @@ def handle_sigint(self, sig, frame):
 
         self.set_cluster_active(False)
         sys.exit(0)
+
+    def run_external_stream_action(
+        self, action_name: str, action_params: Optional[dict], ws
+    ) -> Optional[Dict[str, Any]]:
+        action_def = self.registry.get_actions().get_action(action_name)
+        if not action_def:
+            return self.__error_resp(f"External action not found {action_name}", ErrorCodes.ACTION_NOT_FOUND.value)
+
+        if not action_def.from_params_func:
+            return self.__error_resp(
+                f"Action {action_name} cannot run using external event", ErrorCodes.NOT_EXTERNAL_ACTION.value
+            )
+
+        try:
+            instantiation_params = action_def.from_params_parameter_class(**action_params)
+        except Exception:
+            return self.__error_resp(
+                f"Failed to create execution instance for"
+                f" {action_name} {action_def.from_params_parameter_class}"
+                f" {action_params} {traceback.format_exc()}",
+                ErrorCodes.EVENT_PARAMS_INSTANTIATION_FAILED.value,
+            )
+
+        execution_event = action_def.from_params_func(instantiation_params)
+        if not execution_event:
+            return self.__error_resp(
+                f"Failed to create execution event for {action_name} {action_params}",
+                ErrorCodes.EVENT_INSTANTIATION_FAILED.value,
+            )
+
+        execution_event.ws = ws
+        playbook_action = PlaybookAction(action_name=action_name, action_params=action_params)
+
+        return self.__run_playbook_actions(execution_event, [playbook_action])
diff --git a/src/robusta/core/reporting/action_requests.py b/src/robusta/core/reporting/action_requests.py
@@ -27,6 +27,7 @@ class ExternalActionRequest(BaseModel):
     partial_auth_b: str = ""  # Auth for public key auth protocol option - should be added by the relay
     request_id: str = ""  # If specified, should return a sync response using the specified request_id
     no_sinks: bool = False  # Indicates not to send to sinks at all. The request body has a sink list,
+    stream: bool = False
     # however an empty sink list means using the server default sinks
 
 
diff --git a/src/robusta/integrations/receiver.py b/src/robusta/integrations/receiver.py
@@ -136,6 +136,12 @@ def stop(self):
     def __sync_response(cls, status_code: int, request_id: str, data) -> Dict:
         return {"action": "response", "request_id": request_id, "status_code": status_code, "data": data}
 
+    def __stream_response(self, request_id: str, data: str):
+        self.ws.send(data=json.dumps({"action": "stream", "request_id": request_id, "data": data}))
+
+    def __close_stream_response(self, request_id: str, data: str):
+        self.ws.send(data=json.dumps({"action": "stream", "request_id": request_id, "data": data, "close": True}))
+
     def __exec_external_request(self, action_request: ExternalActionRequest, validate_timestamp: bool):
         logging.debug(f"Callback `{action_request.body.action_name}` {to_safe_str(action_request.body.action_params)}")
         sync_response = action_request.request_id != ""  # if request_id is set, we need to write back the response
@@ -175,6 +181,23 @@ def __exec_external_request(self, action_request: ExternalActionRequest, validat
             http_code = 200 if response.get("success") else 500
             self.ws.send(data=json.dumps(self.__sync_response(http_code, action_request.request_id, response)))
 
+    def __exec_external_stream_request(self, action_request: ExternalActionRequest, validate_timestamp: bool):
+        logging.debug(f"Callback `{action_request.body.action_name}` {to_safe_str(action_request.body.action_params)}")
+
+        validation_response = self.__validate_request(action_request, validate_timestamp)
+        if validation_response.http_code != 200:
+            req_json = action_request.json(exclude={"body"})
+            body_json = action_request.body.json(exclude={"action_params"})  # action params already printed above
+            logging.error(f"Failed to validate action request {req_json} {body_json}")
+            self.__close_stream_response(action_request.request_id, validation_response.dict(exclude={"http_code"}))
+            return
+
+        res = self.event_handler.run_external_stream_action(action_request.body.action_name,
+                                                            action_request.body.action_params,
+                                                            lambda data: self.__stream_response(request_id=action_request.request_id, data=data))
+        res = "" if res.get("success") else json.dumps(res)
+        self.__close_stream_response(action_request.request_id, res)
+
     def _process_action(self, action: ExternalActionRequest, validate_timestamp: bool) -> None:
         self._executor.submit(self._process_action_sync, action, validate_timestamp)
 
@@ -189,7 +212,10 @@ def _process_action_sync(self, action: ExternalActionRequest, validate_timestamp
             else:
                 ctx = nullcontext()
             with ctx:
-                self.__exec_external_request(action, validate_timestamp)
+                if action.stream:
+                    self.__exec_external_stream_request(action, validate_timestamp)
+                else:
+                    self.__exec_external_request(action, validate_timestamp)
         except Exception:
             logging.error(
                 f"Failed to run incoming event {self._stringify_incoming_event(action.dict())}",