Fix discovery bug that failed when there are many jobs that change frequently (#1938)

arikalon1 · web-flow · commit edd7144745a6 · 2025-10-18T12:55:38.000+03:00
Use new Kubewatch advanced filters by default to improve performance
Fix alert export api docs
diff --git a/docs/configuration/exporting/alert-export-api.rst b/docs/configuration/exporting/alert-export-api.rst
@@ -38,6 +38,10 @@ Query Parameters
      - string
      - The name of the alert to filter by (e.g., ``CrashLoopBackoff``).
      - No
+   * - ``namespace``
+     - string
+     - The namespace of the alert to filter by (e.g., ``monitoring``).
+     - No
 
 Example Request
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/playbook-reference/triggers/kubernetes.rst b/docs/playbook-reference/triggers/kubernetes.rst
@@ -261,6 +261,21 @@ Single Resource Triggers
 
 For triggers that fire only on Pod errors, see :ref:`Crashing Pod Triggers`.
 
+.. note::
+
+    By default, Robusta processes only **Pod** change events that are related to failures or modifications in the **Pod** spec.
+    Other types of **Pod** changes are considered less relevant and are filtered out to reduce noise.
+
+    To process **all Pod** change events, add the following configuration to your `generated_values.yaml` file.
+
+    .. code-block:: yaml
+
+        kubewatch:
+          additional_env_vars:
+          - name: ADVANCED_FILTERS
+            value: "false"
+
+
 .. jinja::
   :inline-ctx: { "resource_name" : "ReplicaSet", "related_actions" : ["related_pods"] }
   :header_update_levels:
@@ -296,6 +311,21 @@ For triggers that fire only on Pod errors, see :ref:`Crashing Pod Triggers`.
   :header_update_levels:
   :file: playbook-reference/triggers/_k8s-generic-triggers.jinja
 
+.. note::
+
+    By default, Robusta processes only **Event Create** events with the type ``Warning``.
+    **Events** with the type ``Normal`` are considered less relevant and are filtered out to reduce noise,
+    except for ``Normal`` events that indicate Pod evictions.
+
+    To process all Kubernetes **Event** change events, add the following configuration to your ``generated_values.yaml`` file.
+
+    .. code-block:: yaml
+
+        kubewatch:
+          additional_env_vars:
+          - name: ADVANCED_FILTERS
+            value: "false"
+
 .. jinja::
   :inline-ctx: { "resource_name" : "HorizontalPodAutoscaler", "related_actions" : [] }
   :header_update_levels:
@@ -321,6 +351,20 @@ For triggers that fire only on Pod errors, see :ref:`Crashing Pod Triggers`.
   :header_update_levels:
   :file: playbook-reference/triggers/_k8s-generic-triggers.jinja
 
+.. note::
+
+    By default, Robusta processes only **Job** change events that are related to failures or modifications in the **Job** spec.
+    Other types of **Job** changes are considered less relevant and are filtered out to reduce noise.
+
+    To process **all Job** change events, add the following configuration to your ``generated_values.yaml`` file.
+
+    .. code-block:: yaml
+
+        kubewatch:
+          additional_env_vars:
+          - name: ADVANCED_FILTERS
+            value: "false"
+
 .. jinja::
   :inline-ctx: { "resource_name" : "Namespace", "related_actions" : [] }
   :header_update_levels:
diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml
@@ -455,6 +455,19 @@ builtinPlaybooks:
 enablePlatformPlaybooks: false
 
 platformPlaybooks:
+- name: "PodEvictionReport"
+  triggers:
+  - on_event_create:
+      scope:
+        include:
+          - attributes:
+              - "reason=Evicted"
+  actions:
+  - create_event_finding:
+      aggregation_key: "PodEviction"
+  - event_resource_events: {}
+  sinks:
+    - "robusta_ui_sink"
 - name: "K8sWarningEventsReport"
   triggers:
   - on_kubernetes_warning_event_create:
@@ -590,7 +603,7 @@ image:
 # parameters for the robusta forwarder deployment
 kubewatch:
   image: ~ # image can be used to override image.registry/imageName
-  imageName: kubewatch:v2.11.0
+  imageName: kubewatch:v2.12.0
   imagePullPolicy: IfNotPresent
   revisionHistoryLimit: 10
   pprof: True
@@ -600,7 +613,9 @@ kubewatch:
       memory: 512Mi
     limits:
       cpu: ~
-  additional_env_vars: []
+  additional_env_vars:
+  - name: ADVANCED_FILTERS
+    value: "true"
   priorityClassName: ""
   tolerations: []
   annotations: {}
diff --git a/src/robusta/core/discovery/discovery.py b/src/robusta/core/discovery/discovery.py
@@ -649,9 +649,22 @@ def discovery_process() -> DiscoveryResults:
         try:
             continue_ref: Optional[str] = None
             for _ in range(DISCOVERY_MAX_BATCHES):
-                current_jobs: V1JobList = client.BatchV1Api().list_job_for_all_namespaces(
-                    limit=DISCOVERY_BATCH_SIZE, _continue=continue_ref
-                )
+                try:
+                    current_jobs: V1JobList = client.BatchV1Api().list_job_for_all_namespaces(
+                        limit=DISCOVERY_BATCH_SIZE, _continue=continue_ref
+                    )
+                except ApiException as e:
+                    if e.status == 410 and e.body:
+                        # Continue token expired, extract new token from error and continue
+                        import json
+                        error_body = json.loads(e.body)
+                        new_continue_token = error_body.get("metadata", {}).get("continue")
+                        if new_continue_token:
+                            logging.info("Continue token expired for jobs listing. Continuing")
+                            continue_ref = new_continue_token
+                            continue
+                    raise
+
                 for job in current_jobs.items:
                     job_pods = []
                     job_labels = {}