99
1010import requests
1111from hikaru .model .rel_1_26 import DaemonSet , Deployment , Job , Node , Pod , ReplicaSet , StatefulSet
12-
13- from robusta .core .discovery .discovery import DISCOVERY_STACKTRACE_TIMEOUT_S , Discovery , DiscoveryResults
12+ from robusta . core . model . namespaces import NamespaceMetadata , ResourceCount
13+ from robusta .core .discovery .discovery import DISCOVERY_STACKTRACE_TIMEOUT_S , Discovery , DiscoveryResults , ResourceAccessForbiddenError
1414from robusta .core .discovery .top_service_resolver import TopLevelResource , TopServiceResolver
1515from robusta .core .discovery .utils import from_api_server_node
1616from robusta .core .model .base_params import HolmesParams
5151# Define the cache with a single slot and the configured TTL
5252_holmes_slackbot_cache = TTLCache (maxsize = 1 , ttl = HOLMES_SLACKBOT_CACHE_TTL )
5353
54+
5455class RobustaSink (SinkBase , EventHandler ):
5556 services_publish_lock = threading .Lock ()
5657
@@ -61,6 +62,9 @@ def __init__(self, sink_config: RobustaSinkConfigWrapper, registry):
6162 self .token = sink_config .robusta_sink .token
6263 self .ttl_hours = sink_config .robusta_sink .ttl_hours
6364 self .persist_events = sink_config .robusta_sink .persist_events
65+ self .namespace_monitored_resources = sink_config .robusta_sink .namespaceMonitoredResources
66+ self .namespace_discovery_seconds = sink_config .robusta_sink .namespace_discovery_seconds
67+
6468 robusta_token = RobustaToken (** json .loads (base64 .b64decode (self .token )))
6569 if self .account_id != robusta_token .account_id :
6670 logging .error (
@@ -134,7 +138,7 @@ def _on_config_reload(self) -> None:
134138 if not hasattr (self , '_thread' ):
135139 self ._thread = threading .Thread (target = self .__discover_cluster , daemon = True )
136140 self ._thread .start ()
137-
141+
138142 def set_cluster_active (self , active : bool ):
139143 self .dal .set_cluster_active (active )
140144
@@ -324,6 +328,63 @@ def __send_helm_release_events(self, release_data: List[HelmRelease]):
324328 except Exception :
325329 logging .error ("Error occurred while sending `helm release` trigger event" , exc_info = True )
326330
331+ def __discover_custom_namespaced_resources (self , namespaces : List [NamespaceInfo ]):
332+ if not self .namespace_monitored_resources :
333+ return
334+
335+ try :
336+ # Step 1: Collect counts in a temporary map
337+ resource_map = {} # type: Dict[str, List[ResourceCount]]
338+
339+ for resource in self .namespace_monitored_resources :
340+ try :
341+ results = Discovery .count_resources (
342+ kind = resource .kind ,
343+ api_group = resource .apiGroup ,
344+ version = resource .apiVersion
345+ )
346+
347+ for namespace_name , count in results .items ():
348+ if namespace_name not in resource_map :
349+ resource_map [namespace_name ] = []
350+
351+ resource_map [namespace_name ].append (ResourceCount (
352+ kind = resource .kind ,
353+ apiVersion = resource .apiVersion ,
354+ apiGroup = resource .apiGroup ,
355+ count = count
356+ ))
357+
358+ except ResourceAccessForbiddenError as e :
359+ logging .warning (f"Skipping resource { resource .kind } due to insufficient permissions: { e } " )
360+ except Exception as e :
361+ logging .exception (f"Unexpected error counting resource { resource .kind } : { e } " )
362+
363+ # Step 2: Apply metadata to matching NamespaceInfo entries
364+ for ns in namespaces :
365+ if ns .name in resource_map :
366+ if not ns .metadata :
367+ ns .metadata = NamespaceMetadata (resources = [])
368+ ns .metadata .resources .extend (resource_map [ns .name ])
369+
370+ logging .info ("Discovered Namespaced custom resources" )
371+ return namespaces
372+
373+ except Exception as e :
374+ logging .exception (f"Namespace discovery failed: { e } " )
375+
376+ def __add_cached_namespace_metadata (self , namespaces : List [NamespaceInfo ]):
377+ discovered_namespaces = {namespace .name : namespace for namespace in namespaces }
378+ updated_namespaces : List [NamespaceInfo ] = []
379+
380+ for namespace_name , namespace in discovered_namespaces .items ():
381+ cached_namespace = self .__namespaces_cache .get (namespace_name )
382+ if cached_namespace :
383+ namespace .metadata = cached_namespace .metadata
384+ updated_namespaces .append (namespace )
385+
386+ return updated_namespaces
387+
327388 def __discover_resources (self ) -> DiscoveryResults :
328389 # discovery is using the k8s python API and not Hikaru, since it's performance is 10 times better
329390 try :
@@ -342,7 +403,13 @@ def __discover_resources(self) -> DiscoveryResults:
342403 self .__publish_new_helm_releases (results .helm_releases )
343404
344405 self .__assert_namespaces_cache_initialized ()
345- self .__publish_new_namespaces (results .namespaces )
406+ namespaces = results .namespaces
407+ if self .namespace_monitored_resources and (time .time () - self .last_namespace_discovery ) >= self .namespace_discovery_seconds :
408+ namespaces = self .__discover_custom_namespaced_resources (namespaces )
409+ self .last_namespace_discovery = time .time ()
410+ elif self .namespace_monitored_resources :
411+ namespaces = self .__add_cached_namespace_metadata (namespaces )
412+ self .__publish_new_namespaces (namespaces )
346413
347414 self .__pods_running_count = results .pods_running_count
348415
@@ -559,25 +626,26 @@ def __discovery_watchdog(self):
559626 def __discover_cluster (self ):
560627 logging .info ("Cluster discovery initialized" )
561628 get_history = self .__should_run_history ()
629+ self .last_namespace_discovery = 0
562630 while self .__active :
563631 start_t = time .time ()
564632 self .__periodic_cluster_status ()
565633 discovery_results = self .__discover_resources ()
634+
566635 if get_history :
567636 self .__get_events_history ()
568637 get_history = False
569-
570638 if discovery_results and discovery_results .helm_releases :
571639 self .__send_helm_release_events (release_data = discovery_results .helm_releases )
572-
573640 duration = round (time .time () - start_t )
574- # for small cluster duration is discovery_period_sec. For bigger clusters, up to 5 min
575641 sleep_dur = min (max (self .__discovery_period_sec , 3 * duration ), 300 )
642+
576643 logging .debug (f"Discovery duration: { duration } next discovery in { sleep_dur } " )
577644 time .sleep (sleep_dur )
578645
579646 logging .info (f"Service discovery for sink { self .sink_name } ended." )
580647
648+
581649 def __periodic_cluster_status (self ):
582650 first_alert = False
583651
0 commit comments