diff --git a/ibm_i/assets/service_checks.json b/ibm_i/assets/service_checks.json index 2fcdbfb5ce1c5..d62da688ce629 100644 --- a/ibm_i/assets/service_checks.json +++ b/ibm_i/assets/service_checks.json @@ -11,6 +11,6 @@ "critical" ], "name": "Can Connect", - "description": "Returns `CRITICAL` if the Agent is unable to connect and collect metrics from the monitored IBM i instance, otherwise returns `OK`." + "description": "Returns CRITICAL if the Agent is unable to establish a connection to the monitored IBM i instance, otherwise returns OK." } ] diff --git a/ibm_i/changelog.d/23986.changed b/ibm_i/changelog.d/23986.changed new file mode 100644 index 0000000000000..403e889d85cf7 --- /dev/null +++ b/ibm_i/changelog.d/23986.changed @@ -0,0 +1 @@ +Change `ibm_i.can_connect` service check to report OK when connection is successful, even if a query fails or times out. This aligns with the `.can_connect` service checks of our other integrations, \ No newline at end of file diff --git a/ibm_i/datadog_checks/ibm_i/check.py b/ibm_i/datadog_checks/ibm_i/check.py index 263b2143d0c0f..7e2ba91a106da 100644 --- a/ibm_i/datadog_checks/ibm_i/check.py +++ b/ibm_i/datadog_checks/ibm_i/check.py @@ -28,12 +28,9 @@ def __init__(self, name, init_config, instances): self._connection_string = None self._subprocess = None self._query_manager = None - self._current_errors = 0 self.check_initializations.append(self.set_up_query_manager) def check(self, _): - self._current_errors = 0 - try: self.query_manager.execute() check_status = AgentCheck.OK @@ -47,10 +44,6 @@ def check(self, _): check_status = AgentCheck.CRITICAL hostname = self.config.hostname if self.config else None - # At least one query failed, set the service check as failing - if self._current_errors: - check_status = AgentCheck.CRITICAL - if check_status is not None: self.service_check( self.SERVICE_CHECK_NAME, @@ -63,10 +56,6 @@ def cancel(self): # When the check gets cancelled, clean up the connection subprocess. self._delete_connection_subprocess() - def handle_query_error(self, error): - self._current_errors += 1 - return error - @property def connection_subprocess(self): if self._subprocess is None: @@ -253,7 +242,6 @@ def set_up_query_manager(self): tags=self.config.tags, queries=query_list, hostname=hostname, - error_handler=self.handle_query_error, ) self._query_manager.compile_queries() diff --git a/ibm_i/tests/test_ibm_i.py b/ibm_i/tests/test_ibm_i.py index bf903d1982e65..57afdd93010e2 100644 --- a/ibm_i/tests/test_ibm_i.py +++ b/ibm_i/tests/test_ibm_i.py @@ -317,5 +317,32 @@ def test_check_query_error(aggregator, instance): assert check._query_manager is not None assert check._query_manager.hostname == "host" check.check(instance) - aggregator.assert_service_check("ibm_i.can_connect", count=2, status=AgentCheck.CRITICAL) + aggregator.assert_service_check("ibm_i.can_connect", count=2, status=AgentCheck.OK) + aggregator.assert_all_metrics_covered() + + +def test_connection_failure(aggregator, instance): + check = IbmICheck('ibm_i', {}, [instance]) + check.log = mock.MagicMock() + check.load_configuration_models() + + with mock.patch('datadog_checks.ibm_i.IbmICheck.set_up_query_manager'): + check.check(instance) + assert check._query_manager is None + aggregator.assert_service_check("ibm_i.can_connect", count=1, status=AgentCheck.CRITICAL) + aggregator.assert_all_metrics_covered() + + +def test_check_query_manager_execute_error(aggregator, instance): + check = IbmICheck('ibm_i', {}, [instance]) + check.log = mock.MagicMock() + check.load_configuration_models() + check._query_manager = mock.MagicMock(hostname="host") + check._query_manager.execute.side_effect = Exception("boom") + + with mock.patch('datadog_checks.ibm_i.IbmICheck._delete_connection_subprocess') as delete_conn: + check.check(instance) + + delete_conn.assert_called_once() + aggregator.assert_service_check("ibm_i.can_connect", count=1, status=AgentCheck.CRITICAL) aggregator.assert_all_metrics_covered() diff --git a/network_path/metadata.csv b/network_path/metadata.csv index cf8712d756fc4..c8566c947e86d 100644 --- a/network_path/metadata.csv +++ b/network_path/metadata.csv @@ -26,7 +26,4 @@ datadog.network_path.collector.worker.task_duration.count,gauge,,second,,Duratio datadog.network_path.collector.worker.task_duration.max,gauge,,second,,Duration of a worker task.,0,network_path,,, datadog.network_path.collector.worker.task_duration.median,gauge,,second,,Duration of a worker task.,0,network_path,,, datadog.network_path.collector.workers,gauge,,,,The number of workers used to process pathtests concurrently.,0,network_path,,, -datadog.network_path.path.hops,gauge,,,,The number of hops of the collected pathtrace (traceroute).,0,network_path,,, datadog.network_path.path.monitored,gauge,,,,Paths monitored count. Make 'sum by {X}' queries to count all the Paths with the tag X,0,network_path,,, -datadog.network_path.path.reachable,gauge,,,,"The value is 1 if the path is reachable, 0 otherwise. Reachability is determined by the status of the destination/target of the pathtest.",0,network_path,,, -datadog.network_path.path.unreachable,gauge,,,,"The value is 1 if the path is unreachable, 0 otherwise. Reachability is determined by the status of the destination/target of the pathtest.",0,network_path,,,