diff --git a/dirac.cfg b/dirac.cfg index 095620f64c7..50c962721f3 100644 --- a/dirac.cfg +++ b/dirac.cfg @@ -1158,6 +1158,102 @@ Operations pilotVORepoBranch = master # Branch to use workDir = /tmp/pilot3Files # Local work directory on the masterCS for synchronisation } + + # RSS section + # See https://dirac.readthedocs.io/en/latest/AdministratorGuide/Systems/ResourceStatus/configuration.html + ResourceStatus + { + Config + { + Cache = 720 # Lifetime (seconds) of the RSSCache (default: 300) + FromAddress = rss@dirac # Email address used as sender for RSS notifications + } + Policies + { + # Command arguments for the built-in Downtime policy type. + # hours = 0 means only ongoing downtimes are considered (default). + # Set hours > 0 to also catch downtimes starting within that window. + # Note: this section has no policyType key and is therefore NOT treated + # as a policy definition — it only sets command argument defaults. + Downtime + { + hours = 0 # look-ahead window in hours (0 = ongoing only, default) + } + # Command arguments for the built-in FreeDiskSpace policy type. + # Unit and thresholds apply to all SEs monitored by this policy. + # Note: same as above — no policyType key, so not a policy definition. + FreeDiskSpace + { + Unit = TB # Space unit: TB (default), GB or MB + Banned_threshold = 0.1 # Free space below which the SE is Banned (in the chosen unit) + Degraded_threshold = 5 # Free space below which the SE is Degraded (in the chosen unit) + } + # Example: apply Downtime policy to all Sites + SiteDowntime + { + policyType = Downtime + matchParams + { + element = Site + } + } + # Example: apply Downtime policy to all Resources + ResourceDowntime + { + policyType = Downtime + matchParams + { + element = Resource + } + } + # Example: apply FreeDiskSpace policy to all SE WriteAccess status types + SEWriteAccessFreeDiskSpace + { + policyType = FreeDiskSpace + matchParams + { + element = Resource + elementType = StorageElement + statusType = WriteAccess + } + } + # Example: apply FreeDiskSpace to SE1 with specific args (Unit and Banned_threshold); + # Degraded_threshold falls back to the default defined in the FreeDiskSpace section above. + SpecificFreeDiskSpace + { + policyType = FreeDiskSpace + Unit = GB + Banned_threshold = 15 + matchParams + { + name = SE1 + } + } + } + PolicyActions + { + # Example: send an email when any Resource reaches Banned status + BannedResourceEmail + { + actionType = EmailAction + notificationGroups = RSSAdmins + matchParams + { + element = Resource + status = Banned + } + } + } + Notification + { + RSSAdmins + { + users = admin@dirac # email addresses used for the notifications + } + } + } + + # Services section Services { # See http://dirac.readthedocs.io/en/latest/AdministratorGuide/Resources/Catalog/index.html diff --git a/docs/source/AdministratorGuide/Systems/ResourceStatus/advanced_configuration.rst b/docs/source/AdministratorGuide/Systems/ResourceStatus/advanced_configuration.rst index 5a2452abb80..f01f644803d 100644 --- a/docs/source/AdministratorGuide/Systems/ResourceStatus/advanced_configuration.rst +++ b/docs/source/AdministratorGuide/Systems/ResourceStatus/advanced_configuration.rst @@ -66,6 +66,82 @@ we cannot define the following matchParams: Code templates and examples for creating custom policies: :doc:`../../../DeveloperGuide/Systems/ResourceStatus/index` +Built-in Downtime Policy +~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``Downtime`` policy type evaluates GOCDB downtime data for a Site or Resource. +Severity is mapped to RSS status as follows: + +* **OUTAGE** → **Banned** +* **WARNING** → **Degraded** +* No downtime → **Active** + +The look-ahead window is configurable from the Operations CS: + +:: + + /Operations/Defaults/ResourceStatus + /Policies + /Downtime + hours = 0 # hours to look ahead (0 = ongoing only, default) + +.. note:: + + Setting ``hours = 0`` (the default) means only downtimes that are currently ongoing + are considered. Setting a positive value (e.g. ``12``) also catches downtimes scheduled + to start within that window, which is useful for proactive status changes. + + This section has no ``policyType`` key and is therefore treated purely as + command-argument defaults, not as a policy definition. + +Example: flag elements with downtimes starting within the next 24 hours:: + + /Operations/Defaults/ResourceStatus/Policies/Downtime + { + hours = 24 + } + +Built-in FreeDiskSpace Policy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``FreeDiskSpace`` policy type monitors Storage Element occupancy. +It compares the free space reported by the SE against two configurable thresholds: + +* If free space is below ``Banned_threshold``, the SE is set to **Banned**. +* If free space is below ``Degraded_threshold`` (but above ``Banned_threshold``), the SE is set to **Degraded**. +* Otherwise the SE is set to **Active**. + +All three parameters — unit, banned threshold, and degraded threshold — are fully configurable +from the Operations CS and fall back to safe defaults: + +:: + + /Operations/Defaults/ResourceStatus + /Policies + /FreeDiskSpace + Unit = TB # unit for the SE occupancy query (TB, GB or MB) + Banned_threshold = 0.1 # in the chosen unit (default) + Degraded_threshold = 5 # in the chosen unit (default) + +.. note:: + + These keys live under ``/Operations/Defaults/ResourceStatus/Policies/FreeDiskSpace``, + not under the ``/matchParams`` sub-section. They tune the **command arguments**, not the + element-matching logic. This section has no ``policyType`` key and is therefore not treated + as a policy definition by the policy engine. + + The default values of ``0.1`` and ``5`` are always used as fallback regardless of unit. + Make sure to set meaningful threshold values explicitly in the CS when changing the unit. + +Example: use GB with tighter thresholds:: + + /Operations/Defaults/ResourceStatus/Policies/FreeDiskSpace + { + Unit = GB + Banned_threshold = 100 + Degraded_threshold = 5000 + } + ------------- PolicyActions ------------- diff --git a/docs/source/DeveloperGuide/Systems/ResourceStatus/index.rst b/docs/source/DeveloperGuide/Systems/ResourceStatus/index.rst index 65e2f3335ea..398b5c0aa9f 100644 --- a/docs/source/DeveloperGuide/Systems/ResourceStatus/index.rst +++ b/docs/source/DeveloperGuide/Systems/ResourceStatus/index.rst @@ -115,7 +115,7 @@ Cache tables for metrics used by policies. * - PolicyResult - Policy evaluation results (Element, Name, PolicyName, Status, Reason) * - SpaceTokenOccupancyCache - - Storage space usage (Endpoint, Token, Free, Guaranteed) + - Storage space usage (Endpoint, Token, Free, Total) — values stored in MB * - TransferCache - Transfer quality metrics (SourceName, DestinationName, Metric, Value) @@ -215,6 +215,24 @@ Policies inherit from ``PolicyBase`` and implement ``evaluate()``. return {'Status': 'Degraded', 'Reason': f'Low efficiency: {efficiency:.2%}'} return {'Status': 'Banned', 'Reason': f'Very low efficiency: {efficiency:.2%}'} +FreeDiskSpace Policy +-------------------- + +The ``FreeDiskSpacePolicy`` (``Policy/FreeDiskSpacePolicy.py``) evaluates SE occupancy using +configurable thresholds. Thresholds are passed through as command arguments so they propagate +from the CS configuration all the way to the policy evaluation: + +1. ``Configurations.py`` reads ``Unit``, ``Banned_threshold`` and ``Degraded_threshold`` from the + Operations CS via ``Operations().getValue("ResourceStatus/Policies/FreeDiskSpace/Banned_threshold", 0.1)`` + and stores them in the policy ``args`` dict. +2. ``FreeDiskSpaceCommand`` reads these values from ``self.args`` in ``_prepareCommand()`` and + returns them alongside ``Free`` and ``Total`` in both ``doNew()`` and ``doCache()``. +3. ``FreeDiskSpacePolicy._evaluate()`` reads ``Banned_threshold`` and ``Degraded_threshold`` + from the command result dict (with safe defaults) and applies the comparison. + +This design keeps thresholds fully configurable per deployment without code changes. +See :ref:`rss_advanced_configuration` for the available CS keys. + Command Implementation ---------------------- diff --git a/src/DIRAC/ResourceStatusSystem/Command/DowntimeCommand.py b/src/DIRAC/ResourceStatusSystem/Command/DowntimeCommand.py index aabd98e6d4e..18ee7548038 100644 --- a/src/DIRAC/ResourceStatusSystem/Command/DowntimeCommand.py +++ b/src/DIRAC/ResourceStatusSystem/Command/DowntimeCommand.py @@ -1,8 +1,17 @@ -""" DowntimeCommand module will look into GOC DB to find announced downtimes for RSS-managed sites and resources. - If found, downtimes are added to the internal RSS cache using ResourceManagementClient. +"""DowntimeCommand + +Command to fetch and cache GOCDB downtime information for RSS-managed Sites and Resources. +Downtimes found are stored in the DowntimeCache table via ResourceManagementClient. +Stale or deleted GOCDB downtimes are also removed from the cache. + +The look-ahead window is controlled by the ``hours`` argument read from ``self.args``, +populated by the policy engine from ``POLICIESMETA`` defaults and any CS overrides: + +* ``hours = 0`` — only ongoing downtimes are considered. +* ``hours > 0`` — downtimes starting within the next ``hours`` hours are also included. - GOCDB downtimes that are modified or deleted are also synced. """ + import re from datetime import datetime, timedelta from operator import itemgetter @@ -43,7 +52,10 @@ class DowntimeCommand(Command): """ - Downtime "master" Command or removed DTs. + Command that queries GOCDB for downtime information and caches the results. + + Supports Sites, Storage Elements, FTS servers, and Computing Elements. + DIRAC resource types are mapped to GOCDB service types via ``diracToGOC_conversion``. """ def __init__(self, args=None, clients=None): @@ -54,7 +66,13 @@ def __init__(self, args=None, clients=None): def _storeCommand(self, result): """ - Stores the results of doNew method on the database. + Persist a list of downtime records to the DowntimeCache table. + + :param list result: list of downtime dicts, each containing ``DowntimeID``, + ``Element``, ``Name``, ``StartDate``, ``EndDate``, ``Severity``, + ``Description``, ``Link``, and ``gOCDBServiceType``. + + :returns: S_OK / S_ERROR from the last ``addOrModifyDowntimeCache`` call. """ for dt in result: @@ -73,7 +91,15 @@ def _storeCommand(self, result): def _cleanCommand(self, element, elementNames): """ - Clear Cache from expired DT. + Remove expired or deleted downtime entries from the DowntimeCache. + + A cached entry is removed if its ``EndDate`` is in the past or if its + GOCDB link no longer appears in the list of current GOCDB downtimes. + + :param str element: ``'Site'`` or ``'Resource'``. + :param list elementNames: names of the elements whose cache entries should be checked. + + :returns: S_OK with a list of deletion results, or S_ERROR on DB / GOCDB failure. """ resQuery = [] @@ -107,13 +133,28 @@ def _cleanCommand(self, element, elementNames): def _prepareCommand(self): """ - DowntimeCommand requires four arguments: - - name : - - element : Site / Resource - - elementType: + Extract and validate command arguments from ``self.args``, resolving DIRAC + names to their GOCDB equivalents where necessary. + + Required keys: + + * ``name`` (str) — DIRAC element name. + * ``element`` (str) — ``'Site'`` or ``'Resource'``. + * ``elementType`` (str) — resource type (e.g. ``StorageElement``, ``ComputingElement``, ``FTS3``). + + Optional key: - If the elements are Site(s), we need to get their GOCDB names. They may - not have, so we ignore them if they do not have. + * ``hours`` (int) — look-ahead window in hours (populated from ``POLICIESMETA``). + + Name resolution: + + * **Site** — converted to the GOCDB site name via ``getGOCSiteName``. + * **StorageElement** — resolved to one or more SE hosts; GOCDB service type + derived from SE type and access protocol via ``diracToGOC_conversion``. + * **FTS / FTS3** — resolved to the GOCDB FTS name via ``getGOCFTSName``. + * **ComputingElement** — GOCDB service type derived from CE type via ``diracToGOC_conversion``. + + :returns: S_OK tuple ``(element, elementName, hours, gOCDBServiceType)`` or S_ERROR. """ if "name" not in self.args: @@ -131,9 +172,7 @@ def _prepareCommand(self): if element not in ["Site", "Resource"]: return S_ERROR("element is neither Site nor Resource") - hours = None - if "hours" in self.args: - hours = self.args["hours"] + hours = self.args.get("hours") gOCDBServiceType = None @@ -208,19 +247,24 @@ def _prepareCommand(self): def doNew(self, masterParams=None): """ - Gets the parameters to run, either from the master method or from its - own arguments. + Fetch current downtime information from GOCDB and store it in the cache. + + Queries GOCDB for ongoing (and optionally upcoming) downtimes for the given + element(s). The GOCDB server is queried twice on ``URLError`` to handle + transient failures. Found downtimes are stored via ``_storeCommand``; the + cache is cleaned of stale entries via ``_cleanCommand``. - For every elementName, unless it is given a list, in which case it contacts - the gocdb client. The server is not very stable, so in case of failure tries - a second time. + :param masterParams: when called from ``doMaster``, a ``(element, elementNames)`` + tuple (e.g. ``('Site', ['CERN', 'IN2P3-CC'])``); the look-ahead window is + taken from ``self.args.get('hours', 0)``. Pass ``None`` to use ``self.args`` + directly (normal per-element policy evaluation path). - If there are downtimes, are recorded and then returned. + :returns: S_OK on success (value is ``None`` if no downtimes were found), or S_ERROR. """ if masterParams is not None: element, elementNames = masterParams - hours = 120 + hours = self.args.get("hours", 0) elementName = None gOCDBServiceType = None @@ -293,8 +337,23 @@ def doNew(self, masterParams=None): def doCache(self): """ - Method that reads the cache table and tries to read from it. It will - return a list with one dictionary describing the DT if there are results. + Retrieve the most relevant downtime for this element from the DowntimeCache. + + When ``hours`` is set, the target date is shifted into the future and the + earliest matching downtime is returned (useful for advance warning of scheduled + outages). When ``hours`` is ``None``, ongoing downtimes are evaluated and the + highest-severity, longest-lasting one is returned. + + Priority when multiple downtimes overlap: + + * OUTAGE takes precedence over WARNING. + * Among equal severity: the one ending latest wins (``hours=None`` path) or + the one starting earliest wins (``hours>0`` path). + + :returns: S_OK with a downtime dict (keys: ``DowntimeID``, ``Element``, ``Name``, + ``StartDate``, ``EndDate``, ``Severity``, ``Description``, ``Link``, + ``gOCDBServiceType``) if a relevant downtime exists, S_OK with ``None`` + if no downtime applies, or S_ERROR on DB failure. """ params = self._prepareCommand() @@ -362,10 +421,20 @@ def doCache(self): return S_OK(dtOverlapping[-1]) def doMaster(self): - """Master method, which looks little bit spaghetti code, sorry ! - - It gets all sites and transforms them into gocSites. - - It gets all the storage elements and transforms them into their hosts - - It gets the the CEs (FTS and file catalogs will come). + """ + Refresh downtime data for all known Sites and Resources from GOCDB. + + Collects: + + * All GOCDB site names (from ``getGOCSites``). + * All SE hosts (from ``getStorageElementsHosts``). + * All FTS3 server hosts (from ``getFTS3Servers``). + * All Computing Element names (from ``getCESiteMapping``). + + Calls ``doNew`` separately for Sites and Resources. Failures are recorded in + ``self.metrics['failed']`` but do not abort the run. + + :returns: S_OK with ``self.metrics`` dict (containing a ``'failed'`` list). """ gocSites = getGOCSites() diff --git a/src/DIRAC/ResourceStatusSystem/Command/FreeDiskSpaceCommand.py b/src/DIRAC/ResourceStatusSystem/Command/FreeDiskSpaceCommand.py index d194657807f..bff004ad0dc 100644 --- a/src/DIRAC/ResourceStatusSystem/Command/FreeDiskSpaceCommand.py +++ b/src/DIRAC/ResourceStatusSystem/Command/FreeDiskSpaceCommand.py @@ -1,10 +1,17 @@ -""" FreeDiskSpaceCommand - The Command gets the free space that is left in a Storage Element +"""FreeDiskSpaceCommand - Note: there are, still, many references to "space tokens", - for example ResourceManagementClient().selectSpaceTokenOccupancyCache(token=elementName) - This is for historical reasons, and shoud be fixed one day. - For the moment, when you see "token" or "space token" here, just read "StorageElement". +Command to retrieve and cache the free/total disk space of a Storage Element. + +The unit and decision thresholds are read from ``self.args``, which are populated +by the policy engine from ``POLICIESMETA`` defaults and any CS overrides: + +* ``unit`` — space unit for the occupancy query (``TB``, ``GB`` or ``MB``) +* ``Banned_threshold`` — free-space value below which the SE is Banned +* ``Degraded_threshold`` — free-space value below which the SE is Degraded + +Note: there are still many references to "space tokens" (e.g. +``ResourceManagementClient().selectSpaceTokenOccupancyCache(token=elementName)``). +This is for historical reasons; when you see "token" or "space token" here, read "StorageElement". """ @@ -25,7 +32,11 @@ class FreeDiskSpaceCommand(Command): """ - Uses diskSpace method to get the free space + Command that queries the occupancy of a Storage Element and caches the result. + + Occupancy values are stored in the SpaceTokenOccupancyCache table (in MB) and + recorded in the StorageOccupancy accounting. The unit used for the returned + values is configurable (default: TB). """ def __init__(self, args=None, clients=None): @@ -35,41 +46,57 @@ def __init__(self, args=None, clients=None): def _prepareCommand(self): """ - FreeDiskSpaceCommand requires one argument: - - name : + Extract and validate command arguments from ``self.args``. + + Required key: + + * ``name`` (str) — Storage Element name. + + Optional keys (populated from ``POLICIESMETA`` defaults and CS overrides): + + * ``unit`` (str) — space unit: ``TB``, ``GB`` or ``MB``. + * ``Banned_threshold`` (float) — free space below which the SE is Banned. + * ``Degraded_threshold`` (float) — free space below which the SE is Degraded. + + :returns: S_OK tuple ``(elementName, unit, banned_threshold, degraded_threshold)`` + or S_ERROR if ``name`` is missing. """ if "name" not in self.args: return S_ERROR('"name" not found in self.args') elementName = self.args["name"] - # We keep TB as default as this is what was used (and will still be used) - # in the policy for "space tokens" ("real", "data" SEs) - unit = self.args.get("unit", "TB") + unit = self.args["unit"] + banned_threshold = self.args["Banned_threshold"] + degraded_threshold = self.args["Degraded_threshold"] - return S_OK((elementName, unit)) + return S_OK((elementName, unit, banned_threshold, degraded_threshold)) def doNew(self, masterParams=None): """ - Gets the parameters to run, either from the master method or from its - own arguments. + Query the SE occupancy directly and cache the result. - Gets the total and the free disk space of a storage element - and inserts the results in the SpaceTokenOccupancyCache table - of ResourceManagementDB database. + Fetches the free and total disk space from the Storage Element, stores the + values in the SpaceTokenOccupancyCache table (in MB) and in the StorageOccupancy + accounting, then returns them to the caller in the configured unit together with + the decision thresholds. - The result is also returned to the caller, not only inserted. - What is inserted in the DB will normally be in MB, - what is returned will be in the specified unit. + :param masterParams: when called from ``doMaster``, a ``(name, unit)`` tuple + that overrides ``self.args``; otherwise ``None``. + + :returns: S_OK dict with keys ``Free``, ``Total``, ``Banned_threshold``, + ``Degraded_threshold`` (all in the configured unit), or S_ERROR. """ if masterParams is not None: elementName, unit = masterParams + banned_threshold = self.args["Banned_threshold"] + degraded_threshold = self.args["Degraded_threshold"] else: params = self._prepareCommand() if not params["OK"]: return params - elementName, unit = params["Value"] + elementName, unit, banned_threshold, degraded_threshold = params["Value"] se = StorageElement(elementName) occupancyResult = se.getOccupancy(unit=unit) @@ -84,18 +111,32 @@ def doNew(self, masterParams=None): if not result["OK"]: return result - return S_OK({"Free": free, "Total": total}) + return S_OK( + { + "Free": free, + "Total": total, + "Banned_threshold": banned_threshold, + "Degraded_threshold": degraded_threshold, + } + ) def _storeCommand(self, results): """ - Stores the results in the cache (SpaceTokenOccupancyCache), - and adds records to the StorageOccupancy accounting. - - :param dict results: something like {'ElementName': 'CERN-HIST-EOS', - 'Endpoint': 'httpg://srm-eoslhcb-bis.cern.ch:8443/srm/v2/server', - 'Free': 3264963586.10073, - 'Total': 8000000000.0} - :returns: S_OK/S_ERROR dict + Persist occupancy data to the cache table and accounting system. + + Writes to SpaceTokenOccupancyCache (values in MB) and registers + Free/Total/Used records in the StorageOccupancy accounting type. + + :param dict results: occupancy data, e.g.:: + + { + 'ElementName': 'CERN-HIST-EOS', + 'Endpoint': 'httpg://srm-eoslhcb-bis.cern.ch:8443/srm/v2/server', + 'Free': 3264963586.10073, # MB + 'Total': 8000000000.0, # MB + } + + :returns: S_OK on success, S_ERROR otherwise. """ # Stores in cache @@ -141,14 +182,21 @@ def _storeCommand(self, results): def doCache(self): """ - This is a method that gets the element's details from the spaceTokenOccupancyCache DB table. - It will return a dictionary with th results, converted to "correct" unit. + Retrieve SE occupancy from the SpaceTokenOccupancyCache table. + + Values are stored in MB and converted on the fly to the configured unit + before being returned. The decision thresholds are appended to the result + so that ``FreeDiskSpacePolicy`` can evaluate them without re-reading the CS. + + :returns: S_OK dict with keys ``Free``, ``Total``, ``Banned_threshold``, + ``Degraded_threshold`` (all in the configured unit), or S_ERROR if + no cached record exists or the unit is invalid. """ params = self._prepareCommand() if not params["OK"]: return params - elementName, unit = params["Value"] + elementName, unit, banned_threshold, degraded_threshold = params["Value"] result = self.rmClient.selectSpaceTokenOccupancyCache(token=elementName) @@ -167,17 +215,28 @@ def doCache(self): if free == -sys.maxsize or total == -sys.maxsize: return S_ERROR("No valid unit specified") - return S_OK({"Free": free, "Total": total}) + return S_OK( + { + "Free": free, + "Total": total, + "Banned_threshold": banned_threshold, + "Degraded_threshold": degraded_threshold, + } + ) def doMaster(self): """ - This method calls the doNew method for each storage element - that exists in the CS. + Refresh occupancy data for all Storage Elements known to the CS. + + Iterates over all SEs returned by DMSHelpers, calls ``doNew`` for each one + (always using MB as the internal storage unit), then purges stale entries + from the cache via ``_cleanCommand``. + + :returns: S_OK on success, S_ERROR if the cache cleanup fails. """ for name in DMSHelpers().getStorageElements(): try: - # keeping TB as default diskSpace = self.doNew((name, "MB")) if not diskSpace["OK"]: self.log.warn("Unable to calculate free/total disk space", f"name: {name}") @@ -191,10 +250,17 @@ def doMaster(self): return self._cleanCommand() def _cleanCommand(self, toDelete=None): - """Clean the spaceTokenOccupancy table from old endpoints + """ + Remove stale entries from the SpaceTokenOccupancyCache table. + + An entry is considered stale when its ``LastCheckTime`` is older than 6 hours + and the corresponding SE/endpoint pair no longer exists in the CS. + + :param tuple toDelete: if provided, a single ``(endpoint, storage_element_name)`` + tuple to delete explicitly, e.g. ``('httpg://srm-lhcb.cern.ch:8443/srm/managerv2', 'CERN-RAW')``. + If ``None`` (default), stale entries are detected automatically. - :param tuple toDelete: endpoint to remove (endpoint, storage_element_name), - e.g. ('httpg://srm-lhcb.cern.ch:8443/srm/managerv2', CERN-RAW) + :returns: S_OK always (individual deletion failures are logged as warnings). """ if not toDelete: toDelete = [] diff --git a/src/DIRAC/ResourceStatusSystem/Policy/Configurations.py b/src/DIRAC/ResourceStatusSystem/Policy/Configurations.py index f626163c5e0..26a56c0ef2e 100644 --- a/src/DIRAC/ResourceStatusSystem/Policy/Configurations.py +++ b/src/DIRAC/ResourceStatusSystem/Policy/Configurations.py @@ -1,62 +1,43 @@ -""" Configurations module +"""Configurations module - Configuration to use policies. +Configuration to use policies. - Follows the schema:: +Follows the schema:: - : { - 'description' : , - 'module' : , - 'command' : ( , < command class name > ), - 'args' : { arguments for the command } or None - } + : { + 'description' : , + 'module' : , + 'command' : ( , < command class name > ), + 'args' : { arguments for the command } or None + } + +The values in ``args`` are code-level defaults. They can be overridden per-policy +via the CS entry (e.g. ``Unit = GB`` directly under the policy name in +``/Operations/Defaults/ResourceStatus/Policies/``). +Deployment-wide defaults can also be set in a command-args section named after +the policy type (e.g. ``/Operations/Defaults/ResourceStatus/Policies/FreeDiskSpace``); +these are picked up by InfoGetter before code-level defaults are applied. """ + POLICIESMETA = { # DownTime POLICIES - "DTOngoing": { - "description": "Ongoing and scheduled down-times", + "Downtime": { + "description": "Ongoing or scheduled down-times within from now (0 = ongoing only)", "module": "DowntimePolicy", "command": ("DowntimeCommand", "DowntimeCommand"), "args": {"hours": 0, "onlyCache": True}, }, - "DTScheduled1": { - "description": "Ongoing and scheduled down-times", - "module": "DowntimePolicy", - "command": ("DowntimeCommand", "DowntimeCommand"), - "args": {"hours": 1, "onlyCache": True}, - }, - "DTScheduled3": { - "description": "Ongoing and scheduled down-times", - "module": "DowntimePolicy", - "command": ("DowntimeCommand", "DowntimeCommand"), - "args": {"hours": 3, "onlyCache": True}, - }, - "DTScheduled": { - "description": "Scheduled down-times, starting in ", - "module": "DowntimePolicy", - "command": ("DowntimeCommand", "DowntimeCommand"), - "args": {"hours": 12, "onlyCache": True}, - }, - # Free Disk Space in Terabytes - "FreeDiskSpaceTB": { - "description": "Free disk space, in TB", - "module": "FreeDiskSpacePolicy", - "command": ("FreeDiskSpaceCommand", "FreeDiskSpaceCommand"), - "args": {"unit": "TB", "onlyCache": True}, - }, - # Free Disk Space in Gigabytes - "FreeDiskSpaceGB": { - "description": "Free disk space, in GB", - "module": "FreeDiskSpacePolicy", - "command": ("FreeDiskSpaceCommand", "FreeDiskSpaceCommand"), - "args": {"unit": "GB", "onlyCache": True}, - }, - # Free Disk Space in Megabytes - "FreeDiskSpaceMB": { - "description": "Free disk space, in MB", + # Free Disk Space + "FreeDiskSpace": { + "description": "Free disk space", "module": "FreeDiskSpacePolicy", "command": ("FreeDiskSpaceCommand", "FreeDiskSpaceCommand"), - "args": {"unit": "MB", "onlyCache": True}, + "args": { + "unit": "TB", + "Banned_threshold": 0.1, + "Degraded_threshold": 5, + "onlyCache": True, + }, }, # GGUS tickets open "GGUSTickets": { diff --git a/src/DIRAC/ResourceStatusSystem/Policy/DowntimePolicy.py b/src/DIRAC/ResourceStatusSystem/Policy/DowntimePolicy.py index 4deb0a346fc..0b009db6fd6 100644 --- a/src/DIRAC/ResourceStatusSystem/Policy/DowntimePolicy.py +++ b/src/DIRAC/ResourceStatusSystem/Policy/DowntimePolicy.py @@ -1,18 +1,41 @@ -""" DowntimePolicy module +"""DowntimePolicy + +Policy to evaluate the downtime status of a Site or Resource as reported by GOCDB. +The look-ahead window (``hours``) is fully configurable via the Operations CS under +``/Operations/Defaults/ResourceStatus/Policies/Downtime``. + """ + from DIRAC import S_OK from DIRAC.ResourceStatusSystem.PolicySystem.PolicyBase import PolicyBase class DowntimePolicy(PolicyBase): - """The DowntimePolicy checks for downtimes, scheduled or ongoing, depending on the command parameters.""" + """ + Policy that proposes a new status for a Site or Resource based on GOCDB downtime data. + + Whether the policy considers only ongoing downtimes or also scheduled ones within + a future window is controlled by the ``hours`` command argument (default: 0 = ongoing only). + """ @staticmethod def _evaluate(commandResult): - """It returns Active status if there is no downtime announced. - Banned if the element is in OUTAGE. - Degraded if it is on WARNING status. - Otherwise, it returns error. + """ + Evaluate the downtime policy against the result of DowntimeCommand. + + Severity mapping: + + * No downtime (``None``) → **Active** + * ``OUTAGE`` → **Banned** + * ``WARNING`` → **Degraded** + * any other severity → **Error** + + :param dict commandResult: S_OK / S_ERROR result from DowntimeCommand. + On success the value is either ``None`` (no downtime) or a dict with at least + ``Severity``, ``DowntimeID``, and ``Description`` keys. + + :returns: S_OK wrapping a dict ``{'Status': str, 'Reason': str}`` where Status is one of + ``Error``, ``Active``, ``Banned``, ``Degraded``. """ result = {"Status": None, "Reason": None} @@ -36,7 +59,7 @@ def _evaluate(commandResult): result["Status"] = "Degraded" else: - _reason = f"DT_Policy: GOCDB returned an unknown value for DT: \"{status['DowntimeID']}\"" + _reason = f'DT_Policy: GOCDB returned an unknown value for DT: "{status["DowntimeID"]}"' result["Status"] = "Error" result["Reason"] = _reason return S_OK(result) diff --git a/src/DIRAC/ResourceStatusSystem/Policy/FreeDiskSpacePolicy.py b/src/DIRAC/ResourceStatusSystem/Policy/FreeDiskSpacePolicy.py index 8e964ffa2c9..f2c4ebd2d9c 100644 --- a/src/DIRAC/ResourceStatusSystem/Policy/FreeDiskSpacePolicy.py +++ b/src/DIRAC/ResourceStatusSystem/Policy/FreeDiskSpacePolicy.py @@ -1,36 +1,37 @@ -""" FreeDiskSpacePolicy +"""FreeDiskSpacePolicy - FreeDiskSpacePolicy.__bases__: - DIRAC.ResourceStatusSystem.PolicySystem.PolicyBase.PolicyBase +Policy to evaluate the free disk space of a Storage Element. +The unit and thresholds (Banned_threshold, Degraded_threshold) are fully +configurable via the Operations CS under +``/Operations/Defaults/ResourceStatus/Policies/FreeDiskSpace``. """ + from DIRAC import S_OK from DIRAC.ResourceStatusSystem.PolicySystem.PolicyBase import PolicyBase class FreeDiskSpacePolicy(PolicyBase): """ - The FreeDiskSpacePolicy class is a policy class satisfied when a SE has a - low occupancy. + Policy that proposes a new status for a Storage Element based on its free disk space. - FreeDiskSpacePolicy, given the space left at the element, proposes a new status. + The free space value and the thresholds (Banned_threshold, Degraded_threshold) are + expressed in the same unit (TB, GB or MB) as configured for the FreeDiskSpace policy + in the Operations CS. Default unit is TB; default thresholds are 0.1 (Banned) and 5 + (Degraded). """ @staticmethod def _evaluate(commandResult): """ - Evaluate policy on SE occupancy: Use FreeDiskSpaceCommand - - :Parameters: - **commandResult** - S_OK / S_ERROR - result of the command. It is expected ( iff S_OK ) a dictionary like - { 'Total' : .., 'Free' : ..} - - :return: - { - 'Status':Error|Active|Bad|Banned, - 'Reason': Some lame statements that have to be updated - } + Evaluate the free disk space policy. + + :param dict commandResult: S_OK / S_ERROR result from FreeDiskSpaceCommand. + On success the value is expected to be a dict with keys: + ``Free``, ``Total``, ``Banned_threshold``, ``Degraded_threshold``. + + :returns: S_OK wrapping a dict ``{'Status': str, 'Reason': str}`` where Status is one of + ``Error``, ``Unknown``, ``Banned``, ``Degraded``, ``Active``. """ result = {} @@ -57,10 +58,10 @@ def _evaluate(commandResult): # Units (TB, GB, MB) may change, # depending on the configuration of the command in Configurations.py - if free < 0.1: + if free < commandResult["Banned_threshold"]: # default: 0.1 result["Status"] = "Banned" result["Reason"] = "Too little free space" - elif free < 5: + elif free < commandResult["Degraded_threshold"]: # default: 5 result["Status"] = "Degraded" result["Reason"] = "Little free space" else: diff --git a/src/DIRAC/ResourceStatusSystem/Policy/test/Test_RSS_Policy_FreeDiskSpacePolicy.py b/src/DIRAC/ResourceStatusSystem/Policy/test/Test_RSS_Policy_FreeDiskSpacePolicy.py index 799bf361299..83380f501ce 100644 --- a/src/DIRAC/ResourceStatusSystem/Policy/test/Test_RSS_Policy_FreeDiskSpacePolicy.py +++ b/src/DIRAC/ResourceStatusSystem/Policy/test/Test_RSS_Policy_FreeDiskSpacePolicy.py @@ -1,5 +1,4 @@ -""" Test_RSS_Policy_FreeDiskSpacePolicy -""" +"""Test_RSS_Policy_FreeDiskSpacePolicy""" # pylint: disable=protected-access import unittest @@ -72,17 +71,29 @@ def test_evaluate(self): self.assertEqual("Error", res["Value"]["Status"]) self.assertEqual("Key Free missing", res["Value"]["Reason"]) - res = module._evaluate({"OK": True, "Value": {"Total": 100, "Free": 0.0}}) + res = module._evaluate( + {"OK": True, "Value": {"Total": 100, "Free": 0.0, "Banned_threshold": 0.1, "Degraded_threshold": 5}} + ) self.assertTrue(res["OK"]) self.assertEqual("Banned", res["Value"]["Status"]) self.assertEqual("Too little free space", res["Value"]["Reason"]) - res = module._evaluate({"OK": True, "Value": {"Total": 100, "Free": 4.0, "Guaranteed": 1}}) + res = module._evaluate( + { + "OK": True, + "Value": {"Total": 100, "Free": 4.0, "Guaranteed": 1, "Banned_threshold": 0.1, "Degraded_threshold": 5}, + } + ) self.assertTrue(res["OK"]) self.assertEqual("Degraded", res["Value"]["Status"]) self.assertEqual("Little free space", res["Value"]["Reason"]) - res = module._evaluate({"OK": True, "Value": {"Total": 100, "Free": 100, "Guaranteed": 1}}) + res = module._evaluate( + { + "OK": True, + "Value": {"Total": 100, "Free": 100, "Guaranteed": 1, "Banned_threshold": 0.1, "Degraded_threshold": 5}, + } + ) self.assertTrue(res["OK"]) self.assertEqual("Active", res["Value"]["Status"]) self.assertEqual("Enough free space", res["Value"]["Reason"]) diff --git a/src/DIRAC/ResourceStatusSystem/Utilities/InfoGetter.py b/src/DIRAC/ResourceStatusSystem/Utilities/InfoGetter.py index dda3a8764c2..cc1f0e855f0 100644 --- a/src/DIRAC/ResourceStatusSystem/Utilities/InfoGetter.py +++ b/src/DIRAC/ResourceStatusSystem/Utilities/InfoGetter.py @@ -45,21 +45,24 @@ def getPoliciesThatApply(decisionParams): # Get policies that match the given decisionParameters for policyName, policySetup in policiesConfig.items(): - # The parameter policyType replaces policyName, so if it is not present, - # we pick policyName + # The parameter policyType is mandatory. If not present, skip this entry — + # it is a command-args defaults section, not a policy definition. try: policyType = policySetup["policyType"][0] except KeyError: - policyType = policyName - # continue + continue # The section matchParams is not mandatory, so we set {} as default. policyMatchParams = policySetup.get("matchParams", {}) gLogger.debug(f"matchParams of {policyName}: {str(policyMatchParams)}") - # FIXME: make sure the values in the policyConfigParams dictionary are typed !! - policyConfigParams = {} - # policyConfigParams = policySetup.get( 'configParams', {} ) + # Any key in the CS policy entry that is not a reserved keyword is treated as + # a command-argument override. These override the defaults from POLICIESMETA. + _reservedKeys = {"policyType", "matchParams", "configParams", "doNotCombineResult", "active"} + policyConfigParams = { + k: v[0] if isinstance(v, list) else v for k, v in policySetup.items() if k not in _reservedKeys + } + policyMatch = Utils.configMatch(decisionParams, policyMatchParams) gLogger.debug(f"PolicyMatch for decisionParams {decisionParams}: {str(policyMatch)}") @@ -67,7 +70,7 @@ def getPoliciesThatApply(decisionParams): # is not straightforward (e.g. when the policy specify a 'domain', while # the decisionParams has only the name of the element) if policyMatch and _filterPolicies(decisionParams, policyMatchParams): - policiesThatApply.append((policyName, policyType, policyConfigParams)) + policiesThatApply.append((policyName, policyType, policyConfigParams, policyMatchParams)) gLogger.debug(f"policies that apply (before post-processing): {str(policiesThatApply)}") policiesThatApply = postProcessingPolicyList(policiesThatApply) @@ -76,7 +79,7 @@ def getPoliciesThatApply(decisionParams): objectLoader = ObjectLoader() policiesToBeLoaded = [] # Gets policies parameters from code. - for policyName, policyType, _policyConfigParams in policiesThatApply: + for policyName, policyType, _policyConfigParams, _policyMatchParams in policiesThatApply: try: result = objectLoader.loadModule("DIRAC.ResourceStatusSystem.Policy.Configurations") if not result["OK"]: @@ -92,8 +95,22 @@ def getPoliciesThatApply(decisionParams): policyDict = {"name": policyName, "type": policyType, "args": {}} # args is one of the parameters we are going to use on the policies. We copy - # the defaults and then we update if with whatever comes from the CS. + # the defaults from POLICIESMETA and then override with whatever comes from the CS. policyDict.update(policyMeta) + if _policyConfigParams and policyDict.get("args") is not None: + # Build a case-insensitive lookup of the existing arg keys so that CS keys + # like "Unit" correctly override POLICIESMETA keys like "unit". + argsKeyMap = {k.lower(): k for k in policyDict["args"]} + for csKey, csVal in _policyConfigParams.items(): + targetKey = argsKeyMap.get(csKey.lower(), csKey) + # CS values are always strings; cast to the type of the existing default. + existingVal = policyDict["args"].get(targetKey) + if existingVal is not None: + try: + csVal = type(existingVal)(csVal) + except (ValueError, TypeError): + pass + policyDict["args"][targetKey] = csVal policiesToBeLoaded.append(policyDict) @@ -262,28 +279,39 @@ def _filterPolicies(decisionParams, policyMatchParams): def postProcessingPolicyList(policiesThatApply): - """Put here any "hacky" post-processing""" - - # FIXME: the following 2 "if" are a "hack" for dealing with the following case: - # an SE happens to be subject to, e.g., both the 'FreeDiskSpaceMB' and the 'FreeDiskSpaceGB' policies - # (currently, there is no way to avoid that this happens, see e.g. LogSE) - # When this is the case, supposing that an SE has 50 MB free, the policies evaluation will be the following: - # - 'FreeDiskSpaceMB' will evaluate 'Active' - # - 'FreeDiskSpaceGB' will evaluate 'Banned' - # so the SE will end up being banned, but we want only the 'FreeDiskSpaceMB' to be considered. - if ("FreeDiskSpaceMB", "FreeDiskSpaceMB", {}) in policiesThatApply: - try: - policiesThatApply.remove(("FreeDiskSpaceGB", "FreeDiskSpaceGB", {})) - except ValueError: - pass - try: - policiesThatApply.remove(("FreeDiskSpaceTB", "FreeDiskSpaceTB", {})) - except ValueError: - pass - if ("FreeDiskSpaceGB", "FreeDiskSpaceGB", {}) in policiesThatApply: - try: - policiesThatApply.remove(("FreeDiskSpaceTB", "FreeDiskSpaceTB", {})) - except ValueError: - pass + """Remove lower-priority duplicates when multiple policies of the same type apply. + + When two or more policies share the same ``policyType`` and both match the current + element, we keep only the most specific one. Specificity is determined by the number + of ``matchParams`` keys: more keys = more specific. If one of the duplicates matched + by ``name`` it is always considered more specific than one that did not. + """ + from collections import defaultdict + + # Group policies by policyType + byType = defaultdict(list) + for entry in policiesThatApply: + policyName, policyType, policyConfigParams, policyMatchParams = entry + byType[policyType].append(entry) + + result = [] + for policyType, entries in byType.items(): + if len(entries) == 1: + result.extend(entries) + continue - return policiesThatApply + # Multiple policies of the same type matched — keep only the most specific. + # Specificity = number of matchParams keys; ties broken by name-match presence. + def specificity(entry): + matchParams = entry[3] # policyMatchParams + nameMatch = 1 if "name" in matchParams else 0 + return (nameMatch, len(matchParams)) + + most_specific = max(entries, key=specificity) + result.append(most_specific) + gLogger.debug( + f"postProcessing: multiple {policyType!r} policies matched; " + f"keeping {most_specific[0]!r}, dropping {[e[0] for e in entries if e is not most_specific]}" + ) + + return result diff --git a/src/DIRAC/ResourceStatusSystem/Utilities/test/Test_InfoGetter.py b/src/DIRAC/ResourceStatusSystem/Utilities/test/Test_InfoGetter.py new file mode 100644 index 00000000000..5e41a980c85 --- /dev/null +++ b/src/DIRAC/ResourceStatusSystem/Utilities/test/Test_InfoGetter.py @@ -0,0 +1,300 @@ +"""Tests for InfoGetter module + +Tests cover: +- ``getPoliciesThatApply``: CS matching, policyType enforcement, arg defaults, + per-policy arg overrides (key normalisation + type casting), and + command-args sections (no policyType) being skipped. +- ``postProcessingPolicyList``: single policy, no duplicates, specificity + disambiguation when multiple policies of the same type apply. +""" + +from unittest.mock import patch + +import pytest + +from DIRAC import S_ERROR, S_OK +from DIRAC.ResourceStatusSystem.Utilities.InfoGetter import getPoliciesThatApply, postProcessingPolicyList + +# --------------------------------------------------------------------------- +# Shared fixtures +# --------------------------------------------------------------------------- + +# Minimal CS-style policies tree reused across tests. +# Values are lists (as returned by getCSTree). +_BASE_POLICIES = { + # Command-args defaults section — no policyType, must be skipped. + "FreeDiskSpace": { + "Unit": ["TB"], + "Banned_threshold": ["0.1"], + "Degraded_threshold": ["5"], + }, + # Matches all Resources of type StorageElement with WriteAccess. + "SEWriteAccessFreeDiskSpace": { + "policyType": ["FreeDiskSpace"], + "matchParams": { + "element": ["Resource"], + "elementType": ["StorageElement"], + "statusType": ["WriteAccess"], + }, + }, + # Matches only SE1 — overrides Unit and Banned_threshold. + "SpecificFreeDiskSpace": { + "policyType": ["FreeDiskSpace"], + "Unit": ["GB"], + "Banned_threshold": ["15"], + "matchParams": {"name": ["SE1"]}, + }, + # Matches all Sites. + "AlwaysBannedForSite": { + "policyType": ["AlwaysBanned"], + "matchParams": {"element": ["Site"]}, + }, +} + +_GET_POLICIES = "DIRAC.ResourceStatusSystem.Utilities.RssConfiguration.getPolicies" + +_SE1_WRITEACCESS = { + "element": "Resource", + "name": "SE1", + "elementType": "StorageElement", + "statusType": "WriteAccess", + "status": "Active", + "reason": None, + "tokenOwner": None, + "active": "Active", +} + +_SE2_WRITEACCESS = { + "element": "Resource", + "name": "SE2", + "elementType": "StorageElement", + "statusType": "WriteAccess", + "status": "Active", + "reason": None, + "tokenOwner": None, + "active": "Active", +} + +_SITE1 = { + "element": "Site", + "name": "Site1", + "elementType": None, + "statusType": "ReadAccess", + "status": "Active", + "reason": None, + "tokenOwner": None, + "active": "Active", +} + +# --------------------------------------------------------------------------- +# getPoliciesThatApply — basic CS behaviour +# --------------------------------------------------------------------------- + + +@patch(_GET_POLICIES, return_value=S_OK({})) +def test_no_policies_in_cs(_mock): + """Empty CS → empty result.""" + result = getPoliciesThatApply(_SE1_WRITEACCESS) + assert result["OK"] + assert result["Value"] == [] + + +@patch(_GET_POLICIES, return_value=S_ERROR("CS unavailable")) +def test_cs_error_is_propagated(_mock): + """CS failure → S_ERROR propagated.""" + result = getPoliciesThatApply(_SE1_WRITEACCESS) + assert not result["OK"] + + +@pytest.mark.parametrize( + "decisionParams, expected_names, unexpected_names", + [ + pytest.param( + _SE1_WRITEACCESS, + [], + ["FreeDiskSpace"], + id="command-args-section-skipped", + ), + pytest.param( + _SE1_WRITEACCESS, + [], + ["AlwaysBannedForSite"], + id="site-policy-does-not-match-resource", + ), + pytest.param( + _SITE1, + ["AlwaysBannedForSite"], + [], + id="site-policy-matches-site", + ), + ], +) +@patch(_GET_POLICIES, return_value=S_OK(_BASE_POLICIES)) +def test_policy_matching(_mock, decisionParams, expected_names, unexpected_names): + """Policies are included or excluded based on element matching and policyType presence.""" + result = getPoliciesThatApply(decisionParams) + assert result["OK"] + names = [p["name"] for p in result["Value"]] + for name in expected_names: + assert name in names + for name in unexpected_names: + assert name not in names + + +def test_unknown_policytype_in_policiesmeta_is_skipped(): + """A CS policy whose policyType has no entry in POLICIESMETA is silently skipped.""" + policies = { + "GhostPolicy": { + "policyType": ["NonExistentPolicyType"], + "matchParams": {"element": ["Resource"]}, + } + } + with patch(_GET_POLICIES, return_value=S_OK(policies)): + result = getPoliciesThatApply(_SE1_WRITEACCESS) + assert result["OK"] + assert result["Value"] == [] + + +# --------------------------------------------------------------------------- +# getPoliciesThatApply — arg defaults and per-policy overrides +# --------------------------------------------------------------------------- + + +@patch(_GET_POLICIES, return_value=S_OK(_BASE_POLICIES)) +def test_se2_gets_default_args(_mock): + """SE2 matches only SEWriteAccessFreeDiskSpace → gets POLICIESMETA default args.""" + result = getPoliciesThatApply(_SE2_WRITEACCESS) + assert result["OK"] + assert len(result["Value"]) == 1 + args = result["Value"][0]["args"] + assert result["Value"][0]["name"] == "SEWriteAccessFreeDiskSpace" + assert args["unit"] == "TB" + assert args["Banned_threshold"] == 0.1 + assert args["Degraded_threshold"] == 5 + + +@patch(_GET_POLICIES, return_value=S_OK(_BASE_POLICIES)) +def test_se1_specific_policy_wins_with_overridden_args(_mock): + """SE1 WriteAccess: SpecificFreeDiskSpace wins; overridden args applied, default kept.""" + result = getPoliciesThatApply(_SE1_WRITEACCESS) + assert result["OK"] + assert len(result["Value"]) == 1 + policy = result["Value"][0] + assert policy["name"] == "SpecificFreeDiskSpace" + assert policy["args"]["unit"] == "GB" # overridden, key-normalised from "Unit" + assert policy["args"]["Banned_threshold"] == 15.0 # overridden, cast from str to float + assert policy["args"]["Degraded_threshold"] == 5 # not overridden → POLICIESMETA default + + +@pytest.mark.parametrize( + "expected_key, unexpected_key, expected_value", + [ + pytest.param("unit", "Unit", "GB", id="Unit-normalised-to-unit"), + ], +) +@patch(_GET_POLICIES, return_value=S_OK(_BASE_POLICIES)) +def test_arg_override_key_normalisation(_mock, expected_key, unexpected_key, expected_value): + """CS key 'Unit' (capital) must override POLICIESMETA key 'unit' (lowercase), not add a duplicate.""" + result = getPoliciesThatApply(_SE1_WRITEACCESS) + assert result["OK"] + args = result["Value"][0]["args"] + assert unexpected_key not in args + assert args[expected_key] == expected_value + + +@pytest.mark.parametrize( + "arg_key, expected_type", + [ + pytest.param("Banned_threshold", float, id="Banned_threshold-cast-to-float"), + pytest.param("Degraded_threshold", int, id="Degraded_threshold-remains-int"), + ], +) +@patch(_GET_POLICIES, return_value=S_OK(_BASE_POLICIES)) +def test_arg_override_type_casting(_mock, arg_key, expected_type): + """CS values are strings; they must be cast to the type of the POLICIESMETA default.""" + result = getPoliciesThatApply(_SE1_WRITEACCESS) + assert result["OK"] + assert isinstance(result["Value"][0]["args"][arg_key], expected_type) + + +# --------------------------------------------------------------------------- +# postProcessingPolicyList +# --------------------------------------------------------------------------- + + +def _entry(name, policyType, configParams=None, matchParams=None): + """Build a 4-tuple as produced by getPoliciesThatApply's inner loop.""" + return (name, policyType, configParams or {}, matchParams or {}) + + +_GENERAL = _entry( + "SEWriteAccessFreeDiskSpace", + "FreeDiskSpace", + matchParams={"element": ["Resource"], "elementType": ["StorageElement"], "statusType": ["WriteAccess"]}, +) +_SPECIFIC = _entry( + "SpecificFreeDiskSpace", + "FreeDiskSpace", + configParams={"unit": "GB", "Banned_threshold": 15.0}, + matchParams={"name": ["SE1"]}, +) +_NARROW = _entry( + "NarrowPolicy", + "FreeDiskSpace", + matchParams={"element": ["Resource"], "elementType": ["StorageElement"], "statusType": ["WriteAccess"]}, +) +_BROAD = _entry( + "BroadPolicy", + "FreeDiskSpace", + matchParams={"element": ["Resource"]}, +) +_DOWNTIME = _entry("SiteDowntime", "Downtime", matchParams={"element": ["Site"]}) + + +@pytest.mark.parametrize( + "entries, expected_count, expected_winner", + [ + pytest.param( + [_entry("A", "FreeDiskSpace", matchParams={"element": ["Resource"]})], + 1, + "A", + id="single-policy-unchanged", + ), + pytest.param( + [_GENERAL, _DOWNTIME], + 2, + None, # both kept — no winner check needed + id="different-types-both-kept", + ), + pytest.param( + [_GENERAL, _SPECIFIC], + 1, + "SpecificFreeDiskSpace", + id="name-match-beats-broader-match", + ), + pytest.param( + [_SPECIFIC, _GENERAL], # reversed order + 1, + "SpecificFreeDiskSpace", + id="name-match-beats-broader-match-reversed", + ), + pytest.param( + [_BROAD, _NARROW], + 1, + "NarrowPolicy", + id="more-matchparams-beats-fewer", + ), + pytest.param( + [_NARROW, _BROAD], # reversed order + 1, + "NarrowPolicy", + id="more-matchparams-beats-fewer-reversed", + ), + ], +) +def test_postprocessing(entries, expected_count, expected_winner): + """postProcessingPolicyList keeps the correct number of policies and the right winner.""" + result = postProcessingPolicyList(entries) + assert len(result) == expected_count + if expected_winner is not None: + assert result[0][0] == expected_winner diff --git a/tests/Jenkins/dirac-cfg-update-server.py b/tests/Jenkins/dirac-cfg-update-server.py index bd75aec5b5d..57cd1b117be 100644 --- a/tests/Jenkins/dirac-cfg-update-server.py +++ b/tests/Jenkins/dirac-cfg-update-server.py @@ -302,35 +302,44 @@ # Config # { # Cache = 600 -# State = Active # FromAddress = fstagni@cern.ch # notificationGroups = ShiftersGroup # } # Policies # { +# Downtime +# { +# hours = 0 +# } +# FreeDiskSpace +# { +# Unit = TB +# Banned_threshold = 0.1 +# Degraded_threshold = 5 +# } # AlwaysActiveForResource # { +# policyType = AlwaysActive # matchParams # { # element = Resource # } -# policyType = AlwaysActive # } # AlwaysBannedForSE1SE2 # { +# policyType = AlwaysBanned # matchParams # { # name = SE1,SE2 # } -# policyType = AlwaysBanned # } # AlwaysBannedForSite # { +# policyType = AlwaysBanned # matchParams # { # element = Site # } -# policyType = AlwaysBanned # } # } # } @@ -359,6 +368,21 @@ if not res["OK"]: print(res["Message"]) sys.exit(1) + +res = csAPI.createSection("Operations/Defaults/ResourceStatus/Policies/Downtime") +if not res["OK"]: + print(res["Message"]) + sys.exit(1) +csAPI.setOption("Operations/Defaults/ResourceStatus/Policies/Downtime/hours", "0") + +res = csAPI.createSection("Operations/Defaults/ResourceStatus/Policies/FreeDiskSpace") +if not res["OK"]: + print(res["Message"]) + sys.exit(1) +csAPI.setOption("Operations/Defaults/ResourceStatus/Policies/FreeDiskSpace/Unit", "TB") +csAPI.setOption("Operations/Defaults/ResourceStatus/Policies/FreeDiskSpace/Banned_threshold", "0.1") +csAPI.setOption("Operations/Defaults/ResourceStatus/Policies/FreeDiskSpace/Degraded_threshold", "5") + res = csAPI.createSection("Operations/Defaults/ResourceStatus/Policies/AlwaysActiveForResource") if not res["OK"]: print(res["Message"])