Skip to content

Commit 9212de9

Browse files
authored
Add group_per_user_services option to windows_service (#24087)
* Add group_per_user_services option to windows_service Strip the per-session LUID suffix from Windows per-user service instances so they report under their template name, reducing windows_service and display_name tag cardinality. Detection gates on the SERVICE_USERSERVICE_INSTANCE type flag plus a trailing _<LUID> suffix. Defaults to false. * Address review: type hints, comment, filter-mode test Add type hints to _group_per_user_service_name and drop the redundant suffix-search guard. Document the grouped-instance behavior (instances collapse to one series; last-emitted state wins) as a code comment. Add a test covering grouping when services are selected by name filter. * Add changelog entry * Reuse ServiceAssertion in group_per_user tests Use the existing ServiceAssertion / assert_service_check_and_metrics helpers in the group_per_user tests for consistency with the rest of the suite; this also asserts the uptime/state/restarts metrics, not just the service check.
1 parent f66be12 commit 9212de9

9 files changed

Lines changed: 191 additions & 3 deletions

File tree

windows_service/assets/configuration/spec.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,4 +82,18 @@ files:
8282
value:
8383
type: boolean
8484
example: false
85+
- name: group_per_user_services
86+
description: |
87+
Whether or not to group Windows per-user services under their template name.
88+
89+
Per-user services are instantiated per user logon session and named
90+
`<service name>_<LUID>` (for example `OneSyncSvc_443f50`), which creates high
91+
cardinality in the `windows_service` and `display_name` tags. When enabled, the
92+
LUID suffix is stripped so all instances of a per-user service report under the
93+
same template name.
94+
See https://learn.microsoft.com/en-us/windows/application-management/per-user-services-in-windows
95+
fleet_configurable: true
96+
value:
97+
type: boolean
98+
example: false
8599
- template: instances/default
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add the `group_per_user_services` option to report Windows per-user service instances under their template name, reducing `windows_service` tag cardinality.

windows_service/datadog_checks/windows_service/config_models/defaults.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ def instance_enable_legacy_tags_normalization():
2828
return True
2929

3030

31+
def instance_group_per_user_services():
32+
return False
33+
34+
3135
def instance_min_collection_interval():
3236
return 15
3337

windows_service/datadog_checks/windows_service/config_models/instance.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class InstanceConfig(BaseModel):
4040
disable_legacy_service_tag: Optional[bool] = None
4141
empty_default_hostname: Optional[bool] = None
4242
enable_legacy_tags_normalization: Optional[bool] = None
43+
group_per_user_services: Optional[bool] = None
4344
metric_patterns: Optional[MetricPatterns] = None
4445
min_collection_interval: Optional[float] = None
4546
service: Optional[str] = None

windows_service/datadog_checks/windows_service/data/conf.yaml.example

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,18 @@ instances:
7373
#
7474
# collect_display_name_as_tag: false
7575

76+
## @param group_per_user_services - boolean - optional - default: false
77+
## Whether or not to group Windows per-user services under their template name.
78+
##
79+
## Per-user services are instantiated per user logon session and named
80+
## `<service name>_<LUID>` (for example `OneSyncSvc_443f50`), which creates high
81+
## cardinality in the `windows_service` and `display_name` tags. When enabled, the
82+
## LUID suffix is stripped so all instances of a per-user service report under the
83+
## same template name.
84+
## See https://learn.microsoft.com/en-us/windows/application-management/per-user-services-in-windows
85+
#
86+
# group_per_user_services: false
87+
7688
## @param tags - list of strings - optional
7789
## A list of tags to attach to every metric and service check emitted by this instance.
7890
##

windows_service/datadog_checks/windows_service/windows_service.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,18 @@
1616

1717
SERVICE_CONFIG_TRIGGER_INFO = 8
1818

19+
# Per-user service instance flag (SERVICE_USERSERVICE_INSTANCE in winsvc.h). Set on the
20+
# per-session instances of a per-user service (named <template>_<LUID>), not on the template.
21+
SERVICE_USERSERVICE_INSTANCE = 0x80
22+
USER_SERVICE_LUID_SUFFIX_RE = re.compile(r'_[0-9A-Fa-f]+$')
23+
24+
25+
def _group_per_user_service_name(name: str, service_type: int) -> str:
26+
"""Strip the per-user LUID suffix so instances group under their template name."""
27+
if service_type & SERVICE_USERSERVICE_INSTANCE:
28+
return USER_SERVICE_LUID_SUFFIX_RE.sub('', name)
29+
return name
30+
1931

2032
def QueryServiceConfig2W(*args):
2133
"""
@@ -303,14 +315,31 @@ def check(self, instance):
303315
# See test_name_regex_order()
304316
service_filters = sorted(service_filters, reverse=True, key=lambda x: len(x.name or ""))
305317

318+
group_per_user_services = instance.get('group_per_user_services', False)
319+
306320
for service_status_process_enum in service_status_process_enums:
307321
service_name = service_status_process_enum["ServiceName"]
308322
display_name = service_status_process_enum["DisplayName"]
309323
state = service_status_process_enum["CurrentState"]
310324
service_pid = service_status_process_enum["ProcessId"]
325+
service_type = service_status_process_enum.get("ServiceType", 0)
311326

312327
service_view = ServiceView(scm_handle, service_name)
313328

329+
# Names used for tags; for per-user services these collapse the per-session LUID suffix
330+
# so all instances report under their template name.
331+
# The full instance name is kept for service handles and the restart PID cache.
332+
# Multiple instances thus collapse into a single series; if they are in different states
333+
# the reported state reflects whichever instance is emitted last.
334+
# We generally expect multiple per-user instances per host to be rare (terminal service
335+
# sessions only); the main win is grouping the windows_service tag across hosts (and thus
336+
# service checks) for easier monitoring.
337+
reported_name = service_name
338+
reported_display_name = display_name
339+
if group_per_user_services:
340+
reported_name = _group_per_user_service_name(service_name, service_type)
341+
reported_display_name = _group_per_user_service_name(display_name, service_type)
342+
314343
if 'ALL' not in services:
315344
for service_filter in service_filters:
316345
try:
@@ -338,11 +367,11 @@ def check(self, instance):
338367
status = self.STATE_TO_STATUS.get(state, self.UNKNOWN)
339368
state_string = self.STATE_TO_STRING.get(state, self.UNKNOWN_LITERAL)
340369

341-
tags = ['windows_service:{}'.format(service_name), 'windows_service_state:{}'.format(state_string)]
370+
tags = ['windows_service:{}'.format(reported_name), 'windows_service_state:{}'.format(state_string)]
342371
tags.extend(custom_tags)
343372

344373
if instance.get('collect_display_name_as_tag', False):
345-
tags.append('display_name:{}'.format(display_name))
374+
tags.append('display_name:{}'.format(reported_display_name))
346375

347376
if instance.get('windows_service_startup_type_tag', False):
348377
try:
@@ -355,7 +384,7 @@ def check(self, instance):
355384

356385
if not instance.get('disable_legacy_service_tag', False):
357386
self._log_deprecation('service_tag', 'windows_service')
358-
tags.append('service:{}'.format(service_name))
387+
tags.append('service:{}'.format(reported_name))
359388

360389
self.service_check(self.SERVICE_CHECK_NAME, status, tags=tags)
361390
self.log.debug('service state for %s %s', service_name, status)

windows_service/tests/common.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@
4444
],
4545
'disable_legacy_service_tag': True,
4646
}
47+
INSTANCE_GROUP_PER_USER_SERVICES = {
48+
'services': ['ALL'],
49+
'group_per_user_services': True,
50+
'collect_display_name_as_tag': True,
51+
'disable_legacy_service_tag': True,
52+
}
4753
INSTANCE_TRIGGER_START = {
4854
'services': [
4955
{'name': 'eventlog', 'startup_type': 'automatic', 'trigger_start': False},

windows_service/tests/conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ def instance_trigger_start():
6565
return deepcopy(common.INSTANCE_TRIGGER_START)
6666

6767

68+
@pytest.fixture
69+
def instance_group_per_user_services():
70+
return deepcopy(common.INSTANCE_GROUP_PER_USER_SERVICES)
71+
72+
6873
@pytest.fixture
6974
def instance_name_regex_prefix():
7075
return deepcopy(common.INSTANCE_PREFIX_MATCH)

windows_service/tests/test_windows_service.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,122 @@ def test_service_restart_detection(aggregator, check, instance_basic):
389389
)
390390

391391

392+
# Per-user service instances carry SERVICE_USER_OWN_PROCESS (0x50) plus the
393+
# SERVICE_USERSERVICE_INSTANCE flag (0x80). Templates (not enumerated) lack the 0x80 bit.
394+
USER_SERVICE_INSTANCE_TYPE = 0x10 | 0x40 | 0x80
395+
WIN32_OWN_PROCESS_TYPE = 0x10
396+
397+
398+
def _per_user_mock_services():
399+
return [
400+
{
401+
'ServiceName': 'OneSyncSvc_443f50',
402+
'DisplayName': 'Sync Host_443f50',
403+
'CurrentState': win32service.SERVICE_RUNNING,
404+
'ProcessId': 1234,
405+
'ServiceType': USER_SERVICE_INSTANCE_TYPE,
406+
},
407+
{
408+
'ServiceName': 'OneSyncSvc_18f113',
409+
'DisplayName': 'Sync Host_18f113',
410+
'CurrentState': win32service.SERVICE_RUNNING,
411+
'ProcessId': 5678,
412+
'ServiceType': USER_SERVICE_INSTANCE_TYPE,
413+
},
414+
{
415+
'ServiceName': 'Dnscache',
416+
'DisplayName': 'DNS Client',
417+
'CurrentState': win32service.SERVICE_RUNNING,
418+
'ProcessId': 9999,
419+
'ServiceType': WIN32_OWN_PROCESS_TYPE,
420+
},
421+
]
422+
423+
424+
def test_group_per_user_services(aggregator, check, instance_group_per_user_services):
425+
c = check(instance_group_per_user_services)
426+
427+
with patch('win32service.EnumServicesStatusEx', return_value=_per_user_mock_services()):
428+
c.check(instance_group_per_user_services)
429+
430+
services = [
431+
# Both per-user instances collapse to the template name, so the grouped tag is submitted twice
432+
ServiceAssertion('OneSyncSvc', win32service.SERVICE_RUNNING, extra_tags=['display_name:Sync Host'], count=2),
433+
# The LUID-suffixed names must no longer be emitted
434+
ServiceAssertion('OneSyncSvc_443f50', win32service.SERVICE_RUNNING, count=0),
435+
ServiceAssertion('OneSyncSvc_18f113', win32service.SERVICE_RUNNING, count=0),
436+
# Non per-user services are untouched
437+
ServiceAssertion('Dnscache', win32service.SERVICE_RUNNING, extra_tags=['display_name:DNS Client']),
438+
]
439+
assert_service_check_and_metrics(aggregator, services)
440+
441+
442+
def test_group_per_user_services_disabled(aggregator, check, instance_group_per_user_services):
443+
instance_group_per_user_services['group_per_user_services'] = False
444+
c = check(instance_group_per_user_services)
445+
446+
with patch('win32service.EnumServicesStatusEx', return_value=_per_user_mock_services()):
447+
c.check(instance_group_per_user_services)
448+
449+
services = [
450+
# Without grouping each instance keeps its full LUID-suffixed name
451+
ServiceAssertion(
452+
'OneSyncSvc_443f50', win32service.SERVICE_RUNNING, extra_tags=['display_name:Sync Host_443f50']
453+
),
454+
ServiceAssertion(
455+
'OneSyncSvc_18f113', win32service.SERVICE_RUNNING, extra_tags=['display_name:Sync Host_18f113']
456+
),
457+
# The grouped template name is not emitted
458+
ServiceAssertion('OneSyncSvc', win32service.SERVICE_RUNNING, count=0),
459+
]
460+
assert_service_check_and_metrics(aggregator, services)
461+
462+
463+
def test_group_per_user_services_ignores_non_user_service(aggregator, check, instance_group_per_user_services):
464+
# A regular service whose name happens to end in _<hex> must not be stripped: it lacks the
465+
# SERVICE_USERSERVICE_INSTANCE flag.
466+
mock_services = [
467+
{
468+
'ServiceName': 'MyService_abc123',
469+
'DisplayName': 'My Service_abc123',
470+
'CurrentState': win32service.SERVICE_RUNNING,
471+
'ProcessId': 4242,
472+
'ServiceType': WIN32_OWN_PROCESS_TYPE,
473+
},
474+
]
475+
c = check(instance_group_per_user_services)
476+
477+
with patch('win32service.EnumServicesStatusEx', return_value=mock_services):
478+
c.check(instance_group_per_user_services)
479+
480+
services = [
481+
ServiceAssertion(
482+
'MyService_abc123', win32service.SERVICE_RUNNING, extra_tags=['display_name:My Service_abc123']
483+
),
484+
]
485+
assert_service_check_and_metrics(aggregator, services)
486+
487+
488+
def test_group_per_user_services_with_name_filter(aggregator, check, instance_group_per_user_services):
489+
# Grouping must also apply when services are selected by a name filter (not just ALL). The
490+
# prefix regex matches the full LUID-suffixed instance names, but the emitted tag is grouped.
491+
instance_group_per_user_services['services'] = ['OneSyncSvc']
492+
493+
c = check(instance_group_per_user_services)
494+
495+
with patch('win32service.EnumServicesStatusEx', return_value=_per_user_mock_services()):
496+
c.check(instance_group_per_user_services)
497+
498+
services = [
499+
ServiceAssertion('OneSyncSvc', win32service.SERVICE_RUNNING, extra_tags=['display_name:Sync Host'], count=2),
500+
# The grouped template name must not be reported UNKNOWN by the services_unseen path
501+
ServiceAssertion('OneSyncSvc', -1, count=0),
502+
# The non-matching service is not reported
503+
ServiceAssertion('Dnscache', win32service.SERVICE_RUNNING, count=0),
504+
]
505+
assert_service_check_and_metrics(aggregator, services)
506+
507+
392508
@pytest.mark.e2e
393509
def test_basic_e2e(dd_agent_check, check, instance_basic):
394510
aggregator = dd_agent_check(instance_basic)

0 commit comments

Comments
 (0)