From 993917c3fe5b95ce442751ac748e548d641e272c Mon Sep 17 00:00:00 2001
From: Eric Weaver <eweaver755@gmail.com>
Date: Wed, 10 Jun 2026 09:03:45 -0400
Subject: [PATCH] [DBMON-6715] Update TagManager to handle internal and db tags
 (#23976)

* Update TagManager to handle internal and db tags correctly

* Add changelog
---
 datadog_checks_base/changelog.d/23976.added   |   1 +
 .../datadog_checks/base/utils/db/utils.py     | 108 +++++---
 .../tests/base/utils/db/test_util.py          | 252 ++++++++++++------
 3 files changed, 254 insertions(+), 107 deletions(-)
 create mode 100644 datadog_checks_base/changelog.d/23976.added

diff --git a/datadog_checks_base/changelog.d/23976.added b/datadog_checks_base/changelog.d/23976.added
new file mode 100644
index 0000000000000..9e2c3848673f5
--- /dev/null
+++ b/datadog_checks_base/changelog.d/23976.added
@@ -0,0 +1 @@
+Add ``include_internal`` and ``include_db`` options to ``TagManager.get_tags`` so DBM checks can exclude internal resource tags (``dd.internal.*``) and the per-database ``db`` tag from tag views that should not carry them.
diff --git a/datadog_checks_base/datadog_checks/base/utils/db/utils.py b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
index 5ad2a61a79d2f..8bb52064b1310 100644
--- a/datadog_checks_base/datadog_checks/base/utils/db/utils.py
+++ b/datadog_checks_base/datadog_checks/base/utils/db/utils.py
@@ -535,20 +535,42 @@ class TagType(Enum):
     KEYLESS = auto()
 
 
+# Tags under keys with this prefix are internal resource tags (e.g. "dd.internal.resource") that are
+# consumed by the agent's metric submission pipeline. They should be excluded from payloads that don't
+# go through that pipeline, such as DBM metadata events.
+INTERNAL_TAG_PREFIX = "dd.internal"
+
+# Key used for the per-database tag. DBM checks exclude this from instance-level views because
+# the data is collected across all databases and the `db` tag is re-applied during ingestion.
+DB_TAG_KEY = "db"
+
+
 class TagManager:
     """
     Manages tags for a check. Tags are stored as a dictionary of key-value pairs
     for key-value tags and as a list of values for keyless tags useful for easy update and deletion.
-    There's an internal cache of the tag list to avoid generating the list of tag strings
+    There's an internal cache of the rendered tag lists to avoid generating the tag strings
     multiple times.
     """
 
     def __init__(self, normalizer: Optional[Callable[[Union[str, bytes]], str]] = None) -> None:
         self._tags: Dict[Union[str, TagType], List[str]] = {}
-        self._cached_tag_list: Optional[tuple[str, ...]] = None
+        # Rendered tag strings split into three disjoint buckets. Each tag string is stored exactly
+        # once, and a single render pass populates all three:
+        #   - _internal_tags: keys prefixed with "dd.internal"
+        #   - _db_tags: the per-database "db" key
+        #   - _base_tags: everything else (including keyless tags)
+        # get_tags() returns _base_tags merged with the requested optional buckets.
+        self._base_tags: tuple[str, ...] = ()
+        self._internal_tags: tuple[str, ...] = ()
+        self._db_tags: tuple[str, ...] = ()
+        self._cache_valid: bool = False
         self._keyless: TagType = TagType.KEYLESS
         self._normalizer = normalizer
 
+    def _invalidate_cache(self) -> None:
+        self._cache_valid = False
+
     def set_tag(self, key: Optional[str], value: str, replace: bool = False, normalize: bool = False) -> None:
         """
         Set a tag with the given key and value.
@@ -567,13 +589,18 @@ def set_tag(self, key: Optional[str], value: str, replace: bool = False, normali
             key = self._keyless
 
         if replace or key not in self._tags:
+            # Skip the cache invalidation if the value is unchanged. Dynamic tags (e.g. version,
+            # replication_role) are re-set with replace=True every check run but rarely change, so
+            # this lets the cached tag lists survive across runs.
+            if self._tags.get(key) == [value]:
+                return
             self._tags[key] = [value]
-            # Invalidate the cache since tags have changed
-            self._cached_tag_list = None
+            # Invalidate the caches since tags have changed
+            self._invalidate_cache()
         elif value not in self._tags[key]:
             self._tags[key].append(value)
-            # Invalidate the cache since tags have changed
-            self._cached_tag_list = None
+            # Invalidate the caches since tags have changed
+            self._invalidate_cache()
 
     def set_tags_from_list(self, tag_list: List[str], replace: bool = False, normalize: bool = False) -> None:
         """
@@ -586,7 +613,7 @@ def set_tags_from_list(self, tag_list: List[str], replace: bool = False, normali
         """
         if replace:
             self._tags.clear()
-            self._cached_tag_list = None
+            self._invalidate_cache()
 
         for tag in tag_list:
             if ':' in tag:
@@ -619,8 +646,8 @@ def delete_tag(self, key: Optional[str], value: Optional[str] = None, normalize:
         if value is None:
             # Delete the entire key
             del self._tags[key]
-            # Invalidate the cache
-            self._cached_tag_list = None
+            # Invalidate the caches
+            self._invalidate_cache()
             return True
         else:
             # Delete specific value if it exists
@@ -629,39 +656,58 @@ def delete_tag(self, key: Optional[str], value: Optional[str] = None, normalize:
                 # Clean up empty lists
                 if not self._tags[key]:
                     del self._tags[key]
-                # Invalidate the cache
-                self._cached_tag_list = None
+                # Invalidate the caches
+                self._invalidate_cache()
                 return True
         return False
 
-    def _generate_tag_strings(self, tags_dict: Dict[Union[str, TagType], List[str]]) -> tuple[str, ...]:
+    def _rebuild_cache(self) -> None:
         """
-        Generate a tuple of tag strings from a tags dictionary.
-        Args:
-            tags_dict (Dict[Union[str, TagType], List[str]]): Dictionary of tags to convert to strings
-        Returns:
-            tuple[str, ...]: Tuple of tag strings
+        Render the tags dict into the three disjoint buckets (base, internal, db) in a single pass.
+        For key-value tags the rendered form is "key:value"; for keyless tags it's just the value.
         """
-        return tuple(
-            value if key == self._keyless else f"{key}:{value}" for key, values in tags_dict.items() for value in values
-        )
-
-    def get_tags(self) -> List[str]:
+        base: List[str] = []
+        internal: List[str] = []
+        db: List[str] = []
+        for key, values in self._tags.items():
+            if key == self._keyless:
+                base.extend(values)
+                continue
+            if isinstance(key, str) and key.startswith(INTERNAL_TAG_PREFIX):
+                bucket = internal
+            elif key == DB_TAG_KEY:
+                bucket = db
+            else:
+                bucket = base
+            bucket.extend("{}:{}".format(key, value) for value in values)
+        self._base_tags = tuple(base)
+        self._internal_tags = tuple(internal)
+        self._db_tags = tuple(db)
+        self._cache_valid = True
+
+    def get_tags(self, include_internal: bool = True, include_db: bool = True) -> List[str]:
         """
         Get a list of tag strings.
         For key-value tags, returns "key:value" format.
         For keyless tags, returns just the value.
-        The returned list is always sorted alphabetically.
+        Args:
+            include_internal (bool): If False, excludes internal resource tags (keys prefixed with
+                "dd.internal"). Useful for payloads that don't go through the agent's metric
+                submission pipeline, such as DBM metadata events.
+            include_db (bool): If False, excludes the per-database tag (the "db" key). Useful for
+                instance-level metrics and DBM events where the database is determined per-row or
+                during ingestion.
         Returns:
-            list: Sorted list of tag strings
+            list: List of tag strings
         """
-        # Return cached list if available
-        if self._cached_tag_list is not None:
-            return list(self._cached_tag_list)
-
-        # Generate and cache regular tags
-        self._cached_tag_list = self._generate_tag_strings(self._tags)
-        return list(self._cached_tag_list)
+        if not self._cache_valid:
+            self._rebuild_cache()
+        tags = list(self._base_tags)
+        if include_internal:
+            tags.extend(self._internal_tags)
+        if include_db:
+            tags.extend(self._db_tags)
+        return tags
 
 
 def now_ms() -> int:
diff --git a/datadog_checks_base/tests/base/utils/db/test_util.py b/datadog_checks_base/tests/base/utils/db/test_util.py
index d25e3b895c678..4fcb378be812a 100644
--- a/datadog_checks_base/tests/base/utils/db/test_util.py
+++ b/datadog_checks_base/tests/base/utils/db/test_util.py
@@ -502,46 +502,38 @@ def test_init(self):
         """Test initialization of TagManager"""
         tag_manager = TagManager()
         assert tag_manager._tags == {}
-        assert tag_manager._cached_tag_list is None
+        assert tag_manager._cache_valid is False
         assert tag_manager._keyless == TagType.KEYLESS
 
     @pytest.mark.parametrize(
-        'key,value,expected_tags',
+        'operations,expected_tags',
         [
-            ('test_key', 'test_value', {'test_key': ['test_value']}),
-            (None, 'test_value', {TagType.KEYLESS: ['test_value']}),
+            pytest.param([('test_key', 'test_value', False)], {'test_key': ['test_value']}, id='single key-value'),
+            pytest.param([(None, 'test_value', False)], {TagType.KEYLESS: ['test_value']}, id='single keyless'),
+            pytest.param(
+                [('test_key', 'value1', False), ('test_key', 'value2', False)],
+                {'test_key': ['value1', 'value2']},
+                id='append to existing key',
+            ),
+            pytest.param(
+                [('test_key', 'value1', False), ('test_key', 'value2', True)],
+                {'test_key': ['value2']},
+                id='replace existing key',
+            ),
+            pytest.param(
+                [('test_key', 'test_value', False), ('test_key', 'test_value', False)],
+                {'test_key': ['test_value']},
+                id='duplicate value ignored',
+            ),
         ],
     )
-    def test_set_tag(self, key, value, expected_tags):
-        """Test setting tags with various combinations of key and value"""
+    def test_set_tag(self, operations, expected_tags):
+        """Test setting tags across single, keyless, append, replace, and duplicate scenarios"""
         tag_manager = TagManager()
-        tag_manager.set_tag(key, value)
+        for key, value, replace in operations:
+            tag_manager.set_tag(key, value, replace=replace)
         assert tag_manager._tags == expected_tags
-        assert tag_manager._cached_tag_list is None
-
-    def test_set_tag_existing_key_append(self):
-        """Test appending a value to an existing key"""
-        tag_manager = TagManager()
-        tag_manager.set_tag('test_key', 'value1')
-        tag_manager.set_tag('test_key', 'value2')
-        assert tag_manager._tags == {'test_key': ['value1', 'value2']}
-        assert tag_manager._cached_tag_list is None
-
-    def test_set_tag_existing_key_replace(self):
-        """Test replacing values for an existing key"""
-        tag_manager = TagManager()
-        tag_manager.set_tag('test_key', 'value1')
-        tag_manager.set_tag('test_key', 'value2', replace=True)
-        assert tag_manager._tags == {'test_key': ['value2']}
-        assert tag_manager._cached_tag_list is None
-
-    def test_set_tag_duplicate_value(self):
-        """Test setting a duplicate value for a key"""
-        tag_manager = TagManager()
-        tag_manager.set_tag('test_key', 'test_value')
-        tag_manager.set_tag('test_key', 'test_value')
-        assert tag_manager._tags == {'test_key': ['test_value']}
-        assert tag_manager._cached_tag_list is None
+        assert tag_manager._cache_valid is False
 
     @pytest.mark.parametrize(
         'key,values,delete_key,delete_value,expected_tags,description',
@@ -575,13 +567,13 @@ def test_delete_tag(self, key, values, delete_key, delete_value, expected_tags,
 
         # Generate initial cache
         tag_manager.get_tags()
-        assert tag_manager._cached_tag_list is not None
+        assert tag_manager._cache_valid is True
 
         # Perform deletion
         assert tag_manager.delete_tag(delete_key, delete_value)
 
         # Verify cache is invalidated
-        assert tag_manager._cached_tag_list is None
+        assert tag_manager._cache_valid is False
 
         # Verify internal state
         assert tag_manager._tags == expected_tags
@@ -633,13 +625,13 @@ def test_set_tags_from_list(self, initial_tags, tag_list, replace, expected_tags
 
         # Generate cache to ensure it exists
         _ = tag_manager.get_tags()
-        assert tag_manager._cached_tag_list is not None
+        assert tag_manager._cache_valid is True
 
         tag_manager.set_tags_from_list(tag_list, replace=replace)
 
         # Verify cache was invalidated when needed
         if replace:
-            assert tag_manager._cached_tag_list is None
+            assert tag_manager._cache_valid is False
 
         # Verify final state
         assert sorted(tag_manager.get_tags()) == sorted(expected_tags)
@@ -676,11 +668,11 @@ def test_cache_management(self):
 
         # First call should generate cache
         _ = tag_manager.get_tags()
-        assert tag_manager._cached_tag_list is not None
+        assert tag_manager._cache_valid is True
 
         # Modify regular tags
         tag_manager.set_tag('regular_key2', 'regular_value2')
-        assert tag_manager._cached_tag_list is None
+        assert tag_manager._cache_valid is False
 
         # Verify all tags are included
         second_result = tag_manager.get_tags()
@@ -704,6 +696,128 @@ def test_get_tags_mutation(self):
         assert new_tags == ['test_key:test_value']
         assert new_tags != tags  # The lists should be different objects
 
+    @pytest.mark.parametrize(
+        'include_internal,include_db,expected_tags',
+        [
+            pytest.param(
+                True,
+                True,
+                [
+                    'env:prod',
+                    'db:mydb',
+                    'dd.internal.resource:database_instance:host1',
+                    'dd.internal.resource:aws_rds_instance:endpoint',
+                ],
+                id='full view',
+            ),
+            pytest.param(
+                False,
+                True,
+                ['env:prod', 'db:mydb'],
+                id='exclude internal',
+            ),
+            pytest.param(
+                True,
+                False,
+                [
+                    'env:prod',
+                    'dd.internal.resource:database_instance:host1',
+                    'dd.internal.resource:aws_rds_instance:endpoint',
+                ],
+                id='exclude db',
+            ),
+            pytest.param(
+                False,
+                False,
+                ['env:prod'],
+                id='exclude internal and db',
+            ),
+        ],
+    )
+    def test_get_tags_view_filtering(self, include_internal, include_db, expected_tags):
+        """Test that include_internal/include_db filter the dd.internal.* and db buckets"""
+        tag_manager = TagManager()
+        tag_manager.set_tag('env', 'prod')
+        tag_manager.set_tag('db', 'mydb')
+        tag_manager.set_tag('dd.internal.resource', 'database_instance:host1')
+        tag_manager.set_tag('dd.internal.resource', 'aws_rds_instance:endpoint')
+
+        assert sorted(tag_manager.get_tags(include_internal=include_internal, include_db=include_db)) == sorted(
+            expected_tags
+        )
+
+    def test_get_tags_exclude_db_is_exact_key_match(self):
+        """A key like "dbms_flavor" must NOT be excluded by the db filter (exact-key match, not prefix)"""
+        tag_manager = TagManager()
+        tag_manager.set_tag('db', 'mydb')
+        tag_manager.set_tag('dbms_flavor', 'postgres')
+
+        tags = tag_manager.get_tags(include_db=False)
+        assert 'db:mydb' not in tags
+        assert 'dbms_flavor:postgres' in tags
+
+    def test_get_tags_filtering_reflects_mutations(self):
+        """Test that filtered views reflect subsequent mutations (cache invalidation)"""
+        tag_manager = TagManager()
+        tag_manager.set_tag('env', 'prod')
+        tag_manager.set_tag('dd.internal.resource', 'database_instance:host1')
+
+        assert tag_manager.get_tags(include_internal=False) == ['env:prod']
+
+        # Reading a filtered view must not affect the full view
+        assert sorted(tag_manager.get_tags()) == sorted(['env:prod', 'dd.internal.resource:database_instance:host1'])
+
+        tag_manager.set_tag('service', 'web')
+        assert sorted(tag_manager.get_tags(include_internal=False)) == sorted(['env:prod', 'service:web'])
+
+    def test_set_tag_replace_noop_preserves_cache(self):
+        """Re-setting a tag with replace=True and the same value must not invalidate the cache"""
+        tag_manager = TagManager()
+        tag_manager.set_tag('env', 'prod')
+        tag_manager.set_tag('db', 'mydb')
+
+        # Build the rendered cache
+        tag_manager.get_tags()
+        assert tag_manager._cache_valid is True
+
+        # Re-setting an existing tag to the same value via replace is a no-op: cache survives
+        tag_manager.set_tag('env', 'prod', replace=True)
+        assert tag_manager._cache_valid is True
+
+        # Setting a tag with a different value via replace still invalidates
+        tag_manager.set_tag('env', 'staging', replace=True)
+        assert tag_manager._cache_valid is False
+        assert 'env:staging' in tag_manager.get_tags()
+        assert 'env:prod' not in tag_manager.get_tags()
+
+    def test_set_tag_replace_noop_collapses_multivalue(self):
+        """replace with the same single value is a no-op, but replacing a multi-value key is not"""
+        tag_manager = TagManager()
+        tag_manager.set_tag('team', 'a')
+        tag_manager.set_tag('team', 'b')  # team now has two values
+        tag_manager.get_tags()
+        assert tag_manager._cache_valid is True
+
+        # Replacing the multi-value key with a single value must invalidate and collapse it
+        tag_manager.set_tag('team', 'a', replace=True)
+        assert tag_manager._cache_valid is False
+        assert tag_manager._tags['team'] == ['a']
+
+    def test_filtered_view_cache(self):
+        """Test that the rendered buckets are cached and invalidated on mutation"""
+        tag_manager = TagManager()
+        tag_manager.set_tag('env', 'prod')
+        tag_manager.set_tag('db', 'mydb')
+
+        assert tag_manager._cache_valid is False
+        assert tag_manager.get_tags(include_db=False) == ['env:prod']
+        assert tag_manager._cache_valid is True
+
+        # Mutating tags invalidates the cache
+        tag_manager.set_tag('service', 'web')
+        assert tag_manager._cache_valid is False
+        assert sorted(tag_manager.get_tags(include_db=False)) == sorted(['env:prod', 'service:web'])
+
     # Normalization tests
     def mock_tag_normalizer(self, tag):
         """Mock normalizer that replaces spaces and hyphens with underscores and lowercases"""
@@ -714,7 +828,7 @@ def test_init_with_normalizer(self):
         tag_manager = TagManager(normalizer=self.mock_tag_normalizer)
         assert tag_manager._normalizer == self.mock_tag_normalizer
         assert tag_manager._tags == {}
-        assert tag_manager._cached_tag_list is None
+        assert tag_manager._cache_valid is False
 
     def test_set_tag_with_normalization(self):
         """Test setting tags with normalization enabled"""
@@ -741,50 +855,36 @@ def test_set_tag_without_normalization(self):
         tag_manager.set_tag('another-key', 'another value')
         assert tag_manager._tags == {'test-key': ['value with spaces'], 'another-key': ['another value']}
 
-    def test_set_tags_from_list_with_normalization(self):
-        """Test setting tags from list with normalization enabled"""
-        tag_manager = TagManager(normalizer=self.mock_tag_normalizer)
-
-        tag_list = ['env:prod-test', 'service:web app', 'keyless-tag']
-        tag_manager.set_tags_from_list(tag_list, normalize=True)
-
-        expected_tags = sorted(['env:prod_test', 'service:web_app', 'keyless_tag'])
-        assert sorted(tag_manager.get_tags()) == expected_tags
-
-    def test_set_tags_from_list_without_normalization(self):
-        """Test setting tags from list without normalization"""
-        tag_manager = TagManager(normalizer=self.mock_tag_normalizer)
-
-        tag_list = ['env:prod-test', 'service:web app', 'keyless-tag']
-        tag_manager.set_tags_from_list(tag_list, normalize=False)
-
-        expected_tags = sorted(['env:prod-test', 'service:web app', 'keyless-tag'])
-        assert sorted(tag_manager.get_tags()) == expected_tags
-
-    def test_delete_tag_with_normalization(self):
-        """Test deleting tags with normalization enabled"""
+    @pytest.mark.parametrize(
+        'normalize,expected_tags',
+        [
+            pytest.param(True, ['env:prod_test', 'service:web_app', 'keyless_tag'], id='normalize'),
+            pytest.param(False, ['env:prod-test', 'service:web app', 'keyless-tag'], id='no normalize'),
+        ],
+    )
+    def test_set_tags_from_list_normalization(self, normalize, expected_tags):
+        """Test setting tags from a list with and without normalization"""
         tag_manager = TagManager(normalizer=self.mock_tag_normalizer)
 
-        # Set tag with normalization (only value normalized)
-        tag_manager.set_tag('test-key', 'value with spaces', normalize=True)
-        assert 'test-key' in tag_manager._tags
-        assert tag_manager._tags['test-key'] == ['value_with_spaces']
+        tag_manager.set_tags_from_list(['env:prod-test', 'service:web app', 'keyless-tag'], normalize=normalize)
 
-        # Delete with normalization - should find the normalized value
-        result = tag_manager.delete_tag('test-key', 'value with spaces', normalize=True)
-        assert result is True
-        assert tag_manager._tags == {}
+        assert sorted(tag_manager.get_tags()) == sorted(expected_tags)
 
-    def test_delete_tag_without_normalization(self):
-        """Test deleting tags without normalization"""
+    @pytest.mark.parametrize(
+        'normalize,stored_value',
+        [
+            pytest.param(True, 'value_with_spaces', id='normalize'),
+            pytest.param(False, 'value with spaces', id='no normalize'),
+        ],
+    )
+    def test_delete_tag_normalization(self, normalize, stored_value):
+        """Test setting then deleting a tag with and without normalization"""
         tag_manager = TagManager(normalizer=self.mock_tag_normalizer)
 
-        # Set tag without normalization
-        tag_manager.set_tag('test-key', 'value with spaces', normalize=False)
-        assert tag_manager._tags == {'test-key': ['value with spaces']}
+        tag_manager.set_tag('test-key', 'value with spaces', normalize=normalize)
+        assert tag_manager._tags == {'test-key': [stored_value]}
 
-        # Delete without normalization
-        result = tag_manager.delete_tag('test-key', 'value with spaces', normalize=False)
+        result = tag_manager.delete_tag('test-key', 'value with spaces', normalize=normalize)
         assert result is True
         assert tag_manager._tags == {}