Skip to content

Commit bb96daf

Browse files
wanlonghenryBavneet Singh
authored andcommitted
Extend ContainerInsights Extension for high log scale mode support (#9)
1 parent cb0788d commit bb96daf

3 files changed

Lines changed: 240 additions & 2 deletions

File tree

src/k8s-extension/azext_k8s_extension/partner_extensions/ContainerInsights.py

Lines changed: 161 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,21 @@
3333
logger = get_logger(__name__)
3434
DCR_API_VERSION = "2022-06-01"
3535

36+
ContainerInsightsStreams = [
37+
"Microsoft-ContainerLog",
38+
"Microsoft-ContainerLogV2-HighScale",
39+
"Microsoft-KubeEvents",
40+
"Microsoft-KubePodInventory",
41+
"Microsoft-KubeNodeInventory",
42+
"Microsoft-KubePVInventory",
43+
"Microsoft-KubeServices",
44+
"Microsoft-KubeMonAgentEvents",
45+
"Microsoft-InsightsMetrics",
46+
"Microsoft-ContainerInventory",
47+
"Microsoft-ContainerNodeInventory",
48+
"Microsoft-Perf",
49+
]
50+
3651

3752
class ContainerInsights(DefaultExtension):
3853
def Create(self, cmd, client, resource_group_name, cluster_name, name, cluster_type, cluster_rp,
@@ -83,6 +98,7 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
8398
# Delete DCR-A if it exists incase of MSI Auth
8499
useAADAuth = False
85100
isDCRAExists = False
101+
enable_high_log_scale_mode = False
86102
cluster_rp, _ = get_cluster_rp_api_version(cluster_type=cluster_type, cluster_rp=cluster_rp)
87103
try:
88104
extension = client.get(resource_group_name, cluster_rp, cluster_type, cluster_name, name)
@@ -95,10 +111,15 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
95111
return
96112

97113
subscription_id = get_subscription_id(cmd.cli_ctx)
114+
resources = cf_resources(cmd.cli_ctx, subscription_id)
98115
# handle cluster type here
99116
cluster_resource_id = '/subscriptions/{0}/resourceGroups/{1}/providers/{2}/{3}/{4}'.format(subscription_id, resource_group_name, cluster_rp, cluster_type, cluster_name)
117+
workspace_resource_id = None
100118
if (extension is not None) and (extension.configuration_settings is not None):
101119
configSettings = extension.configuration_settings
120+
# Extract workspace resource ID if present
121+
if 'logAnalyticsWorkspaceResourceID' in configSettings:
122+
workspace_resource_id = configSettings['logAnalyticsWorkspaceResourceID']
102123
# omsagent is being renamed to ama-logs. Check for both for compatibility
103124
if 'omsagent.useAADAuth' in configSettings:
104125
useAADAuthSetting = configSettings['omsagent.useAADAuth']
@@ -108,6 +129,16 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
108129
useAADAuthSetting = configSettings['amalogs.useAADAuth']
109130
if (isinstance(useAADAuthSetting, str) and str(useAADAuthSetting).lower() == "true") or (isinstance(useAADAuthSetting, bool) and useAADAuthSetting):
110131
useAADAuth = True
132+
133+
# Check if high log scale mode was enabled
134+
if useAADAuth and 'amalogs.enableHighLogScaleMode' in configSettings:
135+
highLogScaleSetting = configSettings['amalogs.enableHighLogScaleMode']
136+
if isinstance(highLogScaleSetting, str):
137+
enable_high_log_scale_mode = (highLogScaleSetting.lower() == "true")
138+
elif isinstance(highLogScaleSetting, bool):
139+
enable_high_log_scale_mode = highLogScaleSetting
140+
else:
141+
raise InvalidArgumentValueError('amalogs.enableHighLogScaleMode value MUST be either true/false or boolean type')
111142
if useAADAuth:
112143
association_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{cluster_resource_id}/providers/Microsoft.Insights/dataCollectionRuleAssociations/ContainerInsightsExtension?api-version={DCR_API_VERSION}"
113144
for _ in range(3):
@@ -131,6 +162,41 @@ def Delete(self, cmd, client, resource_group_name, cluster_name, name, cluster_t
131162
except Exception:
132163
pass # its OK to ignore the exception since MSI auth in preview
133164

165+
if useAADAuth:
166+
# Get the workspace region if workspace_resource_id is available
167+
workspace_region = None
168+
if workspace_resource_id:
169+
try:
170+
workspace_resource = resources.get_by_id(workspace_resource_id, '2015-11-01-preview')
171+
workspace_region = workspace_resource.location.replace(" ", "").lower()
172+
except Exception as ex:
173+
logger.warning("Skipping DCR and DCE deletion due to inability to determine workspace region")
174+
return
175+
# If workspace_region still couldn't be determined, skip deletion
176+
if not workspace_region:
177+
logger.warning("Workspace region could not be determined. Skipping DCR and DCE deletion.")
178+
return
179+
180+
# Use workspace_region for DCR name to match creation logic
181+
dcr_name = f"MSCI-{workspace_region}-{cluster_name}"
182+
dcr_name = dcr_name[0:64]
183+
184+
dcr_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.Insights/dataCollectionRules/{dcr_name}"
185+
dcr_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{dcr_resource_id}?api-version={DCR_API_VERSION}"
186+
response = send_raw_request(cmd.cli_ctx, "GET", dcr_url)
187+
dcr_config = json.loads(response.text)
188+
# Delete the DCR
189+
for _ in range(3):
190+
try:
191+
send_raw_request(cmd.cli_ctx, "DELETE", dcr_url,)
192+
logger.info(f"Successfully deleted DCR: {dcr_name}")
193+
break
194+
except Exception as ex:
195+
logger.warning(f"Error deleting DCR: {str(ex)}")
196+
pass
197+
198+
if enable_high_log_scale_mode:
199+
_delete_dce_for_dcr(cmd, subscription_id, resource_group_name, dcr_config)
134200

135201
# Custom Validation Logic for Container Insights
136202

@@ -464,6 +530,7 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
464530
subscription_id = get_subscription_id(cmd.cli_ctx)
465531
workspace_resource_id = ''
466532
useAADAuth = True
533+
enableHighLogScaleMode = False # Default value
467534
if 'amalogs.useAADAuth' not in configuration_settings:
468535
configuration_settings['amalogs.useAADAuth'] = "true"
469536
extensionSettings = {}
@@ -520,6 +587,16 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
520587
raise InvalidArgumentValueError('streams must be an array type')
521588
extensionSettings["dataCollectionSettings"] = dataCollectionSettings
522589

590+
if useAADAuth and 'amalogs.enableHighLogScaleMode' in configuration_settings:
591+
enableHighLogScaleMode = configuration_settings['amalogs.enableHighLogScaleMode']
592+
if isinstance(enableHighLogScaleMode, str):
593+
enableHighLogScaleMode_str = enableHighLogScaleMode.lower()
594+
if enableHighLogScaleMode_str not in ["true", "false"]:
595+
raise InvalidArgumentValueError('amalogs.enableHighLogScaleMode value MUST be either true or false')
596+
enableHighLogScaleMode = (enableHighLogScaleMode_str == "true")
597+
elif not isinstance(enableHighLogScaleMode, bool):
598+
raise InvalidArgumentValueError('amalogs.enableHighLogScaleMode value MUST be either true or false')
599+
523600
workspace_resource_id = workspace_resource_id.strip()
524601

525602
if configuration_protected_settings is not None:
@@ -548,7 +625,7 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
548625
if is_ci_extension_type:
549626
if useAADAuth:
550627
logger.info("creating data collection rule and association")
551-
_ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings)
628+
_ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings, enableHighLogScaleMode)
552629
elif not _is_container_insights_solution_exists(cmd, workspace_resource_id):
553630
logger.info("Creating ContainerInsights solution resource, since it doesn't exist and it is using legacy authentication")
554631
_ensure_container_insights_for_monitoring(cmd, workspace_resource_id).result()
@@ -597,6 +674,37 @@ def _get_container_insights_settings(cmd, cluster_resource_group_name, cluster_r
597674
configuration_settings['amalogs.domain'] = 'opinsights.azure.microsoft.scloud'
598675

599676

677+
def _delete_dce_for_dcr(cmd, subscription_id, cluster_resource_group_name, dcr_config):
678+
"""Delete Data Collection Endpoint associated with a DCR if it exists"""
679+
try:
680+
if ("properties" in dcr_config and
681+
"dataCollectionEndpointId" in dcr_config["properties"] and
682+
dcr_config["properties"]["dataCollectionEndpointId"]):
683+
684+
dce_id = dcr_config["properties"]["dataCollectionEndpointId"]
685+
dce_parts = dce_id.split('/')
686+
687+
if len(dce_parts) > 0:
688+
dce_name = dce_parts[-1]
689+
dce_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{cluster_resource_group_name}/providers/Microsoft.Insights/dataCollectionEndpoints/{dce_name}"
690+
dce_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{dce_resource_id}?api-version=2022-06-01"
691+
# Try to delete up to 3 times
692+
for retry in range(3):
693+
try:
694+
send_raw_request(cmd.cli_ctx, "DELETE", dce_url)
695+
logger.info("Successfully deleted DCE: %s", dce_name)
696+
return True
697+
except CLIError as e:
698+
if "ResourceNotFound" in str(e):
699+
return True
700+
if retry == 2:
701+
logger.warning("Failed to delete DCE: %s - %s", dce_name, str(e))
702+
return False
703+
logger.info("Retrying DCE deletion after error: %s", str(e))
704+
except CLIError:
705+
pass
706+
return True
707+
600708
def get_existing_container_insights_extension_dcr_tags(cmd, dcr_url):
601709
tags = {}
602710
_MAX_RETRY_TIMES = 3
@@ -617,7 +725,7 @@ def get_existing_container_insights_extension_dcr_tags(cmd, dcr_url):
617725
return tags
618726

619727

620-
def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings):
728+
def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_resource_group_name, cluster_rp, cluster_type, cluster_name, workspace_resource_id, extensionSettings, enable_high_log_scale_mode):
621729
from azure.core.exceptions import HttpResponseError
622730

623731
cluster_region = ''
@@ -652,6 +760,18 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
652760
dataCollectionRuleName = dataCollectionRuleName[0:64]
653761
dcr_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{cluster_resource_group_name}/providers/Microsoft.Insights/dataCollectionRules/{dataCollectionRuleName}"
654762

763+
# ingestion DCE MUST be in workspace region
764+
ingestionDataCollectionEndpointName = f"MSCI-ingest-{workspace_region}-{cluster_name}"
765+
# Max length of the DCE name is 43 chars
766+
ingestionDataCollectionEndpointName = _trim_suffix_if_needed(ingestionDataCollectionEndpointName[0:43])
767+
ingestion_dce_resource_id = None
768+
769+
# create ingestion DCE if high log scale mode enabled
770+
if enable_high_log_scale_mode:
771+
ingestion_dce_resource_id = create_data_collection_endpoint(
772+
cmd, subscription_id, cluster_resource_group_name, workspace_region, ingestionDataCollectionEndpointName
773+
)
774+
655775
# first get the association between region display names and region IDs (because for some reason
656776
# the "which RPs are available in which regions" check returns region display names)
657777
region_names_to_id = {}
@@ -677,6 +797,8 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
677797
# get existing tags on the container insights extension DCR if the customer added any
678798
existing_tags = get_existing_container_insights_extension_dcr_tags(cmd, dcr_url)
679799
streams = ["Microsoft-ContainerInsights-Group-Default"]
800+
if enable_high_log_scale_mode:
801+
streams = ContainerInsightsStreams
680802
if extensionSettings is None:
681803
extensionSettings = {}
682804
if 'dataCollectionSettings' in extensionSettings.keys():
@@ -691,6 +813,11 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
691813
}
692814
extensionSettings["dataCollectionSettings"] = dataCollectionSettings
693815

816+
if enable_high_log_scale_mode:
817+
for i, v in enumerate(streams):
818+
if v == "Microsoft-ContainerLogV2":
819+
streams[i] = "Microsoft-ContainerLogV2-HighScale"
820+
694821
# create the DCR
695822
dcr_creation_body = json.dumps(
696823
{
@@ -722,6 +849,7 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
722849
}
723850
]
724851
},
852+
"dataCollectionEndpointId": ingestion_dce_resource_id
725853
},
726854
}
727855
)
@@ -755,3 +883,34 @@ def _ensure_container_insights_dcr_for_monitoring(cmd, subscription_id, cluster_
755883
error = e
756884
else:
757885
raise error
886+
887+
888+
def create_data_collection_endpoint(cmd, subscription_id, cluster_resource_group_name, workspace_region, ingestionDataCollectionEndpointName):
889+
# create the ingestion DCE
890+
ingestion_dce_resource_id = f"/subscriptions/{subscription_id}/resourceGroups/{cluster_resource_group_name}/providers/Microsoft.Insights/dataCollectionEndpoints/{ingestionDataCollectionEndpointName}"
891+
ingestion_dce_url = cmd.cli_ctx.cloud.endpoints.resource_manager + f"{ingestion_dce_resource_id}?api-version=2022-06-01"
892+
ingestion_dce_creation_body = json.dumps({
893+
"location": workspace_region,
894+
"kind": "Linux",
895+
"properties": {
896+
"networkAcls": {
897+
"publicNetworkAccess": "Enabled"
898+
}
899+
}
900+
})
901+
error = None
902+
for _ in range(3):
903+
try:
904+
send_raw_request(cmd.cli_ctx, "PUT", ingestion_dce_url, body=ingestion_dce_creation_body)
905+
return ingestion_dce_resource_id
906+
except AzCLIError as e:
907+
error = e
908+
if error:
909+
raise error
910+
return ingestion_dce_resource_id
911+
912+
913+
def _trim_suffix_if_needed(s, suffix="-"):
914+
if s.endswith(suffix):
915+
s = s[:-len(suffix)]
916+
return s

testing/pipeline/k8s-custom-pipelines.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ stages:
3535
parameters:
3636
jobName: AzureMonitor
3737
path: ./test/extensions/public/AzureMonitor.Tests.ps1
38+
- template: ./templates/run-test.yml
39+
parameters:
40+
jobName: AzureMonitorHighScale
41+
path: ./test/extensions/public/AzureMonitorHighScale.Tests.ps1
3842
- template: ./templates/run-test.yml
3943
parameters:
4044
jobName: AzurePolicy
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
Describe 'Azure Monitor High Scale Mode Testing' {
2+
BeforeAll {
3+
$extensionType = "microsoft.azuremonitor.containers"
4+
$extensionName = "azuremonitor-containers"
5+
$extensionAgentName = "omsagent"
6+
$extensionAgentNamespace = "kube-system"
7+
8+
. $PSScriptRoot/../../helper/Constants.ps1
9+
. $PSScriptRoot/../../helper/Helper.ps1
10+
}
11+
12+
It 'Creates the extension with high log scale mode and checks that it onboards correctly' {
13+
az $Env:K8sExtensionName create -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) `
14+
--cluster-type connectedClusters --extension-type $extensionType -n $extensionName `
15+
--configuration-settings "amalogs.enableHighLogScaleMode=true" --no-wait
16+
$? | Should -BeTrue
17+
18+
$output = az $Env:K8sExtensionName show -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName
19+
$? | Should -BeTrue
20+
21+
$extension = ($output | ConvertFrom-Json)
22+
$isAutoUpgradeMinorVersion = $extension.autoUpgradeMinorVersion
23+
$isAutoUpgradeMinorVersion.ToString() -eq "True" | Should -BeTrue
24+
25+
# Verify high scale mode configuration
26+
$settings = $extension.configurationSettings
27+
$settings.'amalogs.enableHighLogScaleMode' | Should -Be "true"
28+
29+
# Loop and retry until the extension installs
30+
$n = 0
31+
do
32+
{
33+
if (Has-ExtensionData $extensionName) {
34+
break
35+
}
36+
Start-Sleep -Seconds 10
37+
$n += 1
38+
} while ($n -le $MAX_RETRY_ATTEMPTS)
39+
$n | Should -BeLessOrEqual $MAX_RETRY_ATTEMPTS
40+
}
41+
42+
It "Performs a show on the extension" {
43+
$output = az $Env:K8sExtensionName show -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName
44+
$? | Should -BeTrue
45+
$output | Should -Not -BeNullOrEmpty
46+
}
47+
48+
It "Lists the extensions on the cluster" {
49+
$output = az $Env:K8sExtensionName list -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters
50+
$? | Should -BeTrue
51+
52+
$output | Should -Not -BeNullOrEmpty
53+
$extensionExists = $output | ConvertFrom-Json | Where-Object { $_.extensionType -eq $extensionType }
54+
$extensionExists | Should -Not -BeNullOrEmpty
55+
}
56+
57+
It "Deletes the extension from the cluster" {
58+
$output = az $Env:K8sExtensionName delete -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName --force
59+
$? | Should -BeTrue
60+
61+
# Extension should not be found on the cluster
62+
$output = az $Env:K8sExtensionName show -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters -n $extensionName
63+
$? | Should -BeFalse
64+
$output | Should -BeNullOrEmpty
65+
}
66+
67+
It "Performs another list after the delete" {
68+
$output = az $Env:K8sExtensionName list -c $($ENVCONFIG.arcClusterName) -g $($ENVCONFIG.resourceGroup) --cluster-type connectedClusters
69+
$? | Should -BeTrue
70+
$output | Should -Not -BeNullOrEmpty
71+
72+
$extensionExists = $output | ConvertFrom-Json | Where-Object { $_.extensionType -eq $extensionName }
73+
$extensionExists | Should -BeNullOrEmpty
74+
}
75+
}

0 commit comments

Comments
 (0)