From 5f8cb4ab0e6536a866bc4bfbf19cd84fd38bfbd6 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 8 May 2026 10:27:03 -0700 Subject: [PATCH 1/6] Add per-node ContainerLogV2 coverage check to e2e querylogs --- test/ginkgo-e2e/querylogs/querylogs_test.go | 16 +++++ test/ginkgo-e2e/utils/kubernetes_api_utils.go | 20 ++++++ test/ginkgo-e2e/utils/query_logs_api_utils.go | 70 +++++++++++++++++++ 3 files changed, 106 insertions(+) diff --git a/test/ginkgo-e2e/querylogs/querylogs_test.go b/test/ginkgo-e2e/querylogs/querylogs_test.go index c0e46f393..8d74c7e28 100644 --- a/test/ginkgo-e2e/querylogs/querylogs_test.go +++ b/test/ginkgo-e2e/querylogs/querylogs_test.go @@ -41,6 +41,22 @@ var _ = Describe("When querying the logs for the table", func() { ) }) +var _ = Describe("When querying ContainerLogV2 per node", func() { + It("Every node hosting an ama-logs DaemonSet pod should have ContainerLogV2 rows", func() { + if GenevaIntegrationEnabled == "true" { + Skip("ContainerLogV2 per-node coverage skipped because GENEVA_INTEGRATION is set to 'true'") + } + + expectedNodes, err := utils.GetExpectedAmaLogsNodes(K8sClient) + Expect(err).NotTo(HaveOccurred()) + + observed, err := utils.QueryContainerLogV2CountsByComputer(LogsClient, AKSResourceId, "5m") + Expect(err).NotTo(HaveOccurred()) + + Expect(utils.AssertContainerLogV2NodeCoverage(expectedNodes, observed)).NotTo(HaveOccurred()) + }) +}) + var _ = Describe("When querying the logs for the ContainerInventory", func() { DescribeTable("Column should have zero empty values", func(column string) { diff --git a/test/ginkgo-e2e/utils/kubernetes_api_utils.go b/test/ginkgo-e2e/utils/kubernetes_api_utils.go index ae795e989..69206b445 100644 --- a/test/ginkgo-e2e/utils/kubernetes_api_utils.go +++ b/test/ginkgo-e2e/utils/kubernetes_api_utils.go @@ -522,6 +522,26 @@ func GetAndUpdateConfigMap(clientset *kubernetes.Clientset, configMapName, confi return nil } +// GetExpectedAmaLogsNodes returns the names of all nodes in the cluster. +// This is the set of nodes that ama-logs DaemonSets are expected to cover +// for ContainerLogV2 ingestion. Crucially, this set is derived from the +// Kubernetes node list directly (not from where ama-logs pods actually +// landed), so a node where the DaemonSet failed to schedule (taint, +// resource pressure, image pull failure, etc.) is still expected and will +// be reported as missing by the per-node coverage check. +func GetExpectedAmaLogsNodes(clientset *kubernetes.Clientset) ([]string, error) { + nodes, err := GetAllNodes(clientset) + if err != nil { + return nil, err + } + + names := make([]string, 0, len(nodes)) + for _, n := range nodes { + names = append(names, n.Name) + } + return names, nil +} + func GetAllAgentPods(clientset *kubernetes.Clientset) ([]corev1.Pod, error) { podList, err := clientset.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{}) if err != nil { diff --git a/test/ginkgo-e2e/utils/query_logs_api_utils.go b/test/ginkgo-e2e/utils/query_logs_api_utils.go index 397977751..aabe95a76 100644 --- a/test/ginkgo-e2e/utils/query_logs_api_utils.go +++ b/test/ginkgo-e2e/utils/query_logs_api_utils.go @@ -3,6 +3,7 @@ package utils import ( "context" "fmt" + "strings" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/azidentity" @@ -149,3 +150,72 @@ func CompareResourcesInLogsAndKubeAPI(K8sClient *kubernetes.Clientset, logsClien return CompareResourcesHelper(logsClient, resourceID, query, resources) } + +// QueryContainerLogV2CountsByComputer queries the Log Analytics workspace for +// the number of ContainerLogV2 rows ingested per node (Computer) within the +// given time window (e.g. "5m"). Returns a map keyed by lowercased Computer +// name. +// +// ContainerLogV2 and ContainerLog are mutually exclusive — a cluster writes +// to one or the other based on its schema configuration. This helper only +// falls back to ContainerLog when the ContainerLogV2 query *errors* (e.g. +// the V2 table does not exist in a V1-configured workspace). A successful V2 +// query that returns zero rows is treated as a real ingestion failure and +// surfaced as an empty map; callers must NOT interpret that as a reason to +// fall back, otherwise V2 ingestion failures would be silently masked. +func QueryContainerLogV2CountsByComputer(logsClient *azquery.LogsClient, resourceID string, window string) (map[string]int64, error) { + counts, v2Err := queryCountsByComputer(logsClient, resourceID, "ContainerLogV2", window) + if v2Err == nil { + return counts, nil + } + + fallback, fbErr := queryCountsByComputer(logsClient, resourceID, "ContainerLog", window) + if fbErr != nil { + return nil, fmt.Errorf("ContainerLogV2 query failed: %v; ContainerLog fallback failed: %v", v2Err, fbErr) + } + return fallback, nil +} + +func queryCountsByComputer(logsClient *azquery.LogsClient, resourceID string, table string, window string) (map[string]int64, error) { + query := fmt.Sprintf("%s | where TimeGenerated > ago(%s) | summarize count() by Computer", table, window) + tables, err := QueryLogs(logsClient, resourceID, query) + if err != nil { + return nil, err + } + + counts := map[string]int64{} + for _, t := range tables { + for _, row := range t.Rows { + if len(row) < 2 { + continue + } + computer, ok := row[0].(string) + if !ok || computer == "" { + continue + } + count, _ := row[1].(float64) + counts[strings.ToLower(computer)] += int64(count) + } + } + return counts, nil +} + +// AssertContainerLogV2NodeCoverage returns nil if every expected node appears +// in the per-Computer count map with a positive row count (compared +// case-insensitively), or an error listing the missing nodes otherwise. +func AssertContainerLogV2NodeCoverage(expectedNodes []string, observedCountsByComputer map[string]int64) error { + if len(expectedNodes) == 0 { + return fmt.Errorf("no expected nodes provided; cannot verify ContainerLogV2 coverage") + } + + var missing []string + for _, n := range expectedNodes { + if observedCountsByComputer[strings.ToLower(n)] <= 0 { + missing = append(missing, n) + } + } + if len(missing) > 0 { + return fmt.Errorf("ContainerLogV2 ingestion is missing for %d/%d expected node(s): %s", len(missing), len(expectedNodes), strings.Join(missing, ", ")) + } + return nil +} From d0c379f737222c9beb25cd1d32a0cf2fb20b4697 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 8 May 2026 13:41:14 -0700 Subject: [PATCH 2/6] Gate per-node ContainerLogV2 coverage check behind PER_NODE_LOG_COVERAGE env var --- test/ginkgo-e2e/querylogs/querylogs_suite_test.go | 2 ++ test/ginkgo-e2e/querylogs/querylogs_test.go | 3 +++ 2 files changed, 5 insertions(+) diff --git a/test/ginkgo-e2e/querylogs/querylogs_suite_test.go b/test/ginkgo-e2e/querylogs/querylogs_suite_test.go index 735df38a6..58b45c151 100644 --- a/test/ginkgo-e2e/querylogs/querylogs_suite_test.go +++ b/test/ginkgo-e2e/querylogs/querylogs_suite_test.go @@ -18,6 +18,7 @@ var LogsClient *azquery.LogsClient var AKSResourceId string var RetinaNetworkFlowLogsEnabled string var GenevaIntegrationEnabled string +var PerNodeLogCoverageEnabled string var Cfg *rest.Config func TestQuerylogs(t *testing.T) { @@ -34,6 +35,7 @@ var _ = BeforeSuite(func() { RetinaNetworkFlowLogsEnabled, err = utils.IsRetinaNetworkFlowLogsEnabled(K8sClient, "kube-system", "component", "ama-logs-agent", "ama-logs") Expect(err).NotTo(HaveOccurred()) GenevaIntegrationEnabled = os.Getenv("GENEVA_INTEGRATION") + PerNodeLogCoverageEnabled = os.Getenv("PER_NODE_LOG_COVERAGE") LogsClient, err = utils.SetupLogsClient() Expect(err).NotTo(HaveOccurred()) }) diff --git a/test/ginkgo-e2e/querylogs/querylogs_test.go b/test/ginkgo-e2e/querylogs/querylogs_test.go index 8d74c7e28..fb5de9a96 100644 --- a/test/ginkgo-e2e/querylogs/querylogs_test.go +++ b/test/ginkgo-e2e/querylogs/querylogs_test.go @@ -43,6 +43,9 @@ var _ = Describe("When querying the logs for the table", func() { var _ = Describe("When querying ContainerLogV2 per node", func() { It("Every node hosting an ama-logs DaemonSet pod should have ContainerLogV2 rows", func() { + if PerNodeLogCoverageEnabled != "true" { + Skip("Per-node ContainerLogV2 coverage skipped because PER_NODE_LOG_COVERAGE is not set to 'true'") + } if GenevaIntegrationEnabled == "true" { Skip("ContainerLogV2 per-node coverage skipped because GENEVA_INTEGRATION is set to 'true'") } From 1c9021b4f5a538b4e5ff19df3a61358a89b725d1 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 8 May 2026 16:13:04 -0700 Subject: [PATCH 3/6] Rename QueryContainerLogV2CountsByComputer to GetComputerFromContainerLog --- test/ginkgo-e2e/querylogs/querylogs_test.go | 2 +- test/ginkgo-e2e/utils/query_logs_api_utils.go | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/ginkgo-e2e/querylogs/querylogs_test.go b/test/ginkgo-e2e/querylogs/querylogs_test.go index fb5de9a96..f50b91343 100644 --- a/test/ginkgo-e2e/querylogs/querylogs_test.go +++ b/test/ginkgo-e2e/querylogs/querylogs_test.go @@ -53,7 +53,7 @@ var _ = Describe("When querying ContainerLogV2 per node", func() { expectedNodes, err := utils.GetExpectedAmaLogsNodes(K8sClient) Expect(err).NotTo(HaveOccurred()) - observed, err := utils.QueryContainerLogV2CountsByComputer(LogsClient, AKSResourceId, "5m") + observed, err := utils.GetComputerFromContainerLog(LogsClient, AKSResourceId, "5m") Expect(err).NotTo(HaveOccurred()) Expect(utils.AssertContainerLogV2NodeCoverage(expectedNodes, observed)).NotTo(HaveOccurred()) diff --git a/test/ginkgo-e2e/utils/query_logs_api_utils.go b/test/ginkgo-e2e/utils/query_logs_api_utils.go index aabe95a76..a9ed0a7eb 100644 --- a/test/ginkgo-e2e/utils/query_logs_api_utils.go +++ b/test/ginkgo-e2e/utils/query_logs_api_utils.go @@ -151,10 +151,10 @@ func CompareResourcesInLogsAndKubeAPI(K8sClient *kubernetes.Clientset, logsClien return CompareResourcesHelper(logsClient, resourceID, query, resources) } -// QueryContainerLogV2CountsByComputer queries the Log Analytics workspace for +// GetComputerFromContainerLog queries the Log Analytics workspace for // the number of ContainerLogV2 rows ingested per node (Computer) within the // given time window (e.g. "5m"). Returns a map keyed by lowercased Computer -// name. +// name with the row count as value. // // ContainerLogV2 and ContainerLog are mutually exclusive — a cluster writes // to one or the other based on its schema configuration. This helper only @@ -163,7 +163,7 @@ func CompareResourcesInLogsAndKubeAPI(K8sClient *kubernetes.Clientset, logsClien // query that returns zero rows is treated as a real ingestion failure and // surfaced as an empty map; callers must NOT interpret that as a reason to // fall back, otherwise V2 ingestion failures would be silently masked. -func QueryContainerLogV2CountsByComputer(logsClient *azquery.LogsClient, resourceID string, window string) (map[string]int64, error) { +func GetComputerFromContainerLog(logsClient *azquery.LogsClient, resourceID string, window string) (map[string]int64, error) { counts, v2Err := queryCountsByComputer(logsClient, resourceID, "ContainerLogV2", window) if v2Err == nil { return counts, nil From 06c77605dc42b26b772eafeeaf916e64f8972227 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 8 May 2026 16:13:59 -0700 Subject: [PATCH 4/6] Rename AssertContainerLogV2NodeCoverage to AssertContainerLogNodeCoverage --- test/ginkgo-e2e/querylogs/querylogs_test.go | 2 +- test/ginkgo-e2e/utils/query_logs_api_utils.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/ginkgo-e2e/querylogs/querylogs_test.go b/test/ginkgo-e2e/querylogs/querylogs_test.go index f50b91343..0740d2cea 100644 --- a/test/ginkgo-e2e/querylogs/querylogs_test.go +++ b/test/ginkgo-e2e/querylogs/querylogs_test.go @@ -56,7 +56,7 @@ var _ = Describe("When querying ContainerLogV2 per node", func() { observed, err := utils.GetComputerFromContainerLog(LogsClient, AKSResourceId, "5m") Expect(err).NotTo(HaveOccurred()) - Expect(utils.AssertContainerLogV2NodeCoverage(expectedNodes, observed)).NotTo(HaveOccurred()) + Expect(utils.AssertContainerLogNodeCoverage(expectedNodes, observed)).NotTo(HaveOccurred()) }) }) diff --git a/test/ginkgo-e2e/utils/query_logs_api_utils.go b/test/ginkgo-e2e/utils/query_logs_api_utils.go index a9ed0a7eb..2f32aa64a 100644 --- a/test/ginkgo-e2e/utils/query_logs_api_utils.go +++ b/test/ginkgo-e2e/utils/query_logs_api_utils.go @@ -200,10 +200,10 @@ func queryCountsByComputer(logsClient *azquery.LogsClient, resourceID string, ta return counts, nil } -// AssertContainerLogV2NodeCoverage returns nil if every expected node appears +// AssertContainerLogNodeCoverage returns nil if every expected node appears // in the per-Computer count map with a positive row count (compared // case-insensitively), or an error listing the missing nodes otherwise. -func AssertContainerLogV2NodeCoverage(expectedNodes []string, observedCountsByComputer map[string]int64) error { +func AssertContainerLogNodeCoverage(expectedNodes []string, observedCountsByComputer map[string]int64) error { if len(expectedNodes) == 0 { return fmt.Errorf("no expected nodes provided; cannot verify ContainerLogV2 coverage") } From ab915ef641612fd1a2496676603bdce4aac49169 Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 8 May 2026 16:20:20 -0700 Subject: [PATCH 5/6] refactor --- test/ginkgo-e2e/querylogs/querylogs_test.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/ginkgo-e2e/querylogs/querylogs_test.go b/test/ginkgo-e2e/querylogs/querylogs_test.go index 0740d2cea..a21555923 100644 --- a/test/ginkgo-e2e/querylogs/querylogs_test.go +++ b/test/ginkgo-e2e/querylogs/querylogs_test.go @@ -41,13 +41,13 @@ var _ = Describe("When querying the logs for the table", func() { ) }) -var _ = Describe("When querying ContainerLogV2 per node", func() { - It("Every node hosting an ama-logs DaemonSet pod should have ContainerLogV2 rows", func() { +var _ = Describe("When querying Container logs per node", func() { + It("Every node hosting an ama-logs DaemonSet pod should have Container logs", func() { if PerNodeLogCoverageEnabled != "true" { - Skip("Per-node ContainerLogV2 coverage skipped because PER_NODE_LOG_COVERAGE is not set to 'true'") + Skip("Per-node Container log coverage skipped because PER_NODE_LOG_COVERAGE is not set to 'true'") } if GenevaIntegrationEnabled == "true" { - Skip("ContainerLogV2 per-node coverage skipped because GENEVA_INTEGRATION is set to 'true'") + Skip("Container log per-node coverage skipped because GENEVA_INTEGRATION is set to 'true'") } expectedNodes, err := utils.GetExpectedAmaLogsNodes(K8sClient) From 05b8d4d46c774e797d08340c59005e788dde4e5c Mon Sep 17 00:00:00 2001 From: zanejohnson-azure Date: Fri, 8 May 2026 16:22:26 -0700 Subject: [PATCH 6/6] cleanup --- test/ginkgo-e2e/utils/query_logs_api_utils.go | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/test/ginkgo-e2e/utils/query_logs_api_utils.go b/test/ginkgo-e2e/utils/query_logs_api_utils.go index 2f32aa64a..74b39d9e6 100644 --- a/test/ginkgo-e2e/utils/query_logs_api_utils.go +++ b/test/ginkgo-e2e/utils/query_logs_api_utils.go @@ -151,18 +151,7 @@ func CompareResourcesInLogsAndKubeAPI(K8sClient *kubernetes.Clientset, logsClien return CompareResourcesHelper(logsClient, resourceID, query, resources) } -// GetComputerFromContainerLog queries the Log Analytics workspace for -// the number of ContainerLogV2 rows ingested per node (Computer) within the -// given time window (e.g. "5m"). Returns a map keyed by lowercased Computer -// name with the row count as value. -// -// ContainerLogV2 and ContainerLog are mutually exclusive — a cluster writes -// to one or the other based on its schema configuration. This helper only -// falls back to ContainerLog when the ContainerLogV2 query *errors* (e.g. -// the V2 table does not exist in a V1-configured workspace). A successful V2 -// query that returns zero rows is treated as a real ingestion failure and -// surfaced as an empty map; callers must NOT interpret that as a reason to -// fall back, otherwise V2 ingestion failures would be silently masked. + func GetComputerFromContainerLog(logsClient *azquery.LogsClient, resourceID string, window string) (map[string]int64, error) { counts, v2Err := queryCountsByComputer(logsClient, resourceID, "ContainerLogV2", window) if v2Err == nil {