diff --git a/test/ginkgo-e2e/querylogs/querylogs_suite_test.go b/test/ginkgo-e2e/querylogs/querylogs_suite_test.go index 735df38a6..58b45c151 100644 --- a/test/ginkgo-e2e/querylogs/querylogs_suite_test.go +++ b/test/ginkgo-e2e/querylogs/querylogs_suite_test.go @@ -18,6 +18,7 @@ var LogsClient *azquery.LogsClient var AKSResourceId string var RetinaNetworkFlowLogsEnabled string var GenevaIntegrationEnabled string +var PerNodeLogCoverageEnabled string var Cfg *rest.Config func TestQuerylogs(t *testing.T) { @@ -34,6 +35,7 @@ var _ = BeforeSuite(func() { RetinaNetworkFlowLogsEnabled, err = utils.IsRetinaNetworkFlowLogsEnabled(K8sClient, "kube-system", "component", "ama-logs-agent", "ama-logs") Expect(err).NotTo(HaveOccurred()) GenevaIntegrationEnabled = os.Getenv("GENEVA_INTEGRATION") + PerNodeLogCoverageEnabled = os.Getenv("PER_NODE_LOG_COVERAGE") LogsClient, err = utils.SetupLogsClient() Expect(err).NotTo(HaveOccurred()) }) diff --git a/test/ginkgo-e2e/querylogs/querylogs_test.go b/test/ginkgo-e2e/querylogs/querylogs_test.go index c0e46f393..a21555923 100644 --- a/test/ginkgo-e2e/querylogs/querylogs_test.go +++ b/test/ginkgo-e2e/querylogs/querylogs_test.go @@ -41,6 +41,25 @@ var _ = Describe("When querying the logs for the table", func() { ) }) +var _ = Describe("When querying Container logs per node", func() { + It("Every node hosting an ama-logs DaemonSet pod should have Container logs", func() { + if PerNodeLogCoverageEnabled != "true" { + Skip("Per-node Container log coverage skipped because PER_NODE_LOG_COVERAGE is not set to 'true'") + } + if GenevaIntegrationEnabled == "true" { + Skip("Container log per-node coverage skipped because GENEVA_INTEGRATION is set to 'true'") + } + + expectedNodes, err := utils.GetExpectedAmaLogsNodes(K8sClient) + Expect(err).NotTo(HaveOccurred()) + + observed, err := utils.GetComputerFromContainerLog(LogsClient, AKSResourceId, "5m") + Expect(err).NotTo(HaveOccurred()) + + Expect(utils.AssertContainerLogNodeCoverage(expectedNodes, observed)).NotTo(HaveOccurred()) + }) +}) + var _ = Describe("When querying the logs for the ContainerInventory", func() { DescribeTable("Column should have zero empty values", func(column string) { diff --git a/test/ginkgo-e2e/utils/kubernetes_api_utils.go b/test/ginkgo-e2e/utils/kubernetes_api_utils.go index ae795e989..69206b445 100644 --- a/test/ginkgo-e2e/utils/kubernetes_api_utils.go +++ b/test/ginkgo-e2e/utils/kubernetes_api_utils.go @@ -522,6 +522,26 @@ func GetAndUpdateConfigMap(clientset *kubernetes.Clientset, configMapName, confi return nil } +// GetExpectedAmaLogsNodes returns the names of all nodes in the cluster. +// This is the set of nodes that ama-logs DaemonSets are expected to cover +// for ContainerLogV2 ingestion. Crucially, this set is derived from the +// Kubernetes node list directly (not from where ama-logs pods actually +// landed), so a node where the DaemonSet failed to schedule (taint, +// resource pressure, image pull failure, etc.) is still expected and will +// be reported as missing by the per-node coverage check. +func GetExpectedAmaLogsNodes(clientset *kubernetes.Clientset) ([]string, error) { + nodes, err := GetAllNodes(clientset) + if err != nil { + return nil, err + } + + names := make([]string, 0, len(nodes)) + for _, n := range nodes { + names = append(names, n.Name) + } + return names, nil +} + func GetAllAgentPods(clientset *kubernetes.Clientset) ([]corev1.Pod, error) { podList, err := clientset.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{}) if err != nil { diff --git a/test/ginkgo-e2e/utils/query_logs_api_utils.go b/test/ginkgo-e2e/utils/query_logs_api_utils.go index 397977751..74b39d9e6 100644 --- a/test/ginkgo-e2e/utils/query_logs_api_utils.go +++ b/test/ginkgo-e2e/utils/query_logs_api_utils.go @@ -3,6 +3,7 @@ package utils import ( "context" "fmt" + "strings" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/azidentity" @@ -149,3 +150,61 @@ func CompareResourcesInLogsAndKubeAPI(K8sClient *kubernetes.Clientset, logsClien return CompareResourcesHelper(logsClient, resourceID, query, resources) } + + +func GetComputerFromContainerLog(logsClient *azquery.LogsClient, resourceID string, window string) (map[string]int64, error) { + counts, v2Err := queryCountsByComputer(logsClient, resourceID, "ContainerLogV2", window) + if v2Err == nil { + return counts, nil + } + + fallback, fbErr := queryCountsByComputer(logsClient, resourceID, "ContainerLog", window) + if fbErr != nil { + return nil, fmt.Errorf("ContainerLogV2 query failed: %v; ContainerLog fallback failed: %v", v2Err, fbErr) + } + return fallback, nil +} + +func queryCountsByComputer(logsClient *azquery.LogsClient, resourceID string, table string, window string) (map[string]int64, error) { + query := fmt.Sprintf("%s | where TimeGenerated > ago(%s) | summarize count() by Computer", table, window) + tables, err := QueryLogs(logsClient, resourceID, query) + if err != nil { + return nil, err + } + + counts := map[string]int64{} + for _, t := range tables { + for _, row := range t.Rows { + if len(row) < 2 { + continue + } + computer, ok := row[0].(string) + if !ok || computer == "" { + continue + } + count, _ := row[1].(float64) + counts[strings.ToLower(computer)] += int64(count) + } + } + return counts, nil +} + +// AssertContainerLogNodeCoverage returns nil if every expected node appears +// in the per-Computer count map with a positive row count (compared +// case-insensitively), or an error listing the missing nodes otherwise. +func AssertContainerLogNodeCoverage(expectedNodes []string, observedCountsByComputer map[string]int64) error { + if len(expectedNodes) == 0 { + return fmt.Errorf("no expected nodes provided; cannot verify ContainerLogV2 coverage") + } + + var missing []string + for _, n := range expectedNodes { + if observedCountsByComputer[strings.ToLower(n)] <= 0 { + missing = append(missing, n) + } + } + if len(missing) > 0 { + return fmt.Errorf("ContainerLogV2 ingestion is missing for %d/%d expected node(s): %s", len(missing), len(expectedNodes), strings.Join(missing, ", ")) + } + return nil +}