diff --git a/pkg/tracejob/job.go b/pkg/tracejob/job.go index 97f2a4c3..137a75c6 100644 --- a/pkg/tracejob/job.go +++ b/pkg/tracejob/job.go @@ -367,9 +367,9 @@ func (nj *TraceJob) Job() *batchv1.Job { RequiredDuringSchedulingIgnoredDuringExecution: &apiv1.NodeSelector{ NodeSelectorTerms: []apiv1.NodeSelectorTerm{ apiv1.NodeSelectorTerm{ - MatchExpressions: []apiv1.NodeSelectorRequirement{ + MatchFields: []apiv1.NodeSelectorRequirement{ apiv1.NodeSelectorRequirement{ - Key: "kubernetes.io/hostname", + Key: "metadata.name", Operator: apiv1.NodeSelectorOpIn, Values: []string{nj.Target.Node}, }, @@ -611,23 +611,33 @@ func jobHostname(j batchv1.Job) (string, error) { return "", fmt.Errorf("node selector terms are empty in node affinity for job") } - me := nst[0].MatchExpressions - - if len(me) == 0 { - return "", fmt.Errorf("node selector terms match expressions are empty in node affinity for job") + // Check MatchFields first (new approach) + mf := nst[0].MatchFields + if len(mf) > 0 { + for _, v := range mf { + if v.Key == "metadata.name" { + if len(v.Values) == 0 { + return "", fmt.Errorf("node name affinity found but no values in it for job") + } + return v.Values[0], nil + } + } } - for _, v := range me { - if v.Key == "kubernetes.io/hostname" { - if len(v.Values) == 0 { - return "", fmt.Errorf("hostname affinity found but no values in it for job") + // Fallback to MatchExpressions for backward compatibility + me := nst[0].MatchExpressions + if len(me) > 0 { + for _, v := range me { + if v.Key == "kubernetes.io/hostname" { + if len(v.Values) == 0 { + return "", fmt.Errorf("hostname affinity found but no values in it for job") + } + return v.Values[0], nil } - - return v.Values[0], nil } } - return "", fmt.Errorf("hostname not found for job") + return "", fmt.Errorf("node name not found for job") } // TraceJobStatus is a label for the running status of a trace job at the current time. diff --git a/pkg/tracejob/job_test.go b/pkg/tracejob/job_test.go index 38218dd7..0b303469 100644 --- a/pkg/tracejob/job_test.go +++ b/pkg/tracejob/job_test.go @@ -7,6 +7,8 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/suite" + batchv1 "k8s.io/api/batch/v1" + apiv1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes/fake" ) @@ -69,3 +71,98 @@ func (j *jobSuite) TestCreateJobWithGoogleAppSecret() { assert.Len(j.T(), joblist.Items[0].Spec.Template.Spec.Containers[0].Env, 1) assert.Equal(j.T(), joblist.Items[0].Spec.Template.Spec.Containers[0].Env[0].Name, "GOOGLE_APPLICATION_CREDENTIALS") } + +func (j *jobSuite) TestJobNodeSelectorUsesNodeName() { + testJobName := "test-node-selector" + testNodeName := "ip-10-0-1-123.ec2.internal" + tj := TraceJob{ + Name: testJobName, + Target: TraceJobTarget{ + Node: testNodeName, + }, + } + + job := tj.Job() + + // Verify that the job uses MatchFields with metadata.name instead of kubernetes.io/hostname + assert.NotNil(j.T(), job.Spec.Template.Spec.Affinity) + assert.NotNil(j.T(), job.Spec.Template.Spec.Affinity.NodeAffinity) + assert.NotNil(j.T(), job.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution) + + nodeSelectorTerms := job.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms + assert.Len(j.T(), nodeSelectorTerms, 1) + + matchFields := nodeSelectorTerms[0].MatchFields + assert.Len(j.T(), matchFields, 1) + + assert.Equal(j.T(), "metadata.name", matchFields[0].Key) + assert.Equal(j.T(), "In", string(matchFields[0].Operator)) + assert.Len(j.T(), matchFields[0].Values, 1) + assert.Equal(j.T(), testNodeName, matchFields[0].Values[0]) +} + +func (j *jobSuite) TestJobHostnameExtraction() { + testNodeName := "ip-10-0-1-123.ec2.internal" + + // Test with new MatchFields approach + jobWithMatchFields := &batchv1.Job{ + Spec: batchv1.JobSpec{ + Template: apiv1.PodTemplateSpec{ + Spec: apiv1.PodSpec{ + Affinity: &apiv1.Affinity{ + NodeAffinity: &apiv1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &apiv1.NodeSelector{ + NodeSelectorTerms: []apiv1.NodeSelectorTerm{ + apiv1.NodeSelectorTerm{ + MatchFields: []apiv1.NodeSelectorRequirement{ + apiv1.NodeSelectorRequirement{ + Key: "metadata.name", + Operator: apiv1.NodeSelectorOpIn, + Values: []string{testNodeName}, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + nodeName, err := jobHostname(*jobWithMatchFields) + assert.Nil(j.T(), err) + assert.Equal(j.T(), testNodeName, nodeName) + + // Test backward compatibility with old MatchExpressions approach + jobWithMatchExpressions := &batchv1.Job{ + Spec: batchv1.JobSpec{ + Template: apiv1.PodTemplateSpec{ + Spec: apiv1.PodSpec{ + Affinity: &apiv1.Affinity{ + NodeAffinity: &apiv1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &apiv1.NodeSelector{ + NodeSelectorTerms: []apiv1.NodeSelectorTerm{ + apiv1.NodeSelectorTerm{ + MatchExpressions: []apiv1.NodeSelectorRequirement{ + apiv1.NodeSelectorRequirement{ + Key: "kubernetes.io/hostname", + Operator: apiv1.NodeSelectorOpIn, + Values: []string{"ip-10-0-1-123"}, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + hostname, err := jobHostname(*jobWithMatchExpressions) + assert.Nil(j.T(), err) + assert.Equal(j.T(), "ip-10-0-1-123", hostname) +} diff --git a/pkg/tracejob/selected_target.go b/pkg/tracejob/selected_target.go index 81af8fb1..75b8086b 100644 --- a/pkg/tracejob/selected_target.go +++ b/pkg/tracejob/selected_target.go @@ -114,12 +114,7 @@ func ResolveTraceJobTarget(clientset kubernetes.Interface, resource, container, return nil, errors.NewErrorInvalid(fmt.Sprintf("Failed to locate a node for %s %v", resourceID, err)) } - labels := node.GetLabels() - val, ok := labels["kubernetes.io/hostname"] - if !ok { - return nil, errors.NewErrorInvalid("label kubernetes.io/hostname not found in node") - } - target.Node = val + target.Node = node.Name case "pod": podClient := clientset.CoreV1().Pods(targetNamespace) diff --git a/pkg/tracejob/selected_target_test.go b/pkg/tracejob/selected_target_test.go new file mode 100644 index 00000000..bc297ac6 --- /dev/null +++ b/pkg/tracejob/selected_target_test.go @@ -0,0 +1,198 @@ +package tracejob + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/suite" + + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +type selectedTargetSuite struct { + suite.Suite + clientset *fake.Clientset +} + +func TestSelectedTargetSuite(t *testing.T) { + suite.Run(t, &selectedTargetSuite{}) +} + +func (s *selectedTargetSuite) SetupTest() { + s.clientset = fake.NewSimpleClientset() +} + +func (s *selectedTargetSuite) TestResolveNodeTargetUsesNodeName() { + // Create a test node with fully qualified name and different hostname label + testNodeName := "ip-10-0-1-123.ec2.internal" + testHostnameLabel := "ip-10-0-1-123" + + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNodeName, + Labels: map[string]string{ + "kubernetes.io/hostname": testHostnameLabel, + }, + }, + Status: v1.NodeStatus{ + Allocatable: v1.ResourceList{ + v1.ResourcePods: resource.MustParse("110"), + }, + }, + } + + _, err := s.clientset.CoreV1().Nodes().Create(context.TODO(), node, metav1.CreateOptions{}) + assert.Nil(s.T(), err) + + // Test resolving the node target + target, err := ResolveTraceJobTarget(s.clientset, testNodeName, "", "") + + assert.Nil(s.T(), err) + assert.NotNil(s.T(), target) + // The target should use the actual node name, not the hostname label + assert.Equal(s.T(), testNodeName, target.Node) +} + +func (s *selectedTargetSuite) TestResolvePodTargetUsesNodeName() { + // Create a test node + testNodeName := "ip-10-0-1-123.ec2.internal" + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNodeName, + Labels: map[string]string{ + "kubernetes.io/hostname": "ip-10-0-1-123", + }, + }, + Status: v1.NodeStatus{ + Allocatable: v1.ResourceList{ + v1.ResourcePods: resource.MustParse("110"), + }, + }, + } + + _, err := s.clientset.CoreV1().Nodes().Create(context.TODO(), node, metav1.CreateOptions{}) + assert.Nil(s.T(), err) + + // Create a test pod scheduled on the node + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + UID: "test-pod-uid", + }, + Spec: v1.PodSpec{ + NodeName: testNodeName, + Containers: []v1.Container{ + { + Name: "test-container", + }, + }, + }, + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "test-container", + ContainerID: "docker://abc123", + }, + }, + }, + } + + _, err = s.clientset.CoreV1().Pods("default").Create(context.TODO(), pod, metav1.CreateOptions{}) + assert.Nil(s.T(), err) + + // Test resolving the pod target + target, err := ResolveTraceJobTarget(s.clientset, "pod/test-pod", "test-container", "default") + + assert.Nil(s.T(), err) + assert.NotNil(s.T(), target) + // The target should use the actual node name from pod.Spec.NodeName + assert.Equal(s.T(), testNodeName, target.Node) + assert.Equal(s.T(), "test-pod-uid", target.PodUID) + assert.Equal(s.T(), "abc123", target.ContainerID) +} + +func (s *selectedTargetSuite) TestResolveDeploymentTargetUsesNodeName() { + // Create a test node + testNodeName := "ip-10-0-1-123.ec2.internal" + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNodeName, + Labels: map[string]string{ + "kubernetes.io/hostname": "ip-10-0-1-123", + }, + }, + Status: v1.NodeStatus{ + Allocatable: v1.ResourceList{ + v1.ResourcePods: resource.MustParse("110"), + }, + }, + } + + _, err := s.clientset.CoreV1().Nodes().Create(context.TODO(), node, metav1.CreateOptions{}) + assert.Nil(s.T(), err) + + // Create a test deployment + deployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-deployment", + Namespace: "default", + }, + Spec: appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "test-app", + }, + }, + }, + } + + _, err = s.clientset.AppsV1().Deployments("default").Create(context.TODO(), deployment, metav1.CreateOptions{}) + assert.Nil(s.T(), err) + + // Create a test pod for the deployment + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-deployment-pod", + Namespace: "default", + UID: "test-pod-uid", + Labels: map[string]string{ + "app": "test-app", + }, + }, + Spec: v1.PodSpec{ + NodeName: testNodeName, + Containers: []v1.Container{ + { + Name: "test-container", + }, + }, + }, + Status: v1.PodStatus{ + ContainerStatuses: []v1.ContainerStatus{ + { + Name: "test-container", + ContainerID: "docker://def456", + }, + }, + }, + } + + _, err = s.clientset.CoreV1().Pods("default").Create(context.TODO(), pod, metav1.CreateOptions{}) + assert.Nil(s.T(), err) + + // Test resolving the deployment target + target, err := ResolveTraceJobTarget(s.clientset, "deployment/test-deployment", "test-container", "default") + + assert.Nil(s.T(), err) + assert.NotNil(s.T(), target) + // The target should use the actual node name from pod.Spec.NodeName + assert.Equal(s.T(), testNodeName, target.Node) + assert.Equal(s.T(), "test-pod-uid", target.PodUID) + assert.Equal(s.T(), "def456", target.ContainerID) +}