Skip to content

Commit 3c7bd78

Browse files
committed
fix: update logging levels from Debug to Info and Warn for better visibility
1 parent b82f5fd commit 3c7bd78

9 files changed

Lines changed: 52 additions & 34 deletions

File tree

kubewatch/pkg/informer/cluster/advisor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ type ClientAdvisor interface {
3434

3535
func (impl *InformerImpl) GetClient(clientType bean.ClientType, clusterInfo *repository.Cluster) (ClientAdvisor, error) {
3636
if !impl.IsMultiClusterMode(clientType) && !clusterInfo.IsDefault() {
37-
impl.logger.Debugw("informer is not supported for cluster, skipping...", "clientType", clientType, "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName, "appConfig", impl.appConfig)
37+
impl.logger.Warnw("informer is not supported for cluster, skipping...", "clientType", clientType, "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName, "appConfig", impl.appConfig)
3838
return NewUnimplementedAdvisor(), nil
3939
}
4040
return impl.GetClientAdvisor(clientType)

kubewatch/pkg/informer/cluster/argoCD/informer.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,12 @@ func NewInformerImpl(logger *zap.SugaredLogger,
5757

5858
func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluster) error {
5959
if !impl.appConfig.GetAcdConfig().ACDInformer || impl.appConfig.GetExternalConfig().External {
60-
impl.logger.Debugw("argo cd informer is not enabled for cluster, skipping...", "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName, "appConfig", impl.appConfig)
60+
impl.logger.Warnw("argo cd informer is not enabled for cluster, skipping...", "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName, "appConfig", impl.appConfig)
6161
return nil
6262
}
6363
startTime := time.Now()
6464
defer func() {
65-
impl.logger.Debugw("time taken to start argo cd informer", "clusterId", clusterInfo.Id, "time", time.Since(startTime))
65+
impl.logger.Infow("time taken to start argo cd informer", "clusterId", clusterInfo.Id, "time", time.Since(startTime))
6666
}()
6767
stopper, ok := impl.argoCdInformerStopper[clusterInfo.Id]
6868
if ok && stopper.HasInformer() {

kubewatch/pkg/informer/cluster/argoWf/cd/informer.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,12 @@ func NewInformerImpl(logger *zap.SugaredLogger,
5555

5656
func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluster) error {
5757
if !impl.appConfig.GetCdConfig().CdInformer {
58-
impl.logger.Debugw("cd argo workflow informer is not enabled, skipping...", "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName, "appConfig", impl.appConfig)
58+
impl.logger.Warnw("cd argo workflow informer is not enabled, skipping...", "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName, "appConfig", impl.appConfig)
5959
return nil
6060
}
6161
startTime := time.Now()
6262
defer func() {
63-
impl.logger.Debugw("time taken to start cd argo workflow informer", "clusterId", clusterInfo.Id, "time", time.Since(startTime))
63+
impl.logger.Infow("time taken to start cd argo workflow informer", "clusterId", clusterInfo.Id, "time", time.Since(startTime))
6464
}()
6565
restConfig := impl.k8sUtil.GetK8sConfigForCluster(clusterInfo)
6666
cdWfInformer := impl.informerClient.GetSharedInformerClient(resourceBean.CdWorkflowResourceType)

kubewatch/pkg/informer/cluster/argoWf/ci/informer.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,12 @@ func NewInformerImpl(logger *zap.SugaredLogger,
5555

5656
func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluster) error {
5757
if !impl.appConfig.GetCiConfig().CiInformer {
58-
impl.logger.Debugw("ci argo workflow informer is not enabled, skipping...", "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName, "appConfig", impl.appConfig)
58+
impl.logger.Warnw("ci argo workflow informer is not enabled, skipping...", "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName, "appConfig", impl.appConfig)
5959
return nil
6060
}
6161
startTime := time.Now()
6262
defer func() {
63-
impl.logger.Debugw("time taken to start ci argo workflow informer", "clusterId", clusterInfo.Id, "time", time.Since(startTime))
63+
impl.logger.Infow("time taken to start ci argo workflow informer", "clusterId", clusterInfo.Id, "time", time.Since(startTime))
6464
}()
6565
restConfig := impl.k8sUtil.GetK8sConfigForCluster(clusterInfo)
6666
ciWfInformer := impl.informerClient.GetSharedInformerClient(resourceBean.CiWorkflowResourceType)

kubewatch/pkg/informer/cluster/helper.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ func (impl *InformerImpl) stopInformersForCluster(clusterId int) error {
9797
func (impl *InformerImpl) startInformerForCluster(clusterInfo *repository.Cluster) error {
9898
startTime := time.Now()
9999
defer func() {
100-
impl.logger.Debugw("time taken to start informers for cluster", "clusterId", clusterInfo.Id, "time", time.Since(startTime))
100+
impl.logger.Infow("time taken to start informers for cluster", "clusterId", clusterInfo.Id, "time", time.Since(startTime))
101101
}()
102102
if len(clusterInfo.ErrorInConnecting) > 0 {
103103
impl.logger.Debugw("cluster is not reachable", "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName)
@@ -205,7 +205,7 @@ func (impl *InformerImpl) syncMultiClusterInformer(clusterId int) error {
205205
func (impl *InformerImpl) reloadInformerForCluster(clusterId int) error {
206206
startTime := time.Now()
207207
defer func() {
208-
impl.logger.Debugw("time taken to reload informer for cluster", "clusterId", clusterId, "time", time.Since(startTime))
208+
impl.logger.Infow("time taken to reload informer for cluster", "clusterId", clusterId, "time", time.Since(startTime))
209209
}()
210210

211211
clusterInfo, err := impl.clusterRepository.FindById(clusterId)

kubewatch/pkg/informer/cluster/informer.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ func NewInformerImpl(logger *zap.SugaredLogger,
8484
func (impl *InformerImpl) StartDevtronClusterWatcher() error {
8585
startTime := time.Now()
8686
defer func() {
87-
impl.logger.Debugw("time taken to start default cluster informer", "time", time.Since(startTime))
87+
impl.logger.Infow("time taken to start default cluster informer", "time", time.Since(startTime))
8888
}()
8989
clusterInfo, err := impl.clusterRepository.FindByName(commonBean.DEFAULT_CLUSTER)
9090
if err != nil && !errors.Is(err, pg.ErrNoRows) {

kubewatch/pkg/informer/cluster/systemExec/helper.go

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) (
181181
ctrs = append(pod.Status.InitContainerStatuses, ctrs...)
182182

183183
for _, ctr := range ctrs {
184+
// In our case, we are not using init/wait containers. So ideally, there should be only main container.
184185

185186
// Virtual Kubelet environment will not set the terminate on waiting container
186187
// https://github.com/argoproj/argo-workflows/issues/3879
@@ -203,10 +204,10 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) (
203204
if eventType == bean.DeleteEvent {
204205
// we should mark this case as an error
205206
if ctr.Name == common.MainContainerName {
206-
return v1alpha1.NodeFailed, getFailedReasonFromPodConditions(pod.Status.Conditions)
207+
return v1alpha1.NodeFailed, impl.getFailedReasonFromPodConditions(pod.Status.Conditions)
207208
}
208209
}
209-
impl.logger.Warnf("Pod %s phase was Failed but %s did not have terminated state", pod.Name, ctr.Name)
210+
impl.logger.Warnw("Pod phase was Failed but container did not have terminated state", "podName", pod.Name, "containerName", ctr.Name, "status", pod.Status)
210211
continue
211212
}
212213

@@ -248,15 +249,34 @@ func (impl *InformerImpl) inferFailedReason(eventType string, pod *coreV1.Pod) (
248249
// it was marking it successful, doing this as it will be skipped at upper level, and delete event will handle it.
249250
// ticket - you can find debug logs/details here - https://github.com/devtron-labs/sprint-tasks/issues/2092
250251
impl.logger.Infow("Pod phase was Failed but no container had terminated state, marking it as failed now", "podName", pod.Name, "status", pod.Status)
252+
253+
// Here we're intentionally returning empty string as message,
254+
// as we don't want this event to be intercepted as actual update event.
255+
// This case will be handled by the delete event.
251256
return v1alpha1.NodeFailed, ""
252257
}
253258

254-
func getFailedReasonFromPodConditions(conditions []coreV1.PodCondition) string {
259+
func (impl *InformerImpl) getFailedReasonFromPodConditions(conditions []coreV1.PodCondition) string {
255260
if len(conditions) == 0 {
256-
return "failed"
261+
// This should never happen.
262+
impl.logger.Warnw("Pod phase was Failed but no conditions found")
263+
// If we get here, that means the pod is deleted after the container is started.
264+
// The Only possible reason is NodeNoLongerExists.
265+
return bean.NodeNoLongerExists
257266
}
258-
259-
return conditions[0].Message
267+
// Sort the conditions by last transition time, newest first.
268+
// This is to ensure that we get the latest reason for the failure.
269+
sort.Slice(conditions, func(i, j int) bool {
270+
return conditions[i].LastTransitionTime.Time.After(conditions[j].LastTransitionTime.Time)
271+
})
272+
// If the latest condition has a message, use that.
273+
if conditions[0].Message != "" {
274+
return conditions[0].Message
275+
}
276+
// If we get here, that means the pod is deleted after the container is started.
277+
// The Only possible reason is NodeNoLongerExists.
278+
impl.logger.Warnw("Pod phase was Failed but the lastest condition has no message", "conditions", conditions)
279+
return bean.NodeNoLongerExists
260280
}
261281

262282
// foundAnyUpdateInPodStatus return true if any of the pod status fields have changed or if the pod is new

kubewatch/pkg/informer/cluster/systemExec/informer.go

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ import (
3333
coreV1 "k8s.io/api/core/v1"
3434
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3535
kubeinformers "k8s.io/client-go/informers"
36-
"log"
3736
"time"
3837
)
3938

@@ -63,12 +62,12 @@ func NewInformerImpl(logger *zap.SugaredLogger,
6362

6463
func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluster) error {
6564
if impl.appConfig.GetExternalConfig().External {
66-
impl.logger.Debugw("argo workflow setup is not done for cluster, skipping...", "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName, "appConfig", impl.appConfig)
65+
impl.logger.Warnw("argo workflow setup is not done for cluster, skipping...", "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName, "appConfig", impl.appConfig)
6766
return nil
6867
}
6968
startTime := time.Now()
7069
defer func() {
71-
impl.logger.Debugw("time taken to start system executor informer", "clusterId", clusterInfo.Id, "time", time.Since(startTime))
70+
impl.logger.Infow("time taken to start system executor informer", "clusterId", clusterInfo.Id, "time", time.Since(startTime))
7271
}()
7372
impl.logger.Infow("starting system executor informer for cluster", "clusterId", clusterInfo.Id, "clusterName", clusterInfo.ClusterName)
7473
restConfig := impl.k8sUtil.GetK8sConfigForCluster(clusterInfo)
@@ -87,7 +86,7 @@ func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluste
8786
if oldPodObj != nil {
8887
logArgs = append(logArgs, "oldPodStatusObj", oldPodObj.Status)
8988
}
90-
impl.logger.Debugw("no significant pod updates are detected so skipping the pod update event", logArgs...)
89+
impl.logger.Warnw("no significant pod updates are detected so skipping the pod update event", logArgs...)
9190
return
9291
}
9392

@@ -101,17 +100,15 @@ func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluste
101100
nodeStatus := impl.assessNodeStatus(bean.UpdateEvent, newPodObj)
102101
workflowStatus := getWorkflowStatus(newPodObj, nodeStatus, workflowType)
103102
if workflowStatus.Message == "" && workflowStatus.Phase == v1alpha1.WorkflowFailed {
104-
impl.logger.Debugw("skipping the failed workflow update event as message is empty", "workflow", workflowStatus)
103+
impl.logger.Warnw("skipping the failed workflow update event as message is empty", "workflow", workflowStatus, "podObjStatus", newPodObj.Status)
105104
return
106105
}
107106
if val, ok := podLabels[informerBean.DevtronOwnerInstanceLabelKey]; ok {
108107
workflowStatus.DevtronOwnerInstance = val
109108
} else {
110-
impl.logger.Warnw("devtron administrator instance label is not found in the pod. not a devtron workflow", "podLabels", podLabels)
111109
middleware.IncNonAdministrativeEvents(clusterLabels, middleware.RESOURCE_K8S_JOB)
112-
// return statement is skipped intentionally for backward compatibility
113-
// TODO Asutosh: remove this return statement in future
114-
// return
110+
impl.logger.Warnw("devtron administrator instance label is not found in the pod. not a devtron workflow", "podLabels", podLabels)
111+
return
115112
}
116113
wfJson, err := json.Marshal(workflowStatus)
117114
if err != nil {
@@ -120,7 +117,8 @@ func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluste
120117
}
121118
impl.logger.Debugw("sending system executor workflow update event", "workflow", string(wfJson))
122119
if impl.pubSubClient == nil {
123-
log.Println("don't publish")
120+
// This should never happen.
121+
impl.logger.Errorw("pubsub client is nil, skipping the publish")
124122
return
125123
}
126124
topic, err := argoWf.GetNatsTopicForWorkflow(workflowType)
@@ -133,7 +131,7 @@ func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluste
133131
impl.logger.Errorw("error while publishing request", "topic", topic, "wfJson", wfJson, "err", err)
134132
return
135133
}
136-
impl.logger.Debugw("system executor workflow update sent", "topic", topic, "workflowType", workflowType)
134+
impl.logger.Infow("system executor workflow update sent", "topic", topic, "workflowType", workflowType)
137135
}
138136
}
139137
// deleteFunc is called when an existing pod is deleted
@@ -148,6 +146,7 @@ func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluste
148146
nodeStatus, reTriggerRequired := impl.checkIfPodDeletedAndUpdateMessage(podObj.Name, podObj.Namespace, nodeStatus, restConfig)
149147
if !reTriggerRequired {
150148
// not sending this deleted event if it's not a re-trigger case
149+
impl.logger.Warnw("not sending delete event as it's not a re-trigger case", "podName", podObj.Name, "podObjStatus", podObj.Status, "deletionTimestamp", podObj.DeletionTimestamp)
151150
return
152151
}
153152
workflowStatus := getWorkflowStatus(podObj, nodeStatus, workflowType)
@@ -156,9 +155,7 @@ func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluste
156155
} else {
157156
impl.logger.Warnw("devtron administrator instance label is not found in the pod. not a devtron workflow", "podLabels", podLabels)
158157
middleware.IncNonAdministrativeEvents(clusterLabels, middleware.RESOURCE_K8S_JOB)
159-
// return statement is skipped intentionally for backward compatibility
160-
// TODO Asutosh: remove this return statement in future
161-
// return
158+
return
162159
}
163160
wfJson, err := json.Marshal(workflowStatus)
164161
if err != nil {
@@ -167,7 +164,8 @@ func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluste
167164
}
168165
impl.logger.Debugw("sending system executor cd workflow delete event", "workflow", string(wfJson))
169166
if impl.pubSubClient == nil {
170-
log.Println("don't publish")
167+
// This should never happen.
168+
impl.logger.Errorw("pubsub client is nil, skipping the publish")
171169
return
172170
}
173171
topic, err := argoWf.GetNatsTopicForWorkflow(workflowType)
@@ -180,7 +178,7 @@ func (impl *InformerImpl) StartInformerForCluster(clusterInfo *repository.Cluste
180178
impl.logger.Errorw("error while publishing request", "topic", topic, "wfJson", wfJson, "err", err)
181179
return
182180
}
183-
impl.logger.Debugw("workflow update sent", "topic", topic, "workflowType", workflowType)
181+
impl.logger.Infow("workflow update sent", "topic", topic, "workflowType", workflowType)
184182
}
185183
podInformerFactory := impl.informerClient.GetPodInformerFactory()
186184
eventHandler := resourceBean.NewEventHandlers[coreV1.Pod]().

kubewatch/pkg/informer/informer.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ func NewRunnerImpl(logger *zap.SugaredLogger,
5151
func (impl *RunnerImpl) Start() error {
5252
startTime := time.Now()
5353
defer func() {
54-
impl.logger.Debugw("time taken to start informer", "time", time.Since(startTime))
54+
impl.logger.Infow("time taken to start informer", "time", time.Since(startTime))
5555
}()
5656
if impl.appConfig.IsDBAvailable() {
5757
err := impl.clusterInformer.StartDevtronClusterWatcher()
@@ -76,7 +76,7 @@ func (impl *RunnerImpl) Start() error {
7676
func (impl *RunnerImpl) Stop() {
7777
startTime := time.Now()
7878
defer func() {
79-
impl.logger.Debugw("time taken to start default cluster informer", "time", time.Since(startTime))
79+
impl.logger.Infow("time taken to start default cluster informer", "time", time.Since(startTime))
8080
}()
8181
impl.clusterInformer.StopAll()
8282
}

0 commit comments

Comments
 (0)