diff --git a/charts/ome-crd/templates/ome.io_inferenceservices.yaml b/charts/ome-crd/templates/ome.io_inferenceservices.yaml index 48caf611..f88725dd 100644 --- a/charts/ome-crd/templates/ome.io_inferenceservices.yaml +++ b/charts/ome-crd/templates/ome.io_inferenceservices.yaml @@ -36381,6 +36381,14 @@ spec: - type type: object type: array + lifecycleState: + enum: + - READY + - CREATING + - UPDATING + - DELETING + - FAILED + type: string modelStatus: properties: lastFailureInfo: diff --git a/config/crd/full/ome.io_inferenceservices.yaml b/config/crd/full/ome.io_inferenceservices.yaml index 48caf611..f88725dd 100644 --- a/config/crd/full/ome.io_inferenceservices.yaml +++ b/config/crd/full/ome.io_inferenceservices.yaml @@ -36381,6 +36381,14 @@ spec: - type type: object type: array + lifecycleState: + enum: + - READY + - CREATING + - UPDATING + - DELETING + - FAILED + type: string modelStatus: properties: lastFailureInfo: diff --git a/pkg/apis/ome/v1beta1/inference_service_status.go b/pkg/apis/ome/v1beta1/inference_service_status.go index 02ad9043..c373f670 100644 --- a/pkg/apis/ome/v1beta1/inference_service_status.go +++ b/pkg/apis/ome/v1beta1/inference_service_status.go @@ -18,6 +18,9 @@ type InferenceServiceStatus struct { // - LatestDeploymentReady (serverless mode only): aggregated configuration condition, i.e. latest deployment readiness condition;
// - Ready: aggregated condition;
duckv1.Status `json:",inline"` + // LifecycleState is a high-level summary of the InferenceService state. + // +optional + LifecycleState InferenceServiceLifecycleState `json:"lifecycleState,omitempty"` // Addressable endpoint for the InferenceService // +optional Address *duckv1.Addressable `json:"address,omitempty"` @@ -31,6 +34,18 @@ type InferenceServiceStatus struct { ModelStatus ModelStatus `json:"modelStatus,omitempty"` } +// InferenceServiceLifecycleState is a high-level lifecycle state for the InferenceService. +// +kubebuilder:validation:Enum=READY;CREATING;UPDATING;DELETING;FAILED +type InferenceServiceLifecycleState string + +const ( + InferenceServiceLifecycleStateReady InferenceServiceLifecycleState = "READY" + InferenceServiceLifecycleStateCreating InferenceServiceLifecycleState = "CREATING" + InferenceServiceLifecycleStateUpdating InferenceServiceLifecycleState = "UPDATING" + InferenceServiceLifecycleStateDeleting InferenceServiceLifecycleState = "DELETING" + InferenceServiceLifecycleStateFailed InferenceServiceLifecycleState = "FAILED" +) + // ComponentStatusSpec describes the state of the component type ComponentStatusSpec struct { // Latest revision name that is in ready state diff --git a/pkg/controller/v1beta1/inferenceservice/controller.go b/pkg/controller/v1beta1/inferenceservice/controller.go index 7f60fbe1..3f897f4c 100644 --- a/pkg/controller/v1beta1/inferenceservice/controller.go +++ b/pkg/controller/v1beta1/inferenceservice/controller.go @@ -24,6 +24,7 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/retry" knapis "knative.dev/pkg/apis" duckv1 "knative.dev/pkg/apis/duck/v1" "knative.dev/pkg/network" @@ -127,6 +128,10 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req } return reconcile.Result{}, err } + if err := r.backfillLifecycleState(ctx, isvc); err != nil { + r.Log.Error(err, "Failed to backfill InferenceService lifecycle state", "InferenceService", isvc.Name) + return reconcile.Result{}, err + } // get annotations from isvc annotations := utils.Filter(isvc.Annotations, func(key string) bool { return !utils.Includes(constants.ServiceAnnotationDisallowedList, key) @@ -557,24 +562,76 @@ func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta1 return ctrl.Result{}, nil } +func (r *InferenceServiceReconciler) backfillLifecycleState(ctx context.Context, desiredService *v1beta1.InferenceService) error { + if desiredService.Status.LifecycleState != "" { + return nil + } + + namespacedName := types.NamespacedName{Name: desiredService.Name, Namespace: desiredService.Namespace} + return retry.RetryOnConflict(retry.DefaultBackoff, func() error { + existingService := &v1beta1.InferenceService{} + if err := r.Get(ctx, namespacedName, existingService); err != nil { + return err + } + if existingService.Status.LifecycleState != "" { + desiredService.Status = existingService.Status + return nil + } + + serviceToUpdate := existingService.DeepCopy() + serviceToUpdate.Status.LifecycleState = status.DeriveLifecycleState(existingService, existingService.Status.LifecycleState) + if err := r.Status().Update(ctx, serviceToUpdate); err != nil { + if !apierrors.IsConflict(err) { + r.Log.Error(err, "Failed to backfill InferenceService lifecycle state", "InferenceService", desiredService.Name) + } + return err + } + desiredService.Status = serviceToUpdate.Status + return nil + }) +} + func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta1.InferenceService, deploymentMode constants.DeploymentModeType) error { - existingService := &v1beta1.InferenceService{} + ctx := context.TODO() namespacedName := types.NamespacedName{Name: desiredService.Name, Namespace: desiredService.Namespace} - if err := r.Get(context.TODO(), namespacedName, existingService); err != nil { - return err - } - wasReady := inferenceServiceReadiness(existingService.Status) - if inferenceServiceStatusEqual(existingService.Status, desiredService.Status) { - // If we didn't change anything then don't call updateStatus. - // This is important because the copy we loaded from the informer's - // cache may be stale, and we don't want to overwrite a prior update - // to status with this stale state. - } else if err := r.Status().Update(context.TODO(), desiredService); err != nil { + wasReady := false + statusUpdated := false + err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + existingService := &v1beta1.InferenceService{} + if err := r.Get(ctx, namespacedName, existingService); err != nil { + return err + } + + serviceToUpdate := existingService.DeepCopy() + serviceToUpdate.Status = desiredService.Status + serviceToUpdate.Status.LifecycleState = status.DeriveLifecycleState(serviceToUpdate, existingService.Status.LifecycleState) + + wasReady = inferenceServiceReadiness(existingService.Status) + if inferenceServiceStatusEqual(existingService.Status, serviceToUpdate.Status) { + // If we didn't change anything then don't call updateStatus. + // This is important because the copy we loaded from the informer's + // cache may be stale, and we don't want to overwrite a prior update + // to status with this stale state. + return nil + } + + if err := r.Status().Update(ctx, serviceToUpdate); err != nil { + if !apierrors.IsConflict(err) { + r.Log.Error(err, "Failed to update InferenceService status", "InferenceService", desiredService.Name) + } + return err + } + desiredService.Status = serviceToUpdate.Status + statusUpdated = true + return nil + }) + if err != nil { r.Log.Error(err, "Failed to update InferenceService status", "InferenceService", desiredService.Name) r.Recorder.Eventf(desiredService, v1.EventTypeWarning, "UpdateFailed", "Failed to update status for InferenceService %q: %v", desiredService.Name, err) return errors.Wrapf(err, "fails to update InferenceService status") - } else { + } + if statusUpdated { // If there was a difference and there was no error. isReady := inferenceServiceReadiness(desiredService.Status) if wasReady && !isReady { // Moved to NotReady State diff --git a/pkg/controller/v1beta1/inferenceservice/controller_test.go b/pkg/controller/v1beta1/inferenceservice/controller_test.go index 669fd172..ff08da9c 100644 --- a/pkg/controller/v1beta1/inferenceservice/controller_test.go +++ b/pkg/controller/v1beta1/inferenceservice/controller_test.go @@ -776,6 +776,65 @@ func TestInferenceServiceReconcile(t *testing.T) { } } +func TestBackfillLifecycleState(t *testing.T) { + g := gomega.NewGomegaWithT(t) + + scheme := runtime.NewScheme() + g.Expect(v1beta1.AddToScheme(scheme)).NotTo(gomega.HaveOccurred()) + + tests := []struct { + name string + status v1beta1.InferenceServiceStatus + expectedState v1beta1.InferenceServiceLifecycleState + }{ + { + name: "sets lifecycle state when missing", + status: v1beta1.InferenceServiceStatus{ + ModelStatus: v1beta1.ModelStatus{ + TransitionStatus: v1beta1.InProgress, + }, + }, + expectedState: v1beta1.InferenceServiceLifecycleStateCreating, + }, + { + name: "keeps existing lifecycle state", + status: v1beta1.InferenceServiceStatus{ + LifecycleState: v1beta1.InferenceServiceLifecycleStateReady, + }, + expectedState: v1beta1.InferenceServiceLifecycleStateReady, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + isvc := &v1beta1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-isvc", + Namespace: "default", + }, + Status: tt.status, + } + c := ctrlclientfake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(isvc). + WithStatusSubresource(isvc). + Build() + reconciler := &InferenceServiceReconciler{ + Client: c, + Log: ctrl.Log.WithName("test"), + } + + err := reconciler.backfillLifecycleState(context.TODO(), isvc) + g.Expect(err).NotTo(gomega.HaveOccurred()) + + updated := &v1beta1.InferenceService{} + err = c.Get(context.TODO(), types.NamespacedName{Name: isvc.Name, Namespace: isvc.Namespace}, updated) + g.Expect(err).NotTo(gomega.HaveOccurred()) + g.Expect(updated.Status.LifecycleState).To(gomega.Equal(tt.expectedState)) + }) + } +} + func TestDetermineDeploymentModes(t *testing.T) { g := gomega.NewGomegaWithT(t) diff --git a/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state.go b/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state.go new file mode 100644 index 00000000..60032f2a --- /dev/null +++ b/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state.go @@ -0,0 +1,83 @@ +package status + +import ( + v1 "k8s.io/api/core/v1" + "knative.dev/pkg/apis" + + "github.com/sgl-project/ome/pkg/apis/ome/v1beta1" +) + +// DeriveLifecycleState converts detailed InferenceService status into a high-level state. +func DeriveLifecycleState( + isvc *v1beta1.InferenceService, + previousState v1beta1.InferenceServiceLifecycleState, +) v1beta1.InferenceServiceLifecycleState { + if isvc == nil { + return v1beta1.InferenceServiceLifecycleStateCreating + } + + if !isvc.GetDeletionTimestamp().IsZero() { + return v1beta1.InferenceServiceLifecycleStateDeleting + } + + readyCondition := isvc.Status.GetCondition(apis.ConditionReady) + if lifecycleTransitionFailed(isvc.Status.ModelStatus.TransitionStatus) || + hasFailure(&isvc.Status) { + return v1beta1.InferenceServiceLifecycleStateFailed + } + + if isvc.Status.IsReady() { + return v1beta1.InferenceServiceLifecycleStateReady + } + + if lifecycleProgressing(&isvc.Status, readyCondition) { + if lifecyclePreviouslyEstablished(previousState) { + return v1beta1.InferenceServiceLifecycleStateUpdating + } + return v1beta1.InferenceServiceLifecycleStateCreating + } + + if readyCondition != nil && + readyCondition.Status == v1.ConditionFalse && + lifecyclePreviouslyEstablished(previousState) { + return v1beta1.InferenceServiceLifecycleStateFailed + } + + return v1beta1.InferenceServiceLifecycleStateCreating +} + +func lifecycleTransitionFailed(transitionStatus v1beta1.TransitionStatus) bool { + return transitionStatus == v1beta1.InvalidSpec || transitionStatus == v1beta1.BlockedByFailedLoad +} + +func hasFailure(status *v1beta1.InferenceServiceStatus) bool { + return status != nil && status.ModelStatus.LastFailureInfo != nil +} + +func lifecycleProgressing(status *v1beta1.InferenceServiceStatus, readyCondition *apis.Condition) bool { + if status.ModelStatus.TransitionStatus == v1beta1.InProgress { + return true + } + if readyCondition != nil && readyCondition.Status == v1.ConditionUnknown { + return true + } + for _, componentStatus := range status.Components { + if componentStatus.LatestCreatedRevision != "" && + componentStatus.LatestReadyRevision != "" && + componentStatus.LatestCreatedRevision != componentStatus.LatestReadyRevision { + return true + } + } + return false +} + +func lifecyclePreviouslyEstablished(previousState v1beta1.InferenceServiceLifecycleState) bool { + switch previousState { + case v1beta1.InferenceServiceLifecycleStateReady, + v1beta1.InferenceServiceLifecycleStateUpdating, + v1beta1.InferenceServiceLifecycleStateFailed: + return true + default: + return false + } +} diff --git a/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state_test.go b/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state_test.go new file mode 100644 index 00000000..75df54f0 --- /dev/null +++ b/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state_test.go @@ -0,0 +1,146 @@ +package status + +import ( + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "knative.dev/pkg/apis" + + "github.com/sgl-project/ome/pkg/apis/ome/v1beta1" +) + +func TestDeriveLifecycleState(t *testing.T) { + tests := []struct { + name string + isvc *v1beta1.InferenceService + previousState v1beta1.InferenceServiceLifecycleState + expected v1beta1.InferenceServiceLifecycleState + }{ + { + name: "nil service returns creating", + expected: v1beta1.InferenceServiceLifecycleStateCreating, + }, + { + name: "deleting service returns deleting", + isvc: deletingInferenceService(), + expected: v1beta1.InferenceServiceLifecycleStateDeleting, + }, + { + name: "ready service returns ready", + isvc: readyInferenceService(), + expected: v1beta1.InferenceServiceLifecycleStateReady, + }, + { + name: "invalid spec returns failed", + isvc: inferenceServiceWithStatus(v1beta1.InferenceServiceStatus{ + ModelStatus: v1beta1.ModelStatus{TransitionStatus: v1beta1.InvalidSpec}, + }), + expected: v1beta1.InferenceServiceLifecycleStateFailed, + }, + { + name: "last failure info returns failed", + isvc: inferenceServiceWithStatus(v1beta1.InferenceServiceStatus{ + ModelStatus: v1beta1.ModelStatus{ + TransitionStatus: v1beta1.InProgress, + LastFailureInfo: &v1beta1.FailureInfo{Reason: v1beta1.ModelLoadFailed}, + }, + }), + expected: v1beta1.InferenceServiceLifecycleStateFailed, + }, + { + name: "in progress initial reconcile returns creating", + isvc: inferenceServiceWithStatus(v1beta1.InferenceServiceStatus{ + ModelStatus: v1beta1.ModelStatus{TransitionStatus: v1beta1.InProgress}, + }), + expected: v1beta1.InferenceServiceLifecycleStateCreating, + }, + { + name: "in progress established service returns updating", + isvc: inferenceServiceWithStatus(v1beta1.InferenceServiceStatus{ + ModelStatus: v1beta1.ModelStatus{TransitionStatus: v1beta1.InProgress}, + }), + previousState: v1beta1.InferenceServiceLifecycleStateReady, + expected: v1beta1.InferenceServiceLifecycleStateUpdating, + }, + { + name: "component rollout initial reconcile returns creating", + isvc: inferenceServiceWithStatus(v1beta1.InferenceServiceStatus{ + Components: map[v1beta1.ComponentType]v1beta1.ComponentStatusSpec{ + v1beta1.EngineComponent: { + LatestCreatedRevision: "engine-2", + LatestReadyRevision: "engine-1", + }, + }, + }), + expected: v1beta1.InferenceServiceLifecycleStateCreating, + }, + { + name: "ready false after established state returns failed", + isvc: inferenceServiceWithStatus(func() v1beta1.InferenceServiceStatus { + status := v1beta1.InferenceServiceStatus{} + status.InitializeConditions() + status.SetCondition(v1beta1.IngressReady, &apis.Condition{ + Type: v1beta1.IngressReady, + Status: v1.ConditionFalse, + Reason: "IngressNotReady", + Message: "ingress failed", + }) + return status + }()), + previousState: v1beta1.InferenceServiceLifecycleStateReady, + expected: v1beta1.InferenceServiceLifecycleStateFailed, + }, + { + name: "ready false during initial reconcile returns creating", + isvc: inferenceServiceWithStatus(func() v1beta1.InferenceServiceStatus { + status := v1beta1.InferenceServiceStatus{} + status.InitializeConditions() + status.SetCondition(v1beta1.IngressReady, &apis.Condition{ + Type: v1beta1.IngressReady, + Status: v1.ConditionFalse, + Reason: "IngressNotReady", + Message: "ingress failed", + }) + return status + }()), + expected: v1beta1.InferenceServiceLifecycleStateCreating, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expected, DeriveLifecycleState(tt.isvc, tt.previousState)) + }) + } +} + +func inferenceServiceWithStatus(status v1beta1.InferenceServiceStatus) *v1beta1.InferenceService { + return &v1beta1.InferenceService{ + Status: status, + } +} + +func readyInferenceService() *v1beta1.InferenceService { + status := v1beta1.InferenceServiceStatus{} + status.InitializeConditions() + status.SetCondition(v1beta1.IngressReady, &apis.Condition{ + Type: v1beta1.IngressReady, + Status: v1.ConditionTrue, + }) + status.SetCondition(v1beta1.EngineReady, &apis.Condition{ + Type: v1beta1.EngineReady, + Status: v1.ConditionTrue, + }) + return inferenceServiceWithStatus(status) +} + +func deletingInferenceService() *v1beta1.InferenceService { + now := metav1.Now() + return &v1beta1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{ + DeletionTimestamp: &now, + }, + } +} diff --git a/pkg/openapi/openapi_generated.go b/pkg/openapi/openapi_generated.go index 586b07b4..72e69d6e 100644 --- a/pkg/openapi/openapi_generated.go +++ b/pkg/openapi/openapi_generated.go @@ -3893,6 +3893,13 @@ func schema_pkg_apis_ome_v1beta1_InferenceServiceStatus(ref common.ReferenceCall }, }, }, + "lifecycleState": { + SchemaProps: spec.SchemaProps{ + Description: "LifecycleState is a high-level summary of the InferenceService state.", + Type: []string{"string"}, + Format: "", + }, + }, "address": { SchemaProps: spec.SchemaProps{ Description: "Addressable endpoint for the InferenceService", diff --git a/pkg/openapi/swagger.json b/pkg/openapi/swagger.json index 3ab9cd4e..fc196b22 100644 --- a/pkg/openapi/swagger.json +++ b/pkg/openapi/swagger.json @@ -2162,6 +2162,10 @@ "x-kubernetes-patch-merge-key": "type", "x-kubernetes-patch-strategy": "merge" }, + "lifecycleState": { + "description": "LifecycleState is a high-level summary of the InferenceService state.", + "type": "string" + }, "modelStatus": { "description": "Model related statuses", "default": {},