diff --git a/charts/ome-crd/templates/ome.io_inferenceservices.yaml b/charts/ome-crd/templates/ome.io_inferenceservices.yaml
index 48caf611..f88725dd 100644
--- a/charts/ome-crd/templates/ome.io_inferenceservices.yaml
+++ b/charts/ome-crd/templates/ome.io_inferenceservices.yaml
@@ -36381,6 +36381,14 @@ spec:
- type
type: object
type: array
+ lifecycleState:
+ enum:
+ - READY
+ - CREATING
+ - UPDATING
+ - DELETING
+ - FAILED
+ type: string
modelStatus:
properties:
lastFailureInfo:
diff --git a/config/crd/full/ome.io_inferenceservices.yaml b/config/crd/full/ome.io_inferenceservices.yaml
index 48caf611..f88725dd 100644
--- a/config/crd/full/ome.io_inferenceservices.yaml
+++ b/config/crd/full/ome.io_inferenceservices.yaml
@@ -36381,6 +36381,14 @@ spec:
- type
type: object
type: array
+ lifecycleState:
+ enum:
+ - READY
+ - CREATING
+ - UPDATING
+ - DELETING
+ - FAILED
+ type: string
modelStatus:
properties:
lastFailureInfo:
diff --git a/pkg/apis/ome/v1beta1/inference_service_status.go b/pkg/apis/ome/v1beta1/inference_service_status.go
index 02ad9043..c373f670 100644
--- a/pkg/apis/ome/v1beta1/inference_service_status.go
+++ b/pkg/apis/ome/v1beta1/inference_service_status.go
@@ -18,6 +18,9 @@ type InferenceServiceStatus struct {
// - LatestDeploymentReady (serverless mode only): aggregated configuration condition, i.e. latest deployment readiness condition;
// - Ready: aggregated condition;
duckv1.Status `json:",inline"`
+ // LifecycleState is a high-level summary of the InferenceService state.
+ // +optional
+ LifecycleState InferenceServiceLifecycleState `json:"lifecycleState,omitempty"`
// Addressable endpoint for the InferenceService
// +optional
Address *duckv1.Addressable `json:"address,omitempty"`
@@ -31,6 +34,18 @@ type InferenceServiceStatus struct {
ModelStatus ModelStatus `json:"modelStatus,omitempty"`
}
+// InferenceServiceLifecycleState is a high-level lifecycle state for the InferenceService.
+// +kubebuilder:validation:Enum=READY;CREATING;UPDATING;DELETING;FAILED
+type InferenceServiceLifecycleState string
+
+const (
+ InferenceServiceLifecycleStateReady InferenceServiceLifecycleState = "READY"
+ InferenceServiceLifecycleStateCreating InferenceServiceLifecycleState = "CREATING"
+ InferenceServiceLifecycleStateUpdating InferenceServiceLifecycleState = "UPDATING"
+ InferenceServiceLifecycleStateDeleting InferenceServiceLifecycleState = "DELETING"
+ InferenceServiceLifecycleStateFailed InferenceServiceLifecycleState = "FAILED"
+)
+
// ComponentStatusSpec describes the state of the component
type ComponentStatusSpec struct {
// Latest revision name that is in ready state
diff --git a/pkg/controller/v1beta1/inferenceservice/controller.go b/pkg/controller/v1beta1/inferenceservice/controller.go
index 7f60fbe1..3f897f4c 100644
--- a/pkg/controller/v1beta1/inferenceservice/controller.go
+++ b/pkg/controller/v1beta1/inferenceservice/controller.go
@@ -24,6 +24,7 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/record"
+ "k8s.io/client-go/util/retry"
knapis "knative.dev/pkg/apis"
duckv1 "knative.dev/pkg/apis/duck/v1"
"knative.dev/pkg/network"
@@ -127,6 +128,10 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
return reconcile.Result{}, err
}
+ if err := r.backfillLifecycleState(ctx, isvc); err != nil {
+ r.Log.Error(err, "Failed to backfill InferenceService lifecycle state", "InferenceService", isvc.Name)
+ return reconcile.Result{}, err
+ }
// get annotations from isvc
annotations := utils.Filter(isvc.Annotations, func(key string) bool {
return !utils.Includes(constants.ServiceAnnotationDisallowedList, key)
@@ -557,24 +562,76 @@ func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta1
return ctrl.Result{}, nil
}
+func (r *InferenceServiceReconciler) backfillLifecycleState(ctx context.Context, desiredService *v1beta1.InferenceService) error {
+ if desiredService.Status.LifecycleState != "" {
+ return nil
+ }
+
+ namespacedName := types.NamespacedName{Name: desiredService.Name, Namespace: desiredService.Namespace}
+ return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+ existingService := &v1beta1.InferenceService{}
+ if err := r.Get(ctx, namespacedName, existingService); err != nil {
+ return err
+ }
+ if existingService.Status.LifecycleState != "" {
+ desiredService.Status = existingService.Status
+ return nil
+ }
+
+ serviceToUpdate := existingService.DeepCopy()
+ serviceToUpdate.Status.LifecycleState = status.DeriveLifecycleState(existingService, existingService.Status.LifecycleState)
+ if err := r.Status().Update(ctx, serviceToUpdate); err != nil {
+ if !apierrors.IsConflict(err) {
+ r.Log.Error(err, "Failed to backfill InferenceService lifecycle state", "InferenceService", desiredService.Name)
+ }
+ return err
+ }
+ desiredService.Status = serviceToUpdate.Status
+ return nil
+ })
+}
+
func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta1.InferenceService, deploymentMode constants.DeploymentModeType) error {
- existingService := &v1beta1.InferenceService{}
+ ctx := context.TODO()
namespacedName := types.NamespacedName{Name: desiredService.Name, Namespace: desiredService.Namespace}
- if err := r.Get(context.TODO(), namespacedName, existingService); err != nil {
- return err
- }
- wasReady := inferenceServiceReadiness(existingService.Status)
- if inferenceServiceStatusEqual(existingService.Status, desiredService.Status) {
- // If we didn't change anything then don't call updateStatus.
- // This is important because the copy we loaded from the informer's
- // cache may be stale, and we don't want to overwrite a prior update
- // to status with this stale state.
- } else if err := r.Status().Update(context.TODO(), desiredService); err != nil {
+ wasReady := false
+ statusUpdated := false
+ err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+ existingService := &v1beta1.InferenceService{}
+ if err := r.Get(ctx, namespacedName, existingService); err != nil {
+ return err
+ }
+
+ serviceToUpdate := existingService.DeepCopy()
+ serviceToUpdate.Status = desiredService.Status
+ serviceToUpdate.Status.LifecycleState = status.DeriveLifecycleState(serviceToUpdate, existingService.Status.LifecycleState)
+
+ wasReady = inferenceServiceReadiness(existingService.Status)
+ if inferenceServiceStatusEqual(existingService.Status, serviceToUpdate.Status) {
+ // If we didn't change anything then don't call updateStatus.
+ // This is important because the copy we loaded from the informer's
+ // cache may be stale, and we don't want to overwrite a prior update
+ // to status with this stale state.
+ return nil
+ }
+
+ if err := r.Status().Update(ctx, serviceToUpdate); err != nil {
+ if !apierrors.IsConflict(err) {
+ r.Log.Error(err, "Failed to update InferenceService status", "InferenceService", desiredService.Name)
+ }
+ return err
+ }
+ desiredService.Status = serviceToUpdate.Status
+ statusUpdated = true
+ return nil
+ })
+ if err != nil {
r.Log.Error(err, "Failed to update InferenceService status", "InferenceService", desiredService.Name)
r.Recorder.Eventf(desiredService, v1.EventTypeWarning, "UpdateFailed",
"Failed to update status for InferenceService %q: %v", desiredService.Name, err)
return errors.Wrapf(err, "fails to update InferenceService status")
- } else {
+ }
+ if statusUpdated {
// If there was a difference and there was no error.
isReady := inferenceServiceReadiness(desiredService.Status)
if wasReady && !isReady { // Moved to NotReady State
diff --git a/pkg/controller/v1beta1/inferenceservice/controller_test.go b/pkg/controller/v1beta1/inferenceservice/controller_test.go
index 669fd172..ff08da9c 100644
--- a/pkg/controller/v1beta1/inferenceservice/controller_test.go
+++ b/pkg/controller/v1beta1/inferenceservice/controller_test.go
@@ -776,6 +776,65 @@ func TestInferenceServiceReconcile(t *testing.T) {
}
}
+func TestBackfillLifecycleState(t *testing.T) {
+ g := gomega.NewGomegaWithT(t)
+
+ scheme := runtime.NewScheme()
+ g.Expect(v1beta1.AddToScheme(scheme)).NotTo(gomega.HaveOccurred())
+
+ tests := []struct {
+ name string
+ status v1beta1.InferenceServiceStatus
+ expectedState v1beta1.InferenceServiceLifecycleState
+ }{
+ {
+ name: "sets lifecycle state when missing",
+ status: v1beta1.InferenceServiceStatus{
+ ModelStatus: v1beta1.ModelStatus{
+ TransitionStatus: v1beta1.InProgress,
+ },
+ },
+ expectedState: v1beta1.InferenceServiceLifecycleStateCreating,
+ },
+ {
+ name: "keeps existing lifecycle state",
+ status: v1beta1.InferenceServiceStatus{
+ LifecycleState: v1beta1.InferenceServiceLifecycleStateReady,
+ },
+ expectedState: v1beta1.InferenceServiceLifecycleStateReady,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ isvc := &v1beta1.InferenceService{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: "test-isvc",
+ Namespace: "default",
+ },
+ Status: tt.status,
+ }
+ c := ctrlclientfake.NewClientBuilder().
+ WithScheme(scheme).
+ WithObjects(isvc).
+ WithStatusSubresource(isvc).
+ Build()
+ reconciler := &InferenceServiceReconciler{
+ Client: c,
+ Log: ctrl.Log.WithName("test"),
+ }
+
+ err := reconciler.backfillLifecycleState(context.TODO(), isvc)
+ g.Expect(err).NotTo(gomega.HaveOccurred())
+
+ updated := &v1beta1.InferenceService{}
+ err = c.Get(context.TODO(), types.NamespacedName{Name: isvc.Name, Namespace: isvc.Namespace}, updated)
+ g.Expect(err).NotTo(gomega.HaveOccurred())
+ g.Expect(updated.Status.LifecycleState).To(gomega.Equal(tt.expectedState))
+ })
+ }
+}
+
func TestDetermineDeploymentModes(t *testing.T) {
g := gomega.NewGomegaWithT(t)
diff --git a/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state.go b/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state.go
new file mode 100644
index 00000000..60032f2a
--- /dev/null
+++ b/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state.go
@@ -0,0 +1,83 @@
+package status
+
+import (
+ v1 "k8s.io/api/core/v1"
+ "knative.dev/pkg/apis"
+
+ "github.com/sgl-project/ome/pkg/apis/ome/v1beta1"
+)
+
+// DeriveLifecycleState converts detailed InferenceService status into a high-level state.
+func DeriveLifecycleState(
+ isvc *v1beta1.InferenceService,
+ previousState v1beta1.InferenceServiceLifecycleState,
+) v1beta1.InferenceServiceLifecycleState {
+ if isvc == nil {
+ return v1beta1.InferenceServiceLifecycleStateCreating
+ }
+
+ if !isvc.GetDeletionTimestamp().IsZero() {
+ return v1beta1.InferenceServiceLifecycleStateDeleting
+ }
+
+ readyCondition := isvc.Status.GetCondition(apis.ConditionReady)
+ if lifecycleTransitionFailed(isvc.Status.ModelStatus.TransitionStatus) ||
+ hasFailure(&isvc.Status) {
+ return v1beta1.InferenceServiceLifecycleStateFailed
+ }
+
+ if isvc.Status.IsReady() {
+ return v1beta1.InferenceServiceLifecycleStateReady
+ }
+
+ if lifecycleProgressing(&isvc.Status, readyCondition) {
+ if lifecyclePreviouslyEstablished(previousState) {
+ return v1beta1.InferenceServiceLifecycleStateUpdating
+ }
+ return v1beta1.InferenceServiceLifecycleStateCreating
+ }
+
+ if readyCondition != nil &&
+ readyCondition.Status == v1.ConditionFalse &&
+ lifecyclePreviouslyEstablished(previousState) {
+ return v1beta1.InferenceServiceLifecycleStateFailed
+ }
+
+ return v1beta1.InferenceServiceLifecycleStateCreating
+}
+
+func lifecycleTransitionFailed(transitionStatus v1beta1.TransitionStatus) bool {
+ return transitionStatus == v1beta1.InvalidSpec || transitionStatus == v1beta1.BlockedByFailedLoad
+}
+
+func hasFailure(status *v1beta1.InferenceServiceStatus) bool {
+ return status != nil && status.ModelStatus.LastFailureInfo != nil
+}
+
+func lifecycleProgressing(status *v1beta1.InferenceServiceStatus, readyCondition *apis.Condition) bool {
+ if status.ModelStatus.TransitionStatus == v1beta1.InProgress {
+ return true
+ }
+ if readyCondition != nil && readyCondition.Status == v1.ConditionUnknown {
+ return true
+ }
+ for _, componentStatus := range status.Components {
+ if componentStatus.LatestCreatedRevision != "" &&
+ componentStatus.LatestReadyRevision != "" &&
+ componentStatus.LatestCreatedRevision != componentStatus.LatestReadyRevision {
+ return true
+ }
+ }
+ return false
+}
+
+func lifecyclePreviouslyEstablished(previousState v1beta1.InferenceServiceLifecycleState) bool {
+ switch previousState {
+ case v1beta1.InferenceServiceLifecycleStateReady,
+ v1beta1.InferenceServiceLifecycleStateUpdating,
+ v1beta1.InferenceServiceLifecycleStateFailed:
+ return true
+ default:
+ return false
+ }
+}
diff --git a/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state_test.go b/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state_test.go
new file mode 100644
index 00000000..75df54f0
--- /dev/null
+++ b/pkg/controller/v1beta1/inferenceservice/status/lifecycle_state_test.go
@@ -0,0 +1,146 @@
+package status
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ v1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "knative.dev/pkg/apis"
+
+ "github.com/sgl-project/ome/pkg/apis/ome/v1beta1"
+)
+
+func TestDeriveLifecycleState(t *testing.T) {
+ tests := []struct {
+ name string
+ isvc *v1beta1.InferenceService
+ previousState v1beta1.InferenceServiceLifecycleState
+ expected v1beta1.InferenceServiceLifecycleState
+ }{
+ {
+ name: "nil service returns creating",
+ expected: v1beta1.InferenceServiceLifecycleStateCreating,
+ },
+ {
+ name: "deleting service returns deleting",
+ isvc: deletingInferenceService(),
+ expected: v1beta1.InferenceServiceLifecycleStateDeleting,
+ },
+ {
+ name: "ready service returns ready",
+ isvc: readyInferenceService(),
+ expected: v1beta1.InferenceServiceLifecycleStateReady,
+ },
+ {
+ name: "invalid spec returns failed",
+ isvc: inferenceServiceWithStatus(v1beta1.InferenceServiceStatus{
+ ModelStatus: v1beta1.ModelStatus{TransitionStatus: v1beta1.InvalidSpec},
+ }),
+ expected: v1beta1.InferenceServiceLifecycleStateFailed,
+ },
+ {
+ name: "last failure info returns failed",
+ isvc: inferenceServiceWithStatus(v1beta1.InferenceServiceStatus{
+ ModelStatus: v1beta1.ModelStatus{
+ TransitionStatus: v1beta1.InProgress,
+ LastFailureInfo: &v1beta1.FailureInfo{Reason: v1beta1.ModelLoadFailed},
+ },
+ }),
+ expected: v1beta1.InferenceServiceLifecycleStateFailed,
+ },
+ {
+ name: "in progress initial reconcile returns creating",
+ isvc: inferenceServiceWithStatus(v1beta1.InferenceServiceStatus{
+ ModelStatus: v1beta1.ModelStatus{TransitionStatus: v1beta1.InProgress},
+ }),
+ expected: v1beta1.InferenceServiceLifecycleStateCreating,
+ },
+ {
+ name: "in progress established service returns updating",
+ isvc: inferenceServiceWithStatus(v1beta1.InferenceServiceStatus{
+ ModelStatus: v1beta1.ModelStatus{TransitionStatus: v1beta1.InProgress},
+ }),
+ previousState: v1beta1.InferenceServiceLifecycleStateReady,
+ expected: v1beta1.InferenceServiceLifecycleStateUpdating,
+ },
+ {
+ name: "component rollout initial reconcile returns creating",
+ isvc: inferenceServiceWithStatus(v1beta1.InferenceServiceStatus{
+ Components: map[v1beta1.ComponentType]v1beta1.ComponentStatusSpec{
+ v1beta1.EngineComponent: {
+ LatestCreatedRevision: "engine-2",
+ LatestReadyRevision: "engine-1",
+ },
+ },
+ }),
+ expected: v1beta1.InferenceServiceLifecycleStateCreating,
+ },
+ {
+ name: "ready false after established state returns failed",
+ isvc: inferenceServiceWithStatus(func() v1beta1.InferenceServiceStatus {
+ status := v1beta1.InferenceServiceStatus{}
+ status.InitializeConditions()
+ status.SetCondition(v1beta1.IngressReady, &apis.Condition{
+ Type: v1beta1.IngressReady,
+ Status: v1.ConditionFalse,
+ Reason: "IngressNotReady",
+ Message: "ingress failed",
+ })
+ return status
+ }()),
+ previousState: v1beta1.InferenceServiceLifecycleStateReady,
+ expected: v1beta1.InferenceServiceLifecycleStateFailed,
+ },
+ {
+ name: "ready false during initial reconcile returns creating",
+ isvc: inferenceServiceWithStatus(func() v1beta1.InferenceServiceStatus {
+ status := v1beta1.InferenceServiceStatus{}
+ status.InitializeConditions()
+ status.SetCondition(v1beta1.IngressReady, &apis.Condition{
+ Type: v1beta1.IngressReady,
+ Status: v1.ConditionFalse,
+ Reason: "IngressNotReady",
+ Message: "ingress failed",
+ })
+ return status
+ }()),
+ expected: v1beta1.InferenceServiceLifecycleStateCreating,
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ assert.Equal(t, tt.expected, DeriveLifecycleState(tt.isvc, tt.previousState))
+ })
+ }
+}
+
+func inferenceServiceWithStatus(status v1beta1.InferenceServiceStatus) *v1beta1.InferenceService {
+ return &v1beta1.InferenceService{
+ Status: status,
+ }
+}
+
+func readyInferenceService() *v1beta1.InferenceService {
+ status := v1beta1.InferenceServiceStatus{}
+ status.InitializeConditions()
+ status.SetCondition(v1beta1.IngressReady, &apis.Condition{
+ Type: v1beta1.IngressReady,
+ Status: v1.ConditionTrue,
+ })
+ status.SetCondition(v1beta1.EngineReady, &apis.Condition{
+ Type: v1beta1.EngineReady,
+ Status: v1.ConditionTrue,
+ })
+ return inferenceServiceWithStatus(status)
+}
+
+func deletingInferenceService() *v1beta1.InferenceService {
+ now := metav1.Now()
+ return &v1beta1.InferenceService{
+ ObjectMeta: metav1.ObjectMeta{
+ DeletionTimestamp: &now,
+ },
+ }
+}
diff --git a/pkg/openapi/openapi_generated.go b/pkg/openapi/openapi_generated.go
index 586b07b4..72e69d6e 100644
--- a/pkg/openapi/openapi_generated.go
+++ b/pkg/openapi/openapi_generated.go
@@ -3893,6 +3893,13 @@ func schema_pkg_apis_ome_v1beta1_InferenceServiceStatus(ref common.ReferenceCall
},
},
},
+ "lifecycleState": {
+ SchemaProps: spec.SchemaProps{
+ Description: "LifecycleState is a high-level summary of the InferenceService state.",
+ Type: []string{"string"},
+ Format: "",
+ },
+ },
"address": {
SchemaProps: spec.SchemaProps{
Description: "Addressable endpoint for the InferenceService",
diff --git a/pkg/openapi/swagger.json b/pkg/openapi/swagger.json
index 3ab9cd4e..fc196b22 100644
--- a/pkg/openapi/swagger.json
+++ b/pkg/openapi/swagger.json
@@ -2162,6 +2162,10 @@
"x-kubernetes-patch-merge-key": "type",
"x-kubernetes-patch-strategy": "merge"
},
+ "lifecycleState": {
+ "description": "LifecycleState is a high-level summary of the InferenceService state.",
+ "type": "string"
+ },
"modelStatus": {
"description": "Model related statuses",
"default": {},