Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions charts/ome-crd/templates/ome.io_inferenceservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36381,6 +36381,14 @@ spec:
- type
type: object
type: array
lifecycleState:
enum:
- READY
- CREATING
- UPDATING
- DELETING
- FAILED
type: string
modelStatus:
properties:
lastFailureInfo:
Expand Down
8 changes: 8 additions & 0 deletions config/crd/full/ome.io_inferenceservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36381,6 +36381,14 @@ spec:
- type
type: object
type: array
lifecycleState:
enum:
- READY
- CREATING
- UPDATING
- DELETING
- FAILED
type: string
modelStatus:
properties:
lastFailureInfo:
Expand Down
15 changes: 15 additions & 0 deletions pkg/apis/ome/v1beta1/inference_service_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ type InferenceServiceStatus struct {
// - LatestDeploymentReady (serverless mode only): aggregated configuration condition, i.e. latest deployment readiness condition; <br/>
// - Ready: aggregated condition; <br/>
duckv1.Status `json:",inline"`
// LifecycleState is a high-level summary of the InferenceService state.
// +optional
LifecycleState InferenceServiceLifecycleState `json:"lifecycleState,omitempty"`
// Addressable endpoint for the InferenceService
// +optional
Address *duckv1.Addressable `json:"address,omitempty"`
Expand All @@ -31,6 +34,18 @@ type InferenceServiceStatus struct {
ModelStatus ModelStatus `json:"modelStatus,omitempty"`
}

// InferenceServiceLifecycleState is a high-level lifecycle state for the InferenceService.
// +kubebuilder:validation:Enum=READY;CREATING;UPDATING;DELETING;FAILED
type InferenceServiceLifecycleState string

const (
InferenceServiceLifecycleStateReady InferenceServiceLifecycleState = "READY"
InferenceServiceLifecycleStateCreating InferenceServiceLifecycleState = "CREATING"
InferenceServiceLifecycleStateUpdating InferenceServiceLifecycleState = "UPDATING"
InferenceServiceLifecycleStateDeleting InferenceServiceLifecycleState = "DELETING"
InferenceServiceLifecycleStateFailed InferenceServiceLifecycleState = "FAILED"
)

// ComponentStatusSpec describes the state of the component
type ComponentStatusSpec struct {
// Latest revision name that is in ready state
Expand Down
81 changes: 69 additions & 12 deletions pkg/controller/v1beta1/inferenceservice/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/retry"
knapis "knative.dev/pkg/apis"
duckv1 "knative.dev/pkg/apis/duck/v1"
"knative.dev/pkg/network"
Expand Down Expand Up @@ -127,6 +128,10 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
}
return reconcile.Result{}, err
}
if err := r.backfillLifecycleState(ctx, isvc); err != nil {
r.Log.Error(err, "Failed to backfill InferenceService lifecycle state", "InferenceService", isvc.Name)
return reconcile.Result{}, err
}
// get annotations from isvc
annotations := utils.Filter(isvc.Annotations, func(key string) bool {
return !utils.Includes(constants.ServiceAnnotationDisallowedList, key)
Expand Down Expand Up @@ -557,24 +562,76 @@ func (r *InferenceServiceReconciler) handleServerlessPrerequisites(isvc *v1beta1
return ctrl.Result{}, nil
}

func (r *InferenceServiceReconciler) backfillLifecycleState(ctx context.Context, desiredService *v1beta1.InferenceService) error {
if desiredService.Status.LifecycleState != "" {
return nil
}

namespacedName := types.NamespacedName{Name: desiredService.Name, Namespace: desiredService.Namespace}
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
existingService := &v1beta1.InferenceService{}
if err := r.Get(ctx, namespacedName, existingService); err != nil {
return err
}
if existingService.Status.LifecycleState != "" {
desiredService.Status = existingService.Status
return nil
}

serviceToUpdate := existingService.DeepCopy()
serviceToUpdate.Status.LifecycleState = status.DeriveLifecycleState(existingService, existingService.Status.LifecycleState)
if err := r.Status().Update(ctx, serviceToUpdate); err != nil {
if !apierrors.IsConflict(err) {
r.Log.Error(err, "Failed to backfill InferenceService lifecycle state", "InferenceService", desiredService.Name)
}
return err
}
desiredService.Status = serviceToUpdate.Status
return nil
})
}

func (r *InferenceServiceReconciler) updateStatus(desiredService *v1beta1.InferenceService, deploymentMode constants.DeploymentModeType) error {
existingService := &v1beta1.InferenceService{}
ctx := context.TODO()
namespacedName := types.NamespacedName{Name: desiredService.Name, Namespace: desiredService.Namespace}
if err := r.Get(context.TODO(), namespacedName, existingService); err != nil {
return err
}
wasReady := inferenceServiceReadiness(existingService.Status)
if inferenceServiceStatusEqual(existingService.Status, desiredService.Status) {
// If we didn't change anything then don't call updateStatus.
// This is important because the copy we loaded from the informer's
// cache may be stale, and we don't want to overwrite a prior update
// to status with this stale state.
} else if err := r.Status().Update(context.TODO(), desiredService); err != nil {
wasReady := false
statusUpdated := false
err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
existingService := &v1beta1.InferenceService{}
if err := r.Get(ctx, namespacedName, existingService); err != nil {
return err
}

serviceToUpdate := existingService.DeepCopy()
serviceToUpdate.Status = desiredService.Status
serviceToUpdate.Status.LifecycleState = status.DeriveLifecycleState(serviceToUpdate, existingService.Status.LifecycleState)

wasReady = inferenceServiceReadiness(existingService.Status)
if inferenceServiceStatusEqual(existingService.Status, serviceToUpdate.Status) {
// If we didn't change anything then don't call updateStatus.
// This is important because the copy we loaded from the informer's
// cache may be stale, and we don't want to overwrite a prior update
// to status with this stale state.
return nil
}

if err := r.Status().Update(ctx, serviceToUpdate); err != nil {
if !apierrors.IsConflict(err) {
r.Log.Error(err, "Failed to update InferenceService status", "InferenceService", desiredService.Name)
}
return err
}
desiredService.Status = serviceToUpdate.Status
statusUpdated = true
return nil
})
if err != nil {
r.Log.Error(err, "Failed to update InferenceService status", "InferenceService", desiredService.Name)
r.Recorder.Eventf(desiredService, v1.EventTypeWarning, "UpdateFailed",
"Failed to update status for InferenceService %q: %v", desiredService.Name, err)
return errors.Wrapf(err, "fails to update InferenceService status")
} else {
}
if statusUpdated {
// If there was a difference and there was no error.
isReady := inferenceServiceReadiness(desiredService.Status)
if wasReady && !isReady { // Moved to NotReady State
Expand Down
59 changes: 59 additions & 0 deletions pkg/controller/v1beta1/inferenceservice/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -776,6 +776,65 @@ func TestInferenceServiceReconcile(t *testing.T) {
}
}

func TestBackfillLifecycleState(t *testing.T) {
g := gomega.NewGomegaWithT(t)

scheme := runtime.NewScheme()
g.Expect(v1beta1.AddToScheme(scheme)).NotTo(gomega.HaveOccurred())

tests := []struct {
name string
status v1beta1.InferenceServiceStatus
expectedState v1beta1.InferenceServiceLifecycleState
}{
{
name: "sets lifecycle state when missing",
status: v1beta1.InferenceServiceStatus{
ModelStatus: v1beta1.ModelStatus{
TransitionStatus: v1beta1.InProgress,
},
},
expectedState: v1beta1.InferenceServiceLifecycleStateCreating,
},
{
name: "keeps existing lifecycle state",
status: v1beta1.InferenceServiceStatus{
LifecycleState: v1beta1.InferenceServiceLifecycleStateReady,
},
expectedState: v1beta1.InferenceServiceLifecycleStateReady,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
isvc := &v1beta1.InferenceService{
ObjectMeta: metav1.ObjectMeta{
Name: "test-isvc",
Namespace: "default",
},
Status: tt.status,
}
c := ctrlclientfake.NewClientBuilder().
WithScheme(scheme).
WithObjects(isvc).
WithStatusSubresource(isvc).
Build()
reconciler := &InferenceServiceReconciler{
Client: c,
Log: ctrl.Log.WithName("test"),
}

err := reconciler.backfillLifecycleState(context.TODO(), isvc)
g.Expect(err).NotTo(gomega.HaveOccurred())

updated := &v1beta1.InferenceService{}
err = c.Get(context.TODO(), types.NamespacedName{Name: isvc.Name, Namespace: isvc.Namespace}, updated)
g.Expect(err).NotTo(gomega.HaveOccurred())
g.Expect(updated.Status.LifecycleState).To(gomega.Equal(tt.expectedState))
})
}
}

func TestDetermineDeploymentModes(t *testing.T) {
g := gomega.NewGomegaWithT(t)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package status

import (
v1 "k8s.io/api/core/v1"
"knative.dev/pkg/apis"

"github.com/sgl-project/ome/pkg/apis/ome/v1beta1"
)

// DeriveLifecycleState converts detailed InferenceService status into a high-level state.
func DeriveLifecycleState(
isvc *v1beta1.InferenceService,
previousState v1beta1.InferenceServiceLifecycleState,
) v1beta1.InferenceServiceLifecycleState {
if isvc == nil {
return v1beta1.InferenceServiceLifecycleStateCreating
}

if !isvc.GetDeletionTimestamp().IsZero() {
return v1beta1.InferenceServiceLifecycleStateDeleting
}

readyCondition := isvc.Status.GetCondition(apis.ConditionReady)
if lifecycleTransitionFailed(isvc.Status.ModelStatus.TransitionStatus) ||
hasFailure(&isvc.Status) {
return v1beta1.InferenceServiceLifecycleStateFailed
}

if isvc.Status.IsReady() {
return v1beta1.InferenceServiceLifecycleStateReady
}

if lifecycleProgressing(&isvc.Status, readyCondition) {
if lifecyclePreviouslyEstablished(previousState) {
return v1beta1.InferenceServiceLifecycleStateUpdating
}
return v1beta1.InferenceServiceLifecycleStateCreating
}

if readyCondition != nil &&
readyCondition.Status == v1.ConditionFalse &&
lifecyclePreviouslyEstablished(previousState) {
return v1beta1.InferenceServiceLifecycleStateFailed
}

return v1beta1.InferenceServiceLifecycleStateCreating
}

func lifecycleTransitionFailed(transitionStatus v1beta1.TransitionStatus) bool {
return transitionStatus == v1beta1.InvalidSpec || transitionStatus == v1beta1.BlockedByFailedLoad
}

func hasFailure(status *v1beta1.InferenceServiceStatus) bool {
return status != nil && status.ModelStatus.LastFailureInfo != nil
}

func lifecycleProgressing(status *v1beta1.InferenceServiceStatus, readyCondition *apis.Condition) bool {
if status.ModelStatus.TransitionStatus == v1beta1.InProgress {
return true
}
if readyCondition != nil && readyCondition.Status == v1.ConditionUnknown {
return true
}
for _, componentStatus := range status.Components {
if componentStatus.LatestCreatedRevision != "" &&
componentStatus.LatestReadyRevision != "" &&
componentStatus.LatestCreatedRevision != componentStatus.LatestReadyRevision {
return true
}
}
return false
}

func lifecyclePreviouslyEstablished(previousState v1beta1.InferenceServiceLifecycleState) bool {
switch previousState {
case v1beta1.InferenceServiceLifecycleStateReady,
v1beta1.InferenceServiceLifecycleStateUpdating,
v1beta1.InferenceServiceLifecycleStateFailed:
return true
default:
return false
}
}
Loading