Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions pkg/rest/autoplace.go
Original file line number Diff line number Diff line change
Expand Up @@ -1556,11 +1556,18 @@ func (s *Server) createOneResource(w http.ResponseWriter, r *http.Request, rdNam
// Persist onto rd.LayerStack if not already set so the satellite
// reconciler sees the right composition.
if len(env.LayerList) > 0 {
rd, getErr := s.Store.ResourceDefinitions().Get(r.Context(), rdName)
if getErr == nil && len(rd.LayerStack) == 0 {
rd.LayerStack = append([]string(nil), env.LayerList...)
_ = s.Store.ResourceDefinitions().Update(r.Context(), &rd)
}
// Bug 204b shape: typed-Patch with retry-on-conflict; the
// "only if unset" guard re-runs against the live RD on every
// retry so a concurrent reconciler write can't surface a 409
// (and a concurrent LayerStack set isn't clobbered).
_ = s.Store.ResourceDefinitions().PatchResourceDefinitionSpec(r.Context(), rdName,
func(rd *apiv1.ResourceDefinition) error {
if len(rd.LayerStack) == 0 {
rd.LayerStack = append([]string(nil), env.LayerList...)
}

return nil
})
Comment on lines 1558 to +1570

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Don't swallow the RD patch failure on explicit create.

If layer_list is present and this patch fails, the handler still continues with createOrPromoteResource() and returns success even though the parent RD never recorded the requested LayerStack. That can leave the new Resource out of sync with its definition, and it also discards concrete failures like ErrNotFound after a concurrent RD delete.

Suggested fix
 	if len(env.LayerList) > 0 {
-		_ = s.Store.ResourceDefinitions().PatchResourceDefinitionSpec(r.Context(), rdName,
-			func(rd *apiv1.ResourceDefinition) error {
-				if len(rd.LayerStack) == 0 {
-					rd.LayerStack = append([]string(nil), env.LayerList...)
-				}
-
-				return nil
-			})
+		err := s.Store.ResourceDefinitions().PatchResourceDefinitionSpec(r.Context(), rdName,
+			func(rd *apiv1.ResourceDefinition) error {
+				if len(rd.LayerStack) == 0 {
+					rd.LayerStack = append([]string(nil), env.LayerList...)
+				}
+
+				return nil
+			})
+		if err != nil {
+			writeStoreError(w, err)
+
+			return nil, false
+		}
 	}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if len(env.LayerList) > 0 {
rd, getErr := s.Store.ResourceDefinitions().Get(r.Context(), rdName)
if getErr == nil && len(rd.LayerStack) == 0 {
rd.LayerStack = append([]string(nil), env.LayerList...)
_ = s.Store.ResourceDefinitions().Update(r.Context(), &rd)
}
// Bug 204b shape: typed-Patch with retry-on-conflict; the
// "only if unset" guard re-runs against the live RD on every
// retry so a concurrent reconciler write can't surface a 409
// (and a concurrent LayerStack set isn't clobbered).
_ = s.Store.ResourceDefinitions().PatchResourceDefinitionSpec(r.Context(), rdName,
func(rd *apiv1.ResourceDefinition) error {
if len(rd.LayerStack) == 0 {
rd.LayerStack = append([]string(nil), env.LayerList...)
}
return nil
})
if len(env.LayerList) > 0 {
// Bug 204b shape: typed-Patch with retry-on-conflict; the
// "only if unset" guard re-runs against the live RD on every
// retry so a concurrent reconciler write can't surface a 409
// (and a concurrent LayerStack set isn't clobbered).
err := s.Store.ResourceDefinitions().PatchResourceDefinitionSpec(r.Context(), rdName,
func(rd *apiv1.ResourceDefinition) error {
if len(rd.LayerStack) == 0 {
rd.LayerStack = append([]string(nil), env.LayerList...)
}
return nil
})
if err != nil {
writeStoreError(w, err)
return nil, false
}
}
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@pkg/rest/autoplace.go` around lines 1558 - 1570, The RD patch call that sets
rd.LayerStack when env.LayerList is present currently ignores errors (the
returned value from s.Store.ResourceDefinitions().PatchResourceDefinitionSpec is
discarded), allowing the handler to proceed to createOrPromoteResource() and
return success even if the patch failed or returned ErrNotFound; change this by
capturing the error from PatchResourceDefinitionSpec (called with rdName and the
rd mutation closure) and if non-nil return or propagate that error from the
handler (handling ErrNotFound specifically if needed) instead of continuing to
create the resource so the caller sees the concrete failure and the new Resource
cannot be created out-of-sync with its RD.

}

// Bug 327 (P1, recurring — reported 5×): a bare `linstor r c <node>
Expand Down Expand Up @@ -2514,10 +2521,20 @@ func (s *Server) handleResourceMakeAvailable(w http.ResponseWriter, r *http.Requ

// Pass-through for CSI-supplied layer_list, identical to the
// autoplace / explicit-create flows. RD-level LayerStack wins.
//
// Bug 204b shape: typed-Patch with retry-on-conflict; the "only
// if unset" guard re-runs against the live RD on every retry so
// a concurrent reconciler write can't surface a 409 to the
// make-available caller.
if len(req.LayerList) > 0 && len(rd.LayerStack) == 0 {
rd.LayerStack = append([]string(nil), req.LayerList...)
err = s.Store.ResourceDefinitions().PatchResourceDefinitionSpec(r.Context(), rdName,
func(live *apiv1.ResourceDefinition) error {
if len(live.LayerStack) == 0 {
live.LayerStack = append([]string(nil), req.LayerList...)
}

err = s.Store.ResourceDefinitions().Update(r.Context(), &rd)
return nil
})
if err != nil {
writeStoreError(w, err)

Expand Down
28 changes: 14 additions & 14 deletions pkg/rest/drbd_passphrase.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,20 @@ func (s *Server) handleDRBDPassphraseSet(w http.ResponseWriter, r *http.Request)
return
}

rd, err := s.Store.ResourceDefinitions().Get(r.Context(), rdName)
if err != nil {
writeStoreError(w, err)

return
}

if rd.Props == nil {
rd.Props = map[string]string{}
}

rd.Props[drbdSharedSecretKey] = req.Passphrase

err = s.Store.ResourceDefinitions().Update(r.Context(), &rd)
// Bug 204b shape: typed-Patch with retry-on-conflict so a
// reconciler write on the RD between the read and the write
// re-applies the secret onto fresh state instead of surfacing
// a 409 to the operator.
err := s.Store.ResourceDefinitions().PatchResourceDefinitionSpec(r.Context(), rdName,
func(rd *apiv1.ResourceDefinition) error {
if rd.Props == nil {
rd.Props = map[string]string{}
}

rd.Props[drbdSharedSecretKey] = req.Passphrase

return nil
})
if err != nil {
writeStoreError(w, err)

Expand Down
29 changes: 15 additions & 14 deletions pkg/rest/node_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -620,24 +620,25 @@ func (s *Server) cascadeOrphansForLostNode(ctx context.Context, name string) err
return nil
}

// updateNodeFlags loads the Node, applies each mutator, and persists.
// Common shape across all three endpoints; lives here so the handler
// bodies stay one-line.
// updateNodeFlags applies each mutator to the Node's flag list and
// persists. Common shape across all three endpoints; lives here so the
// handler bodies stay one-line.
//
// Bug 205 shape: routed through PatchNodeSpec so a satellite Hello /
// reconciler write landing between the read and the write re-runs the
// flag mutation against fresh state under RetryOnConflict instead of
// leaking a 409 to the operator (same conflict class as the
// activate/deactivate release-gate flake on mutateResourceFlag).
func updateNodeFlags(w http.ResponseWriter, r *http.Request, s *Server, mutators ...func([]string) []string) {
name := r.PathValue("node")

node, err := s.Store.Nodes().Get(r.Context(), name)
if err != nil {
writeStoreError(w, err)

return
}

for _, m := range mutators {
node.Flags = m(node.Flags)
}
err := s.Store.Nodes().PatchNodeSpec(r.Context(), name, func(node *apiv1.Node) error {
for _, m := range mutators {
node.Flags = m(node.Flags)
}

err = s.Store.Nodes().Update(r.Context(), &node)
return nil
})
if err != nil {
writeStoreError(w, err)

Expand Down
20 changes: 17 additions & 3 deletions pkg/rest/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,16 @@ func (s *Server) upsertNodeAndDiskless(w http.ResponseWriter, r *http.Request, n
case err == nil:
// fresh create — normal path
case errors.Is(err, store.ErrAlreadyExists):
err = s.Store.Nodes().Update(r.Context(), n)
// Bug 205 shape: the re-register replace is routed through
// PatchNodeSpec so a satellite Hello / reconciler write
// landing between the AlreadyExists probe and the persist
// re-applies the wire snapshot under RetryOnConflict
// instead of leaking a 409 into the registration loop.
err = s.Store.Nodes().PatchNodeSpec(r.Context(), n.Name, func(live *apiv1.Node) error {
*live = *n

return nil
})
Comment on lines +581 to +585

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Completely overwriting live with *n during an upsert (re-registration) will silently wipe any existing node flags (such as EVICTED) and custom properties (such as topology properties or Aux/* keys) that are not present in the incoming registration request n.

Since satellite re-registrations or operator re-runs of linstor node create typically do not carry these controller-managed flags or custom metadata, this can lead to accidental loss of critical state (e.g., aborting an active node eviction process).

Consider preserving the existing flags and merging the properties to prevent data loss.

		err = s.Store.Nodes().PatchNodeSpec(r.Context(), n.Name, func(live *apiv1.Node) error {
			existingFlags := live.Flags
			existingProps := live.Props

			*live = *n

			// Preserve existing flags (like EVICTED) if not explicitly overwritten
			for _, f := range existingFlags {
				if !slices.Contains(live.Flags, f) {
					live.Flags = append(live.Flags, f)
				}
			}

			// Merge properties so custom metadata (like topology labels or Aux/*) is not wiped
			if len(live.Props) == 0 {
				live.Props = existingProps
			} else if len(existingProps) > 0 {
				mergedProps := make(map[string]string, len(existingProps)+len(live.Props))
				for k, v := range existingProps {
					mergedProps[k] = v
				}
				for k, v := range live.Props {
					mergedProps[k] = v
				}
				live.Props = mergedProps
			}

			return nil
		})

Comment on lines +576 to +585

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | 🏗️ Heavy lift

Avoid full-replacing the live node inside the retry patch.

Line 582 assigns *live = *n, so this closure still reapplies the POST snapshot as a wholesale spec replacement. That means an idempotent re-register can wipe live-only node fields that were never sent in the request, including the typed DRBD ranges guarded by pkg/store/k8s/patch_bug_208_test.go:52-110. Merge only the fields that the create API owns, or explicitly preserve controller-managed spec fields before returning.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@pkg/rest/nodes.go` around lines 576 - 585, The PatchNodeSpec closure used in
s.Store.Nodes().PatchNodeSpec should not do a full object replace via "*live =
*n" because that overwrites controller-managed fields; instead merge only the
create-API owned fields from "n" into "live" (or explicitly copy/preserve
controller-managed spec fields such as the DRBD ranges referenced in
pkg/store/k8s/patch_bug_208_test.go) before returning. Update the closure passed
to s.Store.Nodes().PatchNodeSpec to set only the allowed spec fields (or to read
and reassign preserved fields from the original live node into the result)
rather than replacing the whole "live" object.

if err != nil {
writeStoreError(w, err)

Expand Down Expand Up @@ -913,9 +922,14 @@ func (s *Server) handleNodeUpdate(w http.ResponseWriter, r *http.Request) {
}

if patch.NodeType != "" && patch.NodeType != existing.Type {
existing.Type = patch.NodeType
// Bug 205 shape: typed-Patch with retry-on-conflict so a
// concurrent satellite Hello / reconciler write can't bounce
// the rare node_type toggle with a 409.
err = s.Store.Nodes().PatchNodeSpec(r.Context(), name, func(live *apiv1.Node) error {
live.Type = patch.NodeType

err = s.Store.Nodes().Update(r.Context(), &existing)
return nil
})
if err != nil {
writeStoreError(w, err)

Expand Down
34 changes: 18 additions & 16 deletions pkg/rest/rd_clone.go
Original file line number Diff line number Diff line change
Expand Up @@ -446,22 +446,24 @@ func (s *Server) applyClonePropEdits(ctx context.Context, req *rdCloneRequest) e
return nil
}

rd, err := s.Store.ResourceDefinitions().Get(ctx, req.Name)
if err != nil {
return err //nolint:wrapcheck // message wrapped by the caller's envelope
}

if rd.Props == nil {
rd.Props = make(map[string]string, len(req.OverrideProps))
}

maps.Copy(rd.Props, req.OverrideProps)

for _, k := range req.DeleteProps {
delete(rd.Props, k)
}

return s.Store.ResourceDefinitions().Update(ctx, &rd) //nolint:wrapcheck // message wrapped by the caller's envelope
// Bug 204b shape: typed-Patch with retry-on-conflict so the
// freshly-materialised clone target's reconciler can't race this
// prop fold-in into a 409 (the edits re-apply to fresh state).
//nolint:wrapcheck // message wrapped by the caller's envelope
return s.Store.ResourceDefinitions().PatchResourceDefinitionSpec(ctx, req.Name,
func(rd *apiv1.ResourceDefinition) error {
if rd.Props == nil {
rd.Props = make(map[string]string, len(req.OverrideProps))
}

maps.Copy(rd.Props, req.OverrideProps)

for _, k := range req.DeleteProps {
delete(rd.Props, k)
}

return nil
})
}

// writeCloneRefused stamps a clone refusal in the CloneStarted OBJECT
Expand Down
24 changes: 15 additions & 9 deletions pkg/rest/resource_adjust.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,16 @@ func (s *Server) handleResourceDeactivate(w http.ResponseWriter, r *http.Request
// deactivate. set=true adds the flag; set=false removes every
// occurrence. Idempotent.
//
// Routed through PatchResourceSpec (Bug 204b shape) so a controller
// reconciler writing the same Resource CRD between the read and the
// write cannot surface a 409 to the operator: the store re-fetches
// the live object and re-applies the flag mutation under
// RetryOnConflict with backoff. Upstream LINSTOR never returns 409
// on activate/deactivate and the Python CLI does not retry it — the
// pre-fix Get → mutate → Update shape leaked the optimistic-lock
// conflict straight onto the wire (release-gate
// TestGroupFRActivateDeactivate flake).
//
// Responds with an `[]ApiCallRc` envelope (Bug 45): golinstor's
// response parser calls `json.Unmarshal` against this shape
// unconditionally and fails with "Unable to parse REST json data:
Expand All @@ -155,16 +165,12 @@ func mutateResourceFlag(w http.ResponseWriter, r *http.Request, s *Server, flag
rdName := r.PathValue("rd")
node := r.PathValue("node")

res, err := s.Store.Resources().Get(r.Context(), rdName, node)
if err != nil {
writeStoreError(w, err)

return
}

res.Flags = applyFlagMutation(res.Flags, flag, set)
err := s.Store.Resources().PatchResourceSpec(r.Context(), rdName, node,
func(res *apiv1.Resource) error {
res.Flags = applyFlagMutation(res.Flags, flag, set)

err = s.Store.Resources().Update(r.Context(), &res)
return nil
})
if err != nil {
writeStoreError(w, err)

Expand Down
Loading