Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 86 additions & 10 deletions tests/integration/group_a_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ package integration

import (
"context"
"encoding/json"
"io"
"net/http"
"os/exec"
"sort"
"testing"
Expand Down Expand Up @@ -272,10 +275,16 @@ func TestGroupANodeEvacuatePUT(t *testing.T) {
t.Fatalf("EVICTED stamped despite in-use refusal; flags=%v", flagsMid)
}

// Second call: --force MUST stamp EVICTED. This is the PUT
// route smoke — CLI wrapper fatals on traceback so reaching
// the next assert means the wire shape is intact.
cli.JSON(t, "node", "evacuate", "--force", harness.NodeWorker2)
// Second call: ?force=true MUST stamp EVICTED. The pinned CLI
// (linstor-client 1.27.1, asserted by Group H) has no `--force`
// flag on `node evacuate`, so the force override is exercised
// through the PUT route directly — the same
// `PUT /v1/nodes/{node}/evacuate?force=true` wire shape
// handleRGDelete's force precedent documents. The envelope must
// decode as the LINSTOR `[]ApiCallRc` array (Bug 78 wire).
putRequireAPICallRcEnvelope(t,
stack.RestURL+"/v1/nodes/"+harness.NodeWorker2+"/evacuate?force=true",
http.StatusOK)

flagsAfter := nodeFlags(t, stack, harness.NodeWorker2)
if !containsString(flagsAfter, "EVICTED") {
Expand All @@ -288,13 +297,19 @@ func TestGroupANodeEvacuatePUT(t *testing.T) {
// evacuate (blockstor's REST shim wires both to handleNodeEvacuate);
// without the PUT route, golinstor's `NodeService.Evict` (which
// `doPUT`s) crashes the python decoder with an empty 405 body.
//
// The pinned CLI (linstor-client 1.27.1, asserted by Group H) does
// not ship the `node evict` verb yet, so the PUT route is driven
// directly over HTTP — the exact wire shape golinstor's
// NodeService.Evict emits (`PUT /v1/nodes/{node}/evict`, empty JSON
// body, `[]ApiCallRc` response).
func TestGroupANodeEvictPUT(t *testing.T) {
stack := harness.StartStack(t)
harness.SeedThreeNodeCluster(t, stack)

cli := &harness.CLI{URL: stack.RestURL}
// `linstor node evict <node>` — same wire as evacuate.
cli.JSON(t, "node", "evict", harness.NodeWorker3)
putRequireAPICallRcEnvelope(t,
stack.RestURL+"/v1/nodes/"+harness.NodeWorker3+"/evict",
http.StatusOK)

flags := nodeFlags(t, stack, harness.NodeWorker3)
if !containsString(flags, "EVICTED") {
Expand All @@ -310,13 +325,25 @@ func TestGroupANodeEvictPUT(t *testing.T) {
// DeletionTimestamp stamp would hang every orphan forever and brick
// the next RD-create that recycles the name/port allocation.
//
// We seed a Resource on worker-1 + a peer replica on worker-2, then
// call `n lost worker-1` and assert the worker-1 replica is gone
// while worker-2 survives.
// We seed a Resource on worker-1 + a peer replica on worker-2, take
// worker-1's satellite offline (the documented `n lost` precondition
// — the Bug 111 gate refuses `n lost` against an ONLINE satellite),
// then call `n lost worker-1` and assert the worker-1 replica is
// gone while worker-2 survives.
func TestGroupANodeLostCascadesOrphans(t *testing.T) {
stack := harness.StartStack(t)
harness.SeedThreeNodeCluster(t, stack)

// Kill worker-1's satellite heartbeat and wait for the OFFLINE
// status to land — `n lost` against an ONLINE satellite is
// correctly refused with 409 (Bug 111), and the machine-readable
// CLI exits 0 even on refusal envelopes, so calling too early
// would silently no-op and the cascade assert below would time
// out with a misleading message.
stack.Satellite.SimulateNodeOffline(harness.NodeWorker1)
waitForNodeConnectionStatus(t, stack, harness.NodeWorker1,
blockstoriov1alpha1.NodeConnectionStatusOffline)

ctx := context.Background()

rd := &blockstoriov1alpha1.ResourceDefinition{
Expand Down Expand Up @@ -616,6 +643,55 @@ func retryStatusPatch(ctx context.Context, stack *harness.Stack, name string,
return lastErr
}

// putRequireAPICallRcEnvelope issues a JSON PUT with an empty-object
// body and asserts the response status matches and the body decodes
// as the LINSTOR `[]ApiCallRc` array shape — the envelope
// python-linstor/golinstor expect from the node-lifecycle PUT routes
// (Bug 78: an empty or non-JSON body crashes the python decoder).
func putRequireAPICallRcEnvelope(t *testing.T, url string, wantStatus int) {
t.Helper()

resp := httpPutGroupH(t, url, []byte("{}"))

defer func() { _ = resp.Body.Close() }()

body, err := io.ReadAll(resp.Body)
if err != nil {
t.Fatalf("read response body: %v", err)
}

if resp.StatusCode != wantStatus {
t.Fatalf("status: got %d, want %d; body: %s", resp.StatusCode, wantStatus, body)
}

var envelope []map[string]any

err = json.Unmarshal(body, &envelope)
if err != nil {
t.Fatalf("body is not a []ApiCallRc envelope: %v; body: %s", err, body)
}

if len(envelope) == 0 {
t.Fatalf("empty []ApiCallRc envelope; body: %s", body)
}
}

// waitForNodeConnectionStatus blocks until the Node CRD's
// Status.ConnectionStatus reaches `want` — used to sequence the
// satellite-offline simulation before `n lost` (Bug 111 gate).
func waitForNodeConnectionStatus(t *testing.T, stack *harness.Stack, node, want string) {
t.Helper()

harness.Eventually(t, 10*time.Second, func() bool {
var got blockstoriov1alpha1.Node

err := stack.Env.Client.Get(context.Background(),
types.NamespacedName{Name: node}, &got)

return err == nil && got.Status.ConnectionStatus == want
}, "node "+node+" never reached ConnectionStatus="+want)
}

// runCLINoFatal runs the CLI and returns stdout + a best-effort
// success flag. Unlike CLI.Run/JSON, this does NOT t.Fatal on
// non-zero exit — used by tests that expect the call to fail
Expand Down
64 changes: 33 additions & 31 deletions tests/integration/group_f_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,19 +260,22 @@ func TestGroupFRToggleCancel(t *testing.T) {
}

// ---------------------------------------------------------------------------
// TestGroupFRToggleDiskful2DisklessPreservesTieBreaker — Bug 104
// (P1, QUORUM HAZARD). Starting from the steady auto-place(2) state
// (2 diskful + 1 TIE_BREAKER on the 3rd node), `linstor r td
// --diskless <one-of-diskful>` MUST leave the TIE_BREAKER Resource
// intact. Pre-fix, the RD reconciler recomputed wantWitness from
// scratch after the toggle, saw "1 diskful + 1 non-witness diskless"
// (the freshly-flipped replica), flipped its decision to "no witness
// needed", and DELETED the TIE_BREAKER Resource — collapsing the
// quorum surface to 1 diskful + 1 diskless with no third voter. The
// next network partition would freeze the volume read-only.
// TestGroupFRToggleDiskful2DisklessReapsTieBreaker — upstream-parity
// contract for the toggle-to-1-diskful path (supersedes the Bug 104
// "preserve" pin, inverted alongside the L1 specs in
// internal/controller/ensure_tiebreaker_test.go). Starting from the
// steady auto-place(2) state (2 diskful + 1 TIE_BREAKER on the 3rd
// node), `linstor r td --diskless <one-of-diskful>` leaves 1 diskful
// + 1 user-diskless — at which point quorumPolicy returns quorum=off:
// there is no diskful tie to break and no majority to freeze.
// Upstream LINSTOR's shouldTieBreakerExist never manages a witness
// below 2 diskful, so blockstor REAPS the now-redundant TIE_BREAKER,
// leaving exactly 2 replicas. (The former Bug 104/108 keep/create
// branches rested on the false premise that 1 diskful + 1 diskless
// freezes quorum:majority.)
// ---------------------------------------------------------------------------

func TestGroupFRToggleDiskful2DisklessPreservesTieBreaker(t *testing.T) {
func TestGroupFRToggleDiskful2DisklessReapsTieBreaker(t *testing.T) {
stack, cli, rd := setupGroupFRD(t, "td-tb")

// Steady state: auto-place 2 lands diskful on worker-1+worker-2,
Expand All @@ -299,22 +302,22 @@ func TestGroupFRToggleDiskful2DisklessPreservesTieBreaker(t *testing.T) {
return r != nil && groupFContains(r.Spec.Flags, "DISKLESS")
}, "Resource "+rd+"."+target+" never gained DISKLESS flag")

// Bug 104 invariant: the TIE_BREAKER Resource on `witnessNode`
// MUST still exist after the toggle settles. We give the RD
// reconciler a generous beat (rdReconcileRequeue + apply) and
// poll for a STABLE 3-replica composition: 1 diskful + 1
// non-witness diskless + 1 TIE_BREAKER.
// Upstream-parity invariant: the TIE_BREAKER on `witnessNode` is
// REAPED once the toggle settles. We give the RD reconciler a
// generous beat (rdReconcileRequeue + apply) and poll for a
// STABLE 2-replica composition: 1 diskful + 1 user-diskless,
// 0 witnesses.
deadline := time.Now().Add(groupFAssertTimeout)

for time.Now().Before(deadline) {
all := listResourcesByRD(t, stack, rd)

if assertBug104Composition(all, witnessNode) {
if assertWitnessReapedComposition(all) {
time.Sleep(2 * time.Second) // settle window

all = listResourcesByRD(t, stack, rd)

if assertBug104Composition(all, witnessNode) {
if assertWitnessReapedComposition(all) {
return // success
}
}
Expand All @@ -323,41 +326,40 @@ func TestGroupFRToggleDiskful2DisklessPreservesTieBreaker(t *testing.T) {
}

all := listResourcesByRD(t, stack, rd)
t.Fatalf("Bug 104: post-toggle replica set drifted from "+
"{1 diskful + 1 user-diskless + 1 TIE_BREAKER on %s}; got %d entries: %v",
t.Fatalf("post-toggle replica set drifted from "+
"{1 diskful + 1 user-diskless, witness on %s reaped}; got %d entries: %v",
witnessNode, len(all), all)
}

// assertBug104Composition returns true iff the replica set looks
// like {1 diskful + 1 user-diskless + 1 TIE_BREAKER on witnessNode}.
// Bug 104's failure mode is the TIE_BREAKER on witnessNode getting
// reaped, so we pin both the count (3) and the node identity.
func assertBug104Composition(all []blockstoriov1alpha1.Resource, witnessNode string) bool {
if len(all) != 3 {
// assertWitnessReapedComposition returns true iff the replica set
// looks like {1 diskful + 1 user-diskless} with NO TIE_BREAKER left.
// The upstream-parity failure mode is the auto-witness surviving (or
// being re-created) below 2 diskful, so we pin the exact count (2)
// and a zero witness count.
func assertWitnessReapedComposition(all []blockstoriov1alpha1.Resource) bool {
if len(all) != 2 {
return false
}

witnessFound := false
diskfulCount := 0
userDisklessCount := 0
witnessCount := 0

for i := range all {
isDiskless := groupFContains(all[i].Spec.Flags, "DISKLESS")
isTB := groupFContains(all[i].Spec.Flags, "TIE_BREAKER")

switch {
case isTB:
if all[i].Spec.NodeName == witnessNode {
witnessFound = true
}
witnessCount++
case isDiskless:
userDisklessCount++
default:
diskfulCount++
}
}

return witnessFound && diskfulCount == 1 && userDisklessCount == 1
return witnessCount == 0 && diskfulCount == 1 && userDisklessCount == 1
}

// ---------------------------------------------------------------------------
Expand Down
Loading