fix: Don't exit early if a single PreResource Resolver fails (#2517)

bbernays · web-flow · commit cba923ce4707 · 2026-06-12T08:52:56.000Z
&lt;!--
Explain what problem this PR addresses
--&gt;

---
diff --git a/scheduler/queue/active_work_signal.go b/scheduler/queue/active_work_signal.go
@@ -16,8 +16,10 @@ import (
 //
 // Use it like this:
 //
-// - When a worker picks up a task, call `Add()` (like a WaitGroup)
-// - When a worker finishes a task, call `Done()` (like a WaitGroup)
+//   - When the dispatcher takes a task off the queue to hand it to a worker, call `Add()` (like a WaitGroup).
+//     Marking the task active before the handoff ensures the idle check cannot fire while
+//     the task is in flight between the queue and a worker.
+//   - When a worker finishes a task, call `Done()` (like a WaitGroup)
 //
 // - If the queue is empty, check `IsIdle()` to check if no workers are active.
 // - If workers are still active, call `Wait()` to block until state changes.
@@ -35,7 +37,7 @@ func newActiveWorkSignal() *activeWorkSignal {
 	}
 }
 
-// Add means a worker has started working on a task.
+// Add means the dispatcher has taken a task off the queue for a worker.
 //
 // Wake up the work queuing goroutine.
 func (s *activeWorkSignal) Add() {
diff --git a/scheduler/queue/scheduler.go b/scheduler/queue/scheduler.go
@@ -119,8 +119,10 @@ func (d *Scheduler) Sync(ctx context.Context, tableClients []WorkUnit, resolvedR
 			default:
 				item := queue.Pop()
 
-				// There is work to do
+				// There is work to do. Mark it active before handing it off so the
+				// idle check below cannot fire while the item is in flight to a worker.
 				if item != nil {
+					activeWorkSignal.Add()
 					jobs <- item
 					continue
 				}
diff --git a/scheduler/queue/worker.go b/scheduler/queue/worker.go
@@ -37,8 +37,9 @@ type worker struct {
 
 func (w *worker) work(ctx context.Context, activeWorkSignal *activeWorkSignal) {
 	for j := range w.jobs {
-		activeWorkSignal.Add()
-
+		// the work unit was already marked active by the dispatcher before it was
+		// handed off, so the dispatcher can never observe an idle state while a
+		// job is in flight between the queue and a worker
 		w.resolveTable(ctx, j.Table, j.Client, j.Parent)
 
 		activeWorkSignal.Done()
diff --git a/scheduler/resolvers/resolvers.go b/scheduler/resolvers/resolvers.go
@@ -79,13 +79,21 @@ func ResolveResourcesChunk(ctx context.Context, logger zerolog.Logger, m *metric
 	}
 
 	if table.PreResourceResolver != nil {
+		filtered := resources[:0]
 		for _, resource := range resources {
 			if err := table.PreResourceResolver(ctx, client, resource); err != nil {
+				if ctx.Err() != nil {
+					tableLogger.Error().Err(err).Msg("pre resource resolver failed, context cancelled")
+					m.AddErrors(ctx, 1, selector)
+					return nil
+				}
 				tableLogger.Error().Err(err).Msg("pre resource resolver failed")
 				m.AddErrors(ctx, 1, selector)
-				return nil
+				continue
 			}
+			filtered = append(filtered, resource)
 		}
+		resources = filtered
 	}
 	for _, resource := range resources {
 		for _, column := range table.Columns {
diff --git a/scheduler/resolvers/resolvers_test.go b/scheduler/resolvers/resolvers_test.go
@@ -0,0 +1,142 @@
+package resolvers
+
+import (
+	"context"
+	"errors"
+	"testing"
+
+	"github.com/apache/arrow-go/v18/arrow"
+	"github.com/cloudquery/plugin-sdk/v4/caser"
+	"github.com/cloudquery/plugin-sdk/v4/scheduler/metrics"
+	"github.com/cloudquery/plugin-sdk/v4/schema"
+	"github.com/rs/zerolog"
+	"github.com/stretchr/testify/require"
+)
+
+type testClient struct{}
+
+func (testClient) ID() string { return "test" }
+
+var _ schema.ClientMeta = testClient{}
+
+// TestResolveResourcesChunk_PreResourceResolverPartialFailure verifies that a
+// PreResourceResolver error on a single resource only drops that resource from
+// the batch, while the remaining resources are still resolved and returned.
+func TestResolveResourcesChunk_PreResourceResolverPartialFailure(t *testing.T) {
+	for _, tc := range []struct {
+		name          string
+		failItems     map[int]bool
+		expectedItems []int
+	}{
+		{
+			name:          "no failures keeps all resources",
+			failItems:     nil,
+			expectedItems: []int{0, 1, 2, 3, 4},
+		},
+		{
+			name:          "single failure drops only that resource",
+			failItems:     map[int]bool{2: true},
+			expectedItems: []int{0, 1, 3, 4},
+		},
+		{
+			name:          "multiple failures drop only the failing resources",
+			failItems:     map[int]bool{0: true, 3: true},
+			expectedItems: []int{1, 2, 4},
+		},
+		{
+			name:          "all failures drop the whole batch but do not panic",
+			failItems:     map[int]bool{0: true, 1: true, 2: true, 3: true, 4: true},
+			expectedItems: []int{},
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			table := &schema.Table{
+				Name: "test_table",
+				PreResourceResolver: func(_ context.Context, _ schema.ClientMeta, resource *schema.Resource) error {
+					if tc.failItems[resource.Item.(int)] {
+						return errors.New("pre resource resolver boom")
+					}
+					return nil
+				},
+				Columns: []schema.Column{
+					{
+						Name: "test_column",
+						Type: arrow.PrimitiveTypes.Int64,
+						Resolver: func(_ context.Context, _ schema.ClientMeta, resource *schema.Resource, c schema.Column) error {
+							return resource.Set(c.Name, int64(resource.Item.(int)))
+						},
+					},
+				},
+			}
+
+			client := testClient{}
+			m := metrics.NewMetrics()
+			m.InitWithClients(table, []schema.ClientMeta{client})
+
+			chunk := []any{0, 1, 2, 3, 4}
+			logger := zerolog.New(zerolog.NewTestWriter(t))
+
+			resources := ResolveResourcesChunk(context.Background(), logger, m, table, client, nil, chunk, caser.New())
+
+			gotItems := make([]int, len(resources))
+			for i, r := range resources {
+				gotItems[i] = r.Item.(int)
+				// surviving resources should have been fully resolved through the column resolvers
+				col := r.Get("test_column")
+				require.True(t, col.IsValid(), "surviving resource should have its column resolved")
+				require.Equal(t, int64(r.Item.(int)), col.Get(), "resolved column value should match the item")
+			}
+			require.ElementsMatch(t, tc.expectedItems, gotItems)
+
+			selector := m.NewSelector(client.ID(), table.Name)
+			require.Equal(t, uint64(len(tc.failItems)), m.GetErrors(selector), "expected one error per failing resource")
+			require.Equal(t, uint64(len(tc.expectedItems)), m.GetResources(selector), "only surviving resources should be counted")
+		})
+	}
+}
+
+// TestResolveResourcesChunk_PreResourceResolverContextCancelled verifies that
+// once the context is cancelled, the chunk is dropped immediately with a single
+// error instead of emitting one error per remaining resource.
+func TestResolveResourcesChunk_PreResourceResolverContextCancelled(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	calls := 0
+	table := &schema.Table{
+		Name: "test_table",
+		PreResourceResolver: func(_ context.Context, _ schema.ClientMeta, _ *schema.Resource) error {
+			calls++
+			if calls == 2 {
+				cancel()
+				return errors.New("pre resource resolver boom")
+			}
+			return nil
+		},
+		Columns: []schema.Column{
+			{
+				Name: "test_column",
+				Type: arrow.PrimitiveTypes.Int64,
+				Resolver: func(_ context.Context, _ schema.ClientMeta, resource *schema.Resource, c schema.Column) error {
+					return resource.Set(c.Name, int64(resource.Item.(int)))
+				},
+			},
+		},
+	}
+
+	client := testClient{}
+	m := metrics.NewMetrics()
+	m.InitWithClients(table, []schema.ClientMeta{client})
+
+	chunk := []any{0, 1, 2, 3, 4}
+	logger := zerolog.New(zerolog.NewTestWriter(t))
+
+	resources := ResolveResourcesChunk(ctx, logger, m, table, client, nil, chunk, caser.New())
+
+	require.Empty(t, resources, "cancelled chunk should not return resources")
+	require.Equal(t, 2, calls, "resolver should not be called for resources after cancellation")
+
+	selector := m.NewSelector(client.ID(), table.Name)
+	require.Equal(t, uint64(1), m.GetErrors(selector), "cancellation should be counted as a single error, not one per remaining resource")
+	require.Equal(t, uint64(0), m.GetResources(selector), "no resources should be counted for a cancelled chunk")
+}
diff --git a/scheduler/scheduler_preresource_partial_failure_test.go b/scheduler/scheduler_preresource_partial_failure_test.go
@@ -0,0 +1,120 @@
+package scheduler
+
+import (
+	"context"
+	"errors"
+	"testing"
+
+	"github.com/apache/arrow-go/v18/arrow"
+	"github.com/apache/arrow-go/v18/arrow/array"
+	"github.com/cloudquery/plugin-sdk/v4/message"
+	"github.com/cloudquery/plugin-sdk/v4/schema"
+	"github.com/rs/zerolog"
+	"github.com/stretchr/testify/require"
+)
+
+// TestSchedulerPreResourceResolverPartialFailureWithRelations verifies that a
+// PreResourceResolver failure only drops the failing resource and its own
+// subtree, at every level of a parent -> child1 -> child2 hierarchy:
+//   - parent "p2" fails: p2 and all its descendants are dropped, the other
+//     parents and their descendants survive
+//   - child1 "p1/c0" fails: only that row and its child2 descendants are
+//     dropped, sibling "p1/c1" and its descendants survive
+func TestSchedulerPreResourceResolverPartialFailureWithRelations(t *testing.T) {
+	for _, strategy := range AllStrategies {
+		t.Run(strategy.String(), func(t *testing.T) {
+			nameColumn := schema.Column{
+				Name: "name",
+				Type: arrow.BinaryTypes.String,
+				Resolver: func(_ context.Context, _ schema.ClientMeta, resource *schema.Resource, c schema.Column) error {
+					return resource.Set(c.Name, resource.Item.(string))
+				},
+			}
+
+			child2 := &schema.Table{
+				Name: "test_child2",
+				Resolver: func(_ context.Context, _ schema.ClientMeta, parent *schema.Resource, res chan<- any) error {
+					res <- []string{parent.Item.(string) + "/g0"}
+					return nil
+				},
+				Columns: []schema.Column{nameColumn},
+			}
+
+			child1 := &schema.Table{
+				Name: "test_child1",
+				Resolver: func(_ context.Context, _ schema.ClientMeta, parent *schema.Resource, res chan<- any) error {
+					p := parent.Item.(string)
+					res <- []string{p + "/c0", p + "/c1"}
+					return nil
+				},
+				PreResourceResolver: func(_ context.Context, _ schema.ClientMeta, resource *schema.Resource) error {
+					if resource.Item.(string) == "p1/c0" {
+						return errors.New("child1 pre resource resolver boom")
+					}
+					return nil
+				},
+				Columns:   []schema.Column{nameColumn},
+				Relations: schema.Tables{child2},
+			}
+
+			parentTable := &schema.Table{
+				Name: "test_parent",
+				Resolver: func(_ context.Context, _ schema.ClientMeta, _ *schema.Resource, res chan<- any) error {
+					// a single slice so all parents land in one chunk, like one API page
+					res <- []string{"p0", "p1", "p2", "p3", "p4"}
+					return nil
+				},
+				PreResourceResolver: func(_ context.Context, _ schema.ClientMeta, resource *schema.Resource) error {
+					if resource.Item.(string) == "p2" {
+						return errors.New("parent pre resource resolver boom")
+					}
+					return nil
+				},
+				Columns:   []schema.Column{nameColumn},
+				Relations: schema.Tables{child1},
+			}
+
+			tables := schema.Tables{parentTable}
+			c := testExecutionClient{}
+			sc := NewScheduler(
+				WithLogger(zerolog.New(zerolog.NewTestWriter(t)).Level(zerolog.DebugLevel)),
+				WithStrategy(strategy),
+			)
+			msgs := make(chan message.SyncMessage, 500)
+			require.NoError(t, sc.Sync(context.Background(), &c, tables, msgs))
+			close(msgs)
+
+			var messages message.SyncMessages
+			for msg := range msgs {
+				messages = append(messages, msg)
+			}
+
+			collect := func(tb *schema.Table) []string {
+				values := []string{}
+				for _, rec := range messages.GetInserts().GetRecordsForTable(tb) {
+					idx := rec.Schema().FieldIndices("name")[0]
+					col := rec.Column(idx).(*array.String)
+					for i := 0; i < col.Len(); i++ {
+						values = append(values, col.Value(i))
+					}
+				}
+				return values
+			}
+
+			require.ElementsMatch(t,
+				[]string{"p0", "p1", "p3", "p4"},
+				collect(parentTable),
+				"only the failing parent should be dropped")
+
+			require.ElementsMatch(t,
+				[]string{"p0/c0", "p0/c1", "p1/c1", "p3/c0", "p3/c1", "p4/c0", "p4/c1"},
+				collect(child1),
+				"children of surviving parents should sync, except the failing child; no children of the dropped parent")
+
+			require.ElementsMatch(t,
+				[]string{"p0/c0/g0", "p0/c1/g0", "p1/c1/g0", "p3/c0/g0", "p3/c1/g0", "p4/c0/g0", "p4/c1/g0"},
+				collect(child2),
+				"grandchildren should only sync under surviving child1 rows")
+		})
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -79,13 +79,21 @@ func ResolveResourcesChunk(ctx context.Context, logger zerolog.Logger, m *metric`
`79`	`79`	`}`
`80`	`80`
`81`	`81`	`if table.PreResourceResolver != nil {`
	`82`	`+ filtered := resources[:0]`
`82`	`83`	`for _, resource := range resources {`
`83`	`84`	`if err := table.PreResourceResolver(ctx, client, resource); err != nil {`
	`85`	`+ if ctx.Err() != nil {`
	`86`	`+ tableLogger.Error().Err(err).Msg("pre resource resolver failed, context cancelled")`
	`87`	`+ m.AddErrors(ctx, 1, selector)`
	`88`	`+ return nil`
	`89`	`+ }`
`84`	`90`	`tableLogger.Error().Err(err).Msg("pre resource resolver failed")`
`85`	`91`	`m.AddErrors(ctx, 1, selector)`
`86`		`- return nil`
	`92`	`+ continue`
`87`	`93`	`}`
	`94`	`+ filtered = append(filtered, resource)`
`88`	`95`	`}`
	`96`	`+ resources = filtered`
`89`	`97`	`}`
`90`	`98`	`for _, resource := range resources {`
`91`	`99`	`for _, column := range table.Columns {`