feat(client): add ParallelGetStream for in-order streaming parallel downloads

worstell · ampagent · worstell · commit 4d97fb3d2012 · 2026-06-25T17:20:46.000-07:00
ParallelGet writes chunks to an io.WriterAt, which requires a seekable destination (e.g. a temp file) and prevents a consumer from overlapping download with processing. ParallelGetStream fetches chunks in parallel but emits in-order bytes to a plain io.Writer via a bounded reorder buffer, so a streaming consumer (e.g. a decompress/extract pipeline) can run concurrently with the download. A concurrency-sized window caps fetched-but-unwritten chunks, bounding peak memory to O(concurrency * chunkSize) regardless of object size or consumer speed. Revision safety, ETag pinning, empty-object handling, range-ignore degrade, and the concurrency==1 shortcut mirror ParallelGet. Amp-Thread-ID: https://ampcode.com/threads/T-019ef6a9-a407-7389-bc43-001405e3ae9e Co-authored-by: Amp <amp@ampcode.com>
diff --git a/client/parallel_get_stream.go b/client/parallel_get_stream.go
@@ -0,0 +1,195 @@
+package client
+
+import (
+	"context"
+	"io"
+	"sync/atomic"
+
+	"github.com/alecthomas/errors"
+	"golang.org/x/sync/errgroup"
+)
+
+// ParallelGetStream downloads an object from any Range-capable RangeReader and
+// writes it, in order, to dst. Like ParallelGet it fetches the object in
+// chunkSize-byte chunks concurrently (up to concurrency requests in flight), but
+// instead of scattering chunks into an io.WriterAt it emits a single sequential
+// byte stream. This lets a caller overlap the download with a streaming consumer
+// (e.g. a zstd/tar pipeline) rather than staging the whole object first.
+//
+// Chunks complete out of order, so a bounded reorder buffer holds fetched
+// chunks until their turn to be written. A window caps the number of
+// fetched-but-unwritten chunks (and thus in-flight fetches) at concurrency, so
+// peak buffered memory is O(concurrency * chunkSize) regardless of object size.
+//
+// Revision safety mirrors ParallelGet: the first ranged Open reveals the total
+// size and ETag, every later chunk pins that ETag via IfRange, and an ETag
+// change mid-download is reported as an error rather than splicing revisions. An
+// object with no ETag, one that fits in the first chunk, or a backend that
+// ignored the range falls back to a single full read; a concurrency of 1
+// likewise reads the whole object in one request.
+//
+// dst is written sequentially from a single goroutine, so it need not be safe
+// for concurrent writes. On error a partially written dst must be discarded by
+// the caller.
+func ParallelGetStream(ctx context.Context, c RangeReader, key Key, dst io.Writer, chunkSize int64, concurrency int) error {
+	if chunkSize <= 0 {
+		return errors.Errorf("parallel get stream: chunk size must be positive, got %d", chunkSize)
+	}
+	concurrency = max(concurrency, 1)
+
+	// A single worker gains nothing from chunking, so read the object in one
+	// revision-consistent request.
+	if concurrency == 1 {
+		return fullReadStream(ctx, c, key, dst)
+	}
+
+	// Discovery: the first ranged Open delivers chunk zero and reveals the total
+	// size and ETag used to pin the rest.
+	rc, headers, err := c.Open(ctx, key, Range(0, chunkSize))
+	if errors.Is(err, ErrRangeNotSatisfiable) {
+		return nil // Empty object: nothing to write.
+	}
+	if err != nil {
+		return errors.Wrap(err, "parallel get stream: open first chunk")
+	}
+
+	etag := headers.Get(ETagKey)
+	total, hasRange := parseContentRangeTotal(headers.Get("Content-Range"))
+
+	// A backend that ignored the range, or an object that fits within the first
+	// chunk, is delivered entirely by this response: copy it and return. A
+	// negative want skips the length check when the total size is unknown.
+	firstLen := min(chunkSize, total)
+	if !hasRange {
+		firstLen = -1
+	}
+	if !hasRange || total <= chunkSize {
+		return errors.Wrap(copyChunkStream(dst, firstLen, rc), "parallel get stream")
+	}
+
+	// Without a validator to pin subsequent chunks to, splicing across a rewrite
+	// can't be detected, so fall back to a single, revision-consistent read.
+	if etag == "" {
+		if err := rc.Close(); err != nil {
+			return errors.Wrap(err, "parallel get stream: close discovery reader")
+		}
+		return fullReadStream(ctx, c, key, dst)
+	}
+
+	numChunks := int((total + chunkSize - 1) / chunkSize)
+
+	// slots[seq] carries the bytes of chunk seq (1 <= seq < numChunks) from its
+	// fetching worker to the writer. Chunk zero is streamed directly from the
+	// discovery reader and has no slot.
+	slots := make([]chan []byte, numChunks)
+	for seq := 1; seq < numChunks; seq++ {
+		slots[seq] = make(chan []byte, 1)
+	}
+
+	// window bounds fetched-but-unwritten chunks (and in-flight fetches) to
+	// concurrency. A worker takes a token before fetching; the writer returns
+	// one after writing a chunk, admitting the next fetch. The channel never
+	// exceeds capacity because a token is always "held" by the chunk in flight
+	// or being written at the moment it is returned, so sends never block.
+	window := make(chan struct{}, concurrency)
+	for range concurrency {
+		window <- struct{}{}
+	}
+
+	eg, egCtx := errgroup.WithContext(ctx)
+
+	var nextSeq atomic.Int64
+	nextSeq.Store(1)
+	for range concurrency {
+		eg.Go(func() error {
+			for {
+				select {
+				case <-egCtx.Done():
+					return egCtx.Err()
+				case <-window:
+				}
+				seq := int(nextSeq.Add(1) - 1)
+				if seq >= numChunks {
+					window <- struct{}{} // No work for this token; return it.
+					return nil
+				}
+				start := int64(seq) * chunkSize
+				end := min(start+chunkSize, total)
+				data, err := fetchChunkBytes(egCtx, c, key, start, end, etag)
+				if err != nil {
+					return err
+				}
+				slots[seq] <- data // Buffered, single producer: never blocks.
+			}
+		})
+	}
+
+	// Writer: stream chunk zero from the discovery reader, then emit each
+	// subsequent chunk in order, returning a window token after each so workers
+	// can advance.
+	eg.Go(func() error {
+		if err := copyChunkStream(dst, firstLen, rc); err != nil {
+			return err
+		}
+		for seq := 1; seq < numChunks; seq++ {
+			select {
+			case <-egCtx.Done():
+				return egCtx.Err()
+			case data := <-slots[seq]:
+				if _, err := dst.Write(data); err != nil {
+					return errors.Errorf("write chunk at offset %d: %w", int64(seq)*chunkSize, err)
+				}
+				window <- struct{}{}
+			}
+		}
+		return nil
+	})
+
+	return errors.Wrap(eg.Wait(), "parallel get stream")
+}
+
+// fullReadStream downloads the entire object in a single request and copies it
+// to dst. Used when chunking adds no value (a single worker) or can't be made
+// revision-safe (no ETag to pin).
+func fullReadStream(ctx context.Context, c RangeReader, key Key, dst io.Writer) error {
+	rc, _, err := c.Open(ctx, key)
+	if err != nil {
+		return errors.Wrap(err, "parallel get stream: full read")
+	}
+	return errors.Wrap(copyChunkStream(dst, -1, rc), "parallel get stream")
+}
+
+// copyChunkStream copies src into dst and closes src. It fails if fewer than
+// want bytes arrive; a negative want skips that check (total size unknown).
+func copyChunkStream(dst io.Writer, want int64, src io.ReadCloser) error {
+	n, copyErr := io.Copy(dst, src)
+	if err := errors.Join(copyErr, src.Close()); err != nil {
+		return errors.Errorf("copy chunk: %w", err)
+	}
+	if want >= 0 && n != want {
+		return errors.Errorf("short chunk: copied %d of %d bytes", n, want)
+	}
+	return nil
+}
+
+// fetchChunkBytes opens the [start, end) range pinned to etag and returns its
+// bytes. An ETag change (the object was rewritten mid-download) or a short read
+// is reported as an error.
+func fetchChunkBytes(ctx context.Context, c RangeReader, key Key, start, end int64, etag string) ([]byte, error) {
+	rc, headers, err := c.Open(ctx, key, Range(start, end), IfRange(etag))
+	if err != nil {
+		return nil, errors.Errorf("open range %d-%d: %w", start, end, err)
+	}
+	if got := headers.Get(ETagKey); got != etag {
+		return nil, errors.Join(
+			errors.Errorf("object changed during read at offset %d: etag %q != %q", start, got, etag),
+			rc.Close(),
+		)
+	}
+	buf := make([]byte, end-start)
+	_, readErr := io.ReadFull(rc, buf)
+	if err := errors.Join(readErr, rc.Close()); err != nil {
+		return nil, errors.Errorf("read chunk at offset %d: %w", start, err)
+	}
+	return buf, nil
+}
diff --git a/client/parallel_get_stream_test.go b/client/parallel_get_stream_test.go
@@ -0,0 +1,201 @@
+package client_test
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"net/http"
+	"strconv"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/alecthomas/assert/v2"
+	"github.com/alecthomas/errors"
+
+	"github.com/block/cachew/client"
+)
+
+func patternBytes(n int) []byte {
+	data := make([]byte, n)
+	for i := range data {
+		data[i] = byte(i % 251)
+	}
+	return data
+}
+
+func TestParallelGetStreamReassembly(t *testing.T) {
+	// A multi-chunk object must be emitted to the writer as the original,
+	// in-order byte stream despite being fetched concurrently.
+	data := patternBytes(10_000)
+	c := &recordingReader{data: data, etag: `"v1"`}
+	var dst bytes.Buffer
+	err := client.ParallelGetStream(context.Background(), c, client.NewKey("k"), &dst, 1000, 4)
+	assert.NoError(t, err)
+	assert.Equal(t, data, dst.Bytes())
+}
+
+func TestParallelGetStreamSingleWorkerFullRead(t *testing.T) {
+	// A concurrency of 1 must issue a single non-ranged read rather than
+	// discovering and serialising ranged GETs.
+	data := patternBytes(1000)
+	c := &recordingReader{data: data, etag: `"v1"`}
+	var dst bytes.Buffer
+	err := client.ParallelGetStream(context.Background(), c, client.NewKey("k"), &dst, 100, 1)
+	assert.NoError(t, err)
+	assert.Equal(t, data, dst.Bytes())
+	assert.Equal(t, []string{""}, c.opens)
+}
+
+func TestParallelGetStreamEmptyObject(t *testing.T) {
+	c := &recordingReader{data: nil, etag: `"v1"`}
+	var dst bytes.Buffer
+	err := client.ParallelGetStream(context.Background(), c, client.NewKey("k"), &dst, 100, 4)
+	assert.NoError(t, err)
+	assert.Equal(t, 0, dst.Len())
+}
+
+func TestParallelGetStreamSingleChunk(t *testing.T) {
+	// An object delivered entirely by the discovery request is streamed directly.
+	data := []byte("0123456789")
+	c := &recordingReader{data: data, etag: `"v1"`}
+	var dst bytes.Buffer
+	err := client.ParallelGetStream(context.Background(), c, client.NewKey("k"), &dst, 100, 4)
+	assert.NoError(t, err)
+	assert.Equal(t, data, dst.Bytes())
+}
+
+func TestParallelGetStreamETagMismatch(t *testing.T) {
+	// An object rewritten mid-download (different ETag on later chunks) must be
+	// reported rather than splicing bytes from two revisions.
+	c := &rangeFlipReader{data: make([]byte, 1000), firstETag: `"v1"`, restETag: `"v2"`}
+	var dst bytes.Buffer
+	err := client.ParallelGetStream(context.Background(), c, client.NewKey("k"), &dst, 100, 4)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "object changed during read")
+}
+
+func TestParallelGetStreamNoETagMultiChunk(t *testing.T) {
+	// A multi-chunk object with no ETag can't be pinned, so it falls back to a
+	// single full read.
+	data := patternBytes(1000)
+	c := &noETagReader{data: data}
+	var dst bytes.Buffer
+	err := client.ParallelGetStream(context.Background(), c, client.NewKey("k"), &dst, 100, 4)
+	assert.NoError(t, err)
+	assert.Equal(t, data, dst.Bytes())
+}
+
+func TestParallelGetStreamNoETagSingleChunk(t *testing.T) {
+	data := []byte("0123456789")
+	c := &noETagReader{data: data}
+	var dst bytes.Buffer
+	err := client.ParallelGetStream(context.Background(), c, client.NewKey("k"), &dst, 100, 4)
+	assert.NoError(t, err)
+	assert.Equal(t, data, dst.Bytes())
+}
+
+func TestParallelGetStreamServerIgnoresRange(t *testing.T) {
+	// A backend that ignores the range header delivers the whole object on the
+	// discovery request; it must be streamed in full.
+	data := patternBytes(1000)
+	c := &ignoreRangeReader{data: data}
+	var dst bytes.Buffer
+	err := client.ParallelGetStream(context.Background(), c, client.NewKey("k"), &dst, 100, 4)
+	assert.NoError(t, err)
+	assert.Equal(t, data, dst.Bytes())
+}
+
+func TestParallelGetStreamOutOfOrderCompletion(t *testing.T) {
+	// Chunks deliberately complete in reverse order; the writer must still emit a
+	// correctly ordered stream and stay within the bounded window.
+	data := patternBytes(10_000)
+	c := &reorderReader{data: data, etag: `"v1"`, chunkSize: 1000}
+	var dst bytes.Buffer
+	err := client.ParallelGetStream(context.Background(), c, client.NewKey("k"), &dst, 1000, 4)
+	assert.NoError(t, err)
+	assert.Equal(t, data, dst.Bytes())
+}
+
+func TestParallelGetStreamPropagatesOpenError(t *testing.T) {
+	// An error opening a non-first chunk must surface and cancel the download.
+	c := &failingChunkReader{data: patternBytes(10_000), etag: `"v1"`, failAtStart: 5000}
+	var dst bytes.Buffer
+	err := client.ParallelGetStream(context.Background(), c, client.NewKey("k"), &dst, 1000, 4)
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "boom")
+}
+
+// ignoreRangeReader returns the whole object with no Content-Range regardless of
+// the requested range, modelling a backend that doesn't honour ranges.
+type ignoreRangeReader struct{ data []byte }
+
+func (r *ignoreRangeReader) Open(_ context.Context, _ client.Key, _ ...client.RequestOption) (io.ReadCloser, http.Header, error) {
+	headers := http.Header{}
+	headers.Set("Content-Length", strconv.Itoa(len(r.data)))
+	return io.NopCloser(bytes.NewReader(r.data)), headers, nil
+}
+
+// reorderReader serves correct byte ranges but delays earlier offsets longer
+// than later ones, so within the in-flight window chunks complete out of order
+// and the writer must buffer and reorder them.
+type reorderReader struct {
+	data      []byte
+	etag      string
+	chunkSize int64
+}
+
+func (r *reorderReader) Open(_ context.Context, _ client.Key, opts ...client.RequestOption) (io.ReadCloser, http.Header, error) {
+	size := int64(len(r.data))
+	o := client.NewRequestOptions(opts...)
+	start, length, outcome := o.ResolveRange(size, r.etag)
+	headers := http.Header{}
+	if outcome == client.RangeNotSatisfiable {
+		headers.Set("Content-Range", fmt.Sprintf("bytes */%d", size))
+		return nil, headers, client.ErrRangeNotSatisfiable
+	}
+	// Earlier chunks within a window sleep longer, so higher offsets finish
+	// first and the writer is forced to reorder.
+	if outcome == client.RangePartial {
+		chunks := (size - start) / r.chunkSize
+		time.Sleep(time.Duration(chunks) * time.Millisecond)
+	}
+	headers.Set(client.ETagKey, r.etag)
+	headers.Set("Content-Length", strconv.FormatInt(length, 10))
+	if outcome == client.RangePartial {
+		headers.Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, start+length-1, size))
+	}
+	return io.NopCloser(bytes.NewReader(r.data[start : start+length])), headers, nil
+}
+
+// failingChunkReader serves ranges normally but errors when the requested range
+// starts at failAtStart, modelling a mid-download fetch failure.
+type failingChunkReader struct {
+	data        []byte
+	etag        string
+	failAtStart int64
+
+	opens atomic.Int64
+}
+
+func (r *failingChunkReader) Open(_ context.Context, _ client.Key, opts ...client.RequestOption) (io.ReadCloser, http.Header, error) {
+	r.opens.Add(1)
+	size := int64(len(r.data))
+	o := client.NewRequestOptions(opts...)
+	start, length, outcome := o.ResolveRange(size, r.etag)
+	if outcome == client.RangePartial && start == r.failAtStart {
+		return nil, nil, errors.New("boom")
+	}
+	headers := http.Header{}
+	if outcome == client.RangeNotSatisfiable {
+		headers.Set("Content-Range", fmt.Sprintf("bytes */%d", size))
+		return nil, headers, client.ErrRangeNotSatisfiable
+	}
+	headers.Set(client.ETagKey, r.etag)
+	headers.Set("Content-Length", strconv.FormatInt(length, 10))
+	if outcome == client.RangePartial {
+		headers.Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, start+length-1, size))
+	}
+	return io.NopCloser(bytes.NewReader(r.data[start : start+length])), headers, nil
+}