Handle rate limit headers in client, adjust request throttling (#61)

piceri · web-flow · commit 361af00b3a31 · 2026-03-20T15:40:20.000-04:00
* improve deploymentrecord client to support rate limiting backoffs provided in headers

Signed-off-by: Eric Pickard &lt;piceri@github.com&gt;

* fix typo

Signed-off-by: Eric Pickard &lt;piceri@github.com&gt;

* add user agent

Signed-off-by: Eric Pickard &lt;piceri@github.com&gt;

* rename existing rateLimiter to requestThrottler and reduce throughput limits to match github secondary rate limits

Signed-off-by: Eric Pickard &lt;piceri@github.com&gt;

* switch to atomic rate limit deadline

Signed-off-by: Eric Pickard &lt;piceri@github.com&gt;

* address comments

Signed-off-by: Eric Pickard &lt;piceri@github.com&gt;

* remove Retry-After max check

Signed-off-by: Eric Pickard &lt;piceri@github.com&gt;

---------

Signed-off-by: Eric Pickard &lt;piceri@github.com&gt;
diff --git a/pkg/deploymentrecord/client.go b/pkg/deploymentrecord/client.go
@@ -14,6 +14,7 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
+	"sync/atomic"
 	"time"
 
 	"github.com/bradleyfalzon/ghinstallation/v2"
@@ -30,13 +31,16 @@ var validOrgPattern = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
 
 // Client is an API client for posting deployment records.
 type Client struct {
-	baseURL     string
-	org         string
-	httpClient  *http.Client
-	retries     int
-	apiToken    string
-	transport   *ghinstallation.Transport
-	rateLimiter *rate.Limiter
+	baseURL          string
+	org              string
+	httpClient       *http.Client
+	retries          int
+	apiToken         string
+	transport        *ghinstallation.Transport
+	requestThrottler *rate.Limiter
+
+	// rateLimitDeadline is a UnixNano timestamp shared across workers.
+	rateLimitDeadline atomic.Int64
 }
 
 // NewClient creates a new API client with the given base URL and
@@ -70,8 +74,8 @@ func NewClient(baseURL, org string, opts ...ClientOption) (*Client, error) {
 			Timeout: 5 * time.Second,
 		},
 		retries: 3,
-		// 20 req/sec with burst of 50
-		rateLimiter: rate.NewLimiter(rate.Limit(20), 50),
+		// 3 req/sec (180 req/min) with burst of 20
+		requestThrottler: rate.NewLimiter(rate.Limit(3), 20),
 	}
 
 	for _, opt := range opts {
@@ -140,10 +144,10 @@ func WithGHApp(id, installID string, pkBytes []byte, pkPath string) ClientOption
 	}
 }
 
-// WithRateLimiter sets a custom rate limiter for API calls.
-func WithRateLimiter(rps float64, burst int) ClientOption {
+// WithRequestThrottler sets a custom rate limiter for API calls.
+func WithRequestThrottler(rps float64, burst int) ClientOption {
 	return func(c *Client) {
-		c.rateLimiter = rate.NewLimiter(rate.Limit(rps), burst)
+		c.requestThrottler = rate.NewLimiter(rate.Limit(rps), burst)
 	}
 }
 
@@ -180,11 +184,6 @@ func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error {
 		return errors.New("record cannot be nil")
 	}
 
-	// Wait for rate limiter
-	if err := c.rateLimiter.Wait(ctx); err != nil {
-		return fmt.Errorf("rate limiter wait failed: %w", err)
-	}
-
 	url := fmt.Sprintf("%s/orgs/%s/artifacts/metadata/deployment-record", c.baseURL, c.org)
 
 	body, err := json.Marshal(record)
@@ -197,23 +196,16 @@ func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error {
 	var lastErr error
 	// The first attempt is not a retry!
 	for attempt := range c.retries + 1 {
-		if attempt > 0 {
-			backoff := time.Duration(math.Pow(2,
-				float64(attempt))) * 100 * time.Millisecond
-			//nolint:gosec
-			jitter := time.Duration(rand.Int64N(50)) * time.Millisecond
-			delay := backoff + jitter
-
-			if delay > 5*time.Second {
-				delay = 5 * time.Second
-			}
+		if err = waitForBackoff(ctx, attempt); err != nil {
+			return err
+		}
 
-			// Wait with context cancellation support
-			select {
-			case <-time.After(delay):
-			case <-ctx.Done():
-				return fmt.Errorf("context cancelled during retry backoff: %w", ctx.Err())
-			}
+		if err = c.waitForServerRateLimit(ctx); err != nil {
+			return err
+		}
+
+		if err = c.requestThrottler.Wait(ctx); err != nil {
+			return fmt.Errorf("request throttler wait failed: %w", err)
 		}
 
 		// Reset reader position for retries
@@ -236,6 +228,7 @@ func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error {
 		} else if c.apiToken != "" {
 			req.Header.Set("Authorization", "Bearer "+c.apiToken)
 		}
+		req.Header.Set("User-Agent", "GitHub-Deployment-Tracker")
 
 		start := time.Now()
 		// nolint: gosec
@@ -268,7 +261,7 @@ func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error {
 
 		switch {
 		case resp.StatusCode == 404:
-			// No artifact found
+			// No artifact found - do not retry
 			dtmetrics.PostDeploymentRecordNoAttestation.Inc()
 			slog.Debug("no artifact attestation found, no record created",
 				"attempt", attempt,
@@ -279,14 +272,17 @@ func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error {
 			)
 			return &NoArtifactError{err: fmt.Errorf("no attestation found for %s", record.Digest)}
 		case resp.StatusCode >= 400 && resp.StatusCode < 500:
-			if resp.Header.Get("retry-after") != "" || resp.Header.Get("x-ratelimit-remaining") == "0" {
-				// Rate limited — retry with backoff
-				// Could be 403 or 429
+			// Check headers that indicate rate limiting
+			if resp.Header.Get("Retry-After") != "" || resp.Header.Get("X-Ratelimit-Remaining") == "0" {
+				retryDelay := parseRateLimitDelay(resp)
+				c.setRetryAfter(retryDelay)
 				dtmetrics.PostDeploymentRecordRateLimited.Inc()
 				slog.Warn("rate limited, retrying",
 					"attempt", attempt,
 					"status_code", resp.StatusCode,
-					"retry_after", resp.Header.Get("Retry-After"),
+					"retry-after", resp.Header.Get("Retry-After"),
+					"x-ratelimit-remaining", resp.Header.Get("X-Ratelimit-Remaining"),
+					"retry_delay", retryDelay.Seconds(),
 					"container_name", record.Name,
 					"resp_msg", string(respBody),
 				)
@@ -323,3 +319,103 @@ func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error {
 	)
 	return fmt.Errorf("all retries exhausted: %w", lastErr)
 }
+
+// waitForServerRateLimit blocks until the global server rate limit backoff has elapsed.
+// All workers sharing this client observe the same deadline.
+func (c *Client) waitForServerRateLimit(ctx context.Context) error {
+	deadline := c.rateLimitDeadline.Load()
+	delay := time.Until(time.Unix(0, deadline))
+	if delay <= 0 {
+		return nil
+	}
+
+	slog.Info("waiting for server rate limit backoff",
+		"delay", delay.Round(time.Millisecond),
+	)
+
+	timer := time.NewTimer(delay)
+	defer timer.Stop()
+
+	select {
+	case <-timer.C:
+		return nil
+	case <-ctx.Done():
+		return fmt.Errorf("context cancelled during server rate limit wait: %w", ctx.Err())
+	}
+}
+
+// setRetryAfter records a global backoff deadline.
+// Ensures deadline can only be extended, not shortened.
+func (c *Client) setRetryAfter(d time.Duration) {
+	newDeadline := time.Now().Add(d).UnixNano()
+	for {
+		current := c.rateLimitDeadline.Load()
+		if newDeadline <= current {
+			return
+		}
+		if c.rateLimitDeadline.CompareAndSwap(current, newDeadline) {
+			return
+		}
+	}
+}
+
+// parseRateLimitDelay extracts the backoff duration from a rate-limit response:
+// Return largest delay from header options.
+// If no headers are set, default to 1 minute.
+func parseRateLimitDelay(resp *http.Response) time.Duration {
+	// GitHub docs show Retry-After header will always be an int
+	var retryAfterDelay *time.Duration
+	if ra := resp.Header.Get("Retry-After"); ra != "" {
+		if seconds, err := strconv.Atoi(ra); err == nil {
+			rad := time.Duration(seconds) * time.Second
+			retryAfterDelay = &rad
+		}
+	}
+
+	var rateLimitResetDelay *time.Duration
+	if resp.Header.Get("X-Ratelimit-Remaining") == "0" {
+		if resetStr := resp.Header.Get("X-Ratelimit-Reset"); resetStr != "" {
+			if epoch, err := strconv.ParseInt(resetStr, 10, 64); err == nil {
+				if d := time.Until(time.Unix(epoch, 0)); d > 0 {
+					rateLimitResetDelay = &d
+				}
+			}
+		}
+	}
+
+	switch {
+	case retryAfterDelay != nil && rateLimitResetDelay != nil:
+		return max(*retryAfterDelay, *rateLimitResetDelay)
+	case retryAfterDelay != nil:
+		return *retryAfterDelay
+	case rateLimitResetDelay != nil:
+		return *rateLimitResetDelay
+	default:
+		return time.Minute
+	}
+}
+
+func waitForBackoff(ctx context.Context, attempt int) error {
+	if attempt > 0 {
+		backoff := time.Duration(math.Pow(2,
+			float64(attempt))) * 100 * time.Millisecond
+		//nolint:gosec
+		jitter := time.Duration(rand.Int64N(50)) * time.Millisecond
+		delay := backoff + jitter
+
+		if delay > 5*time.Second {
+			delay = 5 * time.Second
+		}
+
+		// Wait with context cancellation support
+		timer := time.NewTimer(delay)
+		defer timer.Stop()
+
+		select {
+		case <-timer.C:
+		case <-ctx.Done():
+			return fmt.Errorf("context cancelled during retry backoff: %w", ctx.Err())
+		}
+	}
+	return nil
+}
diff --git a/pkg/deploymentrecord/client_test.go b/pkg/deploymentrecord/client_test.go
@@ -5,7 +5,9 @@ import (
 	"errors"
 	"net/http"
 	"net/http/httptest"
+	"strconv"
 	"strings"
+	"sync"
 	"sync/atomic"
 	"testing"
 	"time"
@@ -421,6 +423,7 @@ func TestPostOne(t *testing.T) {
 			retries: 1,
 			handler: func(w http.ResponseWriter, _ *http.Request) {
 				w.Header().Set("X-Ratelimit-Remaining", "0")
+				w.Header().Set("X-Ratelimit-Reset", strconv.FormatInt(time.Now().Add(1*time.Second).Unix(), 10))
 				w.WriteHeader(http.StatusForbidden)
 			},
 			wantErr:         true,
@@ -604,3 +607,115 @@ func TestPostOneSendsCorrectRequest(t *testing.T) {
 		t.Fatalf("unexpected error: %v", err)
 	}
 }
+
+func TestParseRateLimitDelay(t *testing.T) {
+	tests := []struct {
+		name    string
+		headers http.Header
+		wantMin time.Duration
+		wantMax time.Duration
+	}{
+		{
+			name:    "Retry-After in seconds",
+			headers: http.Header{"Retry-After": []string{"5"}},
+			wantMin: 5 * time.Second,
+			wantMax: 5 * time.Second,
+		},
+		{
+			name:    "Retry-After zero seconds",
+			headers: http.Header{"Retry-After": []string{"0"}},
+			wantMin: 0,
+			wantMax: 0,
+		},
+		{
+			name: "X-Ratelimit-Remaining 0 with reset",
+			headers: http.Header{
+				"X-Ratelimit-Remaining": []string{"0"},
+				"X-Ratelimit-Reset":     []string{strconv.FormatInt(time.Now().Add(10*time.Second).Unix(), 10)},
+			},
+			wantMin: 9 * time.Second,
+			wantMax: 11 * time.Second,
+		},
+		{
+			name:    "no relevant headers defaults to 1 minute",
+			headers: http.Header{},
+			wantMin: time.Minute,
+			wantMax: time.Minute,
+		},
+		{
+			name: "Largest delay takes precedence",
+			headers: http.Header{
+				"Retry-After":           []string{"3"},
+				"X-Ratelimit-Remaining": []string{"0"},
+				"X-Ratelimit-Reset":     []string{strconv.FormatInt(time.Now().Add(60*time.Second).Unix(), 10)},
+			},
+			wantMin: 59 * time.Second,
+			wantMax: 61 * time.Second,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			resp := &http.Response{Header: tt.headers}
+			result := parseRateLimitDelay(resp)
+			if result < tt.wantMin || result > tt.wantMax {
+				t.Errorf("parseRateLimitDelay() = %v, want between %v and %v", result, tt.wantMin, tt.wantMax)
+			}
+		})
+	}
+}
+
+func TestPostOneRespectsRetryAfterAcrossGoroutines(t *testing.T) {
+	var reqCount atomic.Int32
+	firstReqDone := make(chan struct{})
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		count := reqCount.Add(1)
+		if count == 1 {
+			w.Header().Set("Retry-After", "2")
+			w.WriteHeader(http.StatusTooManyRequests)
+			close(firstReqDone)
+			return
+		}
+		w.WriteHeader(http.StatusOK)
+	}))
+	t.Cleanup(srv.Close)
+
+	client, err := NewClient(srv.URL, "test-org", WithRetries(2))
+	if err != nil {
+		t.Fatalf("failed to create client: %v", err)
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	t.Cleanup(cancel)
+
+	var wg sync.WaitGroup
+
+	// Goroutine 1: triggers the rate limit
+	wg.Go(func() {
+		if err := client.PostOne(ctx, testRecord()); err != nil {
+			t.Errorf("goroutine 1 error: %v", err)
+		}
+	})
+
+	// Wait for the rate limit to be received and backoff set
+	<-firstReqDone
+	time.Sleep(50 * time.Millisecond)
+
+	// Goroutine 2: must observe the shared backoff
+	secondReqDone := make(chan struct{})
+	start := time.Now()
+	wg.Go(func() {
+		defer close(secondReqDone)
+		if err := client.PostOne(ctx, testRecord()); err != nil {
+			t.Errorf("goroutine 2 error: %v", err)
+		}
+	})
+	// Measure only goroutine 2's duration
+	<-secondReqDone
+	elapsed := time.Since(start)
+	if elapsed < 1500*time.Millisecond {
+		t.Errorf("goroutine 2 should have waited for retry-after, but only waited %v", elapsed)
+	}
+	wg.Wait()
+}