fix: address review feedback on Cloudflare routing PR

paskal · paskal · commit 6403f003b059 · 2026-04-12T21:29:54.000+02:00
- TestGetContentCustom: pass the rule directly to getContent so it actually
  exercises the custom-rule path; the RulesMock.GetFunc setup was dead code
  after getContent stopped looking up rules
- CloudflareRetriever.MaxRetries: remove default substitution for the zero
  value — 0 now means "no retries" as expected. Callers opt into retries by
  setting MaxRetries explicitly; main.go uses the exported CFDefaultMaxRetries
  constant (2)
- README: add cf-route-all to the config table and rewrite the Cloudflare
  section to reflect the opt-in routing model + 429 retry behaviour
- rest.Server.Run: expand the WriteTimeout comment to explain why the 150s
  ceiling is server-wide rather than per-route via http.TimeoutHandler
diff --git a/README.md b/README.md
@@ -18,13 +18,17 @@
 | creds        | CREDS           | none           | credentials for protected calls (POST, DELETE /rules) |
 | cf-account-id| CF_ACCOUNT_ID   | none           | Cloudflare account ID for Browser Rendering API       |
 | cf-api-token | CF_API_TOKEN    | none           | Cloudflare API token with Browser Rendering Edit perm |
+| cf-route-all | CF_ROUTE_ALL    | `false`        | route every request through Cloudflare Browser Rendering |
 | dbg          | DEBUG           | `false`        | debug mode                                            |
 
 ### Cloudflare Browser Rendering (optional)
 
-When both `--cf-account-id` and `--cf-api-token` are set, the service uses Cloudflare Browser Rendering API to fetch page content instead of direct HTTP. This renders JavaScript and handles bot-protection pages that return empty or "just a moment..." responses to standard HTTP requests.
+Cloudflare Browser Rendering is useful for JavaScript-heavy pages and sites behind a "please enable JS" wall, but it's slower than direct HTTP and the free tier throttles at 1 request per 10 seconds. To keep the service cost-effective, Cloudflare routing is **opt-in**.
 
-When these flags are not set, the service uses a standard HTTP client (default).
+1. Set `--cf-account-id` and `--cf-api-token` to enable Cloudflare routing.
+2. Either flip the `use_cloudflare` checkbox on individual rules (per-domain, recommended) or set `--cf-route-all=true` to route every request through Cloudflare.
+
+When Cloudflare credentials are not set, the service uses a standard HTTP client for everything (default). On HTTP 429 (rate limit) the service automatically retries with exponential backoff and respects the `Retry-After` header.
 
 ### API
 
diff --git a/extractor/readability_test.go b/extractor/readability_test.go
@@ -344,12 +344,8 @@ func TestPickRetrieverNoRules(t *testing.T) {
 }
 
 func TestGetContentCustom(t *testing.T) {
-	rulesMock := &mocks.RulesMock{
-		GetFunc: func(_ context.Context, _ string) (datastore.Rule, bool) {
-			return datastore.Rule{Content: "#content p, .post-title"}, true
-		},
-	}
-	lr := UReadability{TimeOut: 30 * time.Second, SnippetSize: 200, Rules: rulesMock}
+	rule := &datastore.Rule{Content: "#content p, .post-title"}
+	lr := UReadability{TimeOut: 30 * time.Second, SnippetSize: 200}
 	httpClient := &http.Client{Timeout: 30 * time.Second}
 	ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 		if r.URL.String() == "/2015/09/25/poiezdka-s-apple-maps/" {
@@ -374,7 +370,7 @@ func TestGetContentCustom(t *testing.T) {
 	require.NoError(t, err)
 	body := string(dataBytes)
 
-	content, rich, err := lr.getContent(context.Background(), body, ts.URL+"/2015/09/25/poiezdka-s-apple-maps/", nil)
+	content, rich, err := lr.getContent(context.Background(), body, ts.URL+"/2015/09/25/poiezdka-s-apple-maps/", rule)
 	require.NoError(t, err)
 	assert.Len(t, content, 6988)
 	assert.Len(t, rich, 7169)
diff --git a/extractor/retriever.go b/extractor/retriever.go
@@ -83,14 +83,17 @@ func (h *HTTPRetriever) Retrieve(ctx context.Context, reqURL string) (*RetrieveR
 }
 
 const (
-	cfDefaultBaseURL   = "https://api.cloudflare.com/client/v4"
-	cfDefaultWaitUntil = "networkidle0"
-	cfDefaultTimeout   = 60 * time.Second
-	// cfDefaultMaxRetries=2 keeps worst-case backoff at ~33s (11s + 22s) so total handler time
-	// stays under common upstream timeouts (nginx proxy_read_timeout default is 60s).
-	cfDefaultMaxRetries = 2
+	cfDefaultBaseURL    = "https://api.cloudflare.com/client/v4"
+	cfDefaultWaitUntil  = "networkidle0"
+	cfDefaultTimeout    = 60 * time.Second
 	cfDefaultRetryDelay = 11 * time.Second // free tier: 1 req / 10s — add a little headroom
 	cfMaxRetryDelay     = 30 * time.Second
+
+	// CFDefaultMaxRetries is the suggested MaxRetries value for production CloudflareRetriever setup.
+	// 2 retries keeps worst-case backoff at ~33s (11s + 22s) so the total handler time stays under
+	// common upstream timeouts (nginx proxy_read_timeout default is 60s). Callers must set MaxRetries
+	// explicitly — CloudflareRetriever does not substitute a default for the zero value.
+	CFDefaultMaxRetries = 2
 )
 
 // errCFRateLimited is returned by the single-attempt inner retrieve when the CF API signals rate limiting;
@@ -140,11 +143,9 @@ type cfResponse struct {
 // Retrieve fetches the URL via Cloudflare Browser Rendering /content endpoint.
 // on HTTP 429 it backs off and retries up to MaxRetries times, holding the caller's
 // connection open in the meantime. aborts early if the caller's context is canceled.
+// MaxRetries: 0 means no retries. RetryDelay: 0 falls back to the package default.
 func (c *CloudflareRetriever) Retrieve(ctx context.Context, reqURL string) (*RetrieveResult, error) {
 	maxRetries := c.MaxRetries
-	if maxRetries == 0 {
-		maxRetries = cfDefaultMaxRetries
-	}
 	if maxRetries < 0 {
 		maxRetries = 0
 	}
diff --git a/extractor/retriever_test.go b/extractor/retriever_test.go
@@ -364,24 +364,35 @@ func TestCloudflareRetriever_RateLimitContextCancelled(t *testing.T) {
 }
 
 func TestCloudflareRetriever_RateLimitRetriesDisabled(t *testing.T) {
-	var calls atomic.Int32
-	ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-		calls.Add(1)
-		w.WriteHeader(http.StatusTooManyRequests)
-		_, _ = w.Write([]byte(`rate limited`))
-	}))
-	defer ts.Close()
-
-	retriever := &CloudflareRetriever{
-		AccountID:  "test-account",
-		APIToken:   "test-token",
-		BaseURL:    ts.URL,
-		Timeout:    5 * time.Second,
-		MaxRetries: -1,
+	tests := []struct {
+		name       string
+		maxRetries int
+	}{
+		{name: "zero means no retries", maxRetries: 0},
+		{name: "negative clamped to zero", maxRetries: -1},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			var calls atomic.Int32
+			ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+				calls.Add(1)
+				w.WriteHeader(http.StatusTooManyRequests)
+				_, _ = w.Write([]byte(`rate limited`))
+			}))
+			defer ts.Close()
+
+			retriever := &CloudflareRetriever{
+				AccountID:  "test-account",
+				APIToken:   "test-token",
+				BaseURL:    ts.URL,
+				Timeout:    5 * time.Second,
+				MaxRetries: tt.maxRetries,
+			}
+			_, err := retriever.Retrieve(context.Background(), "https://example.com")
+			require.Error(t, err)
+			assert.Equal(t, int32(1), calls.Load())
+		})
 	}
-	_, err := retriever.Retrieve(context.Background(), "https://example.com")
-	require.Error(t, err)
-	assert.Equal(t, int32(1), calls.Load())
 }
 
 func TestParseRetryAfter(t *testing.T) {
diff --git a/main.go b/main.go
@@ -57,9 +57,10 @@ func main() {
 	var cfRetriever extractor.Retriever
 	if opts.CFAccountID != "" && opts.CFAPIToken != "" {
 		cfRetriever = &extractor.CloudflareRetriever{
-			AccountID: opts.CFAccountID,
-			APIToken:  opts.CFAPIToken,
-			Timeout:   30 * time.Second,
+			AccountID:  opts.CFAccountID,
+			APIToken:   opts.CFAPIToken,
+			Timeout:    30 * time.Second,
+			MaxRetries: extractor.CFDefaultMaxRetries,
 		}
 		if opts.CFRouteAll {
 			log.Printf("[INFO] Cloudflare Browser Rendering enabled, account=%s, mode=route-all", opts.CFAccountID)
diff --git a/rest/server.go b/rest/server.go
@@ -48,8 +48,12 @@ func (s *Server) Run(ctx context.Context, address string, port int, frontendDir
 		Addr:              fmt.Sprintf("%s:%d", address, port),
 		Handler:           s.routes(frontendDir),
 		ReadHeaderTimeout: 5 * time.Second,
-		// 150s ceiling fits the worst-case CF retriever path (1 initial + 2 retries with 11s/22s
-		// exponential backoff + up to 30s per CF request) while still capping runaway handlers.
+		// WriteTimeout is server-wide rather than per-route because the extraction endpoints
+		// are the only potentially long-running handlers (other handlers — static files, rule
+		// CRUD, /ping — finish in milliseconds and are unaffected by this ceiling). 150s covers
+		// the worst-case Cloudflare path: 1 initial request + 2 retries with 11s/22s exponential
+		// backoff + up to 30s per CF request. If extraction ever moves off the server-wide
+		// timeout, wrap only those routes with http.TimeoutHandler instead.
 		WriteTimeout: 150 * time.Second,
 		IdleTimeout:  30 * time.Second,
 	}