Skip to content

Commit 6403f00

Browse files
committed
fix: address review feedback on Cloudflare routing PR
- TestGetContentCustom: pass the rule directly to getContent so it actually exercises the custom-rule path; the RulesMock.GetFunc setup was dead code after getContent stopped looking up rules - CloudflareRetriever.MaxRetries: remove default substitution for the zero value — 0 now means "no retries" as expected. Callers opt into retries by setting MaxRetries explicitly; main.go uses the exported CFDefaultMaxRetries constant (2) - README: add cf-route-all to the config table and rewrite the Cloudflare section to reflect the opt-in routing model + 429 retry behaviour - rest.Server.Run: expand the WriteTimeout comment to explain why the 150s ceiling is server-wide rather than per-route via http.TimeoutHandler
1 parent 1078c74 commit 6403f00

6 files changed

Lines changed: 57 additions & 40 deletions

File tree

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,17 @@
1818
| creds | CREDS | none | credentials for protected calls (POST, DELETE /rules) |
1919
| cf-account-id| CF_ACCOUNT_ID | none | Cloudflare account ID for Browser Rendering API |
2020
| cf-api-token | CF_API_TOKEN | none | Cloudflare API token with Browser Rendering Edit perm |
21+
| cf-route-all | CF_ROUTE_ALL | `false` | route every request through Cloudflare Browser Rendering |
2122
| dbg | DEBUG | `false` | debug mode |
2223

2324
### Cloudflare Browser Rendering (optional)
2425

25-
When both `--cf-account-id` and `--cf-api-token` are set, the service uses Cloudflare Browser Rendering API to fetch page content instead of direct HTTP. This renders JavaScript and handles bot-protection pages that return empty or "just a moment..." responses to standard HTTP requests.
26+
Cloudflare Browser Rendering is useful for JavaScript-heavy pages and sites behind a "please enable JS" wall, but it's slower than direct HTTP and the free tier throttles at 1 request per 10 seconds. To keep the service cost-effective, Cloudflare routing is **opt-in**.
2627

27-
When these flags are not set, the service uses a standard HTTP client (default).
28+
1. Set `--cf-account-id` and `--cf-api-token` to enable Cloudflare routing.
29+
2. Either flip the `use_cloudflare` checkbox on individual rules (per-domain, recommended) or set `--cf-route-all=true` to route every request through Cloudflare.
30+
31+
When Cloudflare credentials are not set, the service uses a standard HTTP client for everything (default). On HTTP 429 (rate limit) the service automatically retries with exponential backoff and respects the `Retry-After` header.
2832

2933
### API
3034

extractor/readability_test.go

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -344,12 +344,8 @@ func TestPickRetrieverNoRules(t *testing.T) {
344344
}
345345

346346
func TestGetContentCustom(t *testing.T) {
347-
rulesMock := &mocks.RulesMock{
348-
GetFunc: func(_ context.Context, _ string) (datastore.Rule, bool) {
349-
return datastore.Rule{Content: "#content p, .post-title"}, true
350-
},
351-
}
352-
lr := UReadability{TimeOut: 30 * time.Second, SnippetSize: 200, Rules: rulesMock}
347+
rule := &datastore.Rule{Content: "#content p, .post-title"}
348+
lr := UReadability{TimeOut: 30 * time.Second, SnippetSize: 200}
353349
httpClient := &http.Client{Timeout: 30 * time.Second}
354350
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
355351
if r.URL.String() == "/2015/09/25/poiezdka-s-apple-maps/" {
@@ -374,7 +370,7 @@ func TestGetContentCustom(t *testing.T) {
374370
require.NoError(t, err)
375371
body := string(dataBytes)
376372

377-
content, rich, err := lr.getContent(context.Background(), body, ts.URL+"/2015/09/25/poiezdka-s-apple-maps/", nil)
373+
content, rich, err := lr.getContent(context.Background(), body, ts.URL+"/2015/09/25/poiezdka-s-apple-maps/", rule)
378374
require.NoError(t, err)
379375
assert.Len(t, content, 6988)
380376
assert.Len(t, rich, 7169)

extractor/retriever.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,14 +83,17 @@ func (h *HTTPRetriever) Retrieve(ctx context.Context, reqURL string) (*RetrieveR
8383
}
8484

8585
const (
86-
cfDefaultBaseURL = "https://api.cloudflare.com/client/v4"
87-
cfDefaultWaitUntil = "networkidle0"
88-
cfDefaultTimeout = 60 * time.Second
89-
// cfDefaultMaxRetries=2 keeps worst-case backoff at ~33s (11s + 22s) so total handler time
90-
// stays under common upstream timeouts (nginx proxy_read_timeout default is 60s).
91-
cfDefaultMaxRetries = 2
86+
cfDefaultBaseURL = "https://api.cloudflare.com/client/v4"
87+
cfDefaultWaitUntil = "networkidle0"
88+
cfDefaultTimeout = 60 * time.Second
9289
cfDefaultRetryDelay = 11 * time.Second // free tier: 1 req / 10s — add a little headroom
9390
cfMaxRetryDelay = 30 * time.Second
91+
92+
// CFDefaultMaxRetries is the suggested MaxRetries value for production CloudflareRetriever setup.
93+
// 2 retries keeps worst-case backoff at ~33s (11s + 22s) so the total handler time stays under
94+
// common upstream timeouts (nginx proxy_read_timeout default is 60s). Callers must set MaxRetries
95+
// explicitly — CloudflareRetriever does not substitute a default for the zero value.
96+
CFDefaultMaxRetries = 2
9497
)
9598

9699
// errCFRateLimited is returned by the single-attempt inner retrieve when the CF API signals rate limiting;
@@ -140,11 +143,9 @@ type cfResponse struct {
140143
// Retrieve fetches the URL via Cloudflare Browser Rendering /content endpoint.
141144
// on HTTP 429 it backs off and retries up to MaxRetries times, holding the caller's
142145
// connection open in the meantime. aborts early if the caller's context is canceled.
146+
// MaxRetries: 0 means no retries. RetryDelay: 0 falls back to the package default.
143147
func (c *CloudflareRetriever) Retrieve(ctx context.Context, reqURL string) (*RetrieveResult, error) {
144148
maxRetries := c.MaxRetries
145-
if maxRetries == 0 {
146-
maxRetries = cfDefaultMaxRetries
147-
}
148149
if maxRetries < 0 {
149150
maxRetries = 0
150151
}

extractor/retriever_test.go

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -364,24 +364,35 @@ func TestCloudflareRetriever_RateLimitContextCancelled(t *testing.T) {
364364
}
365365

366366
func TestCloudflareRetriever_RateLimitRetriesDisabled(t *testing.T) {
367-
var calls atomic.Int32
368-
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
369-
calls.Add(1)
370-
w.WriteHeader(http.StatusTooManyRequests)
371-
_, _ = w.Write([]byte(`rate limited`))
372-
}))
373-
defer ts.Close()
374-
375-
retriever := &CloudflareRetriever{
376-
AccountID: "test-account",
377-
APIToken: "test-token",
378-
BaseURL: ts.URL,
379-
Timeout: 5 * time.Second,
380-
MaxRetries: -1,
367+
tests := []struct {
368+
name string
369+
maxRetries int
370+
}{
371+
{name: "zero means no retries", maxRetries: 0},
372+
{name: "negative clamped to zero", maxRetries: -1},
373+
}
374+
for _, tt := range tests {
375+
t.Run(tt.name, func(t *testing.T) {
376+
var calls atomic.Int32
377+
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
378+
calls.Add(1)
379+
w.WriteHeader(http.StatusTooManyRequests)
380+
_, _ = w.Write([]byte(`rate limited`))
381+
}))
382+
defer ts.Close()
383+
384+
retriever := &CloudflareRetriever{
385+
AccountID: "test-account",
386+
APIToken: "test-token",
387+
BaseURL: ts.URL,
388+
Timeout: 5 * time.Second,
389+
MaxRetries: tt.maxRetries,
390+
}
391+
_, err := retriever.Retrieve(context.Background(), "https://example.com")
392+
require.Error(t, err)
393+
assert.Equal(t, int32(1), calls.Load())
394+
})
381395
}
382-
_, err := retriever.Retrieve(context.Background(), "https://example.com")
383-
require.Error(t, err)
384-
assert.Equal(t, int32(1), calls.Load())
385396
}
386397

387398
func TestParseRetryAfter(t *testing.T) {

main.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,10 @@ func main() {
5757
var cfRetriever extractor.Retriever
5858
if opts.CFAccountID != "" && opts.CFAPIToken != "" {
5959
cfRetriever = &extractor.CloudflareRetriever{
60-
AccountID: opts.CFAccountID,
61-
APIToken: opts.CFAPIToken,
62-
Timeout: 30 * time.Second,
60+
AccountID: opts.CFAccountID,
61+
APIToken: opts.CFAPIToken,
62+
Timeout: 30 * time.Second,
63+
MaxRetries: extractor.CFDefaultMaxRetries,
6364
}
6465
if opts.CFRouteAll {
6566
log.Printf("[INFO] Cloudflare Browser Rendering enabled, account=%s, mode=route-all", opts.CFAccountID)

rest/server.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,12 @@ func (s *Server) Run(ctx context.Context, address string, port int, frontendDir
4848
Addr: fmt.Sprintf("%s:%d", address, port),
4949
Handler: s.routes(frontendDir),
5050
ReadHeaderTimeout: 5 * time.Second,
51-
// 150s ceiling fits the worst-case CF retriever path (1 initial + 2 retries with 11s/22s
52-
// exponential backoff + up to 30s per CF request) while still capping runaway handlers.
51+
// WriteTimeout is server-wide rather than per-route because the extraction endpoints
52+
// are the only potentially long-running handlers (other handlers — static files, rule
53+
// CRUD, /ping — finish in milliseconds and are unaffected by this ceiling). 150s covers
54+
// the worst-case Cloudflare path: 1 initial request + 2 retries with 11s/22s exponential
55+
// backoff + up to 30s per CF request. If extraction ever moves off the server-wide
56+
// timeout, wrap only those routes with http.TimeoutHandler instead.
5357
WriteTimeout: 150 * time.Second,
5458
IdleTimeout: 30 * time.Second,
5559
}

0 commit comments

Comments
 (0)