Skip to content

Commit 1078c74

Browse files
committed
feat: add per-rule and global Cloudflare routing, 429 retries
HTTP retriever stays the default. Cloudflare is now opt-in at two levels: - per-rule: new Rule.UseCloudflare field (checkbox in rule editor UI) routes requests for that domain through Cloudflare Browser Rendering - global: --cf-route-all / CF_ROUTE_ALL flag (default false) routes every request through Cloudflare UReadability.pickRetriever(rule) picks: CFRouteAll > rule.UseCloudflare > default HTTP. extractWithRules now resolves the rule once upfront and shares it between routing and getContent (was looked up twice). CloudflareRetriever retries on HTTP 429 with exponential backoff (base 11s, max 2 retries by default → worst-case 33s of backoff), honours Retry-After header, and aborts immediately on caller context cancel. MaxRetries=-1 disables retries. Added WriteTimeout=150s on the HTTP server — was previously unset, allowing handlers to run forever. 150s covers the worst-case CF path (up to ~123s).
1 parent 5ab4f4e commit 1078c74

8 files changed

Lines changed: 428 additions & 58 deletions

File tree

datastore/rules.go

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,17 @@ type RulesDAO struct {
1818

1919
// Rule record, entry in mongo
2020
type Rule struct {
21-
ID bson.ObjectID `json:"id" bson:"_id,omitempty"`
22-
Domain string `json:"domain"`
23-
MatchURLs []string `json:"match_url,omitempty" bson:"match_urls,omitempty"`
24-
Content string `json:"content"`
25-
Author string `json:"author,omitempty" bson:"author,omitempty"`
26-
TS string `json:"ts,omitempty" bson:"ts,omitempty"` // ts of original article
27-
Excludes []string `json:"excludes,omitempty" bson:"excludes,omitempty"`
28-
TestURLs []string `json:"test_urls,omitempty" bson:"test_urls"`
29-
User string `json:"user"`
30-
Enabled bool `json:"enabled"`
21+
ID bson.ObjectID `json:"id" bson:"_id,omitempty"`
22+
Domain string `json:"domain"`
23+
MatchURLs []string `json:"match_url,omitempty" bson:"match_urls,omitempty"`
24+
Content string `json:"content"`
25+
Author string `json:"author,omitempty" bson:"author,omitempty"`
26+
TS string `json:"ts,omitempty" bson:"ts,omitempty"` // ts of original article
27+
Excludes []string `json:"excludes,omitempty" bson:"excludes,omitempty"`
28+
TestURLs []string `json:"test_urls,omitempty" bson:"test_urls"`
29+
User string `json:"user"`
30+
Enabled bool `json:"enabled"`
31+
UseCloudflare bool `json:"use_cloudflare,omitempty" bson:"use_cloudflare,omitempty"` // route fetch via Cloudflare Browser Rendering
3132
}
3233

3334
// Get rule by url. Checks if found in mongo, matching by domain

extractor/readability.go

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,15 @@ type UReadability struct {
3434
TimeOut time.Duration
3535
SnippetSize int
3636
Rules Rules
37-
Retriever Retriever
37+
Retriever Retriever // default retriever; when nil a cached HTTPRetriever is used
38+
CFRetriever Retriever // optional Cloudflare Browser Rendering retriever; when set, enables routing
39+
CFRouteAll bool // route every request through CFRetriever (requires CFRetriever != nil)
3840

3941
defaultRetrieverOnce sync.Once
4042
defaultRetriever Retriever
4143
}
4244

43-
// retriever returns the configured Retriever, defaulting to a cached HTTPRetriever if nil
45+
// retriever returns the configured default Retriever, creating a cached HTTPRetriever if nil
4446
func (f *UReadability) retriever() Retriever {
4547
if f.Retriever != nil {
4648
return f.Retriever
@@ -51,6 +53,22 @@ func (f *UReadability) retriever() Retriever {
5153
return f.defaultRetriever
5254
}
5355

56+
// pickRetriever decides which retriever should fetch the given URL based on routing config and an
57+
// optional pre-resolved rule. Falls back to the default retriever unless CFRetriever is set AND
58+
// either CFRouteAll is true or the rule explicitly asks for Cloudflare.
59+
func (f *UReadability) pickRetriever(rule *datastore.Rule) Retriever {
60+
if f.CFRetriever == nil {
61+
return f.retriever()
62+
}
63+
if f.CFRouteAll {
64+
return f.CFRetriever
65+
}
66+
if rule != nil && rule.UseCloudflare {
67+
return f.CFRetriever
68+
}
69+
return f.retriever()
70+
}
71+
5472
// Response from api calls
5573
type Response struct {
5674
Content string `json:"content"`
@@ -91,7 +109,15 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
91109
log.Printf("[INFO] extract %s", reqURL)
92110
rb := &Response{}
93111

94-
result, err := f.retriever().Retrieve(ctx, reqURL)
112+
// look up a rule by domain once up front (unless one was explicitly passed) so retriever
113+
// selection and getContent share the same lookup instead of paying for two round-trips.
114+
if rule == nil && f.Rules != nil {
115+
if r, found := f.Rules.Get(ctx, reqURL); found {
116+
rule = &r
117+
}
118+
}
119+
120+
result, err := f.pickRetriever(rule).Retrieve(ctx, reqURL)
95121
if err != nil {
96122
return nil, err
97123
}
@@ -136,10 +162,10 @@ func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule
136162
return rb, nil
137163
}
138164

139-
// getContent retrieves content from raw body string, both content (text only) and rich (with html tags)
140-
// if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage,
141-
// and at last tries to use general readability parser
142-
func (f *UReadability) getContent(ctx context.Context, body, reqURL string, rule *datastore.Rule) (content, rich string, err error) {
165+
// getContent retrieves content from raw body string, both content (text only) and rich (with html tags).
166+
// if rule is provided, it tries the custom rule first and falls back to the general parser on failure.
167+
// rule lookup for a given URL is done upstream in extractWithRules.
168+
func (f *UReadability) getContent(_ context.Context, body, reqURL string, rule *datastore.Rule) (content, rich string, err error) {
143169
// general parser
144170
genParser := func(body, _ string) (content, rich string, err error) {
145171
doc, err := readability.NewDocument(body)
@@ -172,19 +198,10 @@ func (f *UReadability) getContent(ctx context.Context, body, reqURL string, rule
172198

173199
if rule != nil {
174200
log.Printf("[DEBUG] custom rule provided for %s: %v", reqURL, rule)
175-
return customParser(body, reqURL, *rule)
176-
}
177-
178-
if f.Rules != nil {
179-
r := f.Rules
180-
if rule, found := r.Get(ctx, reqURL); found {
181-
if content, rich, err = customParser(body, reqURL, rule); err == nil {
182-
return content, rich, nil
183-
}
184-
log.Printf("[WARN] custom extractor failed for %s, error=%v", reqURL, err) // back to general parser
201+
if content, rich, err = customParser(body, reqURL, *rule); err == nil {
202+
return content, rich, nil
185203
}
186-
} else {
187-
log.Print("[DEBUG] no rules defined!")
204+
log.Printf("[WARN] custom extractor failed for %s, error=%v", reqURL, err) // back to general parser
188205
}
189206

190207
return genParser(body, reqURL)

extractor/readability_test.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,92 @@ func TestExtractWithCustomRetriever(t *testing.T) {
257257
assert.Equal(t, "https://example.com/test-page", calls[0].URL)
258258
}
259259

260+
func TestPickRetriever(t *testing.T) {
261+
mkRetriever := func(tag string) *RetrieverMock {
262+
return &RetrieverMock{
263+
RetrieveFunc: func(_ context.Context, reqURL string) (*RetrieveResult, error) {
264+
h := make(http.Header)
265+
h.Set("Content-Type", "text/html; charset=utf-8")
266+
return &RetrieveResult{
267+
Body: []byte("<html><head><title>" + tag + "</title></head><body><p>body-" + tag + "</p></body></html>"),
268+
URL: reqURL,
269+
Header: h,
270+
}, nil
271+
},
272+
}
273+
}
274+
275+
tests := []struct {
276+
name string
277+
cfRouteAll bool
278+
useCloudflare bool
279+
cfConfigured bool
280+
wantTag string
281+
}{
282+
{name: "no CF configured uses HTTP", cfConfigured: false, wantTag: "http"},
283+
{name: "CF configured, no flag uses HTTP", cfConfigured: true, wantTag: "http"},
284+
{name: "CF configured, rule asks for CF uses CF", cfConfigured: true, useCloudflare: true, wantTag: "cf"},
285+
{name: "CF configured, route-all uses CF", cfConfigured: true, cfRouteAll: true, wantTag: "cf"},
286+
{name: "route-all overrides rule flag", cfConfigured: true, cfRouteAll: true, useCloudflare: false, wantTag: "cf"},
287+
{name: "route-all without CF configured falls back to HTTP", cfConfigured: false, cfRouteAll: true, wantTag: "http"},
288+
}
289+
290+
for _, tt := range tests {
291+
t.Run(tt.name, func(t *testing.T) {
292+
httpR := mkRetriever("http")
293+
var cfR *RetrieverMock
294+
lr := UReadability{
295+
TimeOut: time.Second,
296+
SnippetSize: 200,
297+
Retriever: httpR,
298+
CFRouteAll: tt.cfRouteAll,
299+
Rules: &mocks.RulesMock{
300+
GetFunc: func(_ context.Context, _ string) (datastore.Rule, bool) {
301+
return datastore.Rule{Domain: "example.com", UseCloudflare: tt.useCloudflare}, true
302+
},
303+
},
304+
}
305+
if tt.cfConfigured {
306+
cfR = mkRetriever("cf")
307+
lr.CFRetriever = cfR
308+
}
309+
310+
_, err := lr.Extract(context.Background(), "https://example.com/page")
311+
require.NoError(t, err)
312+
313+
switch tt.wantTag {
314+
case "http":
315+
assert.Len(t, httpR.RetrieveCalls(), 1, "http retriever should have been called")
316+
if cfR != nil {
317+
assert.Empty(t, cfR.RetrieveCalls(), "cf retriever should not have been called")
318+
}
319+
case "cf":
320+
require.NotNil(t, cfR)
321+
assert.Len(t, cfR.RetrieveCalls(), 1, "cf retriever should have been called")
322+
assert.Empty(t, httpR.RetrieveCalls(), "http retriever should not have been called")
323+
}
324+
})
325+
}
326+
}
327+
328+
func TestPickRetrieverNoRules(t *testing.T) {
329+
httpR := &RetrieverMock{RetrieveFunc: func(_ context.Context, reqURL string) (*RetrieveResult, error) {
330+
h := make(http.Header)
331+
h.Set("Content-Type", "text/html; charset=utf-8")
332+
return &RetrieveResult{Body: []byte("<html><head><title>t</title></head><body>x</body></html>"), URL: reqURL, Header: h}, nil
333+
}}
334+
cfR := &RetrieverMock{RetrieveFunc: func(_ context.Context, reqURL string) (*RetrieveResult, error) {
335+
h := make(http.Header)
336+
h.Set("Content-Type", "text/html; charset=utf-8")
337+
return &RetrieveResult{Body: []byte("<html><head><title>t</title></head><body>x</body></html>"), URL: reqURL, Header: h}, nil
338+
}}
339+
lr := UReadability{TimeOut: time.Second, SnippetSize: 200, Retriever: httpR, CFRetriever: cfR} // no Rules
340+
_, err := lr.Extract(context.Background(), "https://example.com/page")
341+
require.NoError(t, err)
342+
assert.Len(t, httpR.RetrieveCalls(), 1, "no rules → HTTP path")
343+
assert.Empty(t, cfR.RetrieveCalls())
344+
}
345+
260346
func TestGetContentCustom(t *testing.T) {
261347
rulesMock := &mocks.RulesMock{
262348
GetFunc: func(_ context.Context, _ string) (datastore.Rule, bool) {

extractor/retriever.go

Lines changed: 95 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@ import (
44
"bytes"
55
"context"
66
"encoding/json"
7+
"errors"
78
"fmt"
89
"io"
910
"net/http"
11+
"strconv"
1012
"sync"
1113
"time"
1214

@@ -84,15 +86,27 @@ const (
8486
cfDefaultBaseURL = "https://api.cloudflare.com/client/v4"
8587
cfDefaultWaitUntil = "networkidle0"
8688
cfDefaultTimeout = 60 * time.Second
89+
// cfDefaultMaxRetries=2 keeps worst-case backoff at ~33s (11s + 22s) so total handler time
90+
// stays under common upstream timeouts (nginx proxy_read_timeout default is 60s).
91+
cfDefaultMaxRetries = 2
92+
cfDefaultRetryDelay = 11 * time.Second // free tier: 1 req / 10s — add a little headroom
93+
cfMaxRetryDelay = 30 * time.Second
8794
)
8895

96+
// errCFRateLimited is returned by the single-attempt inner retrieve when the CF API signals rate limiting;
97+
// the outer Retrieve uses it to decide whether to back off and retry.
98+
var errCFRateLimited = errors.New("cloudflare rate limited")
99+
89100
// CloudflareRetriever fetches pages using Cloudflare Browser Rendering API.
90101
// it sends a POST to the /content endpoint which returns fully rendered HTML after JS execution.
102+
// on HTTP 429 it retries with backoff (respecting Retry-After) up to MaxRetries times.
91103
type CloudflareRetriever struct {
92-
AccountID string
93-
APIToken string
94-
BaseURL string // override for testing; defaults to Cloudflare API
95-
Timeout time.Duration // defaults to 60s (browser rendering can be slow)
104+
AccountID string
105+
APIToken string
106+
BaseURL string // override for testing; defaults to Cloudflare API
107+
Timeout time.Duration // per-request HTTP client timeout; defaults to 60s
108+
MaxRetries int // number of retries on 429; defaults to 3. set to -1 to disable retries
109+
RetryDelay time.Duration // base delay between 429 retries; defaults to 11s (CF free tier is 1 req/10s)
96110

97111
once sync.Once
98112
client *http.Client
@@ -123,8 +137,55 @@ type cfResponse struct {
123137
Result string `json:"result"`
124138
}
125139

126-
// Retrieve fetches the URL via Cloudflare Browser Rendering /content endpoint
140+
// Retrieve fetches the URL via Cloudflare Browser Rendering /content endpoint.
141+
// on HTTP 429 it backs off and retries up to MaxRetries times, holding the caller's
142+
// connection open in the meantime. aborts early if the caller's context is canceled.
127143
func (c *CloudflareRetriever) Retrieve(ctx context.Context, reqURL string) (*RetrieveResult, error) {
144+
maxRetries := c.MaxRetries
145+
if maxRetries == 0 {
146+
maxRetries = cfDefaultMaxRetries
147+
}
148+
if maxRetries < 0 {
149+
maxRetries = 0
150+
}
151+
baseDelay := c.RetryDelay
152+
if baseDelay <= 0 {
153+
baseDelay = cfDefaultRetryDelay
154+
}
155+
156+
var lastErr error
157+
for attempt := 0; attempt <= maxRetries; attempt++ {
158+
result, retryAfter, err := c.doRetrieve(ctx, reqURL)
159+
if err == nil {
160+
return result, nil
161+
}
162+
lastErr = err
163+
if !errors.Is(err, errCFRateLimited) {
164+
return nil, err
165+
}
166+
if attempt == maxRetries {
167+
break
168+
}
169+
delay := retryAfter
170+
if delay <= 0 {
171+
delay = baseDelay << attempt // 11s, 22s, 44s, ...
172+
}
173+
if delay > cfMaxRetryDelay {
174+
delay = cfMaxRetryDelay
175+
}
176+
log.Printf("[INFO] cloudflare rate limited for %s, retry %d/%d after %s", reqURL, attempt+1, maxRetries, delay)
177+
select {
178+
case <-ctx.Done():
179+
return nil, ctx.Err()
180+
case <-time.After(delay):
181+
}
182+
}
183+
return nil, lastErr
184+
}
185+
186+
// doRetrieve performs a single Browser Rendering request. on 429 it returns errCFRateLimited
187+
// (possibly wrapped) and the parsed Retry-After duration (0 if absent or unparseable).
188+
func (c *CloudflareRetriever) doRetrieve(ctx context.Context, reqURL string) (*RetrieveResult, time.Duration, error) {
128189
baseURL := c.BaseURL
129190
if baseURL == "" {
130191
baseURL = cfDefaultBaseURL
@@ -137,20 +198,20 @@ func (c *CloudflareRetriever) Retrieve(ctx context.Context, reqURL string) (*Ret
137198
}
138199
reqBody, err := json.Marshal(cfReq)
139200
if err != nil {
140-
return nil, fmt.Errorf("marshal cf request: %w", err)
201+
return nil, 0, fmt.Errorf("marshal cf request: %w", err)
141202
}
142203

143204
httpReq, err := http.NewRequestWithContext(ctx, "POST", endpoint, bytes.NewReader(reqBody))
144205
if err != nil {
145-
return nil, fmt.Errorf("create cf request: %w", err)
206+
return nil, 0, fmt.Errorf("create cf request: %w", err)
146207
}
147208
httpReq.Header.Set("Authorization", "Bearer "+c.APIToken)
148209
httpReq.Header.Set("Content-Type", "application/json")
149210

150211
resp, err := c.httpClient().Do(httpReq)
151212
if err != nil {
152213
log.Printf("[WARN] cloudflare request failed for %s, error=%v", reqURL, err)
153-
return nil, err
214+
return nil, 0, err
154215
}
155216
defer func() {
156217
if closeErr := resp.Body.Close(); closeErr != nil {
@@ -161,15 +222,19 @@ func (c *CloudflareRetriever) Retrieve(ctx context.Context, reqURL string) (*Ret
161222
body, err := io.ReadAll(resp.Body)
162223
if err != nil {
163224
log.Printf("[WARN] failed to read cf response for %s, error=%v", reqURL, err)
164-
return nil, err
225+
return nil, 0, err
165226
}
166227

228+
if resp.StatusCode == http.StatusTooManyRequests {
229+
retryAfter := parseRetryAfter(resp.Header.Get("Retry-After"))
230+
return nil, retryAfter, fmt.Errorf("%w: status 429", errCFRateLimited)
231+
}
167232
if resp.StatusCode != http.StatusOK {
168233
bodySnippet := body
169234
if len(bodySnippet) > 512 {
170235
bodySnippet = bodySnippet[:512]
171236
}
172-
return nil, fmt.Errorf("cloudflare API error: status %d, body: %s", resp.StatusCode, string(bodySnippet))
237+
return nil, 0, fmt.Errorf("cloudflare API error: status %d, body: %s", resp.StatusCode, string(bodySnippet))
173238
}
174239

175240
// try JSON response format first: {"success": true, "result": "<html>"}
@@ -179,9 +244,9 @@ func (c *CloudflareRetriever) Retrieve(ctx context.Context, reqURL string) (*Ret
179244
case cfResp.Success && cfResp.Result != "":
180245
body = []byte(cfResp.Result)
181246
case cfResp.Success && cfResp.Result == "":
182-
return nil, fmt.Errorf("cloudflare returned empty content for %s", reqURL)
247+
return nil, 0, fmt.Errorf("cloudflare returned empty content for %s", reqURL)
183248
default: // !cfResp.Success
184-
return nil, fmt.Errorf("cloudflare API returned success=false for %s", reqURL)
249+
return nil, 0, fmt.Errorf("cloudflare API returned success=false for %s", reqURL)
185250
}
186251
}
187252
// if unmarshal fails, use the raw body as-is (raw HTML response)
@@ -195,5 +260,22 @@ func (c *CloudflareRetriever) Retrieve(ctx context.Context, reqURL string) (*Ret
195260
Body: body,
196261
URL: reqURL,
197262
Header: header,
198-
}, nil
263+
}, 0, nil
264+
}
265+
266+
// parseRetryAfter parses an HTTP Retry-After header value as either delta-seconds
267+
// or an HTTP date. returns 0 if empty or unparseable.
268+
func parseRetryAfter(value string) time.Duration {
269+
if value == "" {
270+
return 0
271+
}
272+
if secs, err := strconv.Atoi(value); err == nil && secs >= 0 {
273+
return time.Duration(secs) * time.Second
274+
}
275+
if t, err := http.ParseTime(value); err == nil {
276+
if delta := time.Until(t); delta > 0 {
277+
return delta
278+
}
279+
}
280+
return 0
199281
}

0 commit comments

Comments
 (0)