Skip to content

Commit 171cb73

Browse files
authored
Add more ranges to google's allowed bots (#73)
* Add more ranges to google's allowed bots * fix permission issue in state CI test * more unit tests
1 parent 2490bde commit 171cb73

7 files changed

Lines changed: 494 additions & 21 deletions

File tree

.github/workflows/lint-test.yml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,14 @@ jobs:
5858
permissions:
5959
contents: read
6060
runs-on: ubuntu-24.04
61-
strategy:
62-
matrix:
63-
traefik: [latest]
6461
steps:
6562
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
6663

6764
- name: run
6865
run: go run test.go
6966
working-directory: ./ci
7067
env:
71-
TRAEFIK_TAG: ${{ matrix.traefik }}
68+
TRAEFIK_TAG: latest
7269

7370
- name: cleanup
7471
if: ${{ always() }}

ci/docker-compose.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ services:
2626
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableStateReconciliation: "true"
2727
healthcheck:
2828
test: curl -fs http://localhost/healthz | grep -q OK || exit 1
29+
start_period: 5s
2930
volumes:
3031
- ./conf/nginx/default.conf:/etc/nginx/conf.d/default.conf:r
3132
networks:
@@ -57,6 +58,7 @@ services:
5758
traefik.http.middlewares.captcha-protect.plugin.captcha-protect.enableStateReconciliation: "true"
5859
healthcheck:
5960
test: curl -fs http://localhost/healthz | grep -q OK || exit 1
61+
start_period: 5s
6062
volumes:
6163
- ./conf/nginx/default.conf:/etc/nginx/conf.d/default.conf:r
6264
networks:
@@ -71,7 +73,8 @@ services:
7173
--api.debug=true
7274
--ping=true
7375
--entryPoints.http.address=:80
74-
--entryPoints.http.forwardedHeaders.trustedIPs=10.0.0.0/8,172.16.0.0/12,192.168.0.0/16
76+
--entryPoints.http.forwardedHeaders.insecure=true
77+
--entryPoints.http.forwardedHeaders.trustedIPs=127.0.0.1/32,::1/128,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16
7578
--providers.docker=true
7679
--providers.docker.network=default
7780
--experimental.localPlugins.captcha-protect.moduleName=github.com/libops/captcha-protect
@@ -90,6 +93,9 @@ services:
9093
- traefik
9194
healthcheck:
9295
test: traefik healthcheck --ping
96+
start_period: 5s
9397
depends_on:
9498
nginx:
9599
condition: service_healthy
100+
nginx2:
101+
condition: service_healthy

ci/test.go

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@ const parallelism = 10
2828

2929
func main() {
3030
log := slog.New(slog.NewTextHandler(os.Stdout, nil))
31-
googleCIDRs, err := helper.FetchGooglebotIPs(log, http.DefaultClient, "https://developers.google.com/static/search/apis/ipranges/googlebot.json")
31+
googleCIDRs, err := helper.FetchGoogleCrawlerIPs(log, http.DefaultClient, helper.GoogleCrawlerIPRangeURLs)
3232
if err != nil {
33-
slog.Error("unable to fetch google bot ips", "err", err)
33+
slog.Error("unable to fetch google crawler ips", "err", err)
3434
os.Exit(1)
3535
}
3636

@@ -56,18 +56,18 @@ func main() {
5656
runCommand("docker", "compose", "up", "-d")
5757
waitForService("http://localhost")
5858
waitForService("http://localhost/app2")
59+
waitForGoogleExemptionReady(googleCIDRs)
5960

6061
fmt.Printf("Making sure %d attempt(s) pass\n", rateLimit)
6162
runParallelChecks(ips, rateLimit, "http://localhost")
62-
63-
time.Sleep(cp.StateSaveInterval + cp.StateSaveJitter + (1 * time.Second))
64-
runCommand("jq", ".", "tmp/state.json")
63+
statePath := "./tmp/state.json"
64+
runCommand("jq", ".", statePath)
6565

6666
fmt.Printf("Making sure attempt #%d causes a redirect to the challenge page\n", rateLimit+1)
6767
ensureRedirect(ips, "http://localhost")
6868

6969
fmt.Println("\nTesting state sharing between nginx instances...")
70-
time.Sleep(cp.StateSaveInterval + cp.StateSaveJitter + (1 * time.Second))
70+
time.Sleep(cp.StateSaveInterval + cp.StateSaveJitter + (5 * time.Second))
7171

7272
testStateSharing(ips)
7373
testGoogleBotGetsThrough(googleCIDRs)
@@ -81,7 +81,7 @@ func main() {
8181
time.Sleep(10 * time.Second)
8282
checkStateReload()
8383

84-
runCommand("rm", "tmp/state.json")
84+
runCommand("rm", "-f", statePath)
8585

8686
}
8787

@@ -147,7 +147,7 @@ func runParallelChecks(ips []string, rateLimit int, url string) {
147147
var wg sync.WaitGroup
148148
sem := make(chan struct{}, parallelism)
149149

150-
for i := 0; i < rateLimit; i++ {
150+
for range rateLimit {
151151
for _, ip := range ips {
152152
wg.Add(1)
153153
sem <- struct{}{}
@@ -305,7 +305,7 @@ func checkStateReload() {
305305
os.Exit(1)
306306
}
307307

308-
if len(botsMap) != numIPs {
308+
if len(botsMap) < numIPs {
309309
slog.Error("Unexpected number of bots", "expected", numIPs, "received", len(botsMap))
310310
os.Exit(1)
311311
}
@@ -400,7 +400,7 @@ func testGoogleBotGetsThrough(googleCIDRs []string) {
400400

401401
// Prime the rate limiter for the GoogleBot IP with parameters
402402
fmt.Printf("Priming rate limiter for GoogleBot IP %s with params (%d requests)\n", googleIP, rateLimit)
403-
for i := 0; i < rateLimit; i++ {
403+
for i := range rateLimit {
404404
output = httpRequest(googleIP, "http://localhost/?foo=bar") // Assign value
405405
if output != "" {
406406
slog.Error(fmt.Sprintf("GoogleBot with params was challenged prematurely on request #%d", i+1), "ip", googleIP, "output", output)
@@ -421,3 +421,45 @@ func testGoogleBotGetsThrough(googleCIDRs []string) {
421421
// set things back to normal for other tests
422422
runCommand("docker", "compose", "down")
423423
}
424+
425+
func waitForGoogleExemptionReady(googleCIDRs []string) {
426+
googleIP, err := firstUsableIPv4FromCIDRs(googleCIDRs)
427+
if err != nil {
428+
slog.Warn("Unable to select Google IP for readiness check; skipping warmup", "err", err)
429+
return
430+
}
431+
432+
deadline := time.Now().Add(90 * time.Second)
433+
for time.Now().Before(deadline) {
434+
ready := true
435+
for i := 0; i < rateLimit+1; i++ {
436+
if output := httpRequest(googleIP, "http://localhost"); output != "" {
437+
ready = false
438+
break
439+
}
440+
}
441+
if ready {
442+
fmt.Printf("Google exemption is active for %s\n", googleIP)
443+
return
444+
}
445+
time.Sleep(500 * time.Millisecond)
446+
}
447+
448+
slog.Error("Timed out waiting for Google crawler IP exemption to become active", "googleIP", googleIP)
449+
os.Exit(1)
450+
}
451+
452+
func firstUsableIPv4FromCIDRs(cidrs []string) (string, error) {
453+
for _, cidr := range cidrs {
454+
ip, err := getIPFromCIDR(cidr)
455+
if err != nil {
456+
continue
457+
}
458+
parsed := net.ParseIP(ip)
459+
if parsed != nil && parsed.To4() != nil {
460+
return ip, nil
461+
}
462+
}
463+
464+
return "", fmt.Errorf("no usable IPv4 found in CIDR list")
465+
}

internal/helper/google.go

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,19 @@ import (
77
"log/slog"
88
"net"
99
"net/http"
10+
"net/netip"
11+
"sort"
1012
"sync"
1113
"time"
1214
)
1315

16+
var GoogleCrawlerIPRangeURLs = []string{
17+
"https://developers.google.com/static/search/apis/ipranges/googlebot.json",
18+
"https://developers.google.com/static/crawling/ipranges/common-crawlers.json",
19+
"https://developers.google.com/static/crawling/ipranges/special-crawlers.json",
20+
"https://developers.google.com/static/crawling/ipranges/user-triggered-fetchers-google.json",
21+
}
22+
1423
// GooglebotIPs holds the list of Googlebot IP ranges, providing a thread-safe way to check if an IP is a Googlebot.
1524
type GooglebotIPs struct {
1625
cidrs []*net.IPNet
@@ -108,3 +117,89 @@ func FetchGooglebotIPs(log *slog.Logger, httpClient *http.Client, url string) ([
108117

109118
return cidrs, nil
110119
}
120+
121+
// FetchGoogleCrawlerIPs fetches crawler IP ranges from multiple Google-managed endpoints,
122+
// then returns a canonical, unique list where broader prefixes replace narrower prefixes.
123+
func FetchGoogleCrawlerIPs(log *slog.Logger, httpClient *http.Client, urls []string) ([]string, error) {
124+
if len(urls) == 0 {
125+
return nil, nil
126+
}
127+
128+
allCIDRs := make([]string, 0)
129+
for _, url := range urls {
130+
cidrs, err := FetchGooglebotIPs(log, httpClient, url)
131+
if err != nil {
132+
return nil, err
133+
}
134+
allCIDRs = append(allCIDRs, cidrs...)
135+
}
136+
137+
return ReduceCIDRs(allCIDRs, log), nil
138+
}
139+
140+
// RefreshGoogleCrawlerIPs fetches crawler IPs from all configured URLs and updates
141+
// the provided GooglebotIPs set. Returns the number of CIDRs loaded.
142+
func RefreshGoogleCrawlerIPs(log *slog.Logger, httpClient *http.Client, target *GooglebotIPs, urls []string) (int, error) {
143+
cidrs, err := FetchGoogleCrawlerIPs(log, httpClient, urls)
144+
if err != nil {
145+
return 0, err
146+
}
147+
148+
target.Update(cidrs, log)
149+
150+
return len(cidrs), nil
151+
}
152+
153+
// ReduceCIDRs canonicalizes CIDRs, removes exact duplicates, and removes narrower
154+
// ranges when they are fully covered by broader ranges.
155+
func ReduceCIDRs(cidrs []string, log *slog.Logger) []string {
156+
prefixes := make([]netip.Prefix, 0, len(cidrs))
157+
for _, cidr := range cidrs {
158+
prefix, err := netip.ParsePrefix(cidr)
159+
if err != nil {
160+
if log != nil {
161+
log.Error("error parsing CIDR", "cidr", cidr, "err", err)
162+
}
163+
continue
164+
}
165+
prefixes = append(prefixes, prefix.Masked())
166+
}
167+
168+
sort.Slice(prefixes, func(i, j int) bool {
169+
a := prefixes[i]
170+
b := prefixes[j]
171+
172+
aIs4 := a.Addr().Is4()
173+
bIs4 := b.Addr().Is4()
174+
if aIs4 != bIs4 {
175+
return aIs4
176+
}
177+
178+
if a.Bits() != b.Bits() {
179+
return a.Bits() < b.Bits()
180+
}
181+
182+
return a.Addr().Compare(b.Addr()) < 0
183+
})
184+
185+
reduced := make([]netip.Prefix, 0, len(prefixes))
186+
for _, candidate := range prefixes {
187+
covered := false
188+
for _, existing := range reduced {
189+
if existing.Bits() <= candidate.Bits() && existing.Contains(candidate.Addr()) {
190+
covered = true
191+
break
192+
}
193+
}
194+
if !covered {
195+
reduced = append(reduced, candidate)
196+
}
197+
}
198+
199+
result := make([]string, 0, len(reduced))
200+
for _, prefix := range reduced {
201+
result = append(result, prefix.String())
202+
}
203+
204+
return result
205+
}

0 commit comments

Comments
 (0)