Skip to content

Commit ee1f74f

Browse files
committed
Cleaned up how specific category names and app names are specified for generic bot captures.
Added `Cloudflare` crawlers to capture list. Added tests.
1 parent 6bdb060 commit ee1f74f

2 files changed

Lines changed: 182 additions & 190 deletions

File tree

src/mappings/crawlers.php

Lines changed: 109 additions & 190 deletions
Original file line numberDiff line numberDiff line change
@@ -21,195 +21,103 @@ public static function getApp(string $value, array $data = []) : array {
2121
$parts[1] = \substr($parts[1], 0, \strspn($parts[1], '0123456789.'));
2222
}
2323
$category = [
24-
'yacybot' => [
25-
'category' => 'search',
26-
'app' => 'YacyBot'
27-
],
28-
'googlebot' => [
29-
'category' => 'search',
30-
'app' => 'GoogleBot'
31-
],
32-
'googlebot-mobile' => [
33-
'category' => 'search',
34-
'app' => 'GoogleBot'
35-
],
36-
'googlebot-image' => [
37-
'category' => 'search',
38-
'app' => 'GoogleBot'
39-
],
40-
'googlebot-video' => [
41-
'category' => 'search',
42-
'app' => 'GoogleBot'
43-
],
44-
'googlebot-news' => [
45-
'category' => 'search',
46-
'app' => 'GoogleBot'
47-
],
48-
'storebot-google' => [
49-
'category' => 'search',
50-
'app' => 'GoogleBot'
51-
],
52-
'adsbot-google' => [
53-
'category' => 'ads',
54-
'app' => 'GoogleBot'
55-
],
56-
'adsbot-google-mobile' => [
57-
'category' => 'ads',
58-
'app' => 'GoogleBot'
59-
],
60-
'mediapartners-google' => [
61-
'category' => 'ads',
62-
'app' => 'GoogleBot'
63-
],
64-
'bingbot' => [
65-
'category' => 'search',
66-
'app' => 'BingBot'
67-
],
68-
'adidxbot' => [
69-
'category' => 'ads',
70-
'app' => 'AdidxBot'
71-
],
72-
'duckduckbot' => [
73-
'category' => 'search',
74-
'app' => 'DuckDuckBot'
75-
],
76-
'duckduckgo-favicons-bot' => [
77-
'category' => 'search',
78-
'app' => 'DuckDuckBot'
79-
],
80-
'coccocbot-image' => [
81-
'category' => 'search',
82-
'app' => 'CoccocBot'
83-
],
84-
'coccocbot-web' => [
85-
'category' => 'search',
86-
'app' => 'CoccocBot'
87-
],
88-
'applebot' => [
89-
'category' => 'search',
90-
'app' => 'AppleBot'
91-
],
92-
'yandexbot' => [
93-
'category' => 'search'
94-
],
95-
'mj12bot' => [
96-
'category' => 'search',
97-
'app' => 'Majestic 12 Bot'
98-
],
99-
'mail.ru_bot' => [
100-
'category' => 'search',
101-
'app' => 'Mail.ru Bot'
102-
],
103-
'exabot' => [
104-
'category' => 'search',
105-
'app' => 'ExaBot'
106-
],
107-
'uptimerobot' => [
108-
'category' => 'monitor'
109-
],
110-
'petalbot' => [
111-
'category' => 'search'
112-
],
113-
'twitterbot' => [
114-
'category' => 'feed',
115-
'app' => 'TwitterBot'
116-
],
117-
'xbot' => [
118-
'category' => 'feed'
119-
],
120-
'discordbot' => [
121-
'category' => 'feed',
122-
'app' => 'DiscordBot'
123-
],
124-
'sematextsyntheticsrobot' => [
125-
'category' => 'monitor',
126-
'app' => 'Sematext Synthetics Robot'
127-
],
128-
'linkedinbot' => [
129-
'category' => 'feed'
130-
],
131-
'paperlibot' => [
132-
'category' => 'feed'
133-
],
134-
'bitlybot' => [
135-
'category' => 'feed',
136-
'app' => 'Bit.ly Bot'
137-
],
138-
'tineye-bot' => [
139-
'category' => 'search',
140-
'app' => 'TinEye Bot'
141-
],
142-
'pinterestbot' => [
143-
'category' => 'feed',
144-
'app' => 'PinterestBot'
145-
],
146-
'webcrawler' => [
147-
'category' => 'crawler'
148-
],
149-
'webprosbot' => [
150-
'category' => 'crawler',
151-
'app' => 'WebprosBot'
152-
],
153-
'guzzlehttp' => [
154-
'category' => 'scraper'
155-
],
156-
'telegrambot' => [
157-
'category' => 'feed'
158-
],
159-
'semrushbot' => [
160-
'category' => 'crawler'
161-
],
162-
'mediatoolkitbot' => [
163-
'category' => 'crawler',
164-
'app' => 'MediaToolkitBot'
165-
],
166-
'iploggerbot' => [
167-
'category' => 'monitor'
168-
],
169-
'cfnetwork' => [
170-
'category' => 'feed',
171-
'app' => 'Apple Core Foundation Network'
172-
],
173-
'ncsc web check feedback.webcheck@digital.ncsc.gov.uk' => [
174-
'category' => 'monitor',
175-
'app' => 'NCSC Web Check'
176-
],
177-
'google-site-verification' => [
178-
'category' => 'validator',
179-
'app' => 'Google Site Verification'
180-
],
181-
'google-inspectiontool' => [
182-
'category' => 'validator',
183-
'app' => 'Google Inspection Tool'
184-
],
185-
'pingdomtms' => [
186-
'category' => 'monitor',
187-
'app' => 'Pingdom.com'
188-
],
189-
'facebookexternalhit' => [
190-
'category' => 'feed',
191-
'app' => 'Facebook URL Preview'
192-
],
193-
'phxbot' => [
194-
'app' => 'ProtonMail Bot'
195-
],
196-
'baiduspider' => [
197-
'app' => 'Baidu Spider'
198-
],
199-
'yisouspider' => [
200-
'app' => 'Yisou Spider'
201-
],
202-
'google-read-aloud' => [
203-
'app' => 'Google Read Aloud'
204-
]
24+
'yacybot' => 'search',
25+
'googlebot' => 'search',
26+
'googlebot-mobile' => 'search',
27+
'googlebot-image' => 'search',
28+
'googlebot-video' => 'search',
29+
'googlebot-news' => 'search',
30+
'storebot-google' => 'search',
31+
'adsbot-google' => 'ads',
32+
'adsbot-google-mobile' => 'ads',
33+
'mediapartners-google' => 'ads',
34+
'bingbot' => 'search',
35+
'adidxbot' => 'ads',
36+
'duckduckbot' => 'search',
37+
'duckduckgo-favicons-bot' => 'search',
38+
'coccocbot-image' => 'search',
39+
'coccocbot-web' => 'search',
40+
'applebot' => 'search',
41+
'yandexbot' => 'search',
42+
'mj12bot' => 'search',
43+
'mail.ru_bot' => 'search',
44+
'exabot' => 'search',
45+
'uptimerobot' => 'monitor',
46+
'petalbot' => 'search',
47+
'twitterbot' => 'feed',
48+
'xbot' => 'feed',
49+
'discordbot' => 'feed',
50+
'sematextsyntheticsrobot' => 'monitor',
51+
'linkedinbot' => 'feed',
52+
'paperlibot' => 'feed',
53+
'bitlybot' => 'feed',
54+
'tineye-bot' => 'search',
55+
'pinterestbot' => 'feed',
56+
'webcrawler' => 'crawler',
57+
'webprosbot' => 'crawler',
58+
'guzzlehttp' => 'scraper',
59+
'telegrambot' => 'feed',
60+
'semrushbot' => 'crawler',
61+
'mediatoolkitbot' => 'crawler',
62+
'iploggerbot' => 'monitor'
63+
];
64+
$apps = [
65+
'yacybot' => 'YacyBot',
66+
'googlebot' => 'GoogleBot',
67+
'googlebot-mobile' => 'GoogleBot',
68+
'googlebot-image' => 'GoogleBot',
69+
'googlebot-video' => 'GoogleBot',
70+
'googlebot-news' => 'GoogleBot',
71+
'storebot-google' => 'GoogleBot',
72+
'adsbot-google' => 'GoogleBot',
73+
'adsbot-google-mobile' => 'GoogleBot',
74+
'mediapartners-google' => 'GoogleBot',
75+
'bingbot' => 'BingBot',
76+
'adidxbot' => 'AdidxBot',
77+
'duckduckbot' => 'DuckDuckBot',
78+
'duckduckgo-favicons-bot' => 'DuckDuckBot',
79+
'coccocbot-image' => 'CoccocBot',
80+
'coccocbot-web' => 'CoccocBot',
81+
'applebot' => 'AppleBot',
82+
'mj12bot' => 'Majestic 12 Bot',
83+
'mail.ru_bot' => 'Mail.ru Bot',
84+
'exabot' => 'ExaBot',
85+
'twitterbot' => 'TwitterBot',
86+
'discordbot' => 'DiscordBot',
87+
'sematextsyntheticsrobot' => 'Sematext Synthetics Robot',
88+
'bitlybot' => 'Bit.ly Bot',
89+
'tineye-bot' => 'TinEye Bot',
90+
'pinterestbot' => 'PinterestBot',
91+
'webprosbot' => 'WebprosBot',
92+
'mediatoolkitbot' => 'MediaToolkitBot',
93+
'cfnetwork' => 'Apple Core Foundation Network',
94+
'ncsc web check feedback.webcheck@digital.ncsc.gov.uk' => 'NCSC Web Check',
95+
'google-site-verification' => 'Google Site Verification',
96+
'google-inspectiontool' => 'Google Inspection Tool',
97+
'pingdomtms' => 'Pingdom.com',
98+
'facebookexternalhit' => 'Facebook URL Preview',
99+
'phxbot' => 'ProtonMail Bot',
100+
'baiduspider' => 'Baidu Spider',
101+
'yisouspider' => 'Yisou Spider',
102+
'google-read-aloud' => 'Google Read Aloud',
103+
'monitoring360bot' => '360 Monitoring',
104+
'cloudflare-healthchecks' => 'Cloudflare Health Checks',
105+
'cloudflare-alwaysonline' => 'Cloudflare Always Online',
106+
'cloudflare-traffic-manager' => 'Cloudflare-Traffic-Manager',
107+
'cloudflare-prefetch' => 'Cloudflare Prefetch',
108+
'cloudflare-ssldetector' => 'Cloudflare SSL Detector',
109+
'cloudflare-diagnostics' => 'Cloudflare Diagnostics',
110+
'ptst' => 'Cloudflare Speed Test'
205111
];
112+
113+
$lower = \mb_strtolower($parts[0]);
206114
return \array_merge([
207115
'type' => 'robot',
208-
'category' => \mb_stripos($value, 'crawl') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper',
209-
'app' => $parts[0],
116+
'category' => $category[$lower] ?? (\mb_stripos($value, 'crawl') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper'),
117+
'app' => $apps[$lower] ?? $parts[0],
210118
'appname' => $parts[0],
211119
'appversion' => empty($parts[1]) ? null : $parts[1]
212-
], $data, $category[\mb_strtolower($parts[0])] ?? []);
120+
], $data);
213121
}
214122
return [];
215123
}
@@ -252,15 +160,15 @@ public static function get() : array {
252160
];
253161
return [
254162
'Yahoo! Slurp' => new props('exact', $fn['search']),
255-
'facebookexternalhit/' => new props('start', $fn['map']),
256-
'Google-Site-Verification/' => new props('start', $fn['map']),
257-
'Google-InspectionTool/' => new props('start', $fn['map']),
163+
'facebookexternalhit/' => new props('start', $fn['feed']),
164+
'Google-Site-Verification/' => new props('start', $fn['validator']),
165+
'Google-InspectionTool/' => new props('start', $fn['validator']),
258166
'Google-Read-Aloud' => new props('exact', $fn['feed']),
259167
'Mediapartners-Google' => new props('start', $fn['search']),
260168
'FeedFetcher-Google' => new props('exact', $fn['feed']),
261169
'GoogleProducer' => new props('exact', $fn['feed']),
262170
'Google-adstxt' => new props('exact', $fn['ads']),
263-
'CFNetwork/' => new props('start', $fn['map']),
171+
'CFNetwork/' => new props('start', $fn['feed']),
264172
'Siteimprove.com' => new props('any', $fn['crawler']),
265173
'CyotekWebCopy' => new props('start', $fn['scraper']),
266174
'Google Page Speed Insights' => new props('exact', $fn['validator']),
@@ -278,12 +186,23 @@ public static function get() : array {
278186
'python' => new props('start', $fn['scraper']),
279187
'jsdom/' => new props('start', $fn['scraper']),
280188
'Nessus' => new props('start', $fn['monitor']),
189+
'monitoring360bot' => new props('start', $fn['monitor']),
190+
'Cloudflare' => new props('start', $fn['validator']),
191+
'PTST/' => new props('start', $fn['validator']),
192+
'+https://developers.cloudflare.com/security-center/' => new props('exact', $fn['monitor']),
193+
'AppSignalBot/' => new props('start', $fn['monitor']),
194+
'Better Uptime Bot' => new props('start', [
195+
'type' => 'robot',
196+
'category' => 'monitor',
197+
'app' => 'Better Uptime Bot',
198+
'appname' => 'Better Uptime Bot'
199+
]),
281200
'Chrome-Lighthouse' => new props('start', $fn['validator']),
282201
'Siege/' => new props('start', $fn['validator']),
283202
'Microsoft Profiling/' => new props('any', $fn['validator']),
284203
'Bidtellect' => new props('start', $fn['crawler']),
285204
'magpie-crawler/' => new props('start', $fn['crawler']),
286-
'PingdomTMS/' => new props('start', $fn['map']),
205+
'PingdomTMS/' => new props('start', $fn['monitor']),
287206
'DynGate' => new props('exact', $fn['monitor']),
288207
'Datadog/Synthetics' => new props('exact', [
289208
'type' => 'robot',
@@ -294,7 +213,7 @@ public static function get() : array {
294213
'Checkly/' => new props('start', $fn['monitor']),
295214
'Uptime/' => new props('start', $fn['monitor']),
296215
'HostTracker/' => new props('start', $fn['monitor']),
297-
'NCSC Web Check feedback.webcheck@digital.ncsc.gov.uk' => new props('exact', $fn['map']),
216+
'NCSC Web Check feedback.webcheck@digital.ncsc.gov.uk' => new props('exact', $fn['monitor']),
298217
'Pingdom.com' => new props('start', function (string $value) : array {
299218
$version = \explode('_', \trim($value, '_'));
300219
return [

0 commit comments

Comments
 (0)