Skip to content

Commit 2aaa7e8

Browse files
committed
Improved how apps are normalised in crawlers::getApp().
`Applebot` is now categorised as AI. Added `bytespider` crawler. Updated tests.
1 parent 948b0c8 commit 2aaa7e8

2 files changed

Lines changed: 19 additions & 19 deletions

File tree

src/mappings/crawlers.php

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ public static function getApp(string $value, array $data = []) : array {
3737
'duckduckgo-favicons-bot' => 'search',
3838
'coccocbot-image' => 'search',
3939
'coccocbot-web' => 'search',
40-
'applebot' => 'search',
40+
'applebot' => 'ai',
4141
'yandexbot' => 'search',
4242
'mj12bot' => 'search',
4343
'mail.ru_bot' => 'search',
@@ -59,7 +59,13 @@ public static function getApp(string $value, array $data = []) : array {
5959
'telegrambot' => 'feed',
6060
'semrushbot' => 'crawler',
6161
'mediatoolkitbot' => 'crawler',
62-
'iploggerbot' => 'monitor'
62+
'iploggerbot' => 'monitor',
63+
'baiduspider' => 'search',
64+
'haosouspider' => 'search',
65+
'yisouspider' => 'search',
66+
'360spider' => 'search',
67+
'sogou web spider' => 'search',
68+
'bytespider' => 'crawler'
6369
];
6470
$apps = [
6571
'yacybot' => 'YacyBot',
@@ -94,6 +100,7 @@ public static function getApp(string $value, array $data = []) : array {
94100
'mediatoolkitbot' => 'MediaToolkitBot',
95101
'cfnetwork' => 'Apple Core Foundation Network',
96102
'ncsc web check feedback.webcheck@digital.ncsc.gov.uk' => 'NCSC Web Check',
103+
'enhanced webcheck feedback@digital.ncsc.gov.uk' => 'NCSC Enhanced Web Check',
97104
'the national archives uk government web archive:' => 'UK Government National Archives',
98105
'google-site-verification' => 'Google Site Verification',
99106
'google-inspectiontool' => 'Google Inspection Tool',
@@ -115,17 +122,20 @@ public static function getApp(string $value, array $data = []) : array {
115122
'citoid' => 'Wikimedia Citoid',
116123
'censysinspect' => 'Censys Inspect',
117124
'googledocs' => 'Google Docs',
118-
'user-agent: seolyt' => 'SEOlyt'
125+
'user-agent: seolyt' => 'SEOlyt',
126+
'bytespider' => 'ByteDance Spider',
127+
'spider-feedback@bytedance.com' => 'ByteDance Spider'
119128
];
120129

121130
$lower = \mb_strtolower($parts[0]);
122131
return \array_merge([
123132
'type' => 'robot',
124-
'category' => $category[$lower] ?? (\mb_stripos($value, 'crawl') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper'),
125133
'app' => $apps[$lower] ?? $parts[0],
126134
'appname' => $parts[0],
127135
'appversion' => empty($parts[1]) ? null : $parts[1]
128-
], $data);
136+
], $data, [
137+
'category' => $category[$lower] ?? $data['category'] ?? (\mb_stripos($value, 'crawl') !== false || \mb_stripos($value, 'bot') !== false ? 'crawler' : 'scraper')
138+
]);
129139
}
130140
return [];
131141
}
@@ -150,18 +160,7 @@ public static function get() : array {
150160
]
151161
)),
152162
'crawler' => function (string $value) : array {
153-
$parts = \explode('/', $value, 2);
154-
$map = [
155-
'baiduspider' => 'search',
156-
'haosouspider' => 'search',
157-
'yisouspider' => 'search',
158-
'360spider' => 'search',
159-
'sogou web spider' => 'search',
160-
'bytespider' => 'search',
161-
];
162-
return self::getApp($value, [
163-
'category' => $map[\mb_strtolower($parts[0])] ?? 'crawler'
164-
]);
163+
return self::getApp($value, ['category' => 'crawler']);
165164
},
166165
'monitor' => fn (string $value) : array => self::getApp($value, ['category' => 'monitor']),
167166
'scraper' => fn (string $value) : array => self::getApp($value, ['category' => 'scraper']),
@@ -253,6 +252,7 @@ public static function get() : array {
253252
'Uptime/' => new props('start', $fn['monitor']),
254253
'HostTracker/' => new props('start', $fn['monitor']),
255254
'NCSC Web Check feedback.webcheck@digital.ncsc.gov.uk' => new props('exact', $fn['monitor']),
255+
'Enhanced WebCheck feedback@digital.ncsc.gov.uk' => new props('exact', $fn['monitor']),
256256
'Pingdom.com' => new props('start', function (string $value) : array {
257257
$version = \explode('_', \trim($value, '_'));
258258
return [

tests/crawlersTest.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,8 @@ public function testSearch() : void {
317317
'Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; https://zhanzhang.toutiao.com/)' => [
318318
'string' => 'Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; https://zhanzhang.toutiao.com/)',
319319
'type' => 'robot',
320-
'category' => 'search',
321-
'app' => 'Bytespider',
320+
'category' => 'crawler',
321+
'app' => 'ByteDance Spider',
322322
'appname' => 'Bytespider',
323323
'url' => 'https://zhanzhang.toutiao.com/',
324324
'platform' => 'Android',

0 commit comments

Comments
 (0)