From d14755a03945576166e3b2ad1e2b95d8bf951397 Mon Sep 17 00:00:00 2001 From: Jesse Donat Date: Wed, 14 May 2025 13:21:02 -0500 Subject: [PATCH 1/5] More modern mastodon --- tests/user_agents.dist.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/user_agents.dist.json b/tests/user_agents.dist.json index ea5c1d6..3132067 100644 --- a/tests/user_agents.dist.json +++ b/tests/user_agents.dist.json @@ -1519,6 +1519,11 @@ "browser": "Mastodon", "version": "4.3.0" }, + "Mastodon\/4.3.8 (http.rb\/5.2.0; +https:\/\/jorijn.dev\/)": { + "platform": null, + "browser": "Mastodon", + "version": "4.3.8" + }, "msnbot-media\/1.1 (+http:\/\/search.msn.com\/msnbot.htm)": { "platform": null, "browser": "msnbot-media", From cbe0665e3fbd30e83ea01974e6e0e6905675cb5f Mon Sep 17 00:00:00 2001 From: Jesse Donat Date: Sun, 11 May 2025 09:00:29 -0500 Subject: [PATCH 2/5] Add support and tests for many more bots --- bin/constant_generator.php | 42 ++++- bin/user_agent_sorter.php | 6 +- src/UserAgent/Browsers.php | 115 +++++++----- src/UserAgentParser.php | 30 ++- tests/user_agents.dist.json | 365 ++++++++++++++++++++++++++++++++++++ 5 files changed, 501 insertions(+), 57 deletions(-) diff --git a/bin/constant_generator.php b/bin/constant_generator.php index ffd1420..698104f 100644 --- a/bin/constant_generator.php +++ b/bin/constant_generator.php @@ -52,9 +52,49 @@ } } +$browserExclusions = [ + 'ADKERNELTOPICCRAWLER', + 'AHREFSBOT', + 'AMAZONADBOT', + 'AMAZONBOT', + 'AWARIOBOT', + 'BAIDUSPIDER_RENDER', + 'BARKROWLER', + 'BLEXBOT', + 'BRAVEBOT', + 'CENSYSINSPECT', + 'COCCOCBOT_IMAGE', + 'COCCOCBOT_WEB', + 'DATAFORSEOBOT', + 'DOTBOT', + 'DVBOT', + 'EV_CRAWLER', + 'FLIPBOARDPROXY', + 'HEEXYBOT', + 'HUBSPOT_DOMAIN_CHECK', + 'LEIKIBOT', + 'MINIFLUX', + 'MODATSCANNER', + 'MOJEEKBOT', + 'PERPLEXITY_USER', + 'PERPLEXITYBOT', + 'PETALBOT', + 'PROXIMIC', + 'SEMRUSHBOT', + 'SEMRUSHBOT_SI', + 'SEZNAMBOT', + 'SMTBOT', + 'SPIDERLING', + 'SURDOTLYBOT', + 'YETI' +]; + $browserBody = "{$header}namespace donatj\UserAgent;\n\ninterface Browsers {\n\n"; $maxKey = max(array_map('strlen', array_keys($browsers))); foreach( $browsers as $const => $val ) { + if( in_array($const, $browserExclusions, true) ) { + continue; + } $browserBody .= sprintf("\tconst %-{$maxKey}s = %s;\n", $const, var_export(key($val), true)); } $browserBody .= "\n}\n\n"; @@ -74,4 +114,4 @@ $platformBody .= "\n}\n\n"; file_put_contents(__DIR__ . '/../src/UserAgent/Browsers.php', $browserBody); -file_put_contents(__DIR__ . '/../src/UserAgent/Platforms.php', $platformBody); \ No newline at end of file +file_put_contents(__DIR__ . '/../src/UserAgent/Platforms.php', $platformBody); diff --git a/bin/user_agent_sorter.php b/bin/user_agent_sorter.php index f52d107..0211bb1 100644 --- a/bin/user_agent_sorter.php +++ b/bin/user_agent_sorter.php @@ -5,6 +5,10 @@ $jsonfile = __DIR__ . '/../Tests/user_agents.dist.json'; $uas = json_decode(file_get_contents($jsonfile), true); +if( $uas === null ) { + echo "Failed to decode JSON\n"; + die(1); +} foreach( $uas as $key => &$val ) { $val['key'] = $key; @@ -97,4 +101,4 @@ function compare_version( $a, $b ) { } return $value; -} \ No newline at end of file +} diff --git a/src/UserAgent/Browsers.php b/src/UserAgent/Browsers.php index 301230f..c80b58c 100644 --- a/src/UserAgent/Browsers.php +++ b/src/UserAgent/Browsers.php @@ -6,58 +6,69 @@ interface Browsers { - const ADSBOT_GOOGLE = 'AdsBot-Google'; - const ANDROID_BROWSER = 'Android Browser'; - const APPLEBOT = 'Applebot'; - const BAIDUSPIDER = 'Baiduspider'; - const BINGBOT = 'bingbot'; - const BLACKBERRY_BROWSER = 'BlackBerry Browser'; - const BROWSER = 'Browser'; - const BUNJALLOO = 'Bunjalloo'; - const CAMINO = 'Camino'; - const CHATGPT_USER = 'ChatGPT-User'; - const CHROME = 'Chrome'; - const CURL = 'curl'; - const EDGE = 'Edge'; - const FACEBOOKEXTERNALHIT = 'facebookexternalhit'; - const FEEDVALIDATOR = 'FeedValidator'; - const FIREFOX = 'Firefox'; - const GOOGLEBOT = 'Googlebot'; - const GOOGLEBOT_IMAGE = 'Googlebot-Image'; - const GOOGLEBOT_VIDEO = 'Googlebot-Video'; - const GPTBOT = 'GPTBot'; - const HEADLESSCHROME = 'HeadlessChrome'; - const IEMOBILE = 'IEMobile'; - const IMESSAGEBOT = 'iMessageBot'; - const KINDLE = 'Kindle'; - const LYNX = 'Lynx'; - const MASTODON = 'Mastodon'; - const MIDORI = 'Midori'; - const MIUIBROWSER = 'MiuiBrowser'; - const MSIE = 'MSIE'; - const MSNBOT_MEDIA = 'msnbot-media'; - const NETFRONT = 'NetFront'; - const NINTENDOBROWSER = 'NintendoBrowser'; - const OAI_SEARCHBOT = 'OAI-SearchBot'; - const OCULUSBROWSER = 'OculusBrowser'; - const OPERA = 'Opera'; - const PUFFIN = 'Puffin'; - const SAFARI = 'Safari'; - const SAILFISHBROWSER = 'SailfishBrowser'; - const SAMSUNGBROWSER = 'SamsungBrowser'; - const SILK = 'Silk'; - const SLACKBOT = 'Slackbot'; - const TELEGRAMBOT = 'TelegramBot'; - const TIZENBROWSER = 'TizenBrowser'; - const TWITTERBOT = 'Twitterbot'; - const UC_BROWSER = 'UC Browser'; - const VALVE_STEAM_TENFOOT = 'Valve Steam Tenfoot'; - const VIVALDI = 'Vivaldi'; - const WGET = 'Wget'; - const WHALE = 'Whale'; - const WORDPRESS = 'WordPress'; - const YANDEX = 'Yandex'; - const YANDEXBOT = 'YandexBot'; + const ADSBOT_GOOGLE = 'AdsBot-Google'; + const ANDROID_BROWSER = 'Android Browser'; + const APPLEBOT = 'Applebot'; + const ARCHIVE_ORG_BOT = 'archive.org_bot'; + const BAIDUSPIDER = 'Baiduspider'; + const BINGBOT = 'bingbot'; + const BLACKBERRY_BROWSER = 'BlackBerry Browser'; + const BROWSER = 'Browser'; + const BUNJALLOO = 'Bunjalloo'; + const CAMINO = 'Camino'; + const CHATGPT_USER = 'ChatGPT-User'; + const CHROME = 'Chrome'; + const CURL = 'curl'; + const DISCORDBOT = 'Discordbot'; + const EDGE = 'Edge'; + const FACEBOOKEXTERNALHIT = 'facebookexternalhit'; + const FEEDVALIDATOR = 'FeedValidator'; + const FIREFOX = 'Firefox'; + const GOOGLEBOT = 'Googlebot'; + const GOOGLEBOT_IMAGE = 'Googlebot-Image'; + const GOOGLEBOT_VIDEO = 'Googlebot-Video'; + const GOOGLE_READ_ALOUD = 'Google-Read-Aloud'; + const GOOGLE_SAFETY = 'Google-Safety'; + const GPTBOT = 'GPTBot'; + const HEADLESSCHROME = 'HeadlessChrome'; + const IEMOBILE = 'IEMobile'; + const IMESSAGEBOT = 'iMessageBot'; + const KINDLE = 'Kindle'; + const LYNX = 'Lynx'; + const MASTODON = 'Mastodon'; + const MIDORI = 'Midori'; + const MIUIBROWSER = 'MiuiBrowser'; + const MSIE = 'MSIE'; + const MSNBOT_MEDIA = 'msnbot-media'; + const NETFRONT = 'NetFront'; + const NINTENDOBROWSER = 'NintendoBrowser'; + const OAI_SEARCHBOT = 'OAI-SearchBot'; + const OCULUSBROWSER = 'OculusBrowser'; + const OPERA = 'Opera'; + const PINTERESTBOT = 'Pinterestbot'; + const PUFFIN = 'Puffin'; + const SAFARI = 'Safari'; + const SAILFISHBROWSER = 'SailfishBrowser'; + const SAMSUNGBROWSER = 'SamsungBrowser'; + const SILK = 'Silk'; + const SLACKBOT = 'Slackbot'; + const TELEGRAMBOT = 'TelegramBot'; + const TIZENBROWSER = 'TizenBrowser'; + const TWITTERBOT = 'Twitterbot'; + const UC_BROWSER = 'UC Browser'; + const VALVE_STEAM_TENFOOT = 'Valve Steam Tenfoot'; + const VIVALDI = 'Vivaldi'; + const WELLKNOWNBOT = 'WellKnownBot'; + const WGET = 'Wget'; + const WHALE = 'Whale'; + const WORDPRESS = 'WordPress'; + const WPBOT = 'wpbot'; + const YANDEX = 'Yandex'; + const YANDEXBOT = 'YandexBot'; + const YANDEXIMAGES = 'YandexImages'; + const YANDEXMOBILEBOT = 'YandexMobileBot'; + const YANDEXRCA = 'YandexRCA'; + const YANDEXUSERPROXY = 'YandexUserproxy'; } diff --git a/src/UserAgentParser.php b/src/UserAgentParser.php index 15ce032..ee488ea 100644 --- a/src/UserAgentParser.php +++ b/src/UserAgentParser.php @@ -96,7 +96,7 @@ function parse_user_agent( $u_agent = null ) { } preg_match_all(<<<'REGEX' -%(?PCamino|Kindle(\ Fire)?|Firefox|Iceweasel|IceCat|Safari|MSIE|Trident|AppleWebKit| +%(?P.)?(?PCamino|Kindle(\ Fire)?|Firefox|Iceweasel|IceCat|Safari|MSIE|Trident|AppleWebKit| TizenBrowser|(?:Headless)?Chrome|YaBrowser|Vivaldi|IEMobile|Opera|OPR|Silk|Midori|(?-i:Edge)|EdgA?|CriOS|UCBrowser|Puffin| OculusBrowser|SamsungBrowser|SailfishBrowser|XiaoMi/MiuiBrowser|YaApp_Android|Whale| Baiduspider|Applebot|Facebot|Googlebot|YandexBot|bingbot|Lynx|Version|Wget|curl|ChatGPT-User|GPTBot|OAI-SearchBot| @@ -109,11 +109,35 @@ function parse_user_agent( $u_agent = null ) { , $u_agent, $result); // If nothing matched, return null (to avoid undefined index errors) + $quickReturn = false; if( !isset($result[BROWSER][0], $result[BROWSER_VERSION][0]) ) { - if( preg_match('%^(?!Mozilla)(?P[A-Z0-9\-]+)([/ :](?P[0-9A-Z.]+))?%ix', $u_agent, $result) ) { - return [ PLATFORM => $platform ?: null, BROWSER => $result[BROWSER], BROWSER_VERSION => empty($result[BROWSER_VERSION]) ? null : $result[BROWSER_VERSION] ]; + if( preg_match('%^(?!Mozilla)(?P[A-Z0-9\-]+)([/ :](?P[0-9A-Z.]+))?%ix', $u_agent, $g_result) ) { + return [ PLATFORM => $platform, BROWSER => $g_result[BROWSER], BROWSER_VERSION => empty($g_result[BROWSER_VERSION]) ? null : $g_result[BROWSER_VERSION] ]; } + $quickReturn = true; + } + + if( + ( + empty($result[BROWSER][0]) + || ($result['prev'][0] !== '') + ) // if we caught a browser, and it's the first part of the string, skip + && preg_match(<<<'REGEX' +%[(;]\s*(?P[^(/;]+) +(?:[:/ ]v?(?P[0-9A-Z.]+)[^;)\s]*)? +;?(?:\s*robot;)?\s*\+https?:%x +REGEX + , $u_agent, $bot_result) + ) { + return [ + PLATFORM => $platform, + BROWSER => trim($bot_result['browser']), + BROWSER_VERSION => empty($bot_result['version']) ? null : $bot_result['version'], + ]; + } + + if( $quickReturn ) { return $return; } diff --git a/tests/user_agents.dist.json b/tests/user_agents.dist.json index 3132067..5b05fd5 100644 --- a/tests/user_agents.dist.json +++ b/tests/user_agents.dist.json @@ -9,6 +9,11 @@ "browser": "Applebot", "version": "0.1" }, + "Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit\/605.1.15 (KHTML, like Gecko) Version\/17.4 Safari\/605.1.15 (Applebot\/0.1; +http:\/\/www.apple.com\/go\/applebot)": { + "platform": "Macintosh", + "browser": "Applebot", + "version": "0.1" + }, "Mozilla\/5.0 (Macintosh; U; Intel Mac OS X; en; rv:1.8.1.11) Gecko\/20071128 Camino\/1.5.4": { "platform": "Macintosh", "browser": "Camino", @@ -114,6 +119,11 @@ "browser": "Chrome", "version": "100.0.4664.110" }, + "Mozilla\/5.0 (Windows NT 6.3;compatible; DVbot\/1.0; +http:\/\/www.doubleverify.com)": { + "platform": "Windows", + "browser": "DVbot", + "version": "1.0" + }, "Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/79.0.3915.0 Safari\/537.36 Edg\/79.0.287.0": { "platform": "Macintosh", "browser": "Edge", @@ -289,6 +299,11 @@ "browser": "Firefox", "version": "118.0" }, + "Mozilla\/5.0 (Macintosh; Intel Mac OS X 10.11; rv:49.0) Gecko\/20100101 Firefox\/49.0 (FlipboardProxy\/1.2; +http:\/\/flipboard.com\/browserproxy)": { + "platform": "Macintosh", + "browser": "FlipboardProxy", + "version": "1.2" + }, "Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) HeadlessChrome\/64.0.3282.119 Safari\/537.36": { "platform": "Linux", "browser": "HeadlessChrome", @@ -309,6 +324,11 @@ "browser": "iMessageBot", "version": null }, + "Mozilla\/5.0 (Windows NT 6.3;compatible; Leikibot\/1.0; +http:\/\/www.leiki.com)": { + "platform": "Windows", + "browser": "Leikibot", + "version": "1.0" + }, "Mozilla\/5.0 (X11; U; Linux i686; de-DE; rv:1.8.1.9) Gecko\/20071103 Midori\/0.0.10": { "platform": "Linux", "browser": "Midori", @@ -414,6 +434,11 @@ "browser": "MSIE", "version": "11.0" }, + "Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/131.0.0.0 Safari\/537.36; compatible; OAI-SearchBot\/1.0; +https:\/\/openai.com\/searchbot": { + "platform": "Macintosh", + "browser": "OAI-SearchBot", + "version": "1.0" + }, "Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/537.36 (KHTML, like Gecko) OculusBrowser\/5.5.10.137831012 SamsungBrowser\/4.0 Chrome\/66.0.3359.203 Safari\/537.36": { "platform": "Linux", "browser": "OculusBrowser", @@ -664,6 +689,11 @@ "browser": "Yandex", "version": "22.3.0" }, + "Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko; compatible; Yeti\/1.1; +https:\/\/naver.me\/spd) Chrome\/127.0.0.0 Safari\/537.36": { + "platform": "Windows", + "browser": "Yeti", + "version": "1.1" + }, "Mozilla\/5.0 (Linux; U; Android 2.2; en-us; SGH-T959 Build\/FROYO) AppleWebKit\/533.1 (KHTML, like Gecko) Version\/4.0 Mobile Safari\/533.1": { "platform": "Android", "browser": "Android Browser", @@ -699,6 +729,11 @@ "browser": "Android Browser", "version": "534.30" }, + "Mozilla\/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit\/534.46 (KHTML,like Gecko) Version\/5.1 Mobile Safari\/10600.6.3 (compatible; Baiduspider\/2.0; +http:\/\/www.baidu.com\/search\/spider.html)": { + "platform": "Android", + "browser": "Baiduspider", + "version": "2.0" + }, "Mozilla\/5.0 (Linux; Android 4.2.2; de-at; SAMSUNG GT-I9195\/I9195XXUAMF6 Build\/JDQ39) AppleWebKit\/535.19 (KHTML, like Gecko) Version\/1.0 Chrome\/18.0.1025.308 Mobile Safari\/535.19": { "platform": "Android", "browser": "Chrome", @@ -764,11 +799,61 @@ "browser": "Firefox", "version": "84.0" }, + "Mozilla\/5.0 (Linux; Android 7.0; SM-G930V Build\/NRD90M) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/59.0.3071.125 Mobile Safari\/537.36 (compatible; Google-Read-Aloud; +https:\/\/support.google.com\/webmasters\/answer\/1061943)": { + "platform": "Android", + "browser": "Google-Read-Aloud", + "version": null + }, + "Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/41.0.2272.96 Mobile Safari\/537.36 (compatible; Google-Safety; +http:\/\/www.google.com\/bot.html)": { + "platform": "Android", + "browser": "Google-Safety", + "version": null + }, + "Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/135.0.7049.114 Mobile Safari\/537.36 (compatible; Google-Safety; +http:\/\/www.google.com\/bot.html)": { + "platform": "Android", + "browser": "Google-Safety", + "version": null + }, "Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/90.0.4430.97 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)": { "platform": "Android", "browser": "Googlebot", "version": "2.1" }, + "Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/99.0.4844.84 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)": { + "platform": "Android", + "browser": "Googlebot", + "version": "2.1" + }, + "Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/117.0.0.0 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)": { + "platform": "Android", + "browser": "Googlebot", + "version": "2.1" + }, + "Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/134.0.6998.165 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)": { + "platform": "Android", + "browser": "Googlebot", + "version": "2.1" + }, + "Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/135.0.7049.95 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)": { + "platform": "Android", + "browser": "Googlebot", + "version": "2.1" + }, + "Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/135.0.7049.114 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)": { + "platform": "Android", + "browser": "Googlebot", + "version": "2.1" + }, + "Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/136.0.7103.59 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)": { + "platform": "Android", + "browser": "Googlebot", + "version": "2.1" + }, + "Mozilla\/5.0 (Linux; Android 6.0.1; Nexus 5X Build\/MMB29P) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/136.0.7103.92 Mobile Safari\/537.36 (compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html)": { + "platform": "Android", + "browser": "Googlebot", + "version": "2.1" + }, "Mozilla\/5.0 (Linux; U; Android 8.1.0; en-gb; Redmi Note 5 Build\/OPM1.171019.011)\nAppleWebKit\/537.36 (KHTML, like Gecko) Version\/4.0 Chrome\/61.0.3163.128 Mobile\nSafari\/537.36 XiaoMi\/MiuiBrowser\/10.1.2": { "platform": "Android", "browser": "MiuiBrowser", @@ -799,6 +884,11 @@ "browser": "Opera", "version": "63.3.3216.58675" }, + "Mozilla\/5.0 (Linux; Android 7.0;) AppleWebKit\/537.36 (KHTML, like Gecko) Mobile Safari\/537.36 (compatible; PetalBot;+https:\/\/webmaster.petalsearch.com\/site\/petalbot)": { + "platform": "Android", + "browser": "PetalBot", + "version": null + }, "Mozilla\/5.0 (X11; U; Linux x86_64; en-gb) AppleWebKit\/534.35 (KHTML, like Gecko) Chrome\/11.0.696.65 Safari\/534.35 Puffin\/2.9174AP": { "platform": "Android", "browser": "Puffin", @@ -994,6 +1084,16 @@ "browser": "Applebot", "version": "0.1" }, + "Mozilla\/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit\/605.1.15 (KHTML, like Gecko) Version\/17.4.1 Mobile\/15E148 Safari\/604.1 (Applebot\/0.1; +http:\/\/www.apple.com\/go\/applebot)": { + "platform": "iPhone", + "browser": "Applebot", + "version": "0.1" + }, + "Mozilla\/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit\/601.1.46 (KHTML, like Gecko) Version\/9.0 Mobile\/13B143 Safari\/601.1 (compatible; Baiduspider-render\/2.0; +http:\/\/www.baidu.com\/search\/spider.html)": { + "platform": "iPhone", + "browser": "Baiduspider-render", + "version": "2.0" + }, "Mozilla\/5.0 (iPhone; U; CPU iPhone OS 5_1_1 like Mac OS X; en) AppleWebKit\/534.46.0 (KHTML, like Gecko) CriOS\/19.0.1084.60 Mobile\/9B206 Safari\/7534.48.3": { "platform": "iPhone", "browser": "Chrome", @@ -1049,6 +1149,16 @@ "browser": "Safari", "version": "16.6" }, + "Mozilla\/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit\/536.26 (KHTML, like Gecko) Version\/6.0 Mobile\/10A5376e Safari\/8536.25 (compatible; SMTBot\/1.0; +http:\/\/www.similartech.com\/smtbot)": { + "platform": "iPhone", + "browser": "SMTBot", + "version": "1.0" + }, + "Mozilla\/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit\/605.1.15 (KHTML, like Gecko) Version\/15.4 Mobile\/15E148 Safari\/604.1 (compatible; YandexMobileBot\/3.0; +http:\/\/yandex.com\/bots)": { + "platform": "iPhone", + "browser": "YandexMobileBot", + "version": "3.0" + }, "Mozilla\/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit\/420.1 (KHTML, like Gecko) Version\/3.0 Mobile\/3A101a Safari\/419.3": { "platform": "iPod", "browser": "Safari", @@ -1434,31 +1544,141 @@ "browser": "MSIE", "version": "10.0" }, + "Mozilla\/5.0 (compatible; AdkernelTopicCrawler\/1.0; +http:\/\/adkernel.com\/robot\/)": { + "platform": null, + "browser": "AdkernelTopicCrawler", + "version": "1.0" + }, "AdsBot-Google (+http:\/\/www.google.com\/adsbot.html)": { "platform": null, "browser": "AdsBot-Google", "version": null }, + "Mozilla\/5.0 (compatible; AhrefsBot\/7.0; +http:\/\/ahrefs.com\/robot\/)": { + "platform": null, + "browser": "AhrefsBot", + "version": "7.0" + }, + "Mozilla\/5.0 (compatible; AmazonAdBot\/1.0; +https:\/\/adbot.amazon.com)": { + "platform": null, + "browser": "AmazonAdBot", + "version": "1.0" + }, + "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko; compatible; Amazonbot\/0.1; +https:\/\/developer.amazon.com\/support\/amazonbot) Chrome\/119.0.6045.214 Safari\/537.36": { + "platform": null, + "browser": "Amazonbot", + "version": "0.1" + }, + "Mozilla\/5.0 (compatible; archive.org_bot +http:\/\/archive.org\/details\/archive.org_bot) Zeno\/2a1713b warc\/v0.8.77": { + "platform": null, + "browser": "archive.org_bot", + "version": null + }, + "Mozilla\/5.0 (compatible; AwarioBot\/1.0; +https:\/\/awario.com\/bots.html)": { + "platform": null, + "browser": "AwarioBot", + "version": "1.0" + }, + "Mozilla\/5.0 (compatible; Baiduspider\/2.0; +http:\/\/www.baidu.com\/search\/spider.html": { + "platform": null, + "browser": "Baiduspider", + "version": "2.0" + }, "Mozilla\/5.0 (compatible; Baiduspider\/2.0; +http:\/\/www.baidu.com\/search\/spider.html)": { "platform": null, "browser": "Baiduspider", "version": "2.0" }, + "Mozilla\/5.0 (compatible; Baiduspider-render\/2.0; +http:\/\/www.baidu.com\/search\/spider.html)": { + "platform": null, + "browser": "Baiduspider-render", + "version": "2.0" + }, + "Mozilla\/5.0 (compatible; Barkrowler\/0.9; +https:\/\/babbar.tech\/crawler)": { + "platform": null, + "browser": "Barkrowler", + "version": "0.9" + }, "Mozilla\/5.0 (compatible; bingbot\/2.0; +http:\/\/www.bing.com\/bingbot.htm)": { "platform": null, "browser": "bingbot", "version": "2.0" }, + "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko; compatible; bingbot\/2.0; +http:\/\/www.bing.com\/bingbot.htm) Chrome\/100.0.4896.127 Safari\/537.36": { + "platform": null, + "browser": "bingbot", + "version": "2.0" + }, + "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko; compatible; bingbot\/2.0; +http:\/\/www.bing.com\/bingbot.htm) Chrome\/112.0.0.0 Safari\/537.36": { + "platform": null, + "browser": "bingbot", + "version": "2.0" + }, + "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko; compatible; bingbot\/2.0; +http:\/\/www.bing.com\/bingbot.htm) Chrome\/116.0.1938.76 Safari": { + "platform": null, + "browser": "bingbot", + "version": "2.0" + }, + "Mozilla\/5.0 (compatible; BLEXBot\/1.0; +https:\/\/help.seranking.com\/en\/blex-crawler)": { + "platform": null, + "browser": "BLEXBot", + "version": "1.0" + }, + "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko; compatible; Bravebot\/1.0; +https:\/\/search.brave.com\/help\/brave-search-crawler) Chrome\/W.X.Y.Z Safari\/537.36": { + "platform": null, + "browser": "Bravebot", + "version": "1.0" + }, + "Mozilla\/5.0 (compatible; CensysInspect\/1.1; +https:\/\/about.censys.io\/)": { + "platform": null, + "browser": "CensysInspect", + "version": "1.1" + }, "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko); compatible; ChatGPT-User\/1.0; +https:\/\/openai.com\/bot": { "platform": null, "browser": "ChatGPT-User", "version": "1.0" }, + "Mozilla\/5.0 (compatible; coccocbot-image\/1.0; +http:\/\/help.coccoc.com\/searchengine)": { + "platform": null, + "browser": "coccocbot-image", + "version": "1.0" + }, + "Mozilla\/5.0 (compatible; coccocbot-web\/1.0; +http:\/\/help.coccoc.com\/searchengine)": { + "platform": null, + "browser": "coccocbot-web", + "version": "1.0" + }, "curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8r zlib\/1.2.3": { "platform": null, "browser": "curl", "version": "7.19.7" }, + "Mozilla\/5.0 (compatible; DataForSeoBot\/1.0; +https:\/\/dataforseo.com\/dataforseo-bot)": { + "platform": null, + "browser": "DataForSeoBot", + "version": "1.0" + }, + "Mozilla\/5.0 (compatible; Discordbot\/2.0; +https:\/\/discordapp.com)": { + "platform": null, + "browser": "Discordbot", + "version": "2.0" + }, + "Mozilla\/5.0 (compatible; Discordbot\/2.0; +https:\/\/discordapp.com),gzip(gfe)": { + "platform": null, + "browser": "Discordbot", + "version": "2.0" + }, + "Mozilla\/5.0 (compatible; DotBot\/1.2; +https:\/\/opensiteexplorer.org\/dotbot; help@moz.com)": { + "platform": null, + "browser": "DotBot", + "version": "1.2" + }, + "Mozilla\/5.0 (compatible; ev-crawler\/1.0; +https:\/\/headline.com\/legal\/crawler)": { + "platform": null, + "browser": "ev-crawler", + "version": "1.0" + }, "facebookexternalhit\/1.1": { "platform": null, "browser": "facebookexternalhit", @@ -1489,6 +1709,11 @@ "browser": "Googlebot", "version": "2.1" }, + "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko; compatible; Googlebot\/2.1; +http:\/\/www.google.com\/bot.html) Chrome\/135.0.7049.114 Safari\/537.36": { + "platform": null, + "browser": "Googlebot", + "version": "2.1" + }, "Googlebot-Image\/1.0": { "platform": null, "browser": "Googlebot-Image", @@ -1499,11 +1724,31 @@ "browser": "Googlebot-Video", "version": "1.0" }, + "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko; compatible; GPTBot\/1.0; +https:\/\/openai.com\/gptbot)": { + "platform": null, + "browser": "GPTBot", + "version": "1.0" + }, "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko); compatible; GPTBot\/1.1; +https:\/\/openai.com\/gptbot": { "platform": null, "browser": "GPTBot", "version": "1.1" }, + "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko; compatible; GPTBot\/1.2; +https:\/\/openai.com\/gptbot)": { + "platform": null, + "browser": "GPTBot", + "version": "1.2" + }, + "Mozilla\/5.0 (compatible; Heexybot\/1.0; +https:\/\/heexy.org\/bot)": { + "platform": null, + "browser": "Heexybot", + "version": "1.0" + }, + "Mozilla\/5.0 (compatible; HubSpot Crawler; HubSpot Domain check; +https:\/\/www.hubspot.com)": { + "platform": null, + "browser": "HubSpot Domain check", + "version": null + }, "Lynx\/2.8.6rel.4 libwww-FM\/2.14 SSL-MM\/1.4.1 OpenSSL\/0.9.7l Lynxlet\/0.7.0": { "platform": null, "browser": "Lynx", @@ -1524,6 +1769,41 @@ "browser": "Mastodon", "version": "4.3.8" }, + "Mozilla\/5.0 (compatible; Miniflux\/2.2.0; +https:\/\/miniflux.app)": { + "platform": null, + "browser": "Miniflux", + "version": "2.2.0" + }, + "Mozilla\/5.0 (compatible; Miniflux\/2.2.6; +https:\/\/miniflux.app)": { + "platform": null, + "browser": "Miniflux", + "version": "2.2.6" + }, + "Mozilla\/5.0 (compatible; Miniflux\/2.2.7; +https:\/\/miniflux.app)": { + "platform": null, + "browser": "Miniflux", + "version": "2.2.7" + }, + "Mozilla\/5.0 (compatible; Miniflux\/2.2.8; +https:\/\/miniflux.app)": { + "platform": null, + "browser": "Miniflux", + "version": "2.2.8" + }, + "Mozilla\/5.0 (compatible; Miniflux\/v2.2.8; +https:\/\/miniflux.app)": { + "platform": null, + "browser": "Miniflux", + "version": "2.2.8" + }, + "Mozilla\/5.0 (compatible; ModatScanner\/1.0; +https:\/\/modat.io\/)": { + "platform": null, + "browser": "ModatScanner", + "version": "1.0" + }, + "Mozilla\/5.0 (compatible; MojeekBot\/0.11; +https:\/\/www.mojeek.com\/bot.html)": { + "platform": null, + "browser": "MojeekBot", + "version": "0.11" + }, "msnbot-media\/1.1 (+http:\/\/search.msn.com\/msnbot.htm)": { "platform": null, "browser": "msnbot-media", @@ -1534,11 +1814,56 @@ "browser": "OAI-SearchBot", "version": "1.0" }, + "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko; compatible; Perplexity-User\/1.0; +https:\/\/perplexity.ai\/perplexity-user)": { + "platform": null, + "browser": "Perplexity-User", + "version": "1.0" + }, + "Mozilla\/5.0 AppleWebKit\/537.36 (KHTML, like Gecko; compatible; PerplexityBot\/1.0; +https:\/\/perplexity.ai\/perplexitybot)": { + "platform": null, + "browser": "PerplexityBot", + "version": "1.0" + }, + "Mozilla\/5.0 (compatible; Pinterestbot\/1.0; +http:\/\/www.pinterest.com\/bot.html)": { + "platform": null, + "browser": "Pinterestbot", + "version": "1.0" + }, + "Mozilla\/5.0 (compatible; proximic; +https:\/\/www.comscore.com\/Web-Crawler)": { + "platform": null, + "browser": "proximic", + "version": null + }, + "Mozilla\/5.0 (compatible; SemrushBot; +http:\/\/www.semrush.com\/bot.html)": { + "platform": null, + "browser": "SemrushBot", + "version": null + }, + "Mozilla\/5.0 (compatible; SemrushBot-SI\/0.97; +http:\/\/www.semrush.com\/bot.html)": { + "platform": null, + "browser": "SemrushBot-SI", + "version": "0.97" + }, + "Mozilla\/5.0 (compatible; SeznamBot\/4.0; +https:\/\/o-seznam.cz\/napoveda\/vyhledavani\/en\/seznambot-crawler\/)": { + "platform": null, + "browser": "SeznamBot", + "version": "4.0" + }, "Slackbot 1.0 (+https:\/\/api.slack.com\/robots)": { "platform": null, "browser": "Slackbot", "version": "1.0" }, + "Mozilla\/5.0 (compatible; SpiderLing; +https:\/\/nlp.fi.muni.cz\/projects\/biwec\/)": { + "platform": null, + "browser": "SpiderLing", + "version": null + }, + "Mozilla\/5.0 (compatible; SurdotlyBot\/1.0; +http:\/\/sur.ly\/bot.html)": { + "platform": null, + "browser": "SurdotlyBot", + "version": "1.0" + }, "TelegramBot (like TwitterBot)": { "platform": null, "browser": "TelegramBot", @@ -1549,14 +1874,54 @@ "browser": "Twitterbot", "version": "1.0" }, + "Mozilla\/5.0 (compatible; WellKnownBot\/0.1; +https:\/\/well-known.dev\/about\/#bot)": { + "platform": null, + "browser": "WellKnownBot", + "version": "0.1" + }, "WordPress\/3.7.1; http:\/\/wordpress.com": { "platform": null, "browser": "WordPress", "version": "3.7.1" }, + "Mozilla\/5.0 (compatible; wpbot\/1.3; +https:\/\/forms.gle\/ajBaxygz9jSR8p8G9)": { + "platform": null, + "browser": "wpbot", + "version": "1.3" + }, "Mozilla\/5.0 (compatible; YandexBot\/3.0; +http:\/\/yandex.com\/bots)": { "platform": null, "browser": "YandexBot", "version": "3.0" + }, + "Mozilla\/5.0 (compatible; YandexBot\/3.0; +http:\/\/yandex.com\/bots) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/108.0.0.0": { + "platform": null, + "browser": "YandexBot", + "version": "3.0" + }, + "Mozilla\/5.0 (compatible; YandexBot\/3.0; +http:\/\/yandex.com\/bots) YTranslate": { + "platform": null, + "browser": "YandexBot", + "version": "3.0" + }, + "Mozilla\/5.0 (compatible; YandexImages\/3.0; +http:\/\/yandex.com\/bots)": { + "platform": null, + "browser": "YandexImages", + "version": "3.0" + }, + "Mozilla\/5.0 (compatible; YandexRCA\/1.0; +http:\/\/yandex.com\/bots)": { + "platform": null, + "browser": "YandexRCA", + "version": "1.0" + }, + "Mozilla\/5.0 (compatible; YandexUserproxy; robot; +http:\/\/yandex.com\/bots)": { + "platform": null, + "browser": "YandexUserproxy", + "version": null + }, + "Mozilla\/5.0 (compatible; Yeti\/1.1; +https:\/\/naver.me\/spd)": { + "platform": null, + "browser": "Yeti", + "version": "1.1" } } From 8c8b368000ae8dc64e6edfc80b236df9fd9bb22e Mon Sep 17 00:00:00 2001 From: Jesse Donat Date: Wed, 14 May 2025 16:58:14 -0500 Subject: [PATCH 3/5] Simplify the bot dance --- src/UserAgentParser.php | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/UserAgentParser.php b/src/UserAgentParser.php index ee488ea..c66d803 100644 --- a/src/UserAgentParser.php +++ b/src/UserAgentParser.php @@ -109,20 +109,16 @@ function parse_user_agent( $u_agent = null ) { , $u_agent, $result); // If nothing matched, return null (to avoid undefined index errors) - $quickReturn = false; - if( !isset($result[BROWSER][0], $result[BROWSER_VERSION][0]) ) { - if( preg_match('%^(?!Mozilla)(?P[A-Z0-9\-]+)([/ :](?P[0-9A-Z.]+))?%ix', $u_agent, $g_result) ) { - return [ PLATFORM => $platform, BROWSER => $g_result[BROWSER], BROWSER_VERSION => empty($g_result[BROWSER_VERSION]) ? null : $g_result[BROWSER_VERSION] ]; - } - - $quickReturn = true; + if( !isset($result[BROWSER][0], $result[BROWSER_VERSION][0]) + && preg_match('%^(?!Mozilla)(?P[A-Z0-9\-]+)([/ :](?P[0-9A-Z.]+))?%ix', $u_agent, $g_result) ) { + return [ PLATFORM => $platform, BROWSER => $g_result[BROWSER], BROWSER_VERSION => empty($g_result[BROWSER_VERSION]) ? null : $g_result[BROWSER_VERSION] ]; } if( ( empty($result[BROWSER][0]) || ($result['prev'][0] !== '') - ) // if we caught a browser, and it's the first part of the string, skip + ) && preg_match(<<<'REGEX' %[(;]\s*(?P[^(/;]+) (?:[:/ ]v?(?P[0-9A-Z.]+)[^;)\s]*)? @@ -137,7 +133,7 @@ function parse_user_agent( $u_agent = null ) { ]; } - if( $quickReturn ) { + if( !isset($result[BROWSER][0], $result[BROWSER_VERSION][0]) ) { return $return; } From 11f241fb0a59244e0cc7ad7bf330ac2f1ca0d4b4 Mon Sep 17 00:00:00 2001 From: Jesse Donat Date: Wed, 14 May 2025 17:02:03 -0500 Subject: [PATCH 4/5] Simplify regex a little --- src/UserAgentParser.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UserAgentParser.php b/src/UserAgentParser.php index c66d803..ada87dc 100644 --- a/src/UserAgentParser.php +++ b/src/UserAgentParser.php @@ -99,7 +99,7 @@ function parse_user_agent( $u_agent = null ) { %(?P.)?(?PCamino|Kindle(\ Fire)?|Firefox|Iceweasel|IceCat|Safari|MSIE|Trident|AppleWebKit| TizenBrowser|(?:Headless)?Chrome|YaBrowser|Vivaldi|IEMobile|Opera|OPR|Silk|Midori|(?-i:Edge)|EdgA?|CriOS|UCBrowser|Puffin| OculusBrowser|SamsungBrowser|SailfishBrowser|XiaoMi/MiuiBrowser|YaApp_Android|Whale| -Baiduspider|Applebot|Facebot|Googlebot|YandexBot|bingbot|Lynx|Version|Wget|curl|ChatGPT-User|GPTBot|OAI-SearchBot| +Applebot|Facebot|Googlebot|YandexBot|bingbot|Lynx|Version| Valve\ Steam\ Tenfoot|Mastodon| NintendoBrowser|PLAYSTATION\ (?:\d|Vita)+) \)?;? From e482d473da4beee8eddd4cc9a24471c6e7de2b93 Mon Sep 17 00:00:00 2001 From: Jesse Donat Date: Wed, 14 May 2025 21:20:33 -0500 Subject: [PATCH 5/5] Update Browser Constants --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4ef756e..0e643c7 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,7 @@ Predefined helper constants from `donatj\UserAgent\Browsers` | `Browsers::ADSBOT_GOOGLE` | AdsBot-Google | | `Browsers::ANDROID_BROWSER` | Android Browser | | `Browsers::APPLEBOT` | Applebot | +| `Browsers::ARCHIVE_ORG_BOT` | archive.org_bot | | `Browsers::BAIDUSPIDER` | Baiduspider | | `Browsers::BINGBOT` | bingbot | | `Browsers::BLACKBERRY_BROWSER` | BlackBerry Browser | @@ -172,6 +173,7 @@ Predefined helper constants from `donatj\UserAgent\Browsers` | `Browsers::CHATGPT_USER` | ChatGPT-User | | `Browsers::CHROME` | Chrome | | `Browsers::CURL` | curl | +| `Browsers::DISCORDBOT` | Discordbot | | `Browsers::EDGE` | Edge | | `Browsers::FACEBOOKEXTERNALHIT` | facebookexternalhit | | `Browsers::FEEDVALIDATOR` | FeedValidator | @@ -179,6 +181,8 @@ Predefined helper constants from `donatj\UserAgent\Browsers` | `Browsers::GOOGLEBOT` | Googlebot | | `Browsers::GOOGLEBOT_IMAGE` | Googlebot-Image | | `Browsers::GOOGLEBOT_VIDEO` | Googlebot-Video | +| `Browsers::GOOGLE_READ_ALOUD` | Google-Read-Aloud | +| `Browsers::GOOGLE_SAFETY` | Google-Safety | | `Browsers::GPTBOT` | GPTBot | | `Browsers::HEADLESSCHROME` | HeadlessChrome | | `Browsers::IEMOBILE` | IEMobile | @@ -195,6 +199,7 @@ Predefined helper constants from `donatj\UserAgent\Browsers` | `Browsers::OAI_SEARCHBOT` | OAI-SearchBot | | `Browsers::OCULUSBROWSER` | OculusBrowser | | `Browsers::OPERA` | Opera | +| `Browsers::PINTERESTBOT` | Pinterestbot | | `Browsers::PUFFIN` | Puffin | | `Browsers::SAFARI` | Safari | | `Browsers::SAILFISHBROWSER` | SailfishBrowser | @@ -207,10 +212,16 @@ Predefined helper constants from `donatj\UserAgent\Browsers` | `Browsers::UC_BROWSER` | UC Browser | | `Browsers::VALVE_STEAM_TENFOOT` | Valve Steam Tenfoot | | `Browsers::VIVALDI` | Vivaldi | +| `Browsers::WELLKNOWNBOT` | WellKnownBot | | `Browsers::WGET` | Wget | | `Browsers::WHALE` | Whale | | `Browsers::WORDPRESS` | WordPress | +| `Browsers::WPBOT` | wpbot | | `Browsers::YANDEX` | Yandex | -| `Browsers::YANDEXBOT` | YandexBot | +| `Browsers::YANDEXBOT` | YandexBot | +| `Browsers::YANDEXIMAGES` | YandexImages | +| `Browsers::YANDEXMOBILEBOT` | YandexMobileBot | +| `Browsers::YANDEXRCA` | YandexRCA | +| `Browsers::YANDEXUSERPROXY` | YandexUserproxy | More information is available at [Donat Studios](https://donatstudios.com/PHP-Parser-HTTP_USER_AGENT). \ No newline at end of file