8
8
*/
9
9
10
10
use Friendica \App ;
11
+ use Friendica \Core \Config ;
11
12
use Friendica \Core \Hook ;
12
13
use Friendica \Core \System ;
13
14
use Jaybizzle \CrawlerDetect \CrawlerDetect ;
15
+ use Friendica \Core \Logger ;
14
16
15
17
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor ' . DIRECTORY_SEPARATOR . 'autoload.php ' ;
16
18
@@ -24,9 +26,64 @@ function blockbot_uninstall() {
24
26
}
25
27
26
28
function blockbot_init_1 (App $ a ) {
29
+ if (empty ($ _SERVER ['HTTP_USER_AGENT ' ])) {
30
+ return ;
31
+ }
32
+
33
+ $ logdata = ['agent ' => $ _SERVER ['HTTP_USER_AGENT ' ], 'uri ' => $ _SERVER ['REQUEST_URI ' ]];
34
+
35
+ // List of known crawlers.
36
+ $ agents = ['SemrushBot ' , 's~feedly-nikon3 ' , 'Qwantify/Bleriot/ ' , 'ltx71 ' , 'Sogou web spider/ ' ,
37
+ 'Diffbot/ ' , 'Twitterbot/ ' , 'YisouSpider ' , 'evc-batch/ ' , 'LivelapBot/ ' , 'TrendsmapResolver/ ' ,
38
+ 'PaperLiBot/ ' , 'Nuzzel ' , 'um-LN/ ' , 'Google Favicon ' , 'Datanyze ' , 'BLEXBot/ ' , '360Spider ' ,
39
+ 'adscanner/ ' , 'HeadlessChrome ' , 'wpif ' , 'startmebot/ ' , 'Googlebot/ ' , 'Applebot/ ' ,
40
+ 'facebookexternalhit/ ' , 'GoogleImageProxy ' , 'bingbot/ ' , 'heritrix/ ' , 'ldspider ' ,
41
+ 'AwarioRssBot/ ' , 'Zabbix ' , 'TweetmemeBot/ ' , 'dcrawl/ ' , 'PhantomJS/ ' , 'Googlebot-Image/ ' ,
42
+ 'CrowdTanglebot/ ' , 'Mediapartners-Google ' , 'Baiduspider/ ' , 'datagnionbot ' ,
43
+ 'MegaIndex.ru/ ' , 'SMUrlExpander ' , 'Hatena-Favicon/ ' , 'Wappalyzer ' , 'FlipboardProxy/ ' ,
44
+ 'NetcraftSurveyAgent/ ' , 'Dataprovider.com ' , 'SMTBot/ ' , 'Nimbostratus-Bot/ ' ,
45
+ 'DuckDuckGo-Favicons-Bot/ ' , 'IndieWebCards/ ' , 'proximic ' , 'netEstate NE Crawler ' ,
46
+ 'AhrefsBot/ ' , 'YandexBot/ ' , 'Exabot/ ' , 'Mediumbot-MetaTagFetcher/ ' , 'WhatsApp/ ' ,
47
+ 'TelegramBot ' , 'SurdotlyBot/ ' , 'BingPreview/ ' , 'SabsimBot/ ' , 'CCBot/ ' , 'WbSrch/ ' ,
48
+ 'DuckDuckBot-Https/ ' , 'HTTP Banner Detection ' , 'YandexImages/ ' , 'archive.org_bot ' ,
49
+ 'ArchiveTeam ArchiveBot/ ' , 'yacybot ' , 'https://developers.google.com/+/web/snippet/ ' ,
50
+ 'Scrapy/ ' , 'github-camo ' , 'MJ12bot/ ' , 'DotBot/ ' , 'Pinterestbot/ ' , 'Jooblebot/ ' ,
51
+ 'Cliqzbot/ ' , 'YaK/ ' , 'Mediatoolkitbot ' ];
52
+
53
+ foreach ($ agents as $ agent ) {
54
+ if (stristr ($ _SERVER ['HTTP_USER_AGENT ' ], $ agent )) {
55
+ System::httpExit (403 , 'Bots are not allowed ' );
56
+ }
57
+ }
58
+
59
+ // This switch here is only meant for developers who want to add more bots to the list above, it is not safe for production.
60
+ if (!Config::get ('blockbot ' , 'training ' )) {
61
+ return ;
62
+ }
63
+
27
64
$ crawlerDetect = new CrawlerDetect ();
28
65
29
- if ($ crawlerDetect ->isCrawler ()) {
30
- System::httpExit (403 , 'Bots are not allowed ' );
66
+ if (!$ crawlerDetect ->isCrawler ()) {
67
+ logger::debug ('Good user agent detected ' , $ logdata );
68
+ return ;
69
+ }
70
+
71
+ // List of false positives' strings of known "good" agents.
72
+ $ agents = ['fediverse.network crawler ' , 'Active_Pods_CheckBot_3.0 ' , 'Social-Relay/ ' ,
73
+ 'curl ' , 'zgrab ' , 'Go-http-client ' , 'curb ' , 'github.com ' , 'reqwest ' , 'Feedly/ ' ,
74
+ 'Python-urllib/ ' , 'Liferea/ ' , 'aiohttp/ ' , 'WordPress.com Reader ' , 'hackney/ ' ,
75
+ 'Faraday v ' , 'okhttp ' , 'UniversalFeedParser ' , 'PixelFedBot ' , 'python-requests ' ,
76
+ 'WordPress/ ' , 'http.rb/ ' , 'Apache-HttpClient/ ' , 'WordPress.com; ' , 'Pleroma ' ,
77
+ 'Dispatch/ ' , 'Ruby ' , 'Uptimebot/ ' , 'Java/ ' , 'libwww-perl/ ' , 'Mastodon/ ' ,
78
+ 'lua-resty-http/ ' , 'Test Certificate Info ' ];
79
+
80
+ foreach ($ agents as $ agent ) {
81
+ if (stristr ($ _SERVER ['HTTP_USER_AGENT ' ], $ agent )) {
82
+ logger::notice ('False positive ' , $ logdata );
83
+ return ;
84
+ }
31
85
}
86
+
87
+ logger::info ('Blocked bot ' , $ logdata );
88
+ System::httpExit (403 , 'Bots are not allowed ' );
32
89
}
0 commit comments