• nginx配置文件应对网站攻击采集垃圾蜘蛛的方法总结


    #禁垃圾蜘蛛
    if ($http_user_agent ~* "CheckMarkNetwork|Synapse|Bingbot|Googlebot|Nimbostratus-Bot|Dark|scraper|LMAO|Hakai|Gemini|Wappalyzer|masscan|crawler4j|Mappy|Center|eright|aiohttp|MauiBot|Crawler|researchscan|Dispatch|AlphaBot|Census|ips-agent|NetcraftSurveyAgent|ToutiaoSpider|EasyHttp|Iframely|sysscan|fasthttp|muhstik|DeuSu|mstshash|HTTP_Request|ExtLinksBot|package|SafeDNSBot|CPython|SiteExplorer|SSH|MegaIndex|BUbiNG|CCBot|NetTrack|Digincore|aiHitBot|SurdotlyBot|null|SemrushBot|Test|Copied|ltx71|Nmap|DotBot|AdsBot|InetURL|Pcore-HTTP|PocketParser|Wotbox|newspaper|DnyzBot|redback|PiplBot|SMTBot|WinHTTP|Auto Spider 1.0|GrabNet|TurnitinBot|Go-Ahead-Got-It|Download Demon|Go!Zilla|GetWeb!|GetRight|libwww-perl|Cliqzbot|MailChimp|SMTBot|Dataprovider|XoviBot|linkdexbot|SeznamBot|Qwantify|spbot|evc-batch|zgrab|Go-http-client|FeedDemon|JikeSpider|Indy Library|Alexa Toolbar|AskTbFXTV|AhrefsBot|CrawlDaddy|CoolpadWebkit|Java|UniversalFeedParser|ApacheBench|Microsoft URL Control|Swiftbot|ZmEu|jaunty|Python-urllib|lightDeckReports Bot|YYSpider|DigExt|HttpClient|MJ12bot|EasouSpider|LinkpadBot|Ezooms|YoudaoBot|YandexBot|Rogerbot|exabot|ia_archiver|Teoma|gigabot|DOCOMO Sprider|AhrefsBot|SemrushBot|Sosospider|Yahoo! Slurp China|Yahoo! Slurp|MSNBot|MSNot-media|FlightDeckReports Bot|Bytespider|Mail.RU_Bot") {
    return 403;
    break;
    }
    #禁攻击采集
    if ($http_user_agent ~* "FeedDemon|BOT/0.1 (BOT for JCE)|CrawlDaddy|Java|Jullo|Feedly|UniversalFeedParser|ApacheBench|
    #过滤url参数
    set $URL $request_uri;
    if ($URL ~* "member|plus|base|data|dede|public|plug|Vote|tool|feed|components|skin|tinyMCE|version|sysimage|wp-content|wp-admin|static|common|face|shell|swfupload|utility|convert|sitemap|siteserver|BackupDB|file|user|system|upimg|install|wap|multiupload|ewebeditor|office|wallet|backup|bitcoin|maccms|vendor|apply|bjebhgm|photo|module|external|Analytics|tools|subdomains|notes|md5|ckeditor|bbs|ajax|zhuitanyun|logbaak|help|weki|dxyylc|Somnus|manage|J4H7eFjWoBa3bO6U|SiteFiles|dowds|source|ucenter|phpcms|language|TeatchClass|taglib|sql|allowurl|shitan|root|wp-login|houtai|admin001|htadmin|clock2|webadmin"){
    return 403;
    break;
    }
    #禁特殊后缀
    location ~* .(asp|xml|jsp|aspx|dev|aspx|ewebeditor|sql|xsl|asmx|htaccess|ini|env|git|project|cgi|md5|ajax.js|swf|tpl.php)$ {
    return 403;
    break;
    }
    #禁止非GET方式的抓取
    if ($request_method !~ ^(GET)$) {
        return 403;
    }
    
    #禁止特殊请求方式
    if ($request_method ~* "HEAD|DELETE|OPTIONS|POST" ) {
        return 403;
        break;
    }
    #禁特殊请求工具
    if ($http_user_agent ~* "Wget|Curl" ) {
    return 403;
    break;
    }
    #禁部分爬取工具
    if ($http_user_agent ~* "crawl|curb|git|Wtrace|Scrapy|python|http://www.snsbianpofanghu.com/" ) {
    return 403;
    break;
    }
    #禁压缩包
    location ~* .(tgz|bak|zip|rar|tar|gz|bz2|xz|tar.gz)$ {
    return 403;
    break;
    }
    #UA 不全,十有八九不是正常访问,禁
    if ($http_user_agent = "Mozilla") {
        return 403;
        break;
    }
    
    #UA 不全,十有八九不是正常访问,禁
    if ($http_user_agent = "Mozilla/5.0") {
        return 403;
        break;
    }
    
    #UA 不全,十有八九不是正常访问,禁
    if ($http_user_agent = "Mozilla/4.0") {
        return 403;
        break;
    }

      #禁空 UA
      if ($http_user_agent ~* ^$) {
      return 403;
      break;
      }

    #屏蔽ip

    deny 113.92.157.0/24; deny 223.199.0.0/16;
    deny 192.74.225.105;
    限制指定蜘蛛访问频次
    
    limit_req_zone  $anti_spider  zone=one:100m rate=30r/m;
    limit_req zone=one burst=5 nodelay;
    
    server
    {
    
    if ($http_user_agent ~* "Sogou web spider|YisouSpider") {
        set $anti_spider $http_user_agent;
    }
    限制所有ip访问频次
    
    limit_req_zone $binary_remote_addr zone=allips:100m rate=30r/m;
    limit_req zone=allips burst=5 nodelay;
  • 相关阅读:
    爬虫开发9.scrapy框架之递归解析和post请求
    爬虫开发7.scrapy框架简介和基础应用
    爬虫开发6.selenuim和phantonJs处理网页动态加载数据的爬取
    爬虫开发4.三种数据解析方式
    Gym–101061A Cards(有待更新)
    GYM 101061 I. Playing with strings(有待更新)
    HDU2072 单词数
    HDU2057 A + B Again(十六进制加法运算)
    HDU2056 Rectangles
    CodeForces 992C Nastya and a Wardrobe
  • 原文地址:https://www.cnblogs.com/xinlvtian/p/12901886.html
Copyright © 2020-2023  润新知