• python爬取google搜索结果,配合sqlmap做sql注入检测


      前段时间试了半自动sql注入检测的小程序:https://www.cnblogs.com/theseventhson/p/13755588.html  原理很简单:先在百度用关键词爬取目标url后保存在txt文件;再开启sqlmap的api调用服务,用python脚本把爬取的url推动到sqlmap的api实现批量检测sql注入的目的;从实际的效果来看,sql注入的检测效果还不错,存在注入的url还挺多的;但从百度爬取目标url的效果就很差了,很多站点都不带关键词的也被展示在结果页面,这些页面让sqlmap去检测,有sql注入的非常少,导致整体的效率并不高;今天参考别人的框架代码重新写了一个爬取google搜索结果的爬虫,源码如下:

    from gevent import monkey;monkey.patch_all()
    from bs4 import BeautifulSoup
    import chardet
    import os
    import urllib.parse
    import re
    import random
    from demo import fenpei
    import configparser
    import requests
    import gevent
    import re
    
    class Google_Disk(object):
        def __init__(self):
            self.conf={}
            self.iplist=[]
            self.calc=0
            self.html=b''
            self.cookies={}
            self.cookie='HSID=AnOingRydX5d2psm6; SSID=ADt9T-YUVJhcGL4qL; APISID=wJEaAiaIyzvEaudB/AcoN5lpzTLnX5Reo_; SAPISID=f7PURACCKCHWwSNN/AzvNr8jk9DaahBOjn; CONSENT=YES+CN.zh-CN+20170611-09-0; SID=BQd-7E64xr8N2KPkSozUAhhUGA1yC2pOm44rxZeltI5oyZczMhTQXcaLdnFMy6KuYM7CVQ.; _ga=GA1.1.1066659943.1561908462; _gcl_au=1.1.1103150496.1563265661; ANID=AHWqTUkF83QBPYbfQq0kmzf1KcFRM9zsr6E6DzhE_HothF5Y28xI_VdxHrB1fMar; SEARCH_SAMESITE=CgQIzY0B; GOOGLE_ABUSE_EXEMPTION=ID=becbf893a4904d44:TM=1566184449:C=r:IP=47.75.69.236-:S=APGng0se1h0QgE8PglXBZJi1H6W3jRYdzw; NID=188=I04uuKTsGOjSp5c3G9QzFnfHqsL7ZQE3t9FdHLq25aPPiAHLfdWBsh3j3v14esoRRMVNXV6Pg8WXsqliJ8c7G46efNs-16lEr8ZZn6Fvz0GzYcw6wzcJ78OWUOuiz0K8W63M0zuBNTUDDmzVBxiud788TjTvbI5CZurTIcD6z2TTwQ_TuoGvjP2cuutFWcs5C8_11nk35jERGC2_A2UPda-AtI2mnVspSF5NNpawFUwW8PgQpxM; DV=oylrE6tRiwhOECBuCtWvdH13M-J_yhYIrTZO_A7m2wIAAABsoyqeic4gCwEAAFj9N_RUZyHkUQAAAA; 1P_JAR=2019-8-19-3; SIDCC=AN0-TYtz7HmrYpB6Cyw9ogysPbuDr2AY0pBl89HytGxEBiBr2lsZ4ceFMNWkG4Efolz2ihLVoMth'
            for v in self.cookie.split(';'):
                key,value=v.split('=',1)
                self.cookies[key]=value
    
            self.headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
            self.host='https://www.google.tw/search?'
            if os.path.exists('Config.ini'):
                print('[~] 读取配置文件')
                config=configparser.ConfigParser()
                config.read("Config.ini",encoding='utf-8')
                self.conf['proxy']=config['config']['proxy']
                self.conf['save']=config['config']['save_name']
                self.conf['search']=config['config']['search_grammar']
                self.conf['page']=config['config']['page']
                self.conf['sleep']=config['config']['sleep']
                print('[+] 读取完成')
            else:
                print('[-] 找不到配置文件')
                exit()
    
            if os.path.exists('iplist.txt'):
                print('[~] 检测到iplist.txt,采用每次请求随机抽取一个IP')
                dk=open('iplist.txt','r')
                for r in dk.readlines():
                    data="".join(r.split('
    '))
                    self.iplist.append(data)
                proxy=self.iplist
            else:
                proxy=self.conf['proxy']
    
            print('[config] 代理设置:{}'.format(proxy))
            print('[config] 搜索语法:{}'.format(self.conf['search']))
            print('[config] 抓取的页数:{}'.format(self.conf['page']))
            print('[config] 保存文件名:{}'.format(self.conf['save']))
    
        def search(self):
            for p in range(0,int(self.conf['page'])):
                page=p*10
                if len(self.iplist)>0:
                    proxy=random.choice(self.iplist)
                else:
                    proxy=self.conf['proxy']
                try:
                    html=fenpei(proxy=proxy,search=self.conf['search'], page=page,sleep=self.conf['sleep'])
                    if b'302 Moved' not in html:
                        #print(html)
                        self.html+=html
                    else:
                        print('[-] Google又要你输验证码啦...')
                except Exception as r:
                    print("in search exception---------------------------------------------------------------------------------------------
    ")
                    print(r)
    
        def chuli(self):
            try:
                link_list = re.findall(r"<div class="yuRUbf"><a href="(.+?)" ping="", str(self.html))
                for url in link_list:  
                    print (url.replace('&amp;sa=U&amp;','').replace('&amp;','')) 
                    print(url.replace('&amp;sa=U&amp;','').replace('&amp;',''),file=open(self.conf['save'],'a'))
            except:
                pass
    
    if __name__ == '__main__':
        obj=Google_Disk()
        obj.search()
        obj.chuli()

      demo.py: 常用的功能函数封装;这里有个关键点:一般情况下,我只检测国内的站点,所以在url需要添加lr=lang_zh-CN&tbs=lr%3Alang_1zh-CN参数,完整的url如下:

    url='https://{}/search?lr=lang_zh-CN&tbs=lr%3Alang_1zh-CN&q={}&btnG=Search&gbv=10&start={}'.format(domains,search,page)
    import random
    import requests
    import time
    from bs4 import BeautifulSoup
    from http.cookiejar import LWPCookieJar
    from urllib.request import Request, urlopen
    from urllib.parse import quote_plus, urlparse, parse_qs
    
    def read():
        dk=open('user_agents.txt','r',encoding='utf-8')
        for r in dk.readlines():
            data="".join(r.split('
    '))
            yield data
    
    def reads():
        dk=open('domain.txt','r',encoding='utf-8')
        for r in dk.readlines():
            data="".join(r.split('
    '))
            yield data
    
    
    def fenpei(proxy,search,page,sleep):
        user_agents=[]
        google_searchs=[]
        for ua in read():
            user_agents.append(ua)
    
    
        for domain in reads():
            google_searchs.append(domain)
    
        time.sleep(int(sleep))
        proxy={'http':'http://{}'.format(proxy),'https':'https://{}'.format(proxy)}
        domains=random.choice(google_searchs)
        u_s={'user-agent':random.choice(user_agents),'Content-type':"text/html;charset=utf-8"}
        url='https://{}/search?lr=lang_zh-CN&tbs=lr%3Alang_1zh-CN&q={}&btnG=Search&gbv=10&start={}'.format(domains,search,page)
        requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
        rqt=requests.get(url=url,headers=u_s,allow_redirects=False,verify=False,proxies=proxy,timeout=30)
        return rqt.content

      所有文件如下:py文件就是上面的python脚本

        

    •  Config.ini:爬虫的配置文件,分别是代理服务器的ip和端口、爬取结果保存的文件、搜索的关键词、爬取google搜索结果的页面数、每次爬取间隔时间(防止被google要求验证)
    [config]
    proxy=127.0.0.1:12639
    save_name=save.txt
    search_grammar=inurl:php?id=
    page=10
    sleep=5
    •  domain.txt: google在全球各地的分站,可以多尝试不同的分站,避免被要求验证

      

    •  user-agent.txt:不同的操作系统、浏览器有不同的user-agent,这里可以来回切换,避免被要求验证;经过尝试,不同的user-agent会导致google返回不同的结果,给后续的url抽取带来困难,我这里暂时固化用这个:Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36
    •  iplist.txt:代理服务器的列表,格式为ip:portal
    • save.txt:爬取结果保存的文档,都包含了我们自己设置的关键词,比百度精准太多了;

      

       从sqlmap跑的结果来看,存在sql注入的url蛮多的(爬取的50个url中,检测出sql注入的有11个,比例已经超过了20%,保存在同目录的injection文件中),怪不得sql注入的漏洞在OWASP排名长期靠前(另一个是xss,近期用xray、awvs扫出来的高危漏洞超过一半都是xss)

            

  • 相关阅读:
    Bootstrap的datetimepicker插件使用
    值得关注几个博客
    Windows下安装和配置tomca(免安装版本)
    AC自动机
    反向输出链表
    替换空格
    二维数组中的查找
    windows下配置nginx+php环境
    HTTP协议详解
    leetcode.3Sum
  • 原文地址:https://www.cnblogs.com/theseventhson/p/13773885.html
Copyright © 2020-2023  润新知