• googleapi | google爬虫


    google api获取及使用

    1 生成KEY
    访问该地址:https://developers.google.com/custom-search/v1/overview?hl=en_US点击“Get a KEY”,此处需要登录谷歌账号,以及注册谷歌云账号并创建一个project。

    2 这个key可以从谷歌云控制台中看到,建议加上应用限制和API限制,防止泄露后被滥用。

    3 生成CX
    https://programmablesearchengine.google.com/cse/create/new
    cx 是 Google 可编程搜索引擎(Programmable Search Engine)的 id 标识,在此处 新增搜索引擎 可以获取。这里可以指定要搜索的网站,比如说我只希望通过该 API 搜索出来的网站是 shodan.io,谷歌语法里面相当于 site:shodan.io,可以这么设置


    爬虫脚本

    import requests
    import time
    import random
    import re
    import os
    
    
    def check_response(method, resp):
        """
        检查响应 输出非正常响应返回json的信息
    
        :param method: 请求方法
        :param resp: 响应体
        :return: 是否正常响应
        """
        if resp.status_code == 200 and resp.content:
            return True
        content_type = resp.headers.get('Content-Type')
        if content_type and 'json' in content_type and resp.content:
            try:
                msg = resp.json()
            except Exception as e:
                pass
            else:
                pass
        return False
    
    
    def match_subdomains(domain, html, distinct=True, fuzzy=True):
        """
        Use regexp to match subdomains
    
        :param  str domain: main domain
        :param  str html: response html text
        :param  bool distinct: deduplicate results or not (default True)
        :param  bool fuzzy: fuzzy match subdomain or not (default True)
        :return set/list: result set or list
        """
        if fuzzy:
            regexp = r'(?:[a-z0-9](?:[a-z0-9\-]{0,61}[a-z0-9])?\.){0,}' \
                     + domain.replace('.', r'\.')
            result = re.findall(regexp, html, re.I)
            if not result:
                return set()
            deal = map(lambda s: s.lower(), result)
            if distinct:
                return set(deal)
            else:
                return list(deal)
        else:
            regexp = r'(?:\>|\"|\'|\=|\,)(?:http\:\/\/|https\:\/\/)?' \
                     r'(?:[a-z0-9](?:[a-z0-9\-]{0,61}[a-z0-9])?\.){0,}' \
                     + domain.replace('.', r'\.')
            result = re.findall(regexp, html, re.I)
        if not result:
            return set()
        regexp = r'(?:http://|https://)'
        deal = map(lambda s: re.sub(regexp, '', s[1:].lower()), result)
        if distinct:
            return set(deal)
        else:
            return list(deal)
    
    
    class Module(object):
        def __init__(self):
            self.module = 'Module'
            self.source = 'BaseModule'
            self.cookie = None
            self.header = dict()
            self.proxy = None
            self.delay = 1  # 请求睡眠时延
            self.timeout = (13, 27)  # 请求超时时间
            self.verify = False  # 请求SSL验证
            self.domain = str()  # 当前进行子域名收集的主域
            self.subdomains = set()  # 存放发现的子域
            self.infos = dict()  # 存放子域有关信息
            self.results = list()  # 存放模块结果
            self.start = time.time()  # 模块开始执行时间
            self.end = None  # 模块结束执行时间
            self.elapse = None  # 模块执行耗时
    
        def have_api(self, *apis):
            """
            Simply check whether the api information configure or not
    
            :param  apis: apis set
            :return bool: check result
            """
            if not all(apis):
                return False
            return True
    
        def begin(self):
            """
            begin log
            """
            pass
    
        def finish(self):
            """
            finish log
            """
            self.end = time.time()
            self.elapse = round(self.end - self.start, 1)
            pass
    
        def head(self, url, params=None, check=True, **kwargs):
            """
            Custom head request
    
            :param str  url: request url
            :param dict params: request parameters
            :param bool check: check response
            :param kwargs: other params
            :return: response object
            """
            session = requests.Session()
            session.trust_env = False
            try:
                resp = session.head(url,
                                    params=params,
                                    cookies=self.cookie,
                                    headers=self.header,
                                    proxies=self.proxy,
                                    timeout=self.timeout,
                                    verify=self.verify,
                                    **kwargs)
            except Exception as e:
                pass
                return None
            if not check:
                return resp
            if check_response('HEAD', resp):
                return resp
            return None
    
        def get(self, url, params=None, check=True, ignore=False, raise_error=False, **kwargs):
            """
            Custom get request
    
            :param str  url: request url
            :param dict params: request parameters
            :param bool check: check response
            :param bool ignore: ignore error
            :param bool raise_error: raise error or not
            :param kwargs: other params
            :return: response object
            """
            session = requests.Session()
            session.trust_env = False
            level = 'ERROR'
            if ignore:
                level = 'DEBUG'
            try:
                resp = session.get(url,
                                   params=params,
                                   cookies=self.cookie,
                                   headers=self.header,
                                   proxies=self.proxy,
                                   timeout=self.timeout,
                                   verify=self.verify,
                                   **kwargs)
            except Exception as e:
                if raise_error:
                    if isinstance(e, requests.exceptions.ConnectTimeout):
                        raise e
                return None
            if not check:
                return resp
            if check_response('GET', resp):
                return resp
            print('xx')
            print(resp.text)
            return None
    
        def post(self, url, data=None, check=True, **kwargs):
            """
            Custom post request
    
            :param str  url: request url
            :param dict data: request data
            :param bool check: check response
            :param kwargs: other params
            :return: response object
            """
            session = requests.Session()
            session.trust_env = False
            try:
                resp = session.post(url,
                                    data=data,
                                    cookies=self.cookie,
                                    headers=self.header,
                                    proxies=self.proxy,
                                    timeout=self.timeout,
                                    verify=self.verify,
                                    **kwargs)
            except Exception as e:
                print(e)
                return None
            if not check:
                return resp
            if check_response('POST', resp):
                return resp
            return None
    
        def delete(self, url, check=True, **kwargs):
            """
            Custom delete request
    
            :param str  url: request url
            :param bool check: check response
            :param kwargs: other params
            :return: response object
            """
            session = requests.Session()
            session.trust_env = False
            try:
                resp = session.delete(url,
                                      cookies=self.cookie,
                                      headers=self.header,
                                      proxies=self.proxy,
                                      timeout=self.timeout,
                                      verify=self.verify,
                                      **kwargs)
            except Exception as e:
                print(e)
                return None
            if not check:
                return resp
            if check_response('DELETE', resp):
                return resp
            return None
    
        def get_header(self):
            """
            Get request header
    
            :return: header
            """
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0)',}
            if isinstance(headers, dict):
                self.header = headers
                return headers
            return self.header
    
        def get_proxy(self, module):
            """
            Get proxy
    
            :param str module: module name
            :return: proxy
            """
            pass
    
        def match_subdomains(self, resp, distinct=True, fuzzy=True):
            if not resp:
                return set()
            elif isinstance(resp, str):
                return match_subdomains(self.domain, resp, distinct, fuzzy)
            elif hasattr(resp, 'text'):
                return match_subdomains(self.domain, resp.text, distinct, fuzzy)
            else:
                return set()
    
        def collect_subdomains(self, resp):
            subdomains = self.match_subdomains(resp)
            self.subdomains.update(subdomains)
            return self.subdomains
    
        def save_json(self):
            """
            Save the results of each module as a json file
    
            :return bool: whether saved successfully
            """
            pass
    
        def gen_result(self):
            """
            Generate results
            """
            if not len(self.subdomains):  # 该模块一个子域都没有发现的情况
                result = {'id': None,
                          'alive': None,
                          'request': None,
                          'resolve': None,
                          'url': None,
                          'subdomain': None,
                          'port': None,
                          'level': None,
                          'cname': None,
                          'ip': None,
                          'public': None,
                          'cdn': None,
                          'status': None,
                          'reason': None,
                          'title': None,
                          'banner': None,
                          'header': None,
                          'history': None,
                          'response': None,
                          'ip_times': None,
                          'cname_times': None,
                          'ttl': None,
                          'cidr': None,
                          'asn': None,
                          'org': None,
                          'addr': None,
                          'isp': None,
                          'resolver': None,
                          'module': self.module,
                          'source': self.source,
                          'elapse': self.elapse,
                          'find': None}
                self.results.append(result)
            else:
                for subdomain in self.subdomains:
                    url = 'http://' + subdomain
                    level = subdomain.count('.') - self.domain.count('.')
                    info = self.infos.get(subdomain)
                    if info is None:
                        info = dict()
                    cname = info.get('cname')
                    ip = info.get('ip')
                    ip_times = info.get('ip_times')
                    cname_times = info.get('cname_times')
                    ttl = info.get('ttl')
                    if isinstance(cname, list):
                        cname = ','.join(cname)
                        ip = ','.join(ip)
                        ip_times = ','.join([str(num) for num in ip_times])
                        cname_times = ','.join([str(num) for num in cname_times])
                        ttl = ','.join([str(num) for num in ttl])
                    result = {'id': None,
                              'alive': info.get('alive'),
                              'request': info.get('request'),
                              'resolve': info.get('resolve'),
                              'url': url,
                              'subdomain': subdomain,
                              'port': 80,
                              'level': level,
                              'cname': cname,
                              'ip': ip,
                              'public': info.get('public'),
                              'cdn': info.get('cdn'),
                              'status': None,
                              'reason': info.get('reason'),
                              'title': None,
                              'banner': None,
                              'header': None,
                              'history': None,
                              'response': None,
                              'ip_times': ip_times,
                              'cname_times': cname_times,
                              'ttl': ttl,
                              'cidr': info.get('cidr'),
                              'asn': info.get('asn'),
                              'org': info.get('org'),
                              'addr': info.get('addr'),
                              'isp': info.get('isp'),
                              'resolver': info.get('resolver'),
                              'module': self.module,
                              'source': self.source,
                              'elapse': self.elapse,
                              'find': len(self.subdomains)}
                    self.results.append(result)
    
        def save_db(self):
            """
            Save module results into the database
            """
            pass
    
    
    class Search(Module):
        """
        Search base class
        """
        def __init__(self):
            Module.__init__(self)
            self.page_num = 0  # 要显示搜索起始条数
            self.per_page_num = 50  # 每页显示搜索条数
            self.recursive_search = False
            self.recursive_times = 2
            self.full_search = False
    
        @staticmethod
        def filter(domain, subdomain):
            """
            生成搜索过滤语句
            使用搜索引擎支持的-site:语法过滤掉搜索页面较多的子域以发现新域
    
            :param str domain: 域名
            :param set subdomain: 子域名集合
            :return: 过滤语句
            :rtype: str
            """
            common_subnames = {'i', 'w', 'm', 'en', 'us', 'zh', 'w3', 'app', 'bbs',
                               'web', 'www', 'job', 'docs', 'news', 'blog', 'data',
                               'help', 'live', 'mall', 'blogs', 'files', 'forum',
                               'store', 'mobile'}
            statements_list = []
            subdomains_temp = set(map(lambda x: x + '.' + domain, common_subnames))
            subdomains_temp = list(subdomain.intersection(subdomains_temp))
            for i in range(0, len(subdomains_temp), 2):  # 同时排除2个子域
                statements_list.append(''.join(set(map(lambda s: ' -site:' + s,
                                                       subdomains_temp[i:i + 2]))))
            return statements_list
    
        def match_location(self, url):
            """
            匹配跳转之后的url
            针对部分搜索引擎(如百度搜索)搜索展示url时有显示不全的情况
            此函数会向每条结果的链接发送head请求获取响应头的location值并做子域匹配
    
            :param str url: 展示结果的url链接
            :return: 匹配的子域
            :rtype set
            """
            resp = self.head(url, check=False, allow_redirects=False)
            if not resp:
                return set()
            location = resp.headers.get('location')
            if not location:
                return set()
            return set(self.match_subdomains(location))
    
        def check_subdomains(self, subdomains):
            """
            检查搜索出的子域结果是否满足条件
    
            :param subdomains: 子域结果
            :return:
            """
            if not subdomains:
                # 搜索没有发现子域名则停止搜索
                return False
            if not self.full_search and subdomains.issubset(self.subdomains):
                # 在全搜索过程中发现搜索出的结果有完全重复的结果就停止搜索
                return False
            return True
    
        def recursive_subdomain(self):
            # 递归搜索下一层的子域
            # 从1开始是之前已经做过1层子域搜索了,当前实际递归层数是layer+1
            for layer_num in range(1, self.recursive_times):
                for subdomain in self.subdomains:
                    # 进行下一层子域搜索的限制条件
                    count = subdomain.count('.') - self.domain.count('.')
                    if count == layer_num:
                        yield subdomain
    
    
    def export(name, domain, content):
        file = os.path.join(str(name)+'-' + str(domain) + '.txt')
        if not os.path.exists('result'):
            os.mkdir('result')
    
        with open('result/' + file, 'a+', encoding='utf-8') as f:
            if isinstance(content, (list)):
                content = '\n'.join(content)
    
            f.write(str(content))
            f.write('\n')
    
    
    class Google(Search):
        def __init__(self, domain):
            Search.__init__(self)
            self.domain = domain
            self.module = 'Search'
            self.source = 'GoogleAPISearch'
            self.addr = 'https://www.googleapis.com/customsearch/v1'
            self.delay = 1
            self.key = 'AIzqPnTT3LR4tSuPJP2WDkOQUdxC7wOs'
            self.id = 'xxx'
            self.per_page_num = 10  # 每次只能请求10个结果
    
        def search(self, filtered_subdomain=''):
            """
            发送搜索请求并做子域匹配
    
            :param str domain: 域名
            :param str filtered_subdomain: 过滤的子域
            """
            self.page_num = 1
            data = []
            while True:
                word = 'site:' + self.domain + filtered_subdomain
                time.sleep(self.delay)
                self.header = self.get_header()
                params = {'key': self.key, 'cx': self.id,
                          'q': word, #'fields': 'items/link',
                          'start': self.page_num, 'num': self.per_page_num}
                resp = self.get(self.addr, params)
                print(resp.text)
                data.append(resp.text)
                subdomains = self.match_subdomains(resp)
                if not self.check_subdomains(subdomains):
                    break
                self.subdomains.update(subdomains)
                self.page_num += self.per_page_num
                if self.page_num > 100:  # 免费的API只能查询前100条结果
                    break
            if data:
                export(self.__class__.__name__, self.domain, data)
    
    
    '''
    替换476和477行 api-key
    输出方法在455行,455往上基本不需要管
    免费用户单次只能查询到100条数据,如果有会员,注释504行那儿
    爬取站点需要在google后台添加站点
    '''
    if __name__ == '__main__':
        file = "targets.txt"       # 此处添加文件名
        with open(file, 'r', encoding='utf-8') as f:
            for domain in f.readlines():
                google = Google(domain.strip())
                google.search()
    
  • 相关阅读:
    Value '0000-00-00' can not be represented as java.sql.Date
    mysql建表设置两个默认CURRENT_TIMESTAMP的技巧
    PowerDesigner 的mysql PDM 的COMMENT注释
    tomcat配置及优化
    Tomcat7调优及JVM性能优化for Linux环境
    maven混淆Java代码
    通过Maven配置测试环境和开发环境连接不同的数据库
    删除电脑中用强制删除不能删除的bat命令脚本
    在Linux中设置共享目录
    ftp以及smb的配置
  • 原文地址:https://www.cnblogs.com/zongdeiqianxing/p/16019258.html
Copyright © 2020-2023  润新知