• 爬虫代理池源代码测试-Python3WebSpider


    元类属性的使用

    来源:
    https://github.com/Python3WebSpider/ProxyPool/blob/master/proxypool/crawler.py

    主要关于元类的使用:
    通过获取由元类生成的爬虫抓取类的部分属性.这里为抓取函数,以相同的字符开头的抓取函数,生成属性列表,这样可以持续调用.目的是可以仅仅添加不同的抓取函数抓取不同的网站,而类的其他部分不用做调整.
    ,

    部分代码:

    
    class ProxyMetaclass(type):
        def __new__(cls, name, bases, attrs):
            count = 0
            attrs['__CrawlFunc__'] = []
            for k, v in attrs.items():
                if 'crawl_' in k:
                    attrs['__CrawlFunc__'].append(k)
                    count += 1
            attrs['__CrawlFuncCount__'] = count
            return type.__new__(cls, name, bases, attrs)
    
    
    class Crawler(object, metaclass=ProxyMetaclass):
        def get_proxies(self, callback):
            proxies = []
            for proxy in eval("self.{}()".format(callback)):
                print('成功获取到代理', proxy)
                proxies.append(proxy)
            return proxies
           
        def crawl_daili66(self, page_count=4):
            """
            获取代理66
            :param page_count: 页码
            :return: 代理
            """
            start_url = 'http://www.66ip.cn/{}.html'
            urls = [start_url.format(page) for page in range(1, page_count + 1)]
            for url in urls:
                print('Crawling', url)
                html = get_page(url)
                if html:
                    doc = pq(html)
                    trs = doc('.containerbox table tr:gt(0)').items()
                    for tr in trs:
                        ip = tr.find('td:nth-child(1)').text()
                        port = tr.find('td:nth-child(2)').text()
                        yield ':'.join([ip, port])
    
    

    测试方法

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # @Time    : 12/19/19 4:10 PM
    # @Author  : yon
    # @Email   : @qq.com
    # @File    : test
    
    
    import json
    import re
    from pyquery import PyQuery as pq
    
    
    class ProxyMetaclass(type):
        def __new__(cls, name, bases, attrs):
            count = 0
            attrs['__CrawlFunc__'] = []
            for k, v in attrs.items():
                print("打印k")
                print(k)
                print("打印v")
                print(v)
                if 'crawl_' in k:
                    attrs['__CrawlFunc__'].append(k)
                    count += 1
            attrs['__CrawlFuncCount__'] = count
            return type.__new__(cls, name, bases, attrs)
    
    
    class Crawler(object, metaclass=ProxyMetaclass):
        def get_proxies(self, callback):
            proxies = []
            for proxy in eval("self.{}()".format(callback)):
                print('成功获取到代理', proxy)
                proxies.append(proxy)
            return proxies
    
        def crawl_daili66(self, page_count=4):
            """
            获取代理66
            :param page_count: 页码
            :return: 代理
            """
            start_url = 'http://www.66ip.cn/{}.html'
            urls = [start_url.format(page) for page in range(1, page_count + 1)]
            for url in urls:
                print('Crawling', url)
                html = get_page(url)
                if html:
                    doc = pq(html)
                    trs = doc('.containerbox table tr:gt(0)').items()
                    for tr in trs:
                        ip = tr.find('td:nth-child(1)').text()
                        port = tr.find('td:nth-child(2)').text()
                        yield ':'.join([ip, port])
    
        def crawl_ip3366(self):
            for page in range(1, 4):
                start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
                html = get_page(start_url)
                ip_address = re.compile('<tr>s*<td>(.*?)</td>s*<td>(.*?)</td>')
                # s * 匹配空格,起到换行作用
                re_ip_address = ip_address.findall(html)
                for address, port in re_ip_address:
                    result = address + ':' + port
                    yield result.replace(' ', '')
    
        def crawl_kuaidaili(self):
            for i in range(1, 4):
                start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i)
                html = get_page(start_url)
                if html:
                    ip_address = re.compile('<td data-title="IP">(.*?)</td>')
                    re_ip_address = ip_address.findall(html)
                    port = re.compile('<td data-title="PORT">(.*?)</td>')
                    re_port = port.findall(html)
                    for address, port in zip(re_ip_address, re_port):
                        address_port = address + ':' + port
                        yield address_port.replace(' ', '')
    
        def crawl_xicidaili(self):
            for i in range(1, 3):
                start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
                headers = {
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
                    'Host': 'www.xicidaili.com',
                    'Referer': 'http://www.xicidaili.com/nn/3',
                    'Upgrade-Insecure-Requests': '1',
                }
                html = get_page(start_url, options=headers)
                if html:
                    find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
                    trs = find_trs.findall(html)
                    for tr in trs:
                        find_ip = re.compile('<td>(d+.d+.d+.d+)</td>')
                        re_ip_address = find_ip.findall(tr)
                        find_port = re.compile('<td>(d+)</td>')
                        re_port = find_port.findall(tr)
                        for address, port in zip(re_ip_address, re_port):
                            address_port = address + ':' + port
                            yield address_port.replace(' ', '')
    
        def crawl_ip3366(self):
            for i in range(1, 4):
                start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i)
                html = get_page(start_url)
                if html:
                    find_tr = re.compile('<tr>(.*?)</tr>', re.S)
                    trs = find_tr.findall(html)
                    for s in range(1, len(trs)):
                        find_ip = re.compile('<td>(d+.d+.d+.d+)</td>')
                        re_ip_address = find_ip.findall(trs[s])
                        find_port = re.compile('<td>(d+)</td>')
                        re_port = find_port.findall(trs[s])
                        for address, port in zip(re_ip_address, re_port):
                            address_port = address + ':' + port
                            yield address_port.replace(' ', '')
    
        def crawl_iphai(self):
            start_url = 'http://www.iphai.com/'
            html = get_page(start_url)
            if html:
                find_tr = re.compile('<tr>(.*?)</tr>', re.S)
                trs = find_tr.findall(html)
                for s in range(1, len(trs)):
                    find_ip = re.compile('<td>s+(d+.d+.d+.d+)s+</td>', re.S)
                    re_ip_address = find_ip.findall(trs[s])
                    find_port = re.compile('<td>s+(d+)s+</td>', re.S)
                    re_port = find_port.findall(trs[s])
                    for address, port in zip(re_ip_address, re_port):
                        address_port = address + ':' + port
                        yield address_port.replace(' ', '')
    
        def crawl_data5u(self):
            start_url = 'http://www.data5u.com/free/gngn/index.shtml'
            headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
                'Cache-Control': 'max-age=0',
                'Connection': 'keep-alive',
                'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
                'Host': 'www.data5u.com',
                'Referer': 'http://www.data5u.com/free/index.shtml',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
            }
            html = get_page(start_url, options=headers)
            if html:
                ip_address = re.compile('<span><li>(d+.d+.d+.d+)</li>.*?<li class="port.*?>(d+)</li>', re.S)
                re_ip_address = ip_address.findall(html)
                for address, port in re_ip_address:
                    result = address + ':' + port
                    yield result.replace(' ', '')
    
    
    class Getter():
        def __init__(self):
            self.crawler = Crawler()
    
        def run(self):
            print('获取器开始执行')
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                print(callback_label)
                callback = self.crawler.__CrawlFunc__[callback_label]
                print(callback)
                # # 获取代理
                # proxies = self.crawler.get_proxies(callback)
                # sys.stdout.flush()
                # for proxy in proxies:
                #     self.redis.add(proxy)
    
    
    if __name__ == '__main__':
        get = Getter()
        get.run()
    
    

    测试结果

    /home/baixiaoxu/PycharmProjects/pytthon-tt/venv/bin/python /home/baixiaoxu/PycharmProjects/pytthon-tt/proxypool/test.py
    打印k
    __module__
    打印v
    __main__
    打印k
    __qualname__
    打印v
    Crawler
    打印k
    get_proxies
    打印v
    <function Crawler.get_proxies at 0x7f905ca5a598>
    打印k
    crawl_daili66
    打印v
    <function Crawler.crawl_daili66 at 0x7f905ca5a620>
    打印k
    crawl_ip3366
    打印v
    <function Crawler.crawl_ip3366 at 0x7f905ca5a840>
    打印k
    crawl_kuaidaili
    打印v
    <function Crawler.crawl_kuaidaili at 0x7f905ca5a730>
    打印k
    crawl_xicidaili
    打印v
    <function Crawler.crawl_xicidaili at 0x7f905ca5a7b8>
    打印k
    crawl_iphai
    打印v
    <function Crawler.crawl_iphai at 0x7f905ca5a6a8>
    打印k
    crawl_data5u
    打印v
    <function Crawler.crawl_data5u at 0x7f905ca5a8c8>
    打印k
    __CrawlFunc__
    打印v
    ['crawl_daili66', 'crawl_ip3366', 'crawl_kuaidaili', 'crawl_xicidaili', 'crawl_iphai', 'crawl_data5u']
    获取器开始执行
    0
    crawl_daili66
    1
    crawl_ip3366
    2
    crawl_kuaidaili
    3
    crawl_xicidaili
    4
    crawl_iphai
    5
    crawl_data5u
    
    进程完成,退出码 0
    
    
    
  • 相关阅读:
    Android Studio快速定位当前文件所在的位置
    LeetCode:Search Insert Position
    apk当安装程序将文件复制到手机自带的指定文件夹
    《UNIX级别编程环境》注意读出信号(2)
    iOS:删除小程序
    百度CSND博客在搜索栏中显示图片
    HDU4893:Wow! Such Sequence!(段树lazy)
    Google I/O 2014? No,Android I/O 2014
    Android Push Notifications using Google Cloud Messaging (GCM), PHP and MySQL
    自己动手写CPU 笔记
  • 原文地址:https://www.cnblogs.com/g2thend/p/12069968.html
Copyright © 2020-2023  润新知