• scrapy常用配置


    设置随机请求头

    # 安装 pip install fake-useragent
    # middleware.py
    from fake_useragent import UserAgent
    class RandomUserAgentMiddlware(object):
        #随机更换user-agent
        def __init__(self,crawler):
            super(RandomUserAgentMiddlware,self).__init__()
            self.ua = UserAgent()
    
        @classmethod
        def from_crawler(cls,crawler):
            return cls(crawler)
    
        def process_request(self,request,spider):
            request.headers.setdefault("User-Agent",self.ua.random)
    # ...
    # setting.py 中启动RandomUserAgentMiddlware
    DOWNLOADER_MIDDLEWARES = {
        'yourProjectName.middlewares.RandomUserAgentMiddlware': 312,
    }
    

    设置代理

    # 这里使用的是阿布云隧道代理
    # middleware.py
    import base64
    
    # 代理服务器
    proxyServer = "http://http-dyn.abuyun.com:9020"
    
    # 代理隧道验证信息
    proxyUser = "your proxyUser"
    proxyPass = "your proxyPass"
    
    proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")
    class ProxyMiddleware(object):
        def process_request(self, request, spider):
            request.meta["proxy"] = proxyServer
            request.headers["Proxy-Authorization"] = proxyAuth
    
    # ...
    # setting.py 中启动RandomUserAgentMiddlware        
    DOWNLOADER_MIDDLEWARES = {
         'yourProjectName.middlewares.ProxyMiddleware': 100,
    }
    
    

    图片下载

    # pipelines.py
    from scrapy import Request
    from scrapy.pipelines.images import ImagesPipeline
    class ImagePipeline(ImagesPipeline):
        headers = {
    		"""如果网站有headers检测就加上"""
        }
    
        def get_media_requests(self, item, info):
            # 循环每一张图片地址下载,若传过来的不是集合则无需循环直接yield
            for image_url in item['imgurl']:
                headers = self.headers
                headers['Referer'] = item['from_url']
                # meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
                yield Request(image_url,headers=headers,meta={'foldername': item['foldername'], 'imgname': item['imgname']})
        # 重命名,若不重写这函数,图片名为哈希
        def file_path(self, request, response=None, info=None):
            pic_format = request.url.split(".")[-1]
            imgname = request.meta['imgname']
            # 接收meta传递过来的图集名称
            foldername = request.meta['foldername']
            # 过滤windows字符串
            foldername = re.sub(r'[?\*|“<>:/]', '', foldername)
            filename = fr'{foldername}/{imgname}.{pic_format}'
            return filename
       
    # ...
    # setting.py 中启动ImagePipeline       
    DOWNLOADER_MIDDLEWARES = {
         'yourProjectName.pipelines.ImagePipeline': 200,
    }
    

    异步写入MySQL

    # pipelines.py
    from yourProjectName.settings import MYSQL_DBNAME, MYSQL_HOST,MYSQL_PASSWORD,MYSQL_PORT,MYSQL_USER
    
    
    class MysqlTwistedPipeline(object):
        """
        异步写入mysql
        """
        def __init__(self, dbpool):
            self.dbpool = dbpool
    
        @classmethod
        def from_settings(cls,setting):
            dbparms = dict(
                host=MYSQL_HOST,
                db=MYSQL_DBNAME,
                user=MYSQL_USER,
                passwd=MYSQL_PASSWORD,
                port=MYSQL_PORT,
                charset='utf8',
                cursorclass=pymysql.cursors.DictCursor,
                use_unicode=False,
            )
            dbpool = adbapi.ConnectionPool("pymysql", **dbparms)
            return cls(dbpool)
    
        def process_item(self, item, spider):
            query = self.dbpool.runInteraction(self.do_insert, item)
            query.addCallback(self.handle_error)
    
        def handle_error(self, failure):
            print("MysqlTwistedPipeline error is :",failure)
    
        def do_insert(self, cursor, item):
            insert_sql = """insert_sql"""
            cursor.execute(insert_sql, ())#有些情况需使用pymysql.escape_string()对item字段进行转义
    
    # ...
    # setting.py 中启动ImagePipeline       
    DOWNLOADER_MIDDLEWARES = {
         'yourProjectName.pipelines.MysqlTwistedPipeline': 200,
    }
    
  • 相关阅读:
    SQL Azure (17) SQL Azure V12
    Microsoft Azure News(5) Azure新DV2系列虚拟机上线
    Azure Redis Cache (3) 在Windows 环境下使用Redis Benchmark
    Azure PowerShell (11) 使用自定义虚拟机镜像模板,创建Azure虚拟机并绑定公网IP(VIP)和内网IP(DIP)
    Windows Azure Virtual Machine (31) 迁移Azure虚拟机
    Windows Azure Web Site (16) Azure Web Site HTTPS
    Azure China (12) 域名备案问题
    一分钟快速入门openstack
    管理员必备的Linux系统监控工具
    Keepalived+Nginx实现高可用和双主节点负载均衡
  • 原文地址:https://www.cnblogs.com/zhangxuel1ang/p/13174463.html
Copyright © 2020-2023  润新知