• scrapy常用配置


    设置随机请求头

    # 安装 pip install fake-useragent
    # middleware.py
    from fake_useragent import UserAgent
    class RandomUserAgentMiddlware(object):
        #随机更换user-agent
        def __init__(self,crawler):
            super(RandomUserAgentMiddlware,self).__init__()
            self.ua = UserAgent()
    
        @classmethod
        def from_crawler(cls,crawler):
            return cls(crawler)
    
        def process_request(self,request,spider):
            request.headers.setdefault("User-Agent",self.ua.random)
    # ...
    # setting.py 中启动RandomUserAgentMiddlware
    DOWNLOADER_MIDDLEWARES = {
        'yourProjectName.middlewares.RandomUserAgentMiddlware': 312,
    }
    

    设置代理

    # 这里使用的是阿布云隧道代理
    # middleware.py
    import base64
    
    # 代理服务器
    proxyServer = "http://http-dyn.abuyun.com:9020"
    
    # 代理隧道验证信息
    proxyUser = "your proxyUser"
    proxyPass = "your proxyPass"
    
    proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((proxyUser + ":" + proxyPass), "ascii")).decode("utf8")
    class ProxyMiddleware(object):
        def process_request(self, request, spider):
            request.meta["proxy"] = proxyServer
            request.headers["Proxy-Authorization"] = proxyAuth
    
    # ...
    # setting.py 中启动RandomUserAgentMiddlware        
    DOWNLOADER_MIDDLEWARES = {
         'yourProjectName.middlewares.ProxyMiddleware': 100,
    }
    
    

    图片下载

    # pipelines.py
    from scrapy import Request
    from scrapy.pipelines.images import ImagesPipeline
    class ImagePipeline(ImagesPipeline):
        headers = {
    		"""如果网站有headers检测就加上"""
        }
    
        def get_media_requests(self, item, info):
            # 循环每一张图片地址下载,若传过来的不是集合则无需循环直接yield
            for image_url in item['imgurl']:
                headers = self.headers
                headers['Referer'] = item['from_url']
                # meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
                yield Request(image_url,headers=headers,meta={'foldername': item['foldername'], 'imgname': item['imgname']})
        # 重命名,若不重写这函数,图片名为哈希
        def file_path(self, request, response=None, info=None):
            pic_format = request.url.split(".")[-1]
            imgname = request.meta['imgname']
            # 接收meta传递过来的图集名称
            foldername = request.meta['foldername']
            # 过滤windows字符串
            foldername = re.sub(r'[?\*|“<>:/]', '', foldername)
            filename = fr'{foldername}/{imgname}.{pic_format}'
            return filename
       
    # ...
    # setting.py 中启动ImagePipeline       
    DOWNLOADER_MIDDLEWARES = {
         'yourProjectName.pipelines.ImagePipeline': 200,
    }
    

    异步写入MySQL

    # pipelines.py
    from yourProjectName.settings import MYSQL_DBNAME, MYSQL_HOST,MYSQL_PASSWORD,MYSQL_PORT,MYSQL_USER
    
    
    class MysqlTwistedPipeline(object):
        """
        异步写入mysql
        """
        def __init__(self, dbpool):
            self.dbpool = dbpool
    
        @classmethod
        def from_settings(cls,setting):
            dbparms = dict(
                host=MYSQL_HOST,
                db=MYSQL_DBNAME,
                user=MYSQL_USER,
                passwd=MYSQL_PASSWORD,
                port=MYSQL_PORT,
                charset='utf8',
                cursorclass=pymysql.cursors.DictCursor,
                use_unicode=False,
            )
            dbpool = adbapi.ConnectionPool("pymysql", **dbparms)
            return cls(dbpool)
    
        def process_item(self, item, spider):
            query = self.dbpool.runInteraction(self.do_insert, item)
            query.addCallback(self.handle_error)
    
        def handle_error(self, failure):
            print("MysqlTwistedPipeline error is :",failure)
    
        def do_insert(self, cursor, item):
            insert_sql = """insert_sql"""
            cursor.execute(insert_sql, ())#有些情况需使用pymysql.escape_string()对item字段进行转义
    
    # ...
    # setting.py 中启动ImagePipeline       
    DOWNLOADER_MIDDLEWARES = {
         'yourProjectName.pipelines.MysqlTwistedPipeline': 200,
    }
    
  • 相关阅读:
    JavaScript学习总结(十一)——Object类详解
    在mysql命令行下执行sql文件
    canal HA配置
    canal 常用配置
    canal 监控数据库表 快速使用
    HashMap 因子对性能的影响
    JVM 调优
    JVM jstat 详解
    Tomcat 异常关闭排查
    Mysql canal 监控数据变化
  • 原文地址:https://www.cnblogs.com/zhangxuel1ang/p/13174463.html
Copyright © 2020-2023  润新知