• scrapy 分布式爬取数据同步写入数据库


    spider文件

    继承RedisCrawlSpider,实现全站爬取数据

    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from houseBjPro.items import HousebjproItem
    from scrapy_redis.spiders import RedisCrawlSpider
    import re
    
    """"
    爬取58同城北京二手房数据
    """
    class HousebjSpider(RedisCrawlSpider):
        name = 'houseBj'
        # # allowed_domains = ['https://bj.58.com/ershoufang/']
        # start_urls = ['https://bj.58.com/ershoufang/']
        # 调度器队列的名称
        redis_key = 'houseBjurl'
    
        rules = (
            Rule(LinkExtractor(allow=r'ershoufang/pnd+/'), callback='parse_item', follow=True), # 爬取分页信息
        )
    
        def parse_item(self, response):
            i = {}
            house_list = response.xpath('//ul[@class="house-list-wrap"]/li') # 得到房产列表
            for h_item in house_list:
                sum_price = h_item.xpath('./div[@class="price"]/p[1]/b/text()').extract_first()
                unit_price = h_item.xpath('./div[@class="price"]/p[2]/text()').extract_first()
                title = h_item.xpath('./div[@class="list-info"]/h2/a/text()').extract_first()
                house_info = h_item.xpath('./div[@class="list-info"]/p[1]//text()').extract()
                house_loc = h_item.xpath('./div[@class="list-info"]/p[2]/span//text()').extract()
                house_info = re.sub(r's+',',',"".join(house_info).replace('-','').strip())
                house_loc = re.sub(r's+',',',"".join(house_loc).replace('-','').strip())
                item = HousebjproItem()
                print(sum_price)
                print(unit_price)
                print(house_loc)
                print(title)
                print(house_info)
    
                item['sum_price'] = sum_price+'' # 总价
                item['unit_price'] =unit_price # 单价
                item['title'] = title # 标题
                item['house_info'] = house_info # 户型面积信息
                item['house_loc'] = house_loc # 地段信息
                # print('new_title:{}'.format(new_title))
                yield item

    pipeline文件

    同步写入mysql数据库

    import json
    from twisted.enterprise import adbapi
    import pymysql
    
    
    class HousebjproPipeline(object):
        """
        同步写入数据库
        """
    
        def __init__(self, conn):
            self.conn = conn
            self.cursor = self.conn.cursor()
    
        @classmethod
        def from_settings(cls, settings):
    
            # 先将setting中连接数据库所需内容取出,构造一个地点
            dbparms = dict(
                host=settings["MYSQL_HOST"],
                db=settings["MYSQL_DBNAME"],
                port=3306,
                user=settings["MYSQL_USER"],
                password=settings["MYSQL_PASSWORD"],
                charset='utf8mb4',
                # 游标设置
                cursorclass=pymysql.cursors.DictCursor,
                # 设置编码是否使用Unicode
                use_unicode=True
            )
    
            conn = pymysql.connect(**dbparms)
    
            return cls(conn)
    
        def process_item(self, item, spider):
            insert_sql = """
             insert ignore into tongcheng_content(title,sum_price,unit_price,house_info,house_loc
                 )
             VALUES(%s,%s,%s,%s,%s)
        """
            try:
                self.cursor.execute(insert_sql, (item["title"], item["sum_price"], item["unit_price"],
                                                 item["house_info"], item["house_loc"]))
                self.conn.commit()
            except Exception as e:
                print(e)
                self.conn.rollback()
            return item
    
        def close_spider(self, spider):
            self.cursor.close()
            self.conn.close()

    middleware文件

    使用ua池

    from scrapy import signals
    import time
    from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
    import random
    
    #UA池代码的编写(单独给UA池封装一个下载中间件的一个类)
    #1,导包UserAgentMiddlware类
    class RandomUserAgent(UserAgentMiddleware):
    
        def process_request(self, request, spider):
            #从列表中随机抽选出一个ua值
            ua = random.choice(user_agent_list)
            #ua值进行当前拦截到请求的ua的写入操作
            request.headers.setdefault('User-Agent',ua)
    
    #批量对拦截到的请求进行ip更换
    # class Proxy(object):
    #     def process_request(self, request, spider):
    #         #对拦截到请求的url进行判断(协议头到底是http还是https)
    #         #request.url返回值:http://www.xxx.com
    #         h = request.url.split(':')[0]  #请求的协议头
    #         if h == 'https':
    #             ip = random.choice(PROXY_https)
    #             request.meta['proxy'] = 'https://'+ip
    
    
    
    # PROXY_https = [
    #     '120.83.49.90:9000',
    #     '95.189.112.214:35508',
    # ]
    
    user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
            "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

    setting配置

    BOT_NAME = 'houseBjPro'
    
    SPIDER_MODULES = ['houseBjPro.spiders']
    NEWSPIDER_MODULE = 'houseBjPro.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'  # 伪装请求载体身份
    # Obey robots.txt rules
    # ROBOTSTXT_OBEY = True
    ROBOTSTXT_OBEY = False  #可以忽略或者不遵守robots协议
    #只显示指定类型的日志信息
    LOG_LEVEL='ERROR'
    
    #开启访问频率限制
    AUTOTHROTTLE_ENABLED = True
    #设置访问开始的延迟
    AUTOTHROTTLE_START_DELAY = 5
    #设置访问之间的最大延迟
    AUTOTHROTTLE_MAX_DELAY = 60
    #设置Scrapy 并行发给每台远程服务器的请求数量
    AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    #设置下裁之后的自动延迟
    DOWNLOAD_DELAY = 3
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # DOWNLOAD_DELAY = 1
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    #DEFAULT_REQUEST_HEADERS = {
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    #}
    
    # Enable or disable spider middlewares
    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'houseBjPro.middlewares.HousebjproSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    
    DOWNLOADER_MIDDLEWARES = {
    
        'houseBjPro.middlewares.RandomUserAgent': 542,
    
    }
    # Enable or disable extensions
    # See https://doc.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    ITEM_PIPELINES = {
            'houseBjPro.pipelines.HousebjproPipeline': 300,
            'scrapy_redis.pipelines.RedisPipeline': 400
        }
    
    REDIS_HOST = '127.0.0.1'  # 将这句注释掉
    REDIS_PORT = 6379
    REDIS_ENCODING = 'utf-8'
    
    MYSQL_HOST = '49.233.xx.xx'
    MYSQL_DBNAME = 'spider_db'
    MYSQL_USER = 'xx'
    MYSQL_PASSWORD = 'xx'
    
    
    
    # 增加了一个去重容器类的配置, 作用使用Redis的set集合来存储请求的指纹数据, 从而实现请求去重的持久化
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    # 使用scrapy-redis组件自己的调度器
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    # 配置调度器是否要持久化, 也就是当爬虫结束了, 要不要清空Redis中请求队列和去重指纹的set。如果是True, 就表示要持久化存储, 就不清空数据, 否则清空数据
    SCHEDULER_PERSIST = True
  • 相关阅读:
    对技术的态度
    码农提高工作效率
    为什么要使用String
    Java还是程序员的金饭碗
    为什么Java7开始在数字中使用下划线
    阿里云快速搭建一个静态网站
    IDEA中Springboot项目部署到阿里云linux服务器
    Linux环境下安装宝塔面板
    阿里云服务器的购买和配置以及搭建项目教程
    linux服务器上部署springboot项目,并让他持续运行到后台
  • 原文地址:https://www.cnblogs.com/xiao-apple36/p/12764936.html
Copyright © 2020-2023  润新知