• scrapy自动抓取蛋壳公寓最新房源信息并存入sql数据库


    利用scrapy抓取蛋壳公寓上的房源信息,以北京市为例,目标url:https://www.dankegongyu.com/room/bj

    思路分析

    每次更新最新消息,都是在第一页上显示,因此考虑隔一段时间自动抓取第一页上的房源信息,实现抓取最新消息。

    利用redis的set数据结构的特征,将每次抓取后的url存到redis中;

    每次请求,将请求url与redis中的url对比,若redis中已存在该url,代表没有更新,忽略该次请求;若redis中不存在该url,代表该信息是新信息,抓取并将url存入到redis中。

    分析页面源码,发现该网页属于静态网页;首先获取最新页面每条数据的url,请求该url,得到详细页面情况,所有数据均从详情页面获取。

    代码实现

    明确抓取字段

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    class DankeItem(scrapy.Item):
        """
        编辑带爬取信息字段
        """
        # 数据来源
        source = scrapy.Field()
        # 抓取时间
        utc_time = scrapy.Field()
    
        # 房间名称
        room_name = scrapy.Field()
        # 房间租金
        room_money = scrapy.Field()
        # 房间面积
        room_area = scrapy.Field()
        # 房间编号
        room_numb = scrapy.Field()
        # 房间户型
        room_type = scrapy.Field()
        # 租房方式
        rent_type = scrapy.Field()
        # 房间楼层
        room_floor = scrapy.Field()
        # 所在区域
        room_loca = scrapy.Field()
        # 所在楼盘
        estate_name = scrapy.Field()

    编写爬虫逻辑

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from danke.items import DankeItem
    
    
    class DankeSpider(CrawlSpider):
    
        # 爬虫名
        name = 'dkgy3'
    
        # 允许抓取的url
        allowed_domains = ['dankegongyu.com']
    
        custom_settings = {'DOWNLOAD_DELAY': 0.2}
    
        # 请求开始的url
        start_urls = ['https://www.dankegongyu.com/room/sz']
    
    
        # rules属性
        rules = (
    
            #编写匹配详情页的规则,抓取到详情页的链接后不用跟进
            Rule(LinkExtractor(allow=r'https://www.dankegongyu.com/room/d+'), callback='parse_detail', follow=False),
        )
    
        def parse_detail(self, response):
            """
            解析详情页数据
            :param response:
            :return:
            """
            node_list = response.xpath('//div[@class="room-detail-right"]')
            for node in node_list:
                item = DankeItem()
    
                # 房间名称
                room_name = node.xpath('./div/h1/text()')
                item['room_name'] = room_name.extract_first()
    
                # 房间租金
                room_money = node.xpath('./div[@class="room-price"]/div/span').xpath('string(.)').extract_first()
    
                # 有的房子有首月租金,和普通租金不同,因此匹配方式也不同
                if room_money:
                    item['room_money'] = room_money
                else:
                    room_money = node.xpath('./div[@class="room-price hot"]/div/div[@class="room-price-num"]/text()').extract_first()
                    item['room_money'] = room_money
                    print(room_money)
    
                # 房间面积
                room_area = node.xpath('./*/div[@class="room-detail-box"]/div[1]/label/text()').extract_first().split('')[-1]
                item['room_area'] = room_area
    
                # 房间编号
                room_numb = node.xpath('./*/div[@class="room-detail-box"]/div[2]/label/text()').extract_first().split('')[-1]
                item['room_numb'] = room_numb
    
                # 房间户型
                room_type = node.xpath('./*/div[@class="room-detail-box"]/div[3]/label/text()').extract_first().split('')[-1]
                item['room_type'] = room_type
    
                # 租房方式
                rent_type = node.xpath('./*/div[@class="room-detail-box"]/div[3]/label/b/text()').extract_first().split('')[
                    -1]
                item['rent_type'] = rent_type
    
                # 所在楼层
                room_floor = node.xpath('./div[@class="room-list-box"]/div[2]/div[2]').xpath('string(.)').extract_first().split('')[-1]
                item['room_floor'] = room_floor
    
                # 所在区域
                room_loca = node.xpath('./div[@class="room-list-box"]/div[2]/div[3]/label/div/a[1]/text()').extract_first()
                item['room_loca'] = room_loca
    
                # 所在楼盘
                estate_name = node.xpath('./div[@class="room-list-box"]/div[2]/div[3]/label/div/a[3]/text()').extract_first()
                item['estate_name'] = estate_name
    
                yield item

    编写下载中间件

    下载中间件中实现两个逻辑:添加随机请求头和url存入redis中

    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
    import time
    import random
    import hashlib
    import redis
    from scrapy.exceptions import IgnoreRequest
    from danke.settings import USER_AGENTS as ua
    
    class DankeSpiderMiddleware(object):
        def process_request(self, request, spider):
            """
            给每一个请求随机分配一个代理
            :param request:
            :param spider:
            :return:
            """
            user_agent = random.choice(ua)
            request.headers['User-Agent'] = user_agent
    
    class DankeRedisMiddleware(object):
        """
        将第一个页面上的每一个url放入redis的set类型中,防止重复爬取
        """
        # 连接redis
        def __init__(self):
            self.redis = redis.StrictRedis(host='39.106.116.21', port=6379, db=3)
    
        def process_request(self, request, spider):
    
            # 将来自详情页的链接存到redis中
            if request.url.endswith(".html"):
                # MD5加密详情页链接
                url_md5 = hashlib.md5(request.url.encode()).hexdigest()
    
                # 添加到redis,添加成功返回True,否则返回False
                result = self.redis.sadd('dk_url', url_md5)
    
                # 添加失败,说明链接已爬取,忽略该请求
                if not result:
                    raise IgnoreRequest

    数据存储

    # -*- coding: utf-8 -*-
    
    from datetime import datetime
    import pymysql
    
    class DankeSourcePipeline(object):
        def process_item(self, item, spider):
            item['source'] = spider.name
            item['utc_time'] = str(datetime.utcnow())
            return item
    
    class DankePipeline(object):
    
        def __init__(self):
    
            self.conn = pymysql.connect(
                host='39.106.116.21',
                port=3306,
                database='***',
                user='***',
                password='****',
                charset='utf8'
            )
            # 实例一个游标
            self.cursor = self.conn.cursor()
    
        def process_item(self, item, spider):
    
            sql = ("insert into result_latest(标题, 租金, 面积, "
                   "编号, 户型, 出租方式, 楼层, "
                   "区域, 楼盘, 抓取时间, 数据来源)"
                   "values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
    
            item = dict(item)
    
            data = [
                    item['room_name'],
                    item['room_money'],
                    item['room_area'],
                    item['room_numb'],
                    item['room_type'],
                    item['rent_type'],
                    item['room_floor'],
                    item['room_loca'],
                    item['estate_name'],
                    item['utc_time'],
                    item['source'],
                    ]
            self.cursor.execute(sql, data)
            # 提交数据
            self.conn.commit()
    
            return item
    
        def close_spider(self, spider):
            self.cursor.close()
            self.conn.close()

    实现自动爬取

    import os
    import time
    
    while True:
        """
        每隔20*60*60 自动爬取一次,实现自动更新
        """
        os.system("scrapy crawl dkgy3")
        time.sleep(20*60*60)
    
    
    # from scrapy import cmdline
    # cmdline.execute("scrapy crawl dkgy3".split())

    完整代码

    参见:https://github.com/zInPython/danke

  • 相关阅读:
    mysql导出某张表的部分数据
    linux命令行实用快捷键
    mysql导出数据库某些表的数据
    二进制日志过期时间设置expire_logs_days
    进程信息
    安装mysql-workbench
    create user
    系统io统计
    mysqldump与mydumper
    系统负载监控
  • 原文地址:https://www.cnblogs.com/pythoner6833/p/9157431.html
Copyright © 2020-2023  润新知