• 【scrapy实践】_爬取安居客_广州_新楼盘数据


    需求:爬取【安居客—广州—新楼盘】的数据,具体到每个楼盘的详情页的若干字段。

    难点:楼盘类型各式各样:住宅 别墅 商住 商铺 写字楼,不同楼盘字段的名称不一样。然后同一种类型,比如住宅,又分为不同的情况,比如分为期房在售,现房在售,待售,尾盘。其他类型也有类似情况。所以字段不能设置固定住。

    解决方案:目前想到的解决方案,第一种:scrapy中items.py中不设置字段,spider中爬的时候自动识别字段(也就是有啥字段就保留下来),然后返回字典存起来。第二种,不同字段的网页分别写规则单独抓取。显然不可取。我采用的是第一种方案。还有其他方案的朋友们,欢迎交流哈。

    目标网址为:http://gz.fang.anjuke.com/ 该网页下的楼盘数据

    示例楼盘网址:http://gz.fang.anjuke.com/loupan/canshu-298205.html?from=loupan_tab

    开始编写scrapy脚本。建立工程步骤略过。

    1、count.py

     1 __author__ = 'Oscar_Yang'
     2 #-*- coding= utf-8 -*-
     3 """
     4     查看mongodb存储状况的脚本count.py
     5 """
     6 import time
     7 import pymongo
     8 client = pymongo.MongoClient("localhost", 27017)
     9 db = client["SCRAPY_anjuke_gz"]
    10 sheet = db["anjuke_doc1"]
    11 
    12 while True:
    13     print(sheet.find().count())
    14     print("____________________________________")
    15     time.sleep(3)

    1 """
    2     entrypoint.py
    3 """
    4 from scrapy.cmdline import execute
    5 execute(['scrapy', 'crawl', 'anjuke_gz'])
     1 # -*- coding: utf-8 -*-
     2 """
     3     settings.py
     4 """
     5 
     6 # Scrapy settings for anjuke_gz project
     7 #
     8 # For simplicity, this file contains only settings considered important or
     9 # commonly used. You can find more settings consulting the documentation:
    10 #
    11 #     http://doc.scrapy.org/en/latest/topics/settings.html
    12 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    13 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    14 
    15 BOT_NAME = 'anjuke_gz'
    16 
    17 SPIDER_MODULES = ['anjuke_gz.spiders']
    18 NEWSPIDER_MODULE = 'anjuke_gz.spiders'
    19 MONGODB_HOST = "127.0.0.1"
    20 MONGODB_PORT = 27017
    21 MONGODB_DBNAME="SCRAPY_anjuke_gz"
    22 MONGODB_DOCNAME="anjuke_doc1"
    23 
    24 # Crawl responsibly by identifying yourself (and your website) on the user-agent
    25 #USER_AGENT = 'anjuke_gz (+http://www.yourdomain.com)'
    26 
    27 # Obey robots.txt rules
    28 ROBOTSTXT_OBEY = False
    29 
    30 # Configure maximum concurrent requests performed by Scrapy (default: 16)
    31 #CONCURRENT_REQUESTS = 32
    32 
    33 # Configure a delay for requests for the same website (default: 0)
    34 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
    35 # See also autothrottle settings and docs
    36 #DOWNLOAD_DELAY = 3
    37 # The download delay setting will honor only one of:
    38 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    39 #CONCURRENT_REQUESTS_PER_IP = 16
    40 
    41 # Disable cookies (enabled by default)
    42 #COOKIES_ENABLED = False
    43 
    44 # Disable Telnet Console (enabled by default)
    45 #TELNETCONSOLE_ENABLED = False
    46 
    47 # Override the default request headers:
    48 #DEFAULT_REQUEST_HEADERS = {
    49 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    50 #   'Accept-Language': 'en',
    51 #}
    52 
    53 # Enable or disable spider middlewares
    54 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    55 #SPIDER_MIDDLEWARES = {
    56 #    'anjuke_gz.middlewares.AnjukeGzSpiderMiddleware': 543,
    57 #}
    58 
    59 # Enable or disable downloader middlewares
    60 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    61 #DOWNLOADER_MIDDLEWARES = {
    62 #    'anjuke_gz.middlewares.MyCustomDownloaderMiddleware': 543,
    63 #}
    64 
    65 # Enable or disable extensions
    66 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
    67 #EXTENSIONS = {
    68 #    'scrapy.extensions.telnet.TelnetConsole': None,
    69 #}
    70 
    71 # Configure item pipelines
    72 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
    73 ITEM_PIPELINES = {
    74    'anjuke_gz.pipelines.AnjukeGzPipeline': 300,
    75 }
    76 
    77 # Enable and configure the AutoThrottle extension (disabled by default)
    78 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    79 #AUTOTHROTTLE_ENABLED = True
    80 # The initial download delay
    81 #AUTOTHROTTLE_START_DELAY = 5
    82 # The maximum download delay to be set in case of high latencies
    83 #AUTOTHROTTLE_MAX_DELAY = 60
    84 # The average number of requests Scrapy should be sending in parallel to
    85 # each remote server
    86 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    87 # Enable showing throttling stats for every response received:
    88 #AUTOTHROTTLE_DEBUG = False
    89 
    90 # Enable and configure HTTP caching (disabled by default)
    91 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    92 HTTPCACHE_ENABLED = True
    93 HTTPCACHE_EXPIRATION_SECS = 0
    94 HTTPCACHE_DIR = 'httpcache'
    95 HTTPCACHE_IGNORE_HTTP_CODES = []
    96 HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

    接下来,是items。因为没有设置字段,为默认的代码。

     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # http://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 
    10 
    11 class AnjukeGzItem(scrapy.Item):
    12     # define the fields for your item here like:
    13     # name = scrapy.Field()
    14     pass

    接下来,是piplines.py。在中设置了mongodb的配置。

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    import pymongo
    from scrapy.conf import settings
    
    class AnjukeGzPipeline(object):
        def __init__(self):
            host=settings["MONGODB_HOST"]
            port=settings["MONGODB_PORT"]
            dbname=settings["MONGODB_DBNAME"]
            client=pymongo.MongoClient(port=port,host=host)
            tdb = client[dbname]
            self.post=tdb[settings["MONGODB_DOCNAME"]]
        def process_item(self,item,spider):
            info = dict(item)
            self.post.insert(info)
            return item

    最后,是最主要的spider.py

     1 from scrapy.http import Request
     2 import scrapy
     3 from bs4 import BeautifulSoup
     4 import re
     5 import requests
     6 """
     7     spider脚本
     8 """
     9 class Myspider(scrapy.Spider):
    10     name = 'anjuke_gz'
    11     allowed_domains = ['http://gz.fang.anjuke.com/loupan/']
    12     start_urls = ["http://gz.fang.anjuke.com/loupan/all/p{}/".format(i) for i in range(39)]
    13 
    14     def parse(self, response):
    15         soup = BeautifulSoup(response.text,"lxml")
    16         content=soup.find_all(class_="items-name") #返回每个楼盘的对应数据
    17         for item in content:
    18             code=item["href"].split("/")[-1][:6]
    19             real_href="http://gz.fang.anjuke.com/loupan/canshu-{}.html?from=loupan_tab".format(code) #拼凑出楼盘详情页的url
    20             res=requests.get(real_href)
    21             soup = BeautifulSoup(res.text,"lxml")
    22             a = re.findall(r'<div class="name">(.*?)</div>', str(soup))
    23             b = soup.find_all(class_="des")
    24             data = {}
    25             for (i, j) in zip(range(len(b)), a):
    26                 data[j] = b[i].text.strip().strip("	")
    27                 data["url"] = real_href
    28             yield data

    下面是存入mongodb的情况。

      因为针对不同的网页结构,爬取的规则是一个,所以爬取的时候就不能针对每个字段进行爬取,所以存到库里的数据如果要是分析的话还需要清洗。

    在python中使用mongodb的查询语句,再配合使用pandas应该就很方便清洗了。

  • 相关阅读:
    Qt ini文件
    Qt我的文档 桌面路径
    windows zlib库编译步骤
    环形缓冲区
    openssl生成随机数
    怎样安装Scrapy
    CentOS7怎样安装GoAccess1.3
    Docker创建数据卷容器
    Docker创建数据卷
    Docker创建容器
  • 原文地址:https://www.cnblogs.com/coskaka/p/6165520.html
Copyright © 2020-2023  润新知