• 配置scrapy-splash+python爬取医院信息(利用了scrapy-splash)


    北京艾丽斯妇科医院(http://fuke.fuke120.com/)

    首先先说一下配置splash

    1.利用pip安装scrapy-splash库

    pip install scrapy-splash

    2.现在就要用到另一个神器(Docker)

    Docker下载地址:https://www.docker.com/community-edition#/windows

    3.安装好Docker后启动Docker拉取镜像

    docker pull scrapinghub/splash

    4.利用Docker运行splash

    docker run -p 8050:8050 scrapinghub/splash(运行之后大家可以去浏览器输入http://192.168.99.100:8050检查Docker是否正确)

    5settings.py配置

    SPLASH_URL = 'http://192.168.99.100:8050'(重中之重,一个大坑,一定要注意这个IP就是192.168.99.100,我就一直用的自己IP一直没运行成功)
    DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
    }
    
    SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
    }
    
    DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
    
    HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

    ROBOTSTXT_OBEY = True(此处注意,有的网站是True,而有的网站需要把它改成False)

     爬虫的py文件1.py

    # -*- coding: utf-8 -*-
    import re
    from urllib.request import urlopen
    from scrapy.http import Request
    # from urllib.request import Request
    from bs4 import BeautifulSoup
    from lxml import etree
    import pymongo
    import scrapy
    from scrapy.selector import HtmlXPathSelector
    
    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.Health
    collection = db.Healthclass  # 表名classification
    
    import redis  # 导入redis数据库
    
    r = redis.Redis(host='127.0.0.1', port=6379, db=0)
    
    
    ii = 0
    class healthcareClassSpider(scrapy.Spider):
        name = "HealthCare"
        allowed_domains = ["fuke120.com"]  # 允许访问的域
        start_urls = [
            "http://fuke.fuke120.com/",
        ]
    
        # 每爬完一个网页会回调parse方法
        def parse(self, response):
            global ii
            hxs = HtmlXPathSelector(response)
            hx = hxs.select('//div[@id="allsort"]/div[@class="item"]/span/a')
            hx1 = hxs.select('//div[@id="allsort"]/div[@class="item born"]/span/a')
            # hx2 = hxs.select('//div[@id="allsort"]/div[@class="item"]/div[@class="i-mc"]/div[@class="i-mc01"]/ul[@class="w_ul01"]/li/a')
            for secItem in hx:
                ii+=1
                url = secItem.select("@href").extract()
                c = "http://fuke.fuke120.com"+url[0]
                name = secItem.select("text()").extract()
    
                print(c)
                print(name)
                classid = collection.insert({'healthclass': name, 'pid': None})
                healthurl = '%s,%s,%s' % (classid, c, ii)
                r.lpush('healthclassurl',healthurl)
            for secItem1 in hx1:
                url = secItem1.select("@href").extract()
                c1 = "http://fuke.fuke120.com"+url[0]
                name1 = secItem1.select("text()").extract()
                print(c1)
                print(name1)
                classid = collection.insert({'healthclass': name1, 'pid': None})
                healthurl = '%s,%s,%s' % (classid, c1, 0)
                r.lpush('healthclassurl', healthurl)
    

      2.py

    # -*- coding: utf-8 -*-
    import re
    from urllib.request import urlopen
    from urllib.request import Request
    from bs4 import BeautifulSoup
    from lxml import etree
    import pymongo
    import scrapy
    from scrapy.selector import HtmlXPathSelector
    from bson.objectid import ObjectId
    # from scrapy.http import Request
    # from urllib.request import urlopen
    from scrapy.http import Request
    # from hello.items import ZhaopinItem
    # from scrapy.spiders import CrawlSpider, Rule
    # from scrapy.linkextractors import LinkExtractor
    from urllib.request import Request,ProxyHandler
    from urllib.request import build_opener
    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.Health            #库名dianping
    collection = db.Diseaseclass          #表名classification
    
    
    import redis        #导入redis数据库
    
    r = redis.Redis(host='192.168.60.112', port=6379, db=0, charset='utf-8')
    class healthcareClassSpider(scrapy.Spider):
    
        name = "HealthCare1"
        allowed_domains = ["fuke120.com"]  # 允许访问的域
        dict = {}
        start_urls = []
    
        def __init__(self):
            a = r.lrange('healthclassurl', 0,-1)
    
            for item in a:
                healthurl = bytes.decode(item)
                arr = healthurl.split(',')
                healthcareClassSpider.start_urls.append(arr[1])
    
                num = arr[2]
                pid = arr[0]
                url = arr[1]
                self.dict[url] = {"pid": pid, "num": num}
        def parse(self, response):
            nameInfo = self.dict[response.url]
            pid1 = nameInfo['pid']
            pid = ObjectId(pid1)
            num = nameInfo['num']
            hxs = HtmlXPathSelector(response)
            hx = hxs.select('//div[@class="x_con02_2"]/div[@class="x_con02_3"]/ul/li/p/a')
            for secItem in hx:
                url = secItem.select("@href").extract()
                url = "http://fuke.fuke120.com"+url[0]
                name = secItem.select("text()").extract()
                print(url)
                print(name)
                classid = collection.insert({'Diseaseclass': name, 'pid': pid})
                diseaseclassurl = '%s,%s,%s' % (classid, url, pid)
                r.lpush('diseaseclassurl', diseaseclassurl)
    

      3.py

    # -*- coding: utf-8 -*-
    import re
    from urllib.request import urlopen
    from urllib.request import Request
    from bs4 import BeautifulSoup
    from lxml import etree
    import pymongo
    import scrapy
    from scrapy_splash import SplashMiddleware
    from scrapy.http import Request, HtmlResponse
    from scrapy_splash import SplashRequest
    from scrapy.selector import Selector
    from scrapy.selector import HtmlXPathSelector
    from bson.objectid import ObjectId
    # from diseaseHealth.diseaseHealth.spiders.SpiderJsDynamic import phantomjs1
    # from scrapy.http import Request
    # from urllib.request import urlopen
    from scrapy.http import Request
    
    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.Health  # 库名dianping
    collection = db.Treatclass  # 表名classification
    #
    import redis  # 导入redis数据库
    #
    r = redis.Redis(host='192.168.60.112', port=6379, db=0, charset='utf-8')
    
    
    class healthcareClassSpider(scrapy.Spider):
    
        name = "HealthCare2"
        allowed_domains = ["fuke120.com"]  # 允许访问的域
        dict = {}
        start_urls = []
    
        def __init__(self):
            a = r.lrange('diseaseclassurl', 0,-1)
    
            for item in a:
                healthurl = bytes.decode(item)
                arr = healthurl.split(',')
                healthcareClassSpider.start_urls.append(arr[1])
    
                num = arr[2]
                pid = arr[0]
                url = arr[1]
                self.dict[url] = {"pid": pid, "num": num}
    
        def start_requests(self):
    
            for url in self.start_urls:
                yield SplashRequest(url, self.parse, args={'wait': 0.5})
        def parse(self, response):
                # a = response.body.decode('utf-8')
                # print(a)
    
    
            nameInfo = self.dict[response.url]
            pid1 = nameInfo['pid']
            pid = ObjectId(pid1)
            num = nameInfo['num']
            print(num)
            print(pid)
            hxs = HtmlXPathSelector(response)
            hx = hxs.select('//div[@class="dh01"]/ul[@class="ul_bg01"]/li/a')
            for secItem in hx:
                url = secItem.select("@href").extract()
                c = "http://fuke.fuke120.com" + url[0]
                name = secItem.select("text()").extract()
                print(c)
                print(name)
                classid = collection.insert({'Treatclass': name, 'pid': pid})
                treatclassurl = '%s,%s,%s' % (classid, c, pid)
                r.lpush('treatclassurl', treatclassurl)
    

      大功告成,主要还是为了使用scrapy-splash。

     

  • 相关阅读:
    Ansible 和 Playbook 暂存
    nginx 和keepalived的使用
    关于 群晖 docker 百度云盘下载的使用心得
    文件夹共享
    转:轻松把玩HttpClient之封装HttpClient工具类(一)(现有网上分享中的最强大的工具类)
    Maven学习笔记一
    转:Maven常用命令
    转:MySQL下载安装、配置与使用(win7x64)
    转:SQL SERVER 2014 安装图解(含 SQL SERVER 2014 安装程序共享)
    转:java 解析excel,带合并单元的excel
  • 原文地址:https://www.cnblogs.com/wangyuhangboke/p/8025067.html
Copyright © 2020-2023  润新知