• scrapy使用PhantomJS爬取数据


    环境:python2.7+scrapy+selenium+PhantomJS

    内容:测试scrapy+PhantomJS

    爬去内容:涉及到js加载更多的页面

    原理:配置文件打开中间件+修改process_request函数(在里面增加PhantomJS操作)

    第一步:

    settings.py

    DOWNLOADER_MIDDLEWARES = {
        'dbdm.middlewares.DbdmSpiderMiddleware': 543,
    }

    项目不一样名字会改变不影响。

    第二步:

    ----------默认开启PhantomJS

    middlewares.py

    上面需要加载selenium 
    from selenium import webdriver
    #........省略部分代码
    @classmethod
    def process_request(cls, request, spider): #if request.meta.has_key('PhantomJS'): driver = webdriver.PhantomJS('E:\p_python\Scripts\phantomjs\bin\phantomjs.exe') driver.get(request.url) if request.url=='https://movie.douban.com/tag': driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/div[1]/ul[1]/li[5]/span').click() time.sleep(5) if driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a'): click_more(driver) content = driver.page_source.encode('utf-8') #print content #file = open(path.join(d, '1.txt'),'w') #file.write(content) #file.close() driver.quit() return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)

    def click_more(driver,i=1):
        driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a').click()
        print str(i)+'  click'
        time.sleep(5)
        i = i+1
        try:
            more_btn = driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a')
            if more_btn:
                click_more(driver,i)
        except:
            print 'click Over!!'

    上面只是测试的代码,具体根据自己的项目更改,当前默认是打开PhantomJS访问url,可以通过判断。

    -----------需要开启时再开启

    判断key的值

    
    
    上面需要加载selenium 
    from selenium import webdriver
    #........省略部分代码 

    @classmethod
    def process_request(cls, request, spider): if request.meta.has_key('PhantomJS'): driver = webdriver.PhantomJS('E:\p_python\Scripts\phantomjs\bin\phantomjs.exe') driver.get(request.url) content = driver.page_source.encode('utf-8') driver.quit() return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)

    key的值设定在spider文件里面

    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from phantomjs_test.items import PhantomscrapyItem
    
    class PhantomjsTestSpider(CrawlSpider):
        name = 'phantomjs_test'
        allowed_domains = ['book.com']
        start_urls = ['http://book.com/']
        #all_urls = []   去重似乎不需要
         rules = (
            ###获取所有的分页列表
            Rule(LinkExtractor(allow=r'/story/p/[2-9]*'), callback='parse', follow=True),
            ###获取里面所有的详情页
            #Rule(LinkExtractor(allow=r'/detail/p/[2-9]*'), callback = 'parse_item',follow=True),
        )
    
        ###从分页页面获取所有的文章url
        def parse(self, response):
            url_list = response.xpath('/a/@href').extract()
            for url in url_list:
                request = Request(url=url, callback=self.parse_item, dont_filter=True)
                request.meta['PhantomJS'] = True
                yield request
    
        def parse_item(self, response):
            item = PhantomscrapyItem()
            #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
            #i['name'] = response.xpath('//div[@id="name"]').extract()
            #i['description'] = response.xpath('//div[@id="description"]').extract()
            item['bookName'] = response.xpath()
            items = []
            items.append(item)
            return items

    以上便是默认打开与判断条件再打开的区别,根据页面不同可以设置,代码仍需要完善才能人性化。

  • 相关阅读:
    alpha冲刺9
    alpha冲刺8
    alpha冲刺7
    alpha冲6
    随堂小测-同学录
    alpha冲刺5
    任务3
    任务2
    任务1
    网站用户行为分析
  • 原文地址:https://www.cnblogs.com/shuangzikun/p/taotao_python_scrapy_phantomJS.html
Copyright © 2020-2023  润新知