• scrapy+lxml.etree爬取百度贴吧


    分析:首先通过scrapy内置的xpath提取内容,发现为空,所以不行咯
       采用正则re匹配出所有的<li>标签,也就是需要提取的所有内容
         在把li标签通过resultTree = lxml.etree.HTML(articleBody),变成'lxml.etree._Element'
       在通过resultTree.xpath()进行提取
       注意  此时的xpath与scrapy的xpath是不一样的

    # -*- coding: utf-8 -*-
    import scrapy
    from ..settings import MAX_PAGE
    from ..items import TiebaBaiduItem
    import re
    import lxml.html
    import lxml.etree
    import json
    
    
    class TiebaSpider(scrapy.Spider):
    
        name = 'tieba'
        allowed_domains = ['tieba.baidu.com']
        start_urls = ['https://tieba.baidu.com/f?kw=%E9%83%91%E5%AE%B8&ie=utf-8&pn={}'.format(str(page * 50)) for page in range(MAX_PAGE + 1)]
    
        def parse(self, response):
    
            # 关键 是正则匹配出 那一段需要有用的 html代码 如下  就是把那一部分 <li>标签全取出来
            articleBodyRe = re.search('<ul id="thread_list" class="threadlist_bright j_threadlist_bright">(.*?)<div class="thread_list_bottom clearfix">', response.text, re.DOTALL)
            articleBody = ''
            if articleBodyRe:
                articleBody = articleBodyRe.group(1)
            # 通过lxml.etree.HTML(articleBody) 变成html对象  再利用xpath进行提取
            # 此时的xpath与scrapy使用的xpath略有不同
            # 这是lxml模块中xpath的使用方式
            resultTree = lxml.etree.HTML(articleBody)
    
            articleList = resultTree.xpath('//li[contains(@class,"j_thread_list")]')
            for articleElem in articleList:
                articleInfo = {}
                data_field = articleElem.xpath("@data-field")[0]
                dataFieldJson = json.loads(data_field)
                articleInfo['id'] = dataFieldJson['id']
                articleInfo['author'] = dataFieldJson['author_name']
                articleInfo['title'] = articleElem.xpath(".//div[@class='t_con cleafix']//a/@title")[0]
                articleInfo['href'] = 
                articleElem.xpath(".//div[@class='t_con cleafix']//a/@href")[0]
                yield response.follow(
                    url = articleInfo['href'] + "?see_lz=1",
                    meta={'dont_redirect': True, 'articleInfo': articleInfo},
                    callback = self.parseArticleDetail,
                    errback = self.errorHandle
                )
    
        def parseArticleDetail(self, response):
            print(
                f"parseArticleDetail: statusCode = {response.status}, url = {response.url}")
            contentLst = response.xpath(
                "//div[contains(@id, 'post_content')]//text()").extract()
            imgHrefLst = response.xpath(
                "//div[contains(@id, 'post_content')]//img/@src").extract()
            dateLst = response.xpath(
                "//div[contains(@class, 'post_content_firstfloor')]//span[@class='tail-info']/text()").extract()
            content = ''
            for contentElem in contentLst:
                content += contentElem.replace('
    ', ',').replace(" ", '').strip()
                content += ', '
            print(f"content = {content}")
            print(f"imgHrefLst = {imgHrefLst}")
            articleInfo = response.meta['articleInfo']
            articleItem = TiebaBaiduItem()
            articleItem['item_type'] = 'articleDetail'
            articleItem['_id'] = articleInfo['id']
            articleItem['title'] = articleInfo['title']
            articleItem['author'] = articleInfo['author']
            articleItem['content'] = content
            articleItem['fromUrl'] = response.url
            articleItem['picHrefLst'] = imgHrefLst
            articleItem['date'] = dateLst[1]
            yield articleItem
    
        # 请求错误处理:可以打印,写文件,或者写到数据库中
        def errorHandle(self, failure):
            print(f"request error: {failure.value.response}")
    



       

  • 相关阅读:
    澄净是什么意思?
    【Cavali风格/优质羊毛混纺面料/高密抗静电里衬/撞色拼皮/立领/绿色/便装单西】玛萨玛索男装网购商城
    【100%纯新美利奴羊毛(除装饰材料外)/半高领/丈青/商务毛衫】玛萨玛索男装网购商城
    victim是什么意思_victim在线翻译_英语_读音_用法_例句_海词词典
    Lind.DDD.Repositories.EF层介绍
    Lind.DDD.Domain领域模型介绍
    大叔也说Xamarin~Android篇~原生登陆与WebView的网站如何共享Session
    Redis学习笔记~Redis并发锁机制
    知方可补不足~sqlserver中对xml类型字段的操作
    json转String 和 String转json 和判断对象类型
  • 原文地址:https://www.cnblogs.com/hyxailj/p/9156547.html
Copyright © 2020-2023  润新知