• 爬虫入门之Scrapy框架实战(新浪百科豆瓣)(十二)


    一 新浪新闻爬取

    1 爬取新浪新闻(全站爬取)

    项目搭建与开启

    scrapy startproject sina
    cd sina
    scrapy genspider mysina http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_2.shtml
    

    2 项目setting配置

    ROBOTSTXT_OBEY = False
    ITEM_PIPELINES = {
       'sina.pipelines.SinaPipeline': 300,
    }
    

    3 启动文件start.py配置

    import scrapy.cmdline
    def main():
        # -o  ['json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle']
        scrapy.cmdline.execute(['scrapy','crawl','mysina'])
        
    if __name__ == '__main__':
        main()
    

    4 需求目标item配置

    import scrapy
    
    class SinaItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        newsTitle = scrapy.Field()
        newsUrl = scrapy.Field()
        newsTime = scrapy.Field()
        content = scrapy.Field()
    
    

    5 爬虫逻辑文件配置mysina.py

    import scrapy
    import requests
    from lxml import etree
    from sina import items
    from scrapy.spiders import CrawlSpider,Rule  #CrawlSpiders:定义了一些规则跟进link
    from scrapy.linkextractors import LinkExtractor  #提取链接
    
    class MysinaSpider(CrawlSpider): #继承了CrawlSpider因此parse需要重命名防止冲突
        name = 'mysina'
        allowed_domains = ['sina.com.cn']
        start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_2.shtml']
        '''
        Rule参数:link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity
        LinkExtractor部分参数: allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=()
    
        allow=(正则)允许的, deny=(正则)不允许的
        callback=回调函数
        follow= 跟随如果为True就跟随
        '''
        rules = [Rule(LinkExtractor(allow=('index_(d+).shtml')),callback='getParse',follow=True)]
    
        def getParse(self, response): #重命名逻辑方法
            newsList = response.xpath("//ul[@class='list_009']/li")
            for news in newsList:
    
                item = items.SinaItem() #对其进行实例化
                newsTitle = news.xpath('./a/text()')[0].extract()
                newsUrl = news.xpath('./a/@href')[0].extract()
                newsTime = news.xpath('./span/text()')[0].extract()
                content = self.getContent(newsUrl)
    
                item['newsTitle'] = newsTitle
                item['newsUrl'] = newsUrl
                item['newsTime'] = newsTime
                item['content'] = content
                yield item
    
        def getContent(self,url):
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
            }
            response = requests.get(url,headers=headers).content.decode('utf-8','ignore')   #content二进制
            mytree = etree.HTML(response)
            contentList = mytree.xpath("//div[@class='article']//text()")
            print(contentList)
            content = ''
            for c in contentList:
                #Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列
                content += c.strip().replace('
    ','')  #保证content为整片文章
            return content
    

    方法二 :mysina.py也可采用scrapy创建请求

    # -*- coding: utf-8 -*-
    import scrapy
    import requests
    from lxml import etree
    from sina import items
    
    from scrapy.spiders import CrawlSpider,Rule  #CrawlSpiders:定义了一些规则跟进link
    from scrapy.linkextractors import LinkExtractor  #提取链接
    
    class MysinaSpider(CrawlSpider):
        name = 'mysina'
        allowed_domains = ['sina.com.cn']
        start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_2.shtml']
        rules = [Rule(LinkExtractor(allow=('index_(d+).shtml')),callback='getParse',follow=True)]
    
        def getParse(self, response):
    
            newsList = response.xpath("//ul[@class='list_009']/li")
            for news in newsList:
    
                newsTitle = news.xpath('./a/text()')[0].extract()
                newsUrl = news.xpath('./a/@href')[0].extract()
                newsTime = news.xpath('./span/text()')[0].extract()
        
                #构造请求(修改为框架Request构造请求)
                request = scrapy.Request(newsUrl,callback=self.getMataContent) #回调为getMataContent
                #使用meta传参
                request.meta['newsTitle'] = newsTitle
                request.meta['newsUrl'] = newsUrl
                request.meta['newsTime'] = newsTime
                yield request
    
        def getMataContent(self,response):
            '''
            getMataContent接受来自request请求后的响应response
            '''
            contentList = response.xpath("//div[@class='article']//text()")
            content = ''
            for c in contentList:
                content += c.extract().strip()
            item = items.SinaItem()
            #response响应数据对应字段赋值给item
            item['newsTitle'] = response.meta['newsTitle']
            item['newsUrl'] = response.meta['newsUrl']
            item['newsTime'] = response.meta['newsTime']
            item['content'] = content
            yield item
    

    6 管道存储pipelines.py

    import pymysql
    
    class SinaPipeline(object):
        def __init__(self):
            self.conn = None
            self.cursor = None
    
        def open_spider(self,spider):
            self.conn = pymysql.connect(host='111.230.169.xxx',user='root',password='xxx',database='sina', port=3306,charset='utf8') #创建连接
            self.cursor = self.conn.cursor()  #创建数据库游标
    
        def process_item(self, item, spider):
            sql = 'insert into sina_news(newsTitle,newsUrl,newsTime,content) VALUES (%r,%r,%r,%r)'%(item['newsTitle'], item['newsUrl'], item['newsTime'], item['content'])
            self.cursor.execute(sql)  #执行sql语句
            self.conn.commit()  #提交
            return item
    
        def close_spider(self,spider):
            self.cursor.close() #关闭
            self.conn.close()
    

    方法二 : pipelines.py 补充快速创建sql语句

    import pymysql
    
    class DemoPipeline(object):
    
        def __init__(self):
            self.conn = None
            self.cur = None
    
        def open_spider(self, spider):
            self.conn = pymysql.connect(
                host='127.0.0.1',
                port=3306,
                user='root',
                password='123456',
                db='fate',
                charset='utf8')
            self.cur = self.conn.cursor()
    
        def process_item(self, item, spider):
            cols, values = zip(*item.items())  #zip打包返回两个参数
            sql = "INSERT INTO `%s` (%s) VALUES (%s)" % 
                  (
                      'sina_news',
                      ','.join(cols),
                      ','.join(['%s'] * len(values))
                   )
            self.cur.execute(sql, values) #执行sql语句并将values填充到%s
            self.conn.commit()
            return item
    
        def close_spider(self, spider):
            self.cur.close()
            self.conn.close()
    

    二 百科资料的爬取

    1 百科资料爬取

    项目搭建与开启

    scrapy startproject baike
    cd baike
    scrapy genspider mybaike baike.baidu.com/item/Python/407313
    

    2 项目setting配置

    ROBOTSTXT_OBEY = False
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }
    ITEM_PIPELINES = {
       'baike.pipelines.BaikePipeline': 300,
    }
    

    3 启动文件start.py配置

    import scrapy.cmdline
    def main():
        # -o  ['json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle']
        scrapy.cmdline.execute(['scrapy','crawl','mybaike'])
        
    if __name__ == '__main__':
        main()
    

    4 需求目标items配置

    import scrapy
    
    class BaikeItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        level1Title = scrapy.Field()
        level2Title = scrapy.Field()
        content = scrapy.Field()
    

    5 爬虫逻辑文件配置mybaike.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.spiders import CrawlSpider,Rule
    from scrapy.linkextractors import LinkExtractor
    from baike.items import BaikeItem
    
    class MybaikeSpider(CrawlSpider):
        name = 'mybaike'
        allowed_domains = ['baike.baidu.com']
        start_urls = ['https://baike.baidu.com/item/Python/407313']
    
        rules = [Rule(LinkExtractor(allow=('item/(.*)')),callback='getParse',follow=True)]
    
        def getParse(self, response):
            level1Title = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()")[0].extract()
            level2Title = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()")
            if len(level2Title) != 0:
                level2Title = level2Title[0].extract()
            else:
                level2Title = '待编辑'
            contentList = response.xpath("//div[@class='lemma-summary']//text()")
            content = ''
            for c in contentList:
                content += c.extract()
            item = BaikeItem()
            item['level1Title'] = level1Title
            item['level2Title'] = level2Title
            item['content'] = content
            yield item
    

    6 管道存储pipelines.py

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    import pymysql
    
    class BaikePipeline(object):
        def __init__(self):
            self.conn = None
            self.cousor = None
    
        def open_spider(self, spider):
            # 连接
            self.conn = pymysql.connect(host='111.230.169.107', user='root', password="20111673",
                                        database='baike', port=3306,
                                        charset='utf8')
            # 游标
            self.cousor = self.conn.cursor()
    
        def process_item(self, item, spider):
    
            cols, values = zip(*item.items())
    
            # `表名`
            sql = "INSERT INTO `%s`(%s) VALUES (%s)" % 
                  ('baike', ','.join(cols), ','.join(['%s'] * len(values)))
    
            self.cousor.execute(sql, values)
            self.conn.commit()
    
            return item
    
        def close_spider(self, spider):
            self.cousor.close()
            self.conn.close()
    
    

    三 豆瓣电影的爬取

    1 豆瓣电影排行版

    项目搭建与开启

    scrapy startproject douban
    cd douban
    scrapy genspider mysina movie.douban.com/top250
    

    2 项目setting配置

    ROBOTSTXT_OBEY = False
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
       "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36"
    }
    ITEM_PIPELINES = {
       'douban.pipelines.DoubanPipeline': 300,
    }
    

    3 启动文件start.py配置

    import scrapy.cmdline
    def main():
        # -o  ['json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle']
        scrapy.cmdline.execute(['scrapy','crawl','mybaike'])
        
    if __name__ == '__main__':
        main()
    

    4 需求目标items配置

    import scrapy
    
    class DoubanItem(scrapy.Item):
        # define the fields for your item here like:
        name = scrapy.Field()
        movieInfo = scrapy.Field()
        star = scrapy.Field()
        quote = scrapy.Field()
    

    5 爬虫逻辑文件配置mydouban.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.http import Request
    from douban.items import DoubanItem
    
    class MydoubanSpider(scrapy.Spider):
        name = 'mydouban'
        url = ['https://movie.douban.com/top250']
        start_urls = {'https://movie.douban.com/top250'} #方法1
    
        '''#方法二
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
        }
    
        def start_requests(self):
            url = 'https://movie.douban.com/top250'
            yield Request(url, headers=self.headers)
        '''
    
        def parse(self, response):
            item = DoubanItem()
            movies = response.xpath('//ol[@class="grid_view"]/li')
    
            for movie in movies:
                item['name'] = movie.xpath(".//div[@class='pic']/a/img/@alt").extract()[0]
                item['movieInfo'] = movie.xpath(".//div[@class='info']/div[@class='bd']/p/text()").extract()[0].strip()
                item['star'] = movie.xpath(".//div[@class='info']/div[@class='bd']/div[@class='star']/span[2]/text()").extract()[0]
                item['quote'] = movie.xpath('.//div[@class="star"]/span/text()').re(r'(d+)人评价')[0]
                yield item
    
            next_url = response.xpath('//span[@class="next"]/a/@href').extract() #获取下一页链接
            if next_url:
                next_url = 'https://movie.douban.com/top250' + next_url[0]
                yield Request(next_url,callback=self.parse)  #执行回调
    
    

    6 管道存储pipelines.py

    # -*- coding: utf-8 -*-
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    
    import pymysql
    
    class DoubanPipeline(object):
        def __init__(self):
            self.conn = pymysql.connect(host='111.230.169.107', port=3306, user= 'root', passwd = 'xxx', database = 'douban',charset = 'utf8')
            self.cursor = self.conn.cursor()
            self.cursor.execute("truncate table Movie")   #此处设置每开启就清空
            self.conn.commit()
    
        def process_item(self, item, spider):
            try:
                self.cursor.execute("insert into Movie (name,movieInfo,star,quote) VALUES (%s,%s,%s,%s)",(item['name'], item['movieInfo'], item['star'], item['quote']))
                self.conn.commit()
    
            except pymysql.Error:
                print("Error%s,%s,%s,%s" % (item['name'], item['movieInfo'], item['star'], item['quote']))
            return item
        def close_spider(self, spider):
            self.cursor.close()
            self.conn.close()
    
  • 相关阅读:
    2020软件工程第二次结对作业
    软工实践个人总结
    2020软件工程实践第一次个人编程作业
    2020软件工程实践第一次作业
    第4次作业-结对编程之实验室程序实现
    2020年FZU软件工程第一次结对作业
    2020 福州大学软件工程 第一次作业
    2020 福州大学软件工程 实践个人总结
    2020福州大学软件工程实践 第二次结队作业
    福州大学软件工程实践 结队作业
  • 原文地址:https://www.cnblogs.com/why957/p/9280370.html
Copyright © 2020-2023  润新知