• 第4章 scrapy爬取知名技术文章网站(2)


    4-8~9 编写spider爬取jobbole的所有文章

    # -*- coding: utf-8 -*-
    import re
    import scrapy
    import datetime
    from scrapy.http import Request
    from urllib import parse
    '''如果是py2 那就是import urlparse'''
    
    from g0xukr.ArticleSpider.items import JobBoleArticleItem, ArticleItemLoader
    from g0xukr.ArticleSpider.utils.common import get_md5
    
    class JobboleSpider(scrapy.Spider):
        name = "jobbole"
        allowed_domains = ["python.jobbole.com"]
        start_urls = ['http://python.jobbole.com/all-posts/']
    
        def parse(self, response):
            """
            1. 获取文章列表页中的文章url并交给scrapy下载后并进行解析
            2. 获取下一页的url并交给scrapy进行下载, 下载完成后交给parse
            """
    
            #解析列表页中的所有文章url并交给scrapy下载后并进行解析
            post_nodes = response.css("#archive .floated-thumb .post-thumb a")
            for post_node in post_nodes:
                image_url = post_node.css("img::attr(src)").extract_first("")
                post_url = post_node.css("::attr(href)").extract_first("")
                yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)
                '''parse.urljoin(response.url, post_url)补全域名'''
    
            #提取下一页并交给scrapy进行下载
            next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
            if next_url:
                yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
    
        def parse_detail(self, response):
            article_item = JobBoleArticleItem()
            #通过css选择器提取字段
            front_image_url = response.meta.get("front_image_url", "")  #文章封面图
            title = response.css(".entry-header h1::text").extract()[0]
            create_date = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().replace("·","").strip()
            praise_nums = response.css(".vote-post-up h10::text").extract()[0]
            fav_nums = response.css(".bookmark-btn::text").extract()[0]
            match_re = re.match(".*?(d+).*", fav_nums)
            if match_re:
                fav_nums = int(match_re.group(1))
            else:
                fav_nums = 0
    
            comment_nums = response.css("a[href='#article-comment'] span::text").extract()[0]
            match_re = re.match(".*?(d+).*", comment_nums)
            if match_re:
                comment_nums = int(match_re.group(1))
            else:
                comment_nums = 0
    
            content = response.css("div.entry").extract()[0]
    
            tag_list = response.css("p.entry-meta-hide-on-mobile a::text").extract()
            tag_list = [element for element in tag_list if not element.strip().endswith("评论")]
            tags = ",".join(tag_list)
    
            article_item["url_object_id"] = get_md5(response.url)
            article_item["title"] = title
            article_item["url"] = response.url
            try:
                create_date = datetime.datetime.strptime(create_date, "%Y/%m/%d").date()
            except Exception as e:
                create_date = datetime.datetime.now().date()
            article_item["create_date"] = create_date
            article_item["front_image_url"] = [front_image_url]
            article_item["praise_nums"] = praise_nums
            article_item["comment_nums"] = comment_nums
            article_item["fav_nums"] = fav_nums
            article_item["tags"] = tags
            article_item["content"] = content
            yield article_item
    

    4-10~12 items设计

    一些零散的知识点:

    1.meta传递值到item.py文件中

    #例如:
    yield Request(url=parse.urljoin(response.url, post_url), meta={"front_image_url":image_url}, callback=self.parse_detail)
    

    2.extract_first('')使用

    extract_first('') 比 extract()[0]好用,因为后者有风险,如果为空,就会出错。但是前者如果为空设置为' ',所以更好用。

    3.response.meta.get()用法

    response.meta.get('front_image_url','') 前一个引号是自己定义的名称,后一个空着,这样如果就不会抛异常

    4.scrapy自动下载图片pipelines

    ITEM_PIPELINES = {
         'scrapy.pipelines.images.ImagesPipeline': 1,
    }
    

    配置:

    import os
    IMAGES_URLS_FIELD='front\_image\_url' #'引号中要是一个列表,是图片地址的字段
    project_dir=os.path.abspath(os.path.dirname(__file__))  #相对的路径,在其他电脑上也可以
    IMAGES_STORE=os.path.join(project_dir,'存储图片文件名称') #放在同级settings.py目录下
    '''如果要实现自己的需求,也可以重载相应的函数达到需求,在pipelines中建立类,继承ImagesPipeline就可以了'''
    

    5.哈希表摘要算法,输出固定长度

    python3模版:

    def get_md5(url):   #传进来url
    	if isinstance(url, str):  #判断是不是str,其实是判断是不是Unicode,python3中默认是Unicode编码
        	url = url.encode("utf-8") #转换成utf-8,哈希只认utf-8
    	m = hashlib.md5() 
    	m.update(url)
    	return m.hexdigest()
    

    python2模版:

    # -*- coding:utf-8 -*-
    import hashlib
    def get_md5(url='123'):
        m = hashlib.md5()
        m.update(url)
        return m.hexdigest()
    

    4-13 数据表设计和保存item到json文件

    模版:

    import codecs
    import json
    from scrapy.exporters import JsonItemExporter
    class JsonWithEncodingPipeline(object):
    	#自定义json文件的导出
        def __init__(self):
    		''''''
            self.file = codecs.open('article.json', 'w', encoding="utf-8")
        def process_item(self, item, spider):
            lines = json.dumps(dict(item), ensure_ascii=False) + "
    "
            self.file.write(lines)
            return item
        def spider_closed(self, spider):
            self.file.close()
    
    
    class JsonExporterPipleline(object):
        #调用scrapy提供的json export导出json文件
        def __init__(self):
            self.file = open('articleexport.json', 'wb')
            self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
            self.exporter.start_exporting()
    
        def close_spider(self, spider):
            self.exporter.finish_exporting()
            self.file.close()
    
        def process_item(self, item, spider):
            self.exporter.export_item(item)
        return item
    

    4-14~15 通过pipeline保存数据到mysql

    模版:

    pip install mysqlclint 是mysql的一个驱动

    import pymysql
    import pymysql.cursors	
    class MysqlPipeline(object):
        #采用同步的机制写入mysql
        def __init__(self):
            self.conn = pymysql.connect('192.168.0.106', 'root', 'root', 'article_spider', charset="utf8", use_unicode=True)
            self.cursor = self.conn.cursor()
    
        def process_item(self, item, spider):
            insert_sql = """
                insert into jobbole_article(title, url, create_date, fav_nums)
                VALUES (%s, %s, %s, %s)
            """
            self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"]))
            self.conn.commit()
    
    
    from twisted.enterprise import adbapi
    class MysqlTwistedPipline(object):
    	'''异步插入mysql'''
        def __init__(self, dbpool):
            self.dbpool = dbpool
    
        @classmethod
        def from_settings(cls, settings):
    		'''传入settings的参数'''
            dbparms = dict(
                host = settings["MYSQL_HOST"],
                db = settings["MYSQL_DBNAME"],
                user = settings["MYSQL_USER"],
                passwd = settings["MYSQL_PASSWORD"],
                charset='utf8',
                cursorclass=pymysql.cursors.DictCursor,
                use_unicode=True,
            )
            dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
    
            return cls(dbpool)
    
        def process_item(self, item, spider):
            #使用twisted将mysql插入变成异步执行
            query = self.dbpool.runInteraction(self.do_insert, item)
            query.addErrback(self.handle_error, item, spider) #处理异常
    
        def handle_error(self, failure, item, spider):
            # 处理异步插入的异常
            print (failure)
    
        def do_insert(self, cursor, item):
            #执行具体的插入
            #根据不同的item 构建不同的sql语句并插入到mysql中
            insert_sql, params = item.get_insert_sql()
            print (insert_sql, params)
            cursor.execute(insert_sql, params)
    

    4-16~17 scrapy item loader机制

    模版:

    scrapy item loader机制,便于以后的维护

    items.p文件中

    import datetime
    import re
    
    import scrapy
    from scrapy.loader import ItemLoader
    from scrapy.loader.processors import MapCompose, TakeFirst, Join
    
    from utils.common import extract_num
    from settings import SQL_DATETIME_FORMAT, SQL_DATE_FORMAT
    from w3lib.html import remove_tags
    def add_jobbole(value):
    	return value+"-bobby"
    
    
    def date_convert(value):
        try:
            create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()
        except Exception as e:
            create_date = datetime.datetime.now().date()
    
        return create_date
    
    
    def get_nums(value):
        match_re = re.match(".*?(d+).*", value)
        if match_re:
            nums = int(match_re.group(1))
        else:
            nums = 0
    
        return nums
    
    def return_value(value):
        return value
    
    
    def remove_comment_tags(value):
        #去掉tag中提取的评论
        if "评论" in value:
            return ""
        else:
            return value
    
    class ArticleItemLoader(ItemLoader):
        #自定义itemloader
        default_output_processor = TakeFirst()
    
    
    class JobBoleArticleItem(scrapy.Item):
        title = scrapy.Field()
        create_date = scrapy.Field(
            input_processor=MapCompose(date_convert),
        )
        url = scrapy.Field()
        url_object_id = scrapy.Field()
        front_image_url = scrapy.Field(
            output_processor=MapCompose(return_value)
        )
        front_image_path = scrapy.Field()
        praise_nums = scrapy.Field(
            input_processor=MapCompose(get_nums)
        )
        comment_nums = scrapy.Field(
            input_processor=MapCompose(get_nums)
        )
        fav_nums = scrapy.Field(
            input_processor=MapCompose(get_nums)
        )
        tags = scrapy.Field(
            input_processor=MapCompose(remove_comment_tags),
            output_processor=Join(",")
        )
        content = scrapy.Field()
    
        def get_insert_sql(self):
            insert_sql = """
                insert into jobbole_article(title, url, create_date, fav_nums, front_image_url, front_image_path,
                praise_nums, comment_nums, tags, content)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(fav_nums)
            """
    
            fron_image_url = ""
            # content = remove_tags(self["content"])
    
            if self["front_image_url"]:
                fron_image_url = self["front_image_url"][0]
            params = (self["title"], self["url"], self["create_date"], self["fav_nums"],
                      fron_image_url, self["front_image_path"], self["praise_nums"], self["comment_nums"],
                      self["tags"], self["content"])
            return insert_sql, params
    

    spider.py文件中部分代码

    def parse_detail(self, response):
        article_item = JobBoleArticleItem()
        front_image_url = response.meta.get("front_image_url", "")  # 文章封面图
        item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
        item_loader.add_css("title", ".entry-header h1::text")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
        item_loader.add_value("front_image_url", [front_image_url])
        item_loader.add_css("praise_nums", ".vote-post-up h10::text")
        item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
        item_loader.add_css("fav_nums", ".bookmark-btn::text")
        item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
        item_loader.add_css("content", "div.entry")
    
        article_item = item_loader.load_item()
    
    
        yield article_item
    

    作者:今孝

    出处:http://www.cnblogs.com/jinxiao-pu/p/6721848.html

    本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接。

  • 相关阅读:
    javascript提升复习
    关于加解密的
    java动态代理汇总
    ActiveMQ 使用
    16年上半年小结,下半年计划
    多线程之ReentrantReadWriteLock
    xml转换之
    2015-03 月份学习总结 分类: 学习总结 2015-04-01 20:25 87人阅读 评论(0) 收藏
    2015-03 月份学习总结 分类: 学习总结 2015-04-01 20:25 88人阅读 评论(0) 收藏
    IBM Rational AppScan 无法记录登录序列 分类: 数据安全 2015-03-18 16:46 157人阅读 评论(0) 收藏
  • 原文地址:https://www.cnblogs.com/jinxiao-pu/p/6721848.html
Copyright © 2020-2023  润新知