• scrapy-jobbole伯乐案例


    settings.py 配置项目管道

    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
        # 300 代表优先级,优先级数量越小,优先级级别越大
       'bole.pipelines.BolePipeline': 300,
    }

    jobbole.py

    import scrapy
    from bole.items import BoleItem
    
    class JobboleSpider(scrapy.Spider):
        name = 'jobbole'
        allowed_domains = ['jobbole.com']
        # start_urls = ['http://www.jobbole.com/caijing/gsyw/']
    
        def start_requests(self):
            base_url = 'http://www.jobbole.com/caijing/gsyw/index_{}.html'
            for i in range(1, 33):
                url = base_url.format(i)
                yield scrapy.Request(url=url, callback=self.parse)
    
        def parse(self, response):
            href_list = response.xpath('//div[@class="list-item"]/div[@class="img"]/a/@href').extract()
            for href in href_list:
                href = "http://www.jobbole.com/caijing/gsyw/" + href.split('/')[-1]
                detail_request = scrapy.Request(url=href, callback=self.parse_detail)
                yield detail_request
    
            # next_page_url = response.xpath("//div[@id='layui-laypage-1']/a[@class='a1']/@href").extract()[1]
            # if next_page_url:
            #     next_page_url = 'http://www.jobbole.com/' + next_page_url
            #     yield scrapy.Request(url=next_page_url, callback=self.parse)
    
        def parse_detail(self, response):
            article_url = response.url
            title = response.xpath('//div[@class="article-head"]/h1/text()').extract_first()
            p_time = response.xpath('//div[@class="about"]/div[@class="date"]/span[1]/text()').extract_first().split(' ')[0]
            item = BoleItem()
            item['title'] = title
            item['p_time'] = p_time
            item['article_url'] = article_url
            yield item

    items.py

    import scrapy
    
    
    class BoleItem(scrapy.Item):
        # define the fields for your item here like:
        title = scrapy.Field()
        p_time = scrapy.Field()
        article_url = scrapy.Field()
        # pass

    bole_mysql.py

    """
    CREATE TABLE bole_data(
        id int primary key auto_increment,
        title varchar(100),
        p_time date,
        article_url varchar(100)) default charset=utf8mb4;
    """
    import pymysql
    
    
    class BoleMysql(object):
        # 初始化就是连接数据库
        def __init__(self):
            self.conn = pymysql.connect(host='127.0.0.1', user='root', passwd='510520', db='pachong', charset='utf8mb4')
            self.cursor = self.conn.cursor()
    
        def execute_insert_sql(self, sql, bole_data):
            self.cursor.execute(sql, bole_data)
            self.conn.commit()
    
        def __del__(self):
            self.cursor.close()
            self.conn.close()
    
    
    if __name__ == '__main__':
        bole = BoleMysql()
        insert_sql = "INSERT INTO bole_data(title, p_time, article_url) VALUES(%s, %s, %s)"
        data = ('花好月圆夜', '2020-12-18', 'https://www.baidu.com')
        bole.execute_insert_sql(insert_sql, data)

    pipelines.py

    # useful for handling different item types with a single interface
    from itemadapter import ItemAdapter
    from project_01.shujuku.bole_mysql import BoleMysql
    
    
    class BolePipeline:
        def __init__(self):
            self.bole_mysql = BoleMysql()
    
        def process_item(self, item, spider):
            title = item['title']
            p_time = item['p_time']
            article_url = item['article_url']
            insert_sql = "INSERT INTO bole_data(title, p_time, article_url) VALUES(%s, %s, %s)"
            data = (title, p_time, article_url)
            self.bole_mysql.execute_insert_sql(insert_sql, data)
            return item

    run_jobbole.py

    from scrapy.cmdline import execute
    
    # execute(['scrapy', 'crawl', 'jobbole'])
    
    execute('scrapy crawl jobbole'.split())
  • 相关阅读:
    【PHPStorm使用手册】如何设置字体大小?
    Django——admin组件简单入门
    cookie与session
    Django
    练习题
    线程理论之大白话
    队列
    初识gevent模块
    Python第三方模块安装
    Python标准模块_concurrent.futures
  • 原文地址:https://www.cnblogs.com/glz666/p/14190065.html
Copyright © 2020-2023  润新知