settings.py 配置项目管道
# Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 300 代表优先级,优先级数量越小,优先级级别越大 'bole.pipelines.BolePipeline': 300, }
jobbole.py
import scrapy from bole.items import BoleItem class JobboleSpider(scrapy.Spider): name = 'jobbole' allowed_domains = ['jobbole.com'] # start_urls = ['http://www.jobbole.com/caijing/gsyw/'] def start_requests(self): base_url = 'http://www.jobbole.com/caijing/gsyw/index_{}.html' for i in range(1, 33): url = base_url.format(i) yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): href_list = response.xpath('//div[@class="list-item"]/div[@class="img"]/a/@href').extract() for href in href_list: href = "http://www.jobbole.com/caijing/gsyw/" + href.split('/')[-1] detail_request = scrapy.Request(url=href, callback=self.parse_detail) yield detail_request # next_page_url = response.xpath("//div[@id='layui-laypage-1']/a[@class='a1']/@href").extract()[1] # if next_page_url: # next_page_url = 'http://www.jobbole.com/' + next_page_url # yield scrapy.Request(url=next_page_url, callback=self.parse) def parse_detail(self, response): article_url = response.url title = response.xpath('//div[@class="article-head"]/h1/text()').extract_first() p_time = response.xpath('//div[@class="about"]/div[@class="date"]/span[1]/text()').extract_first().split(' ')[0] item = BoleItem() item['title'] = title item['p_time'] = p_time item['article_url'] = article_url yield item
items.py
import scrapy class BoleItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() p_time = scrapy.Field() article_url = scrapy.Field() # pass
bole_mysql.py
""" CREATE TABLE bole_data( id int primary key auto_increment, title varchar(100), p_time date, article_url varchar(100)) default charset=utf8mb4; """ import pymysql class BoleMysql(object): # 初始化就是连接数据库 def __init__(self): self.conn = pymysql.connect(host='127.0.0.1', user='root', passwd='510520', db='pachong', charset='utf8mb4') self.cursor = self.conn.cursor() def execute_insert_sql(self, sql, bole_data): self.cursor.execute(sql, bole_data) self.conn.commit() def __del__(self): self.cursor.close() self.conn.close() if __name__ == '__main__': bole = BoleMysql() insert_sql = "INSERT INTO bole_data(title, p_time, article_url) VALUES(%s, %s, %s)" data = ('花好月圆夜', '2020-12-18', 'https://www.baidu.com') bole.execute_insert_sql(insert_sql, data)
pipelines.py
# useful for handling different item types with a single interface from itemadapter import ItemAdapter from project_01.shujuku.bole_mysql import BoleMysql class BolePipeline: def __init__(self): self.bole_mysql = BoleMysql() def process_item(self, item, spider): title = item['title'] p_time = item['p_time'] article_url = item['article_url'] insert_sql = "INSERT INTO bole_data(title, p_time, article_url) VALUES(%s, %s, %s)" data = (title, p_time, article_url) self.bole_mysql.execute_insert_sql(insert_sql, data) return item
run_jobbole.py
from scrapy.cmdline import execute # execute(['scrapy', 'crawl', 'jobbole']) execute('scrapy crawl jobbole'.split())