新建项目
# 新建项目
$ scrapy startproject jianshu
# 进入到文件夹
$ cd jainshu
# 新建spider文件
$ scrapy genspider -t crawl jianshu_spider jainshu.com
items.py文件
import scrapy class ArticleItem(scrapy.Item): title = scrapy.Field() content = scrapy.Field() article_id = scrapy.Field() origin_url = scrapy.Field() author = scrapy.Field() avatar = scrapy.Field() pub_time = scrapy.Field()
jianshu_spider.py文件
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from jianshu.items import ArticleItem class JianshuSpiderSpider(CrawlSpider): name = 'jianshu_spider' allowed_domains = ['jianshu.com'] start_urls = ['https://www.jianshu.com/'] rules = ( Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True), ) def parse_detail(self, response): title = response.xpath("//h1[@class='title']/text()").get() content = response.xpath("//div[@class='show-content-free']").get() avatar = response.xpath("//a[@class='avatar']/img/@src").get() author = response.xpath("//div[@class='info']/span/a/text()").get() pub_time = response.xpath("//span[@class='publish-time']/text()").get() article_id = response.url.split("?")[0].split("/")[-1] origin_url = response.url item = ArticleItem( title=title, content=content, avatar=avatar, pub_time=pub_time, article_id=article_id, origin_url=origin_url, author=author ) yield item
同步的MySQL插入数据
import pymysql class JianshuPipeline(object): def __init__(self): dbparams = { 'host': '127.0.0.1', 'user': 'root', 'password': '123456', 'database': 'jianshu', 'port': 3306, 'charset': 'utf8' } self.conn = pymysql.connect(**dbparams) self.cursor = self.conn.cursor() self._sql = None def process_item(self, item, spider): self.cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['origin_url'], item['article_id'])) self.conn.commit() return item @property def sql(self): if not self._sql: self._sql = """ insert into article(title,content, author, avatar, pub_time, origin_url, article_id) values (%s, %s, %s, %s, %s, %s,%s) """ return self._sql return self._sql
异步的MySQL插入数据
from twisted.enterprise import adbapi from pymysql import cursors class JianshuTwistedPipeline(object): def __init__(self): dbparams = { 'host': '127.0.0.1', 'user': 'root', 'password': '123456', 'database': 'jianshu', 'port': 3306, 'charset': 'utf8', 'cursorclass': cursors.DictCursor } self.dbpool = adbapi.ConnectionPool('pymysql', **dbparams) self._sql = None @property def sql(self): if not self._sql: self._sql = """ insert into article(title,content, author, avatar, pub_time, origin_url, article_id) values (%s, %s, %s, %s, %s, %s,%s) """ return self._sql return self._sql def process_item(self, item, spider): defer = self.dbpool.runInteraction(self.insert_item, item) defer.addErrback(self.handle_error, item, spider) def insert_item(self, cursor, item): cursor.execute(self.sql, (item['title'], item['content'], item['author'], item['avatar'], item['pub_time'], item['origin_url'], item['article_id'])) def handle_error(self, error, item, spider): print('=' * 10 + 'error' + '=' * 10) print(error) print('=' * 10 + 'error' + '=' * 10)