scrapy基础

from qiubaiPro.items import QiubaiproItem
class QiubaiSpider(scrapy.Spider):
    name = 'qiubai'
    # allowed_domains = ['www.qiushibaike.com/text']
    start_urls = ['https://www.qiushibaike.com/text/']
    def parse(self, response):
        #建议使用xpath进行指定内容的解析，因为框架集成了xpath解析的接口
        #段子的内容和作者
        #可以直接用response+点来调用xpath
        div_list = response.xpath('//div[@id="content-left"]/div')

        for div in div_list:
            #extract()该方法可以将selector对象中存储的数据值拿到，是一个列表，取值要用索引【0】
            # author = div.xpath('./div/a[2]/h2/text()').extract()[0]
            # extract_first() 等于extract()[0]
            author = div.xpath('./div/a[2]/h2/text()').extract_first()
            content = div.xpath('.//div[@class="content"]/span/text()').extract_first()
            #1  将解析到的数据值（author和cont）存储到items对象，需要去QiubaiproItem类里声明属性
            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content

            #2 将item对象提交给管道,去pipelines文件编写代码
            yield item

# 基于管道存储的代码
class QiubaiproPipeline(object):
    fp = None
    # 整个爬虫过程中，该方法只会在开始爬虫的时候被调用一次
    def open_spider(self, spider):
        print('开始爬虫')
        self.fp = open('./qiubai_pipe.txt', 'w', encoding='utf-8')

    # 该方法可以接受爬虫文件中提交过来的item对象，并且对item对象中存储的页面数据进行持久化存储
    # 参数：item表示的就是接收到的item对象
    # 每当爬虫文件向管道提交一次item，该方法就会被执行一次
    def process_item(self, item, spider):
        # 取出item对象中存储的数据值
        author = item['author']
        content = item['content']
        # 持久化存储
        self.fp.write(author + ':' + content+'


')
        return item

    #该方法只会在爬虫结束的时候被调用一次
    def close_spider(self,spider):
        print('爬虫结束')
        self.fp.close()

# 编写向mysql数据库中存储数据的相关代码
import pymysql


class QiubaiproPipeline(object):
    conn = None
    cursor = None

    def open_spider(self, spider):
        print('开始爬虫')
        # 链接数据库
        self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='qiubai')

    def process_item(self, item, spider):
        # 1. 链接数据库(创建好数据库和要存的表）
        # 2. 执行sql语句
        sql = 'insert into qiubai values("%s","%s")' % (item['author'], item['content'])
        #创建游标对象
        self.cursor = self.conn.cursor()
        # 3. 提交事务
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()

        return item

    def close_spider(self, spider):
        print('爬虫结束')
        self.cursor.close() #关闭游标对象
        self.conn.close() #关闭连接对象

import redis

class QiubaiproPipeline(object):
    conn = None

    def open_spider(self, spider):
        print('开始爬虫')
        # 1. 链接数据库
        self.conn = redis.Redis(host='127.0.0.1', port=6379)

    def process_item(self, item, spider):
        # 2. 执行语句
        dict = {
            'author': item['author'],
            'content': item['content'],
        }
        # 创建一个名为data的列表，传入数据dict
        self.conn.lpush('data', dict)
        return item

相关阅读:
shingling算法——提取特征，m个hash函数做指纹计算，针对特征hash后变成m维向量，最后利用union-find算法计算相似性
 普林斯顿算法（1.3）并查集（union-find算法）——本质就是一个数下面的子树代表了连在一起的点
 Cuckoo hash算法分析——其根本思想和bloom filter一致增加hash函数来解决碰撞节省了空间但代价是查找次数增加
 Merkle 树——空间换时间，分而治之的hash表，通过根节点是由它的两个子节点内容的哈希值组成来校验数据完整性，定位篡改的数据位置
 图解Skip List——本质是空间换时间的数据结构，在lucene的倒排列表，bigtable，hbase，cassandra的memtable，redis中sorted set中均用到
 LSM Tree 学习笔记——本质是将随机的写放在内存里形成有序的小memtable，然后定期合并成大的table flush到磁盘
 LSM Tree 学习笔记——MemTable通常用 SkipList 来实现
 Raft 为什么是更易理解的分布式一致性算法——（1）Leader在时，由Leader向Follower同步日志（2）Leader挂掉了，选一个新Leader，Leader选举算法。
一个php user class
CI 模板解析器类
原文地址：https://www.cnblogs.com/lys666/p/10479159.html