• pyspider 数据存储到mongoDB中


    1、新建文件config.json,内容如下,文件放在pyspider文件夹下,路径为C:WindowsSystem32cmd.exe

    {
        "taskdb": "mongodb+taskdb://127.0.0.1:27017/pyspider_taskdb",
        "projectdb": "mongodb+projectdb://127.0.0.1:27017/pyspider_projectdb",
        "resultdb": "mongodb+resultdb://127.0.0.1:27017/pyspider_resultdb",
        "message_queue": "redis://127.0.0.1:6379/0",
        "webui": {
            "port": 5000
        }
    }

    2、安装redis,在redis文件夹下启动终端,运行命令启动redis服务端
    E: edis>redis-server.exe redis.windows.conf
    redis默认15个数据库,db0,db1...上述文件选择index为0的db数据库

    若想启动客户端,运行命令如下,set,get为测试
    E: edis>redis-cli.exe -h 127.0.0.1 -p 6379
    127.0.0.1:6379> set myKey abc
    OK
    127.0.0.1:6379> get myKey
    "abc"
    127.0.0.1:6379>


    3、安装mongoDB,建文件夹db,并配置到mongoDB里去(文件夹不建也行)
    在bin文件夹下运行命令

    E:MongoDBServer4.0in>mongod.exe --dbpath datadb

    在客户端运行一些查询命令


    show dbs
    查看有哪些数据库
    db
    查看当前数据库
    use dbname
    使用dbname数据库作为当前数据库
    show tables / show collections
    查看当前数据库下的表或集合,都指一个意思
    db.website.find()
    查看当前数据库下website集合的数据内容
    db.website.find().count()
    查看website表里数据总数

    4、启动redis,启动mongoDB后,启动pyspider,并把新加的配置文件配置进去
    D:PythonPython36Libsite-packagespyspider>pyspider --config config.json


    5、发现需要安装第三方模块
    pip install redis
    pip install pymongo


    6、在项目里重载函数on_result

    import pymongo
    def
    on_result(self,result): if not result: #提取每个链接都会调用这个函数,只有detail_page返回的result才有值,所以没值时不继续进行 return client = pymongo.MongoClient(host='127.0.0.1',port=27017) db = client['pyspider_projectdb'] #建数据库,也可以是配置文件里设置的数据库 coll = db['website'] #建集合,即表 data = { 'originalLink':result['originalLink'], 'productName':result['productName'], 'price':result['price'], 'productDescription':result['productDescription'], 'category1':result['category1'], 'category2':result['category2'], 'category3':result['category3'], 'images':result['images'] } data_id = coll.insert(data) #将数据插入集合里 print(data_id)

     7、完整代码

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2018-11-08 09:56:40
    # Project: product
    
    from pyspider.libs.base_handler import *
    import re
    import base64
    import os
    import urllib
    import urllib.request
    import requests
    import json
    import pymongo
    import uuid
    
    class Handler(BaseHandler):
        
        def default(self, obj):
            if isinstance(obj, bytes):
                return str(obj, encoding='utf-8')
            return json.JSONEncoder.default(self, obj)
        
        crawl_config = {
             "headers": {
                "User-Agent": "BaiDuSpider",
            }        
        }
    
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('https://www.zhe800.com/', callback=self.index_page, validate_cert=False)
    
        @config(age=10 * 24 * 60 * 60)
        def index_page(self, response):
            for each in response.doc('a[href^="http"]').items():
                if re.match('https://shop.zhe800.com/products/.+',each.attr.href):
                    self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False, connect_timeout = 50, timeout = 200)
                elif re.match('https://brand.zhe800.com/.+',each.attr.href):
                    self.crawl(each.attr.href, callback=self.index_page, validate_cert=False, connect_timeout = 50, timeout = 200)
    
    
        @config(priority=2)
        def detail_page(self, response):
            if not response.doc('h1').text():
                return
            x=1
            imageresult=[]#放图片对象
            results=[]#最终结果,全部json放里
            description=''
            result=dict()#放json
            headers = {"Content-Type": "application/json"}
            path='D:\pythonlianxi\testimg'        
            if not os.path.isdir(path):
                os.makedirs(path)
            paths = path+'\'         
            for img in response.doc('div[class="deteilpic l"]>UL>LI>A>IMG').items():
                if re.match('.+?.jpg',img.attr.src):
                    urllib.request.urlretrieve(img.attr.src,'{0}{1}.jpg'.format(paths,x))
                    with open(paths+str(x)+".jpg","rb") as f:
                        base64_data = base64.b64encode(f.read()).decode()
                        imgurl=dict()#放base64
                        imgurl['id']=x
                        imgurl['base64']=base64_data
                        imageresult.append(imgurl)
                    x = x + 1      
            
            for each in response.doc('aside[class="pos area"]').items():
                catagoary=each.text()
            try:
                catagoary1=catagoary.split(' > ')[1]
            except:
                catagoary1="category1"
            try:
                catagoary2=catagoary.split(' > ')[2]
            except:
                catagoary2="category2"
            try:
                catagoary3=catagoary.split(' > ')[3]
            except:
                catagoary3="category3"
            
            pricebefore = response.doc('strong[class="red js_price_st"]>I').text()
            try:
                price=float(pricebefore)
            except:
                pricearray = re.findall('[0-9]*.?[0-9]+', pricebefore)
                if not len(pricearray):
                    pricearray=[0]
                price=pricearray[0]
            
            for des in response.doc('ul[class="list12 clear"]>LI').items():
                if des.attr.title:
                    description=description+des.attr.title
                    
            result['id']=''.join(str(uuid.uuid4()).split('-'))
            result['originalLink']=response.url
            result['productName']=response.doc('h1').text()
            result['price']=price
            result['productDescription']=description
            result['category1']=catagoary1
            result['category2']=catagoary2
            result['category3']=catagoary3       
            result['images']=imageresult
            
            filename="D:\pythonlianxi\zhe800.txt"
            with open(filename,'+a') as f:
                f.write(str(result)+'
    ')
         
            results.append(result)        
            payload=json.dumps(results)
            #r = requests.post('http://192.168.1.160:8764/index/products', data=payload, headers=headers)
    
            return {
                'id':result['id'],
                'price':price,
                "originalLink": response.url,
                "productName": response.doc('h1').text(),
                'productDescription':description,
                'category1':catagoary1,
                'category2':catagoary2,
                'category3':catagoary3,     
                'images':imageresult
                
            }
        def on_result(self,result):
            if not result:
                return
            print(result)
            client = pymongo.MongoClient(host='127.0.0.1',port=27017)
            db = client['pyspider_projectdb']
            coll = db['productzhe']
            data = {
                'id':result['id'],
                'originalLink':result['originalLink'],
                'productName':result['productName'],
                'price':result['price'],
                'productDescription':result['productDescription'],
                'category1':result['category1'],
                'category2':result['category2'],
                'category3':result['category3'],        
                'images':result['images']
            }
            data_id = coll.insert(data)
  • 相关阅读:
    矩阵乘法运算test
    c字符数组转整型【c语言复习1】
    (转载)JavaScript的那些书
    数据结构 排序算法
    (转载)给自己降降级你会发现一片广阔的天空
    Cocos2D简介
    JRE not compatible with workspace .class file compatibility: 1.7
    水晶报表问题,请高手指教。
    控件开发该如何入门?
    cnblogsDottext的FTB不生效,只是显示一个textarea标记,为什么呢?
  • 原文地址:https://www.cnblogs.com/lely/p/10148107.html
Copyright © 2020-2023  润新知