• python scrapy爬虫存储数据库方法带去重步骤


    import pymongo
    import requests
    import random
    import time
    import pymysql
    
    db = pymongo.MongoClient()['cs']['dn']
    db1 = pymysql.connect(user='root',password='root',db='cs',charset='utf8')
    cursor = db1.cursor()
    
    class CsdnPipeline(object):
        def __init__(self):
            self.set = set()
        def process_item(self, item, spider):
            if item not in self.set:
                title = item['title']
                content_text = item['content_text']
                create_time_datetime = item['create_time_datetime']
                nickName = item['nickName']
                read_count = item['read_count']
                content_img = item['content_img']
                keyword = item['keyword']
                if len(content_img)>0:
                    path = []
                    for img in content_img:
                        img_name = 'F:\34\tu\'+str(time.time()).split('.')[1]+str(random.randrange(1,9999999999999999999999999))+'.jpg'
                        img_source = requests.get(img).content
                        op = open(img_name,'wb')
                        op.write(img_source)
                        op.close()
                        path.append(img_name)
                    item['content_img'] = path
    
                else:
                    item['content_img'] = '暂无图片'
                db.insert(dict(item))
                import json
                data = json.dumps(dict(item))
                sql = "insert into dn1(`data`) VALUES ('{}')".format(data)
                cursor.execute(sql)
                db1.commit()
                self.set.add(item)
                return item
            else:
                print('已经存在')
                return item
  • 相关阅读:
    Redis做为缓存的几个问题
    Python——操作smb文件服务器(上传和下载)
    Python——assert、isinstance的用法
    centos7-修改默认python为3
    mqtt
    如何编译生成 mkfs.ubifs、ubinize 工具
    2020-2笔记
    2020-1笔记
    C语言中getopt()和getopt_long()函数的用法
    buildroot
  • 原文地址:https://www.cnblogs.com/duanlinxiao/p/9851206.html
Copyright © 2020-2023  润新知