import pymongo
import click
# 数据库基本信息
db_configs = {
'type': 'mongo',
'host': '127.0.0.1',
'port': '27017',
"user": "",
"password": "",
'db_name': 'spider'
}
class Mongo():
def __init__(self):
self.db_name = db_configs.get("db_name")
self.host = db_configs.get("host")
self.port = db_configs.get("port")
self.client = pymongo.MongoClient(f'mongodb://{self.host}:{self.port}', connect=False, maxPoolSize=10)
self.username = db_configs.get("user")
self.password = db_configs.get("passwd")
if self.username and self.password:
self.db = self.client[self.db_name].authenticate(self.username, self.password)
self.db = self.client[self.db_name]
def reset_status(self, col="dianping_seed_data"):
self.db[col].update_many({'$or': [{'status': 1}, {'status': 3}]}, {'$set': {"status": 0}})
def reset_all_status(self, col="dianping_seed_data"):
self.db[col].update_many({}, {'$set': {"status": 0}})
def add_index(self, col="dianping_seed_data"):
# status_code 0:初始,1:开始下载,2下载完了
self.db[col].create_index([('status', pymongo.ASCENDING)], unique=True)
def get_index(self, col="dianping_seed_data"):
index_list = self.db[col].list_indexes()
for index in index_list:
print(index)
# 找出重复的放入result表中
def find_duplicate(self, col="dianping_seed_data"):
"""
{'$out': 'result'}:聚合之后将结果写到新的集合result表里。
:param col:
:return:
"""
result = self.db[col].aggregate([
{'$group': {
'_id': {'url': "$url"},
'_id_list': {'$addToSet': "$_id"}, ##_id字段添加到返回结果里面去
'status': {'$addToSet': "$status"}, ##status字段添加到返回结果里面去
'count': {'$sum': 1}
}}, {'$out': 'result'}
], allowDiskUse=True)
for item in result:
print(item)
return result
def delete_dup(self, col="dianping_seed_data"):
delete_data = self.db.result.find()
try:
for d in delete_data:
# 保留一条
unique_id_list = d.get("_id_list")[1:]
for did in unique_id_list:
self.db[col].delete_one({'_id': did})
self.db.result.drop()
except Exception as e:
print("删除的时候出现问题", e.args)
@click.command()
@click.option('--s', type=str, default="two", help="状态:all表示全部重置为0,two:表示重置状态为1、3的重置为0")
@click.option('--i', type=str, default="a", help="a:增加索引 g:获取索引")
@click.option('--d', type=str, default="f", help="d:删除 f:查询并生成聚合之后的结果")
def run(s, i, d):
m = Mongo()
if s:
print("获取参数为:", s)
if s == "all":
print("所有数据状态重置为0:", s)
m.reset_all_status()
elif s == "two":
print("部分数据状态重置为0:", s)
if i:
if i == "a":
m.add_index()
elif i == "g":
m.get_index()
if d:
if d == "d":
m.delete_dup()
elif i == "f":
m.find_duplicate()
if __name__ == '__main__':
m = Mongo()
m.delete_dup()