Using the bulk_write can speed the mongo up, compared with the one by one update or insert.
1 import pymongo, time, psutil 2 from concurrent.futures import ProcessPoolExecutor 3 from pymongo.operations import UpdateOne 4 5 6 tic = time.time() 7 mongoclient = pymongo.MongoClient(host="mongodb3.xxx.com", port=27017) 8 MongoDB = mongoclient["xxx"] 9 collection = MongoDB.resume_test 10 document = { 11 "Duplicate": '', 12 "Skill": '', 13 "SourceURL": '', 14 } 15 document = set(document) 16 17 def bulk_write(requests, collection, last_one=False): 18 if len(requests) > 10000 or last_one: 19 collection.bulk_write(requests) 20 return [] 21 else: 22 return requests 23 24 25 requests = [] 26 for index, data in enumerate(collection.find({})): 27 if index % 10000 == 0: 28 print(index) 29 30 id = data['_id'] 31 data = set(data) 32 add = {el: '' for el in document.difference(data)} 33 if add: 34 requests.append(UpdateOne({'_id': id}, {'$set': add})) 35 requests = bulk_write(requests, collection) 36 requests = bulk_write(requests, collection, last_one=True) 37 38 toc = time.time() 39 print(f'finished, time cost: {toc - tic}')