1、队列、线程
import MySQLdb import MySQLdb.cursors import queue import threading def update_line_thread(): connection = MySQLdb.connect(host='xx.xx.xx.xx', port=3306, user='', passwd='', db='db_name', charset='utf8mb4', connect_timeout=100000, cursorclass=MySQLdb.cursors.SSCursor, autocommit=1) cursor = connection.cursor() sql = 'UPDATE table SET col_name=%s,where id=%s' while True: stroke,Id= q.get() keys = (stroke,Id) try: cursor.execute(sql, keys) except Exception as e: print(e,fid) q.task_done() q = queue.Queue() for i in range(10): t = threading.Thread(target=update_line_thread, daemon=True) t.start() for ID in id2stroke: q.put((id2stroke[ID], ID)) q.join()
2、进程池 与 线程池
IO绑定工作 – > multiprocessing.pool.ThreadPool
CPU绑定作业 – > multiprocessing.Pool
import MySQLdb
import MySQLdb.cursors
import pymongo
from multiprocessing.pool import Pool, ThreadPool def gen_doc(paper_list): with ThreadPool(7) as tp: p1 = tp.apply_async(get_paper_info, (paper_list, )) p2 = tp.apply_async(get_paper_field, (paper_list, )) p3 = tp.apply_async(get_paper_author, (paper_list, )) paper_info = p1.get() paper_field = p2.get() paper_author = p3.get() for line in paper_info: yield { "key":"value"} def update_batch(paper_list): conn= pymongo.MongoClient('xx,xx,xx,xx', 27017, username="", password="").db_name.collection_name for doc in gen_doc(paper_list): # conn.delete_one({'_id':doc['_id']}) conn.replace_one({'_id': doc['_id']}, doc, upsert=True) def update_by_list(paper_list): paper_list.sort() batch_size = 1000 batch_list = [paper_list[begin:begin+batch_size] for begin in range(0, len(paper_list), batch_size)] print("Total batch num: ", len(batch_list)) with Pool(25) as pool: # pool.map(update_batch, batch_list, chunksize=200) for idx, _ in enumerate(pool.imap_unordered(update_batch, batch_list)): if idx % 100 == 0: print(idx)
3、concurrent.futures
模块中有 2 个类:ThreadPoolExecutor 和 ProcessPoolExecutor,也就是对 threading 和 multiprocessing 的进行了高级别的抽象, 暴露出统一的接口,帮助开发者非常方便的实现异步调用。
concurrent.futures 底层还是用着 threading 和 multiprocessing,相当于在其上又封装了一层,并且重新设计了架构,所以会慢一点。架构复杂,但接口简单。
#在Python中实现多处理的一个简单方法是 from multiprocessing import Pool def calculate(number): return number if __name__ == '__main__': pool = Pool() result = pool.map(calculate, range(4))
#基于未来的另一种实现是 from concurrent.futures import ProcessPoolExecutor def calculate(number): return number with ProcessPoolExecutor() as executor: result = executor.map(calculate, range(4))
Python 3.5 加入了 chunksize 。解决了 ProcessPoolExecutor始终一次将一项从可迭代项传递给子项,这会导致IPC开销增加,从而导致大型可迭代项的性能大大降低。
multiprocessing.Pool.map outperforms ProcessPoolExecutor.map. Note that the performance difference is very small per work item,
so you'll probably only notice a large performance difference if you're using map on a very large iterable. The reason for the
performance difference is that multiprocessing.Pool will batch the iterable passed to map into chunks, and then pass the chunks
to the worker processes, which reduces the overhead of IPC between the parent and children. ProcessPoolExecutor always passes
one item from the iterable at a time to the children, which can lead to much slower performance with large iterables, due to the
increased IPC overhead. The good news is this issue will be fixed in Python 3.5, as as chunksize keyword argument has been added
to ProcessPoolExecutor.map, which can be used to specify a larger chunk size if you know you're dealing with large iterables.
See this bug(http://bugs.python.org/issue11271) for more info.
# coding=utf-8 import time from multiprocessing.pool import Pool from concurrent.futures import as_completed, ProcessPoolExecutor NUMBERS = range(1, 50) def fib(n): if n<= 2: return 1 return fib(n-1) + fib(n-2) print('multiprocessing.pool.Pool: ') start = time.time() l = [] pool = Pool(5) for num, result in zip(NUMBERS, pool.map(fib, NUMBERS)): l.append(result) print(len(l)) print('COST: {}'.format(time.time() - start)) print('ProcessPoolExecutor without chunksize: ') start = time.time() l = [] with ProcessPoolExecutor(max_workers=5) as executor: for num, result in zip(NUMBERS, executor.map(fib, NUMBERS)): l.append(result) print(len(l)) print('COST: {}'.format(time.time() - start)) print('ProcessPoolExecutor with chunksize: ') start = time.time() l = [] with ProcessPoolExecutor(max_workers=5) as executor: # 保持和multiprocessing.pool的默认chunksize一样 chunksize, extra = divmod(len(NUMBERS), executor._max_workers * 4) for num, result in zip(NUMBERS, executor.map(fib, NUMBERS, chunksize=chunksize)): l.append(result) print(len(l)) print('COST: {}'.format(time.time() - start))