1、线程之间相互独立,使用多线程缩短执行时间,下面是简单爬虫实例:自动抓取网站内容(threading模块)
import threading,time,requests
def downHtml(url,name): content = requests.get(url).content f = open(name+'.html','wb') f.write(content) f.close() urls = [ ['baidu','http://www.baidu.com'], ['sogou','http://www.sogou.com'], ['xinliang','http://www.sina.com'] ] # 不使用多线程 # start_time = time.time() # for url in urls: # downHtml(url[1],url[0]) # end_time = time.time() # print(end_time - start_time) #使用多线程 threads = [] #线程集 start_time = time.time() for url in urls:#循环创建多个线程 t = threading.Thread(target=downHtml,args =(url[1],url[0]))#创建一个线程 t.start() threads.append(t) for t in threads: #等待子线程(直到一个线程结束,等待另外一个线程,直到3个都结束,进入主线程的程序) t.join() end_time = time.time() print(end_time - start_time)
2、setDaemon(True)设置当前线程为守护线程,一旦主线程结束,子线程立刻结束,不管是否执行完
def pz(): time.sleep(2) print('守护线程打印') threads = [] for i in range(50): t = threading.Thread(target=pz) t.setDaemon(True) #设置子线程为守护线程,一旦加这行代码,只打印‘主线程打印’,不打印‘守护线程打印’ t.start() threads.append(t) # for t in threads:#如果主线程等待子线程的话,那么设置的守护线程就不好用了 # t.join() print('主线程打印')
3、多线程执行的函数要想获取结果,不能用return,可以写到list里面
res = [] def lida(x,y): res.append(x+y) import threading for i in range(5): t = threading.Thread(target=lida,args= (i,i)) t.start() print(res)
4、from threading import Lock线程锁,加锁是为了多线程的时候,同时修改一个数据的时候,有可能导致数据不正确,python3里面锁可以不用加,他会自动给加上
import threading from threading import Lock num = 0 lock = Lock() # 申请一把锁 def run(): global num lock.acquire() # 加锁 num += 1 lock.release() # 解锁 lis = [] for i in range(5): t = threading.Thread(target=run) t.start() lis.append(t) for t in lis: t.join() print('over', num)#输出5
5、多线程,是不能利用多核CPU的,如果想利用多核CPU的话,就得使用多进程,multiprocessing
import multiprocessing,time,threading def run2(): time.sleep(2) print('这个是多线程启动的') def run(): time.sleep(2) for i in range(5): t = threading.Thread(target=run2) t.start() if __name__ == '__main__': for i in range(5): p = multiprocessing.Process(target= run2) p.start()
6、线程池,花费时间更少,更效率,放线程的一个池子threadpool
import threadpool,time def say(num): print("Hello ",num) time.sleep(2) res = list(range(101)) pool = threadpool.ThreadPool(10)##创建一个线程池,10为创建10个线程,线程多,时间少效率高 reqs = threadpool.makeRequests(say,res)#生成线程要执行的所有线程,res是个list,将所有的请求参数放到list中,当执行的参数只有一个 for req in reqs: pool.putRequest(req) #实际才去执行 pool.wait() #等待 其他线程结束
7、自己封装的线程池
import threadpool class MyPool(object): def __init__(self,func,size=20,data=None): self.func = func self.size = size self.data = data def pool(self): pool = threadpool.ThreadPool(self.size) reqs = threadpool.makeRequests(self.func,self.data)#生成请求,分配数据 [pool.putRequest(req) for req in reqs]#执行函数 pool.wait()#等待函数执行完成 def down(num): print(num) my = MyPool(func=down,data=[1,2,3,4,5,6,7]) my.pool()