一、进程
1. 进程间数据不共享,示例如下:
import multiprocessing import threading data_list = [] def task(arg): data_list.append(arg) print(data_list) # 每一个进程使用这个列表时,会自己单独创建一个。 def run(): for i in range(10): p = multiprocessing.Process(target=task,args=(i,)) # p = threading.Thread(target=task,args=(i,)) p.start() if __name__ == '__main__': run()
2. 常用功能
import multiprocessing import time def task(arg): time.sleep(2) print(arg) def run(): print("111111") p1 = multiprocessing.Process(target=task,args=(1,)) p1.start() p1.join(9) # 等待进程完成,最多等待9s print("222222") p2 = multiprocessing.Process(target=task, args=(2,)) p2.start() p2.join() print("333333") if __name__ == '__main__': run()
import multiprocessing import time def task(arg): time.sleep(2) print(arg) def run(): print("111111") p1 = multiprocessing.Process(target=task,args=(1,)) p1.daemon = False # 等待进程完成(默认) p1.start() print("222222") p2 = multiprocessing.Process(target=task, args=(2,)) p2.daemon = Ture # 不等待进程完成 p2.start() print("333333") if __name__ == '__main__': run()
import multiprocessing import time def task(arg): time.sleep(2) p = multiprocessing.current_process() # 获取当前进程对象 name = p.name # 获取名字 id1 = p.ident # 获取id id2 = p.pid # 获取id print(name, id1, id2) def run(): print('111111111') p1 = multiprocessing.Process(target=task,args=(1,)) p1.name = 'pp1' # 为进程设置名字pp1 p1.start() print('222222222') p2 = multiprocessing.Process(target=task, args=(2,)) p2.name = 'pp2' # # 为进程设置名字pp1 p2.start() print('333333333') if __name__ == '__main__': run()
3. 继承(面向对象)方式创建线程
class MyProcess(multiprocessing.Process): def run(self): print("当前进程", multiprocessing.current_process()) def run(): p1 = MyProcess() p1.start() p2 = MyProcess() p2.start() if __name__ == '__main__': run()
二、数据共享(内存级别)
1. Queue
import multiprocessing q = multiprocessing.Queue() def task(arg,q): q.put(arg) def run(): for i in range(10): p = multiprocessing.Process(target=task, args=(i, q,)) p.start() while True: v = q.get() print(v) run()
import multiprocessing def task(arg,q): q.put(arg) if __name__ == '__main__': q = multiprocessing.Queue() for i in range(10): p = multiprocessing.Process(target=task,args=(i,q,)) p.start() while True: v = q.get() print(v)
2. Manager
import multiprocessing m = multiprocessing.Manager() dic = m.dict() def task(arg): dic[arg] = 100 def run(): for i in range(10): p = multiprocessing.Process(target=task, args=(i,)) p.start() input(">>>") print(dic.values()) if __name__ == '__main__': run()
import multiprocessing import time def task(arg,dic): time.sleep(2) dic[arg] = 100 if __name__ == '__main__': m = multiprocessing.Manager() dic = m.dict() process_list = [] for i in range(10): p = multiprocessing.Process(target=task, args=(i,dic,)) p.start() process_list.append(p) while True: count = 0 for p in process_list: if not p.is_alive(): # 如果某进程已经执行完毕,则count加1 count += 1 if count == len(process_list): # 当数量等于进程数,说明全部执行完毕 break print(dic)
三、进程锁
进程锁和线程锁的种类和用法一样,具体使用参考线程锁,下面是进程锁RLock的示例:
import time import multiprocessing lock = multiprocessing.RLock() # 具体使用请参考线程锁 # lock() # BoundedSemaphore() # Condition() # Event() def task(arg): print('鬼子来了') lock.acquire() time.sleep(2) print(arg) lock.release() if __name__ == '__main__': p1 = multiprocessing.Process(target=task,args=(1,)) p1.start() p2 = multiprocessing.Process(target=task, args=(2,)) p2.start()
问题:问什么要加进程锁?
进程本身是安全的,因为进程间的数据是隔离的,是不共享的,所以一般不用加锁。
但是当线程之间实现共享数据,可以相互通信后,线程就是不安全的,这时候需要为一段代码加上锁来控制实现线程安全,即线程间数据隔离;
四、进程池
import time from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor def task(arg): time.sleep(2) print(arg) if __name__ == '__main__': pool = ProcessPoolExecutor(5) for i in range(10): pool.submit(task,i)
五、初识爬虫(requests模块和bs4(beautifulsoup)模块)
1. 安装
pip3 install requests
pip3 install beautifulsoup4
2. 安装出现的问题
找不到内部指令?
方式一:
D:python3.70Scriptspip3 install requests
方式二:
把D:python3.70Scripts添加到系统的环境变量
方式三(前两种都步行再用这一种):
在pycharm中的Settings中的Project中的interpreter中安装(不推荐--只能在windows中用)
3. 示例代码
import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor # 模拟浏览器发送请求 # 内部创建 sk = socket.socket() # 和抽屉进行socket连接 sk.connect(...) # sk.sendall('...') # sk.recv(...) def task(url): print(url) r1 = requests.get( url=url, headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36' } ) # 查看下载下来的文本信息 soup = BeautifulSoup(r1.text,'html.parser') # print(soup.text) content_list = soup.find('div',attrs={'id':'content-list'}) for item in content_list.find_all('div',attrs={'class':'item'}): title = item.find('a').text.strip() target_url = item.find('a').get('href') print(title,target_url) def run(): pool = ThreadPoolExecutor(5) for i in range(1,50): pool.submit(task,'https://dig.chouti.com/all/hot/recent/%s' %i) if __name__ == '__main__': run()
4. 分析
a. 以上示例 进程和线程那个好?
线程好,因为socket属于IO请求,不占用CPU,所以用多线程即节省资源又提高效率;
b. requests模块迷你浏览发送请求
requsets.get() 的本质:
- 创建socket客户端
- 连接 [阻塞]
- 发送请求
- 接收请求 [阻塞]
- 断开连接
c. 线程池和进程池