python爬虫多线程编程

#使用了线程库
import threading
from queue import  Queue
from bs4 import BeautifulSoup
import  json
import requests
class ThreadCrawl(threading.Thread):
    def __init__(self,threadNmae,pageQueue,dataQueue):
        #threading.Thread.__init__(self)
        #多个父类的话下面这个方便
         super(ThreadCrawl,self).__init__( )
         self.threadNmae=threadNmae
         self.pageQueue=pageQueue
         self.dataQueue=dataQueue
         self.headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"}
    def run(self):
         print("启动"+self.threadNmae)
         while not CRAWL_EXIT:
            try:
             #取出一个数字，先进先出
             #1可选参数block默认值是true，不会结束，会进入阻塞状态，直到队列有新的数据
             #2.如果队列为空，block为Flase的话，就会弹出一个Queue.empty()异常
                 page=self.pageQueue.get(False)
                 url="https://www.qiushibaike.com/8hr/page/"+str(page)+"/"
                 content=requests.get(url,headers=self.headers)
                 self.dataQueue.put(content)
            except:
                 pass
         print("结束"+self.threadNmae)
CRAWL_EXIT=False
PARSE_EXIT=False
def main():
    #页面的队列可以存储10页
    pageQueue=Queue(10)
    #放入1-10  先进先出
    for i in range(1,11):
         pageQueue.put(i)
         #采集结果的数据队列，参数为空
         dataQueue=Queue()
         #存储三个线程采集的名字
         crawList=["采集线程1号","采集线程2号","采集线程3号"]
         #存储三个采集线程
         threadcrawl=[]
         for threadNmae in crawList:
            thread=ThreadCrawl(threadNmae,pageQueue,dataQueue)
            thread.start()
            threadcrawl.append(thread)

         while not pageQueue.empty():
             pass
         global  CRAWL_EXIT
         CRAWL_EXIT=True
         print("Queue为空")
         for thread in threadcrawl:
             thread.join()
             print("joining...............")
if __name__=="__main__":
    main()

相关阅读:
【Statistics】均值
【Datastage】在win10安装Datastge 8.7
【Linux】行首、行尾添加字符串
【Linux】替换文本中的字符
【Pyhton 数据分析】通过gensim进行文本相似度分析
【Python 数据分析】jieba文本挖掘
异或运算法则
关于计算机中的《补码》，公式：-n=~n+1 引伸：~n=-n-1
Base64编码解码
位运算之——按位与（&）操作——（快速取模算法）

原文地址：https://www.cnblogs.com/c-x-a/p/8027281.html