利用到的库
- time, requests, lxml, queue, threading
功能
import time
import requests
from lxml import etree
from queue import Queue
import threading
class bsSpider:
def __init__(self):
self.baseUrl = "http://www.budejie.com/"
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"}
# URL队列
self.urlQueue = Queue()
# 相应html队列
self.resQueue = Queue()
# 生成url队列
def getUrl(self):
for pNumber in range(1, 21):
url = self.baseUrl + str(pNumber)
self.urlQueue.put(url)
# 请求,得到相应的html,放到解析队列
def getHtml(self):
while True:
# 1.从url队列中get值
url = self.urlQueue.get()
# 2.发请求,得响应,put到响应队列中
res = requests.get(url, headers=self.headers)
res.encoding = "utf-8"
html = res.text
self.resQueue.put(html)
# 清除此任务
self.urlQueue.task_done()
# 解析页面方法
def getText(self):
while True:
html = self.resQueue.get()
parseHtml = etree.HTML(html)
r_list = parseHtml.xpath('//div[@class="j-r-list-c-desc"]/a/text()')
for r in r_list:
print(r+"
")
self.resQueue.task_done()
def run(self):
# 空列表,用来存放
thList = []
# 生成URL队列
self.getUrl()
# 创建请求线程,放到列表中
for i in range(10):
thRes = threading.Thread(target=self.getHtml)
thList.append(thRes)
# 创建解析线程,放到列表中
for i in range(3):
thParse = threading.Thread(target=self.getText)
thList.append(thParse)
# 所有线程开始干活
for th in thList:
th.setDaemon(True)
th.start()
# 如果队列为空,则执行其他程序
self.urlQueue.join()
self.resQueue.join()
if __name__ == '__main__':
begin = time.time()
spider = bsSpider()
spider.run()
end = time.time()
print(end - begin)