#-*-coding:utf-8-*- import urllib import urllib2 import re import json import threading import requests from lxml import etree from time import sleep,ctime from Queue import Queue import lxml from bs4 import BeautifulSoup from HTMLParser import HTMLParser from itertools import product class Get_Html_Pthread(threading.Thread): def __init__(self,threadid,que): threading.Thread.__init__(self) self.threadid = threadid self.que = que def run(self): self.gethtml() def gethtml(self): while True: if self.que.empty(): break else: page = self.que.get() print 'qiushibaike spider No'+ str(self.threadid) + 'page = '+ str(page) url = 'https://www.qiushibaike.com/hot/page/'+str(page)+ '/' print url headers = { 'User_agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'If-None-Match': '56abab9faecd14ce1ba95132d661a82db1466c94'} num_try = 4 while num_try > 0: num_try -= 1 try: content = requests.get(url, headers=headers) data_que.put(content.text) break except Exception, e: print 'qiushi_spider', e if num_try > 0: print 'timeout:' + url class Get_Message_Pthread(threading.Thread): def __init__(self,threadid,que,lock,f): threading.Thread.__init__(self) self.threadid = threadid self.lock = lock self.que = que self.f = f def run(self): global total,exitFlag_Parser while exitFlag_Parser == False: try: html = self.que.get(False) if not html: pass self.getmessage(html) self.que.task_done() except: pass def getmessage(self,html1): global total try: html = etree.HTML(html1) result = html.xpath('//div[contains(@id,"qiushi_tag")]') for each in result: comment_res = each.xpath('.//span')[0].text name = each.xpath('.//h2')[0].text resultq = { 'author':name, 'phrase':comment_res, } print resultq with self.lock: self.f.write(json.dumps(resultq, ensure_ascii=False).encode('utf-8') + " ") except Exception,e: print 'paeser_data',e with self.lock: total += 1 data_que = Queue() lock = threading.Lock() exitFlag_Parser = False total = 0 def main(): output = open('Phrase.json', 'a') pageque = Queue(60) for page in range(1,11): pageque.put(page) gethtmlpthread = [] List = [0,1,2,3,4,5] for threadid in range(5): thread = Get_Html_Pthread(threadid,pageque) thread.start() gethtmlpthread.append(thread) getmessagepthread = [] for threadid in range(5): thread = Get_Message_Pthread(threadid,data_que,lock,output) thread.start() getmessagepthread.append(thread) while not pageque.empty(): pass for t in gethtmlpthread: t.join() while not data_que.empty(): pass for t in gethtmlpthread: t.join() with lock: output.close() if __name__ == '__main__': global total main() print 'total'+ str(total)