bug-bug-bug

#-*-coding:utf-8-*-
import urllib
import urllib2
import re
import json
import threading
import requests
from lxml import etree
from time import sleep,ctime
from Queue import Queue
import lxml
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
from itertools import product

class Get_Html_Pthread(threading.Thread):
    def __init__(self,threadid,que):
        threading.Thread.__init__(self)
        self.threadid = threadid
        self.que = que
    def run(self):
        self.gethtml()

    def gethtml(self):
        while True:
            if self.que.empty():
                break
            else:
                page = self.que.get()
                print 'qiushibaike spider No'+ str(self.threadid) + 'page = '+ str(page)
                url = 'https://www.qiushibaike.com/hot/page/'+str(page)+ '/'
                print url
                headers = {
                    'User_agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
                    'Accept-Language': 'zh-CN,zh;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                    'If-None-Match': '56abab9faecd14ce1ba95132d661a82db1466c94'}
                num_try = 4
                while num_try > 0:
                    num_try -= 1
                    try:
                        content = requests.get(url, headers=headers)
                        data_que.put(content.text)
                        break
                    except Exception, e:
                        print 'qiushi_spider', e
                if num_try > 0:
                    print 'timeout:' + url


class Get_Message_Pthread(threading.Thread):
    def __init__(self,threadid,que,lock,f):
        threading.Thread.__init__(self)
        self.threadid = threadid
        self.lock = lock
        self.que = que
        self.f = f
    def run(self):
        global total,exitFlag_Parser
        while exitFlag_Parser == False:
            try:
                html = self.que.get(False)
                if not html:
                    pass
                self.getmessage(html)
                self.que.task_done()
            except:
                pass

    def getmessage(self,html1):
        global total
        try:
            html = etree.HTML(html1)
            result = html.xpath('//div[contains(@id,"qiushi_tag")]')
            for each in result:
                comment_res = each.xpath('.//span')[0].text
                name = each.xpath('.//h2')[0].text
                resultq = {
                    'author':name,
                    'phrase':comment_res,
                }
                print resultq
                with self.lock:
                    self.f.write(json.dumps(resultq, ensure_ascii=False).encode('utf-8') + "
")

        except Exception,e:
            print 'paeser_data',e

        with self.lock:
            total += 1

data_que = Queue()
lock = threading.Lock()
exitFlag_Parser = False
total = 0
def main():
    output = open('Phrase.json', 'a')
    pageque = Queue(60)
    for page in range(1,11):
        pageque.put(page)
    gethtmlpthread = []
    List = [0,1,2,3,4,5]
    for threadid in range(5):
        thread = Get_Html_Pthread(threadid,pageque)
        thread.start()
        gethtmlpthread.append(thread)

    getmessagepthread = []


    for threadid in range(5):
        thread = Get_Message_Pthread(threadid,data_que,lock,output)
        thread.start()
        getmessagepthread.append(thread)

    while not pageque.empty():
        pass

    for t in  gethtmlpthread:
        t.join()

    while not data_que.empty():
        pass

    for t in gethtmlpthread:
        t.join()
    with lock:
        output.close()

if __name__ == '__main__':
    global total
    main()
    print 'total'+ str(total)

相关阅读:
使用SQL Server Management Studio 创建数据库备份作业
 ClickOnce 获取客户端发布版本号
 在C#用HttpWebRequest中发送GET/HTTP/HTTPS请求
 找不到方法:“Void System.Data.Objects.ObjectContextOptions.set_UseConsistentNullReferenceBehavior(Boolean)
常用操作类
 数据库命名规范
 expression动态构成
 C# 获得当前方法和方法调用链的方法
 EF架构封装类
 基于微软企业库的分层代码框架
原文地址：https://www.cnblogs.com/chenyang920/p/7663024.html