python 多线程抓取动态数据

利用多线程动态抓取数据，网上也有不少教程，但发现过于繁杂，就不能精简再精简？！

不多解释，直接上代码，基本上还是很好懂的。

#!/usr/bin/env python
# coding=utf-8

import urllib2
import re,sys
from bs4 import BeautifulSoup
from selenium import webdriver
import threading
import time
reload(sys)
sys.setdefaultencoding("utf-8")

queue = [
    "http://baike.baidu.com/view/8332.htm",
    "http://baike.baidu.com/view/145819.htm",
    "http://baike.baidu.com/view/643415.htm",
    "http://baike.baidu.com/view/157424.htm",
    "http://baike.baidu.com/view/149759.htm",]

crawled_url = set()
crawled_word = set()

cnt = 0

class BaikeSpider(threading.Thread):
    """
    模拟浏览器打开页面，多线程爬取数据
    """

    def __init__(self,name):
        threading.Thread.__init__(self)
        self.name = str(name)

        self.browser = webdriver.Chrome()

        # 将抓取数据写入各自的文件
        self.fw = open("baike_words_"+self.name+".txt","wb")

    def run(self):
        global queue
        global crawled_url
        global crawled_word
        global cnt

        while queue:
            url = queue.pop(0)
            
            try:
                self.browser.get(url)
                # 休眠0.5s，等待数据加载
                time.sleep(0.5)
                links = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find_all("a")
                vote = self.browser.find_element_by_class_name("vote-count").text
                view = self.browser.find_element_by_id("j-lemmaStatistics-pv").text
                word = self.browser.title.split(u"_")[0]

                if word in crawled_word or url in crawled_url:
                    continue                
                else:
                    for link in links:
                        if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href'])
<8:
                            continue
                        tmpurl = link["href"]
                        if re.search("baike.baidu.com/view/d+|baike.baidu.com/subview/d+(/d+)?",tmpurl) and tmpurl n
ot in crawled_url:
                            queue.append(tmpurl)

                crawled_url.add(url)
                linedata = word+"	"+view+"	"+vote+"	"+url+"
"
                self.fw.write(linedata)

            except Exception,e:
                print 'error',e
                continue

            cnt += 1
            print cnt,self.name,'len',len(queue)


    def __exit__(self):
        self.fw.close()

if __name__=='__main__':
    """
    开５个线程
    """
    for i in range(5):
        t = BaikeSpider(i)
        t.start()

每天一小步，人生一大步！Good luck~

相关阅读:
ISCC 2018——write up
图的存储结构（十字链表、邻接多重表、边集数组）
图的存储结构
 树梅派（Raspberry Pi 3b）安装kali linux 2.0
树梅派3B kali2.0 启用SSH进行远程登录
 VS+VAssistX自动添加注释
 libtiff库使用
 word采用尾注进行参考文献排版的一些问题
 vs2008安装opencv2.4.6
Altera CYCLONE III FPGA BGA布线
原文地址：https://www.cnblogs.com/jkmiao/p/5073727.html