• python 多线程抓取动态数据


    利用多线程动态抓取数据,网上也有不少教程,但发现过于繁杂,就不能精简再精简?!

    不多解释,直接上代码,基本上还是很好懂的。

    #!/usr/bin/env python
    # coding=utf-8
    
    import urllib2
    import re,sys
    from bs4 import BeautifulSoup
    from selenium import webdriver
    import threading
    import time
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    queue = [
        "http://baike.baidu.com/view/8332.htm",
        "http://baike.baidu.com/view/145819.htm",
        "http://baike.baidu.com/view/643415.htm",
        "http://baike.baidu.com/view/157424.htm",
        "http://baike.baidu.com/view/149759.htm",]
    
    crawled_url = set()
    crawled_word = set()
    
    cnt = 0
    
    class BaikeSpider(threading.Thread):
        """
        模拟浏览器打开页面,多线程爬取数据
        """
    
        def __init__(self,name):
            threading.Thread.__init__(self)
            self.name = str(name)
    
            self.browser = webdriver.Chrome()
    # 将抓取数据写入各自的文件 self.fw = open("baike_words_"+self.name+".txt","wb") def run(self): global queue global crawled_url global crawled_word global cnt while queue: url = queue.pop(0) try: self.browser.get(url) # 休眠0.5s,等待数据加载 time.sleep(0.5) links = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find_all("a") vote = self.browser.find_element_by_class_name("vote-count").text view = self.browser.find_element_by_id("j-lemmaStatistics-pv").text word = self.browser.title.split(u"_")[0] if word in crawled_word or url in crawled_url: continue else: for link in links: if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href']) <8: continue tmpurl = link["href"] if re.search("baike.baidu.com/view/d+|baike.baidu.com/subview/d+(/d+)?",tmpurl) and tmpurl n ot in crawled_url: queue.append(tmpurl) crawled_url.add(url) linedata = word+" "+view+" "+vote+" "+url+" " self.fw.write(linedata) except Exception,e: print 'error',e continue cnt += 1 print cnt,self.name,'len',len(queue) def __exit__(self): self.fw.close() if __name__=='__main__': """ 开5个线程 """ for i in range(5): t = BaikeSpider(i) t.start()
    每天一小步,人生一大步!Good luck~
  • 相关阅读:
    ISCC 2018——write up
    图的存储结构(十字链表、邻接多重表、边集数组)
    图的存储结构
    树梅派(Raspberry Pi 3b)安装kali linux 2.0
    树梅派3B kali2.0 启用SSH进行远程登录
    VS+VAssistX自动添加注释
    libtiff库使用
    word采用尾注进行参考文献排版的一些问题
    vs2008安装opencv2.4.6
    Altera CYCLONE III FPGA BGA布线
  • 原文地址:https://www.cnblogs.com/jkmiao/p/5073727.html
Copyright © 2020-2023  润新知