• python 抓取“煎蛋妹子图”并分页存在本地(普通版和多线程版)


      想测试一下python多线程爬虫的效率,就写了个爬虫程序的多线程版和普通版。测试了一下。发现单线程比多线程还快,不理解。看来有两种可能,一是我写的多线程爬虫有问题,二是python对多线程的支持有局限。

      暂时存在这里,希望以后能解决。

      爬虫单线程版:

      

     1 #coding=utf-8
     2 import sys 
     3 reload(sys) 
     4 sys.setdefaultencoding('utf-8')
     5 
     6 import requests
     7 import re
     8 import os
     9 from threading import Thread 
    10 import urllib
    11 from time import ctime
    12 
    13 class crawl_girls(object):
    14     """docstring for crawl_girls"""
    15     def __init__(self, url, pagenum):
    16         self.url = url
    17         self.pagenum = pagenum
    18         self.content = ""
    19         self.img_urls = []
    20         self.img_names = []
    21 
    22     def getContent(self):
    23         try:
    24             imgs_html = requests.get(self.url)
    25             imgs_html_content = imgs_html.content
    26             self.content = imgs_html_content
    27             #print self.content
    28         except requests.exceptions.RequestException, e:
    29             print e
    30 
    31     def getImgNames(self):
    32         img_names_patt = r'<li id="comment-(.+?)">'
    33         self.img_names = re.findall(img_names_patt, self.content)
    34 
    35     def getImgUrls(self):
    36         img_urls_patt = r'<p><img src="(.+?)"'
    37         self.img_urls = re.findall(img_urls_patt, self.content)
    38 
    39     def start_download(self):
    40         self.getContent()
    41         self.getImgNames()
    42         self.getImgUrls() 
    43 
    44         curr_path = os.getcwd()
    45         curr_path = curr_path.replace('\', '/')
    46         curr_path = curr_path + '/'
    47         file_dir = curr_path + str(self.pagenum) + '/'
    48         os.mkdir(file_dir)
    49 
    50         for name_url in zip(self.img_names, self.img_urls):
    51             pic_name = name_url[1][-4:]
    52             file_path = file_dir + name_url[0] + pic_name
    53             #print 'start download',file_path
    54             print 'starting at', ctime()
    55             urllib.urlretrieve(name_url[1], file_path)
    56             print 'finished at', ctime()
    57 
    58            
    59 def main(page_start, page_end):
    60     page = r'http://jandan.net/ooxx/page-1#comments'
    61     for pagenum in range(page_start, page_end+1):
    62         url = page.replace('1', str(pagenum))
    63         print url
    64         girls = crawl_girls(url, pagenum)
    65         girls.start_download()
    66 
    67     print "all Done"
    68 
    69 if __name__ == '__main__':  
    70     main(905, 906)

      爬虫多线程版:

      

     1 #coding=utf-8
     2 import sys 
     3 reload(sys) 
     4 sys.setdefaultencoding('utf-8')
     5 
     6 import requests
     7 import re
     8 import os
     9 from threading import Thread 
    10 import urllib
    11 from time import ctime
    12 
    13 class crawl_girls(object):
    14     """docstring for crawl_girls"""
    15     def __init__(self, url, pagenum):
    16         self.url = url
    17         self.pagenum = pagenum
    18         self.content = ""
    19         self.img_urls = []
    20         self.img_names = []
    21 
    22     def getContent(self):
    23         try:
    24             imgs_html = requests.get(self.url)
    25             imgs_html_content = imgs_html.content
    26             self.content = imgs_html_content
    27             #print self.content
    28         except requests.exceptions.RequestException, e:
    29             print e
    30 
    31     def getImgNames(self):
    32         img_names_patt = r'<li id="comment-(.+?)">'
    33         self.img_names = re.findall(img_names_patt, self.content)
    34 
    35     def getImgUrls(self):
    36         img_urls_patt = r'<p><img src="(.+?)"'
    37         self.img_urls = re.findall(img_urls_patt, self.content)
    38 
    39     def start_thread(self):
    40         self.getContent()
    41         self.getImgNames()
    42         self.getImgUrls() 
    43 
    44         curr_path = os.getcwd()
    45         curr_path = curr_path.replace('\', '/')
    46         curr_path = curr_path + '/'
    47         file_dir = curr_path + str(self.pagenum) + '/'
    48         os.mkdir(file_dir)
    49 
    50         for name_url in zip(self.img_names, self.img_urls):
    51             pic_name = name_url[1][-4:]
    52             file_path = file_dir + name_url[0] + pic_name
    53             print 'start download',file_path
    54             print 'starting at', ctime()
    55             thread = download_threads(name_url[1], file_path)
    56             thread.start()
    57             thread.join()
    58             print 'finished at', ctime()
    59 
    60 class download_threads(Thread):
    61     def __init__(self, url, path):
    62         Thread.__init__(self)
    63         self.url = url
    64         self.path = path
    65 
    66     def run(self):
    67         urllib.urlretrieve(self.url, self.path)
    68            
    69 def main(page_start, page_end):
    70     page = r'http://jandan.net/ooxx/page-1#comments'
    71     for pagenum in range(page_start, page_end+1):
    72         url = page.replace('1', str(pagenum))
    73         print url
    74         girls = crawl_girls(url, pagenum)
    75         girls.start_thread()
    76 
    77     print "all Done"
    78 
    79 if __name__ == '__main__':
    80     main(905, 906)

      

  • 相关阅读:
    flash 的跑到模型(转)
    (转载)GC调用时机,特殊内存回收时机,及调试时强制GC
    C语言面试那些事儿──一道指针与数组问题
    【原创】Performanced C++ 经验规则 第五条:再谈重载、覆盖和隐藏
    【原创】Performanced C++ 经验规则 第二条:你不知道的构造函数(中)
    算法探讨——Top K算法详细解析(转载)
    【原创】Performanced C++ 经验规则 目录
    【原创】Performanced C++ 经验规则 第四条:静态和多态,亦敌亦友
    Mac OS X Git安装教程
    【原创】Performanced C++ 经验规则 第三条:你不知道的构造函数(下)
  • 原文地址:https://www.cnblogs.com/lkprof/p/3267039.html
Copyright © 2020-2023  润新知