• Python 爬取煎蛋网妹子图片


     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     3 # @Date    : 2017-08-24 10:17:28
     4 # @Author  : EnderZhou (zptxwd@gmail.com)
     5 # @Link    : http://www.cnblogs.com/enderzhou/
     6 # @Version : $Id$
     7 
     8 import requests
     9 from bs4 import BeautifulSoup as bs
    10 import threading
    11 import Queue
    12 import urllib
    13 
    14 class jiandan_ooxx(threading.Thread):
    15     def __init__(self,queue):
    16         threading.Thread.__init__(self)
    17         self._queue = queue
    18 
    19     def run(self):
    20         while not self._queue.empty():
    21             url = self._queue.get_nowait()
    22             self.spider(url)
    23 
    24     def spider(self,url):
    25         r = requests.get(url = url)
    26         soup = bs(r.content,'html.parser')
    27         imges = soup.find_all(name='img',attrs={})
    28         lists = []
    29         for i in imges:
    30             if 'border' in str(i):
    31                 continue
    32             elif 'onload' in str(i):
    33                 lists.append(i['org_src'])
    34                 print i['org_src']
    35                 img = 'http:' + i['org_src']
    36             else:
    37                 lists.append(i['src'])
    38                 print i['src']
    39                 img = 'http:' + i['src']
    40             name = img.split('/')[-1]
    41             urllib.urlretrieve(img,filename=name)
    42 
    43 def main(number):
    44     url = 'http://jandan.net/ooxx/page-'
    45     headers = {}
    46     queue = Queue.Queue()
    47 
    48     # 此处由最新页面开始爬取,默认爬取最新10页的图片,把number-11改成0即可爬取全部页面的图片。
    49     for i in xrange(number,number-11,-1):
    50         queue.put(url+str(i))
    51     threads = []
    52     thread_count = 10
    53 
    54     for i in range(thread_count):
    55         threads.append(jiandan_ooxx(queue))
    56 
    57     for t in threads:
    58         t.start()
    59     for t in threads:
    60         t.join()
    61 
    62 if __name__ == '__main__':
    63     # 获取最新页码并传入main函数
    64     r = requests.get('http://jandan.net/ooxx')
    65     soup = bs(r.content,'html.parser')
    66     string = soup.find_all(name='span',attrs={'class':'current-comment-page'})
    67     number = int(string[1].string[1:-1]) 
    68     main(number)
  • 相关阅读:
    谁是你心目中最优秀的ajax框架
    23种设计模式(1):单例模式
    23种设计模式(8):观察者模式
    设计模式六大原则(3):依赖倒置原则
    23种设计模式(2):工厂方法模式
    oracle中给表和字段添加注释
    单例模式讨论篇:单例模式与垃圾回收
    设计模式六大原则(6):开闭原则
    mysql命名锦集
    23种设计模式(3):抽象工厂模式
  • 原文地址:https://www.cnblogs.com/enderzhou/p/7422441.html
Copyright © 2020-2023  润新知