• Python-爬取妹子图(单线程和多线程版本)


    一、参考文章

        Python爬虫之——爬取妹子图片

        上述文章中的代码讲述的非常清楚,我的基本能思路也是这样,本篇文章中的代码仅仅做了一些异常处理和一些日志显示优化工作,写此文章主要是当做笔记,方便以后查阅,修改的地方如下:

    1、异常处理下面在代码中会单独标红

    2、多线程版使用了multiprocessing这个库,需要在main函数开始调用freeze_support(),防止打包成exe之后,运行时创建线程失败

    3、多线程版本加了一个命令行自定义线程个数功能

    二、单线程版本

     1 #coding=utf-8
     2 import requests
     3 from bs4 import BeautifulSoup
     4 import os
     5 
     6 all_url = 'http://www.mzitu.com'
     7 
     8 
     9 #http请求头
    10 Hostreferer = {
    11     'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    12     'Referer':'http://www.mzitu.com'
    13                }
    14 Picreferer = {
    15     'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    16     'Referer':'http://i.meizitu.net'
    17 }
    18 #此请求头破解盗链
    19 
    20 start_html = requests.get(all_url, headers = Hostreferer)
    21 
    22 #保存地址
    23 path = os.getcwd() + '/mzitu/'
    24 
    25 #找寻最大页数
    26 soup = BeautifulSoup(start_html.text, "html.parser")
    27 page = soup.find_all('a', class_='page-numbers')
    28 max_page = page[-2].text
    29 
    30 
    31 same_url = 'http://www.mzitu.com/page/'
    32 for n in range(0, int(max_page)+1):#遍历页面数
    33     ul = same_url+str(n)
    34     start_html = requests.get(ul, headers = Hostreferer)
    35     soup = BeautifulSoup(start_html.text, "html.parser")
    36     all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank')
    37     for a in all_a:#每个页面包含的妹子数
    38         title = a.get_text() #提取文本
    39         if(title != ''):
    40             print("准备扒取:" + title)
    41 
    42             #win不能创建带?的目录
    43             if(os.path.exists(path+title.strip().replace('?', ''))):
    44                     #print('目录已存在')
    45                     flag = 1
    46             else:
    47                 os.makedirs(path+title.strip().replace('?', ''))
    48                 flag = 0
    49             os.chdir(path + title.strip().replace('?', ''))
    50             href = a['href']
    51             html = requests.get(href, headers = Hostreferer)
    52             mess = BeautifulSoup(html.text, "html.parser")
    53             pic_max = mess.find_all('span')
    54             pic_max = pic_max[10].text #最大页数
    55             if(flag == 1 and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)):
    56                 print('已经保存完毕,跳过')
    57                 continue
    58             for num in range(1, int(pic_max) + 1):#每个妹子的所有照片
    59                 pic = href+'/'+str(num)
    60                 html = requests.get(pic, headers = Hostreferer)
    61                 mess = BeautifulSoup(html.text, "html.parser")
    62                 pic_url = mess.find('img', alt = title)
    63                
    64                 if 'src' not in pic_url.attrs:#有些pic_url标签没有src这个属性,导致操作异常,在次进行过滤
    65                     continue
    66                 print(pic_url['src'])
    67                 #exit(0)
    68                 html = requests.get(pic_url['src'],headers = Picreferer)
    69                 file_name = pic_url['src'].split(r'/')[-1]
    70                 f = open(file_name, 'wb')
    71                 f.write(html.content)
    72                 f.close()
    73             print('完成')
    74     print('',n,'页完成')

    三、多线程版本

     1 #coding=utf-8
     2 import requests
     3 from bs4 import BeautifulSoup
     4 import os
     5 from multiprocessing import Pool
     6 from multiprocessing import freeze_support
     7 import sys
     8 
     9 header = {
    10     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36',
    11     'Referer':'http://www.mzitu.com'
    12     }
    13 Picreferer = {
    14     'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    15     'Referer':'http://i.meizitu.net'
    16 }
    17 
    18 def find_MaxPage():
    19     all_url = 'http://www.mzitu.com'
    20     start_html = requests.get(all_url, headers = header)
    21     #找寻最大妹子页面数
    22     soup = BeautifulSoup(start_html.text, "html.parser")
    23     page = soup.find_all('a', class_ = 'page-numbers')
    24     max_page = page[-2].text
    25     return max_page
    26 
    27 def Download(href, title, path):
    28     html = requests.get(href, headers = header)
    29     soup = BeautifulSoup(html.text, 'html.parser')
    30     pic_max = soup.find_all('span')
    31     pic_max = pic_max[10].text  # 最大页数
    32     if(os.path.exists(path+title.strip().replace('?', '')) 
    33     and len(os.listdir(path+title.strip().replace('?', ''))) >= int(pic_max)):
    34         print('妹子已待命,继续准备下一个妹子' + title)
    35         return 1
    36     print(f"发现妹子资源{pic_max}个,准备中:" + title)
    37     os.makedirs(path + title.strip().replace('?', ''))
    38     os.chdir(path + title.strip().replace('?', ''))
    39     for num in range(1, int(pic_max) + 1):
    40         pic = href + '/' + str(num)
    41         html = requests.get(pic, headers = header)
    42         mess = BeautifulSoup(html.text, "html.parser")
    43         pic_url = mess.find('img', alt = title)
    44         if 'src' not in pic_url.attrs:#有些pic_url标签没有src属性,导致操作异常,在次进行过滤
    45             continue
    46         print(f"{title}:{pic_url['src']}")
    47         html = requests.get(pic_url['src'], headers = header)
    48         file_name = pic_url['src'].split(r'/')[-1]
    49         f = open(file_name,'wb')
    50         f.write(html.content)
    51         f.close()
    52     print('妹子已就绪,客官请慢用:' + title)
    53 
    54 if __name__ == '__main__':
    55     freeze_support()#防止打包后 运行exe创建进程失败
    56     
    57     #线程池中线程数
    58     count = 1
    59     if len(sys.argv) >=2:
    60         count = int(sys.argv[1])
    61         
    62     pool = Pool(count)
    63     print(f'初始化下载线程个数${count}')
    64 
    65     # http请求头
    66     path = os.getcwd() + '/mzitu_mutil/'
    67     max_page = find_MaxPage() #获取最大页数  即生成的文件夹数量
    68     print(f'捕获{max_page}页妹子,请耐心等待下载完成')
    69     same_url = 'http://www.mzitu.com/page/'
    70 
    71     for n in range(1, int(max_page) + 1):
    72         each_url = same_url + str(n)
    73         start_html = requests.get(each_url, headers = header)#请求一页中的所有妹子
    74         soup = BeautifulSoup(start_html.text, "html.parser")
    75         all_a = soup.find('div', class_ = 'postlist').find_all('a', target = '_blank')
    76         for a in all_a:#遍历每一页中的妹子
    77             title = a.get_text()  # 提取文本
    78             if (title != ''):
    79                 href = a['href']#请求妹子的所有图集
    80                 pool.apply_async(Download, args = (href, title, path))
    81     pool.close()
    82     pool.join()
    83     print('所有妹子已就绪,客官请慢用')

    四、资源下载

      资源下载地址:Python爬取妹子图-单线程和多线程版本

    转载声明:本站文章无特别说明,皆为原创,版权所有,转载请注明:朝十晚八

  • 相关阅读:
    bzoj 5028小Z的加油店(D12 序列gcd)(线段树+树状数组)
    蒲公英
    [APIO2012]派遣(可并堆)(D11)
    AT1219 歴史の研究(回滚莫队)
    [USACO05DEC] 布局
    小B的询问
    [HEOI2012]采花(树状数组)(暑假D11)
    [JLOI2011]飞行路线 (暑假D3 拆点+dijkstra堆优化)
    [POI2012]FES-Festival
    [国家集训队]拉拉队排练
  • 原文地址:https://www.cnblogs.com/swarmbees/p/10016919.html
Copyright © 2020-2023  润新知