• 爬取图片(二)


    源码:

     1 import requests
     2 from lxml import etree
     3 import os
     4 
     5 
     6 # 获取图集地址
     7 def get_url(page,headers):
     8     url = 'http://www.mzitu.com/page/{}/'.format(page)
     9     response = requests.get(url,headers=headers)
    10     html_ele = etree.HTML(response.text)
    11     ele_list = html_ele.xpath('//ul[@id="pins"]/li')
    12     url_tuple_list = []
    13     for ele in ele_list:
    14         url = ele.xpath('./span/a/@href')[0]
    15         name = ele.xpath('./span/a')[0].text
    16         url_tuple = (url,name)
    17         url_tuple_list.append(url_tuple)
    18     return url_tuple_list
    19 
    20 
    21 # 下载图片
    22 def get_pics(url,headers,name):
    23     # 创建文件夹
    24     dirs_name = 'www.mzitu.com/' + name
    25     if not os.path.exists(dirs_name):
    26         os.makedirs(dirs_name)
    27 
    28     # 获取最大图片页数
    29     response = requests.get(url,headers=headers)
    30     html_ele = etree.HTML(response.text)
    31     max_page = html_ele.xpath('//div[@class="pagenavi"]/a/span')[-2].text
    32     # print(type(max_page))
    33     # 存储图片
    34     for page in range(1,int(max_page)+1):
    35         if page < 10:
    36             url_page = url + '/0' +str(page)
    37         else:
    38             url_page = url + '/' + str(page)
    39         # print(url_page)
    40         response = requests.get(url_page,headers=headers)
    41         html_ele = etree.HTML(response.text)
    42         pic_url = html_ele.xpath('//div[@class="main-image"]/p/a/img/@src')[0]
    43         # print(pic_url)
    44         pic_bytes = requests.get(pic_url,headers=headers)
    45         filename = dirs_name + '/' + pic_url.split('/')[-1]
    46         if not os.path.exists(filename):
    47             with open(filename, 'wb') as f:
    48                 f.write(pic_bytes.content)
    49             print(filename)
    50 
    51 
    52 if __name__ == '__main__':
    53     headers = {
    54         "Referer": "http://www.mzitu.com",
    55         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    56     }
    57     for page in range(1,3):
    58         url_tuple_list = get_url(page,headers)
    59         for url,name in url_tuple_list:
    60             get_pics(url,headers,name)
  • 相关阅读:
    (5.13)mysql高可用系列——1主3从复制(SSL)
    sql server凭据
    (5.3.3)数据库迁移——迁移一个实例到底需要注意哪些方面?
    provider: Shared Memory Provider, error: 0
    (5.3.2)数据库迁移——SSIS包批量导出
    (4.35)sql server清理过期文件【转】
    Linux学习笔记(16)Linux前后台进程切换(fg/bg/jobs/ctrl+z)
    导入导出维护计划
    C++的那些事:你真的了解引用吗
    为什么构造函数不能是虚函数
  • 原文地址:https://www.cnblogs.com/zhxd-python/p/9501304.html
Copyright © 2020-2023  润新知