• umei-spider


    umei-spider

    复制代码
     1 #!/usr/bin/python3
     2 
     3 import requests
     4 from bs4 import BeautifulSoup
     5 from contextlib import closing
     6 import time
     7 import uuid
     8 
     9 
    10 class SevenOneSixZero:
    11     def __init__(self):
    12         self.photo_id = []
    13         self.url = 'http://www.umei.cc/tags/xiezhen_1.htm'
    14         self.urls = []
    15         self.headers = {
    16             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    17         }
    18 
    19     def get_ids(self):
    20         res = requests.get(url=self.url, headers=self.headers)
    21 
    22     def get_page_content(self, url):
    23         res = requests.get(url, headers=self.headers)
    24         r = res.text
    25         # response.encoding是指从HTTP的header中猜测的响应内容编码方式 如果header中不存在charset,则默认编码为ISO-8859-1
    26         #  print(res.encoding)
    27         # response.apparent_encoding是指从内容中分析出的响应内容编码方式。
    28         #  print(res.apparent_encoding)
    29         # requests内部的 utils 也提供了一个从返回 body 获取页面编码的函数get_encodings_from_content,这样如果服务器返回的头不含 Charset,再通过 get_encodings_from_content 就可以知道页面的正确编码了
    30         # print(requests.utils.get_encodings_from_content(r))
    31         return r.encode(res.encoding).decode(res.apparent_encoding)
    32 
    33     def get_img(self):
    34         res = self.get_page_content(self.url)
    35         soup = BeautifulSoup(res, 'lxml')
    36         img_list = soup.select('body > div.wrap > div.TypeList > ul > li > a > img')
    37         return img_list
    38 
    39     def get_img_src_list(self):
    40         img_list = []
    41         for img in self.get_img():
    42             img_dict = {
    43                 'src': img.get('src')
    44             }
    45             img_list.append(img_dict)
    46         return img_list
    47 
    48     def download_img(self, img_list):
    49         """
    50         下载图片
    51         :param img_list:
    52         :return:
    53         """
    54         i = 0
    55         for img_dic in img_list:
    56           #  time.sleep(0.1)
    57             src = img_dic['src']
    58             res = requests.get(src, self.headers)
    59             i = i + 1
    60             with closing(res) as r:
    61                 name = uuid.uuid1()
    62                 with open('D:/python/imgs/{}.jpg'.format(name), 'ab+') as f:
    63                     for chunk in r.iter_content(chunk_size=1024):
    64                         if chunk:
    65                             f.write(chunk)
    66                             f.flush()
    67                     print('成功下载第{}张图:{}.jpg'.format(i,name ))
    68 
    69     def get_url_list(self, start, end):
    70         """
    71         获取url列表
    72         :param start:
    73         :param end:
    74         :return:
    75         """
    76         for i in range(start, end):
    77             self.urls.append('http://www.umei.cc/tags/xiezhen_{}.htm'.format(i))
    78         return self.urls
    79 
    80 
    81 if __name__ == '__main__':
    82     seven = SevenOneSixZero()
    83 
    84     for url in seven.get_url_list(22, 30):
    85         seven.url = url
    86         img_list = seven.get_img_src_list()
    87         seven.download_img(img_list)
    复制代码
  • 相关阅读:
    浏览器环境下JavaScript脚本加载与执行探析之代码执行顺序
    DOM扩展:DOM API的进一步增强[总结篇-下]
    DOM扩展:DOM API的进一步增强[总结篇-上]
    Network 第九篇
    Network 第八篇 – 动态路由-OSPF
    Network 第七篇
    Network 第六篇
    Network 第五篇
    Network 第四篇
    Network 第三篇
  • 原文地址:https://www.cnblogs.com/valorchang/p/11475840.html
Copyright © 2020-2023  润新知