• python爬取站长之家植物图片


     1 from lxml import etree
     2 from urllib import request
     3 import urllib.parse
     4 import time
     5 import os
     6 
     7 
     8 def handle_request(url,page):
     9     if page == 1:
    10         url = url.format('')
    11     else:
    12         url = url.format('_'+str(page))
    13     headers = {
    14         "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    15     }
    16     request = urllib.request.Request(url=url,headers=headers)
    17 
    18     return request
    19 
    20 def download_img(image_src):
    21     dirpath = r'G:/untitled/zhiwu'
    22     if not os.path.exists(dirpath):
    23         os.mkdir(dirpath)
    24     # 文件名
    25     filename= os.path.basename(image_src)
    26     # 文件路径
    27     filepath = os.path.join(dirpath, filename)
    28     # 发送请求保存图片
    29     headers = {
    30         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    31     }
    32     request = urllib.request.Request(url=image_src, headers=headers)
    33     response = urllib.request.urlopen(request)
    34     print(response)
    35     with open(filepath,'wb') as fp:
    36         fp.write(response.read())
    37 
    38 def parse_content(content):
    39     # 解析内容,获取图片
    40     tree = etree.HTML(content)
    41     image_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src')
    42     for image_src1 in image_list:
    43         image_src = image_src1.split('/')[-1]
    44         download_img(image_src)
    45 
    46 def main():
    47     url = 'http://sc.chinaz.com/tupian/huadetupian{}.html'
    48     start_page = int(input('请输入起始页码:'))
    49     end_page = int(input('请输入结束页码:'))
    50     for page in range(start_page, end_page + 1):
    51         request = handle_request(url, page)
    52         content = urllib.request.urlopen(request).read().decode()
    53         parse_content(content)
    54         time.sleep(1)
    55 
    56 
    57 if __name__ == '__main__':
    58     main()
  • 相关阅读:
    Triangle
    Pascal's Triangle II
    Pascal's Triangle
    Populating Next Right Pointers in Each Node II
    Populating Next Right Pointers in Each Node
    [c++]this指针理解
    [oracle]一个最简单的oracle存储过程"proc_helloworld"
    Oracle 的 INSERT ALL和INSERT FIRST
    Linux2.6 内核的 Initrd 机制解析
    /boot/grub/menu.lst详解
  • 原文地址:https://www.cnblogs.com/erlchixiha/p/11805319.html
Copyright © 2020-2023  润新知