• python爬去壁纸网站上的所有壁纸


     import requests as r                                         2 from bs4 import BeautifulSoup
      3 import os
      4 base_url = "http://www.win4000.com"#站点
      5 theme_base_url = "http://www.win4000.com/zt/xiaoqingxin_"
      6 #利用列表解析快速生成每页链接列表
      7 theme_url_list = [theme_base_url + str(x) + ".html" for x i    n range(1,6)]
      8 
      9 #套图链接列表
     10 series_url_list = []
     11 #获取所有套图链接列表
     12 #UA伪装
     13 headers = {
     14          "User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x8        6_64; rv:80.0) Gecko/20100101 Firefox/80.0"
     15         
     16         }
     17 def get_series_url_lists(url,headers):
     18     resp = r.get(url,headers)
     19     if resp is not None:
     20         result = resp.text
     21         bs = BeautifulSoup(result,"html.parser")
     22         ul = bs.find("div",attrs = {"class":"tab_tj"})
     23         a_s = ul.find_all("a")
     24         for a in a_s:
     25             series_url_list.append(a.get("href"))
     26 
     27 #保存文件夹名
     28 save_root_dir = os.path.join(os.getcwd(),"tmp/")
     29 #获取某个套图里的所有图片
     30 def fetch_all_series_pic(url,headers):
     31     cur_page = 1
     32     while True:
     33         current_url = url
     34         if cur_page>1:
     35             current_url = url.relapce(".html",+"_"+str(cur_    page)+".html")
     36             resp = r.get(current_url,headers)
     #http请求码错误则退出程序
     38             if resp.statu_code == 404:
     39                 break
     40             else:
     41                 if resp is not None:
     42                     bs = BeautifulSoup(result,"lxml")
     43                     #使用lxml获取标题,用作文件夹名称
     44                     title_name = bs.find("div",attrs = {"cl    ass":"ptitle"}).h1.text
     45                     save_dir = os.path.join(save_root_dir,t    itle_name)
     46                     if not os.path.exists(save_dir):
     47                         os.makedirs(save_dir)
     48                     #使用CCS选择器选择图片节点
     49                     imgs = bs.select("img.pic-large")
     50                     for img in imgs:
     51                         download_pic(img.attrs.get("src"),s    ave_dir)
     52                     cur_page+=1
     53 #下载图片的方法
     54 def download_pic(url,path):
     55     print("下载图片:" + url)
     56     try:
     57         #就是通过义/为分割符,形成一个字符串列表并且取列表>    的最后一个元素
     58         pic_name = url.split("/")[-1]
     59         #.content返回的是二进制文件
     60         #.text返回的是Unicode(str)数据
     61         #图片为二进制文件
     62         img_resp = r.get(url).content
     63         with open(path +"/"+pic_name,"wb+") as f:
     64             f.write(img_resp)
     65     except Exception as reason:
     66         print(str(reason))
    
     68 if __name__ == "__main__":
     69     for url in theme_url_list:
     70         get_series_url_lists(url,headers)
     71     for url in series_url_list:
     72         fetch_all_series_pic(url,headers)
     73                              
    笨鸟先飞
  • 相关阅读:
    java中的四种内部类
    09_TomCat_基础知识
    08_XML的解析_SAX解析
    IO流07_输入输出流总体系
    IO流06_处理流
    IO流05_OutputStream和Writer输出流
    IO流04_InputStream和Reader输入流
    IO流03_流的分类和概述
    IO流02_文件过滤器
    IO流01_File类
  • 原文地址:https://www.cnblogs.com/zoutingrong/p/13739719.html
Copyright © 2020-2023  润新知