import requests as r 2 from bs4 import BeautifulSoup 3 import os 4 base_url = "http://www.win4000.com"#站点 5 theme_base_url = "http://www.win4000.com/zt/xiaoqingxin_" 6 #利用列表解析快速生成每页链接列表 7 theme_url_list = [theme_base_url + str(x) + ".html" for x i n range(1,6)] 8 9 #套图链接列表 10 series_url_list = [] 11 #获取所有套图链接列表 12 #UA伪装 13 headers = { 14 "User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x8 6_64; rv:80.0) Gecko/20100101 Firefox/80.0" 15 16 } 17 def get_series_url_lists(url,headers): 18 resp = r.get(url,headers) 19 if resp is not None: 20 result = resp.text 21 bs = BeautifulSoup(result,"html.parser") 22 ul = bs.find("div",attrs = {"class":"tab_tj"}) 23 a_s = ul.find_all("a") 24 for a in a_s: 25 series_url_list.append(a.get("href")) 26 27 #保存文件夹名 28 save_root_dir = os.path.join(os.getcwd(),"tmp/") 29 #获取某个套图里的所有图片 30 def fetch_all_series_pic(url,headers): 31 cur_page = 1 32 while True: 33 current_url = url 34 if cur_page>1: 35 current_url = url.relapce(".html",+"_"+str(cur_ page)+".html") 36 resp = r.get(current_url,headers) #http请求码错误则退出程序 38 if resp.statu_code == 404: 39 break 40 else: 41 if resp is not None: 42 bs = BeautifulSoup(result,"lxml") 43 #使用lxml获取标题,用作文件夹名称 44 title_name = bs.find("div",attrs = {"cl ass":"ptitle"}).h1.text 45 save_dir = os.path.join(save_root_dir,t itle_name) 46 if not os.path.exists(save_dir): 47 os.makedirs(save_dir) 48 #使用CCS选择器选择图片节点 49 imgs = bs.select("img.pic-large") 50 for img in imgs: 51 download_pic(img.attrs.get("src"),s ave_dir) 52 cur_page+=1 53 #下载图片的方法 54 def download_pic(url,path): 55 print("下载图片:" + url) 56 try: 57 #就是通过义/为分割符,形成一个字符串列表并且取列表> 的最后一个元素 58 pic_name = url.split("/")[-1] 59 #.content返回的是二进制文件 60 #.text返回的是Unicode(str)数据 61 #图片为二进制文件 62 img_resp = r.get(url).content 63 with open(path +"/"+pic_name,"wb+") as f: 64 f.write(img_resp) 65 except Exception as reason: 66 print(str(reason)) 68 if __name__ == "__main__": 69 for url in theme_url_list: 70 get_series_url_lists(url,headers) 71 for url in series_url_list: 72 fetch_all_series_pic(url,headers) 73