本文转自 https://blog.csdn.net/baidu_35085676/article/details/68958267
文中的代码,我自己跑了一遍,主要的解析的方式用的是 BeautifulSoup 但是代码跑起来可能会出现一些问题 TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。个人觉得应该是网站的反爬虫机制的问题,可以尝试一下,变换ip地址。
1 import requests 2 from bs4 import BeautifulSoup 3 import os 4 import time 5 6 all_url = 'http://www.mzitu.com' 7 #http请求头 8 Hostreferer = { 9 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 10 'Referer': 'http://www.mzitu.com' 11 } 12 Picreferer = { 13 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 14 'Referer': 'http://i.meizitu.net' 15 } 16 #此请求头破解盗链 17 #保存地址 18 path = 'E:/pythonFile/meititu/mei/' 19 #记录文件 20 data = 'E:/pythonFile/meititu/mei/.data' 21 #读取保存记录 22 def get_log(file): 23 page = 1 24 line = 0 25 try: 26 with open(file, 'r') as f: 27 l = f.readline() 28 page, line = [int(i) for i in l.split('|')] 29 except Exception as e: 30 print(e) 31 print('读取记录失败,从初始开始') 32 return page, line 33 34 #保存记录 35 def put_log(file, page, line): 36 try: 37 with open(file, "w") as f: 38 f.write('{}|{}'.format(page, line)) 39 except Exception as e: 40 print('保存记录失败:[{}]'.format(e)) 41 42 #找寻最大页数 43 def find_max_page(): 44 start_html = requests.get(all_url, headers=Hostreferer) 45 soup = BeautifulSoup(start_html.text, "html.parser") 46 page = soup.find_all('a', class_='page-numbers') 47 max_page = page[-2].text 48 max_page = int(max_page) 49 return max_page 50 51 if __name__ == "__main__": 52 same_url = 'http://www.mzitu.com/page/' 53 max_page = find_max_page() 54 page, line = get_log(data) 55 print('从{}页,{}行开始缓存'.format(page, line)) 56 for n in range(page, int(max_page)+1): 57 ul = same_url+str(n) 58 start_html = requests.get(ul, headers=Hostreferer) 59 soup = BeautifulSoup(start_html.text, "html.parser") 60 all_a = soup.find('div', class_='postlist').find_all('a', target='_blank') 61 for lines in range(line, len(all_a)): 62 a = all_a[lines] 63 title = a.get_text() #提取文本 64 if(title != ''): 65 print("准备扒取:"+title) 66 #win不能创建带?的目录 67 if(os.path.exists(path+title.strip().replace('?',''))): 68 #print('目录已存在') 69 flag = 1 70 else: 71 os.makedirs(path+title.strip().replace('?','')) 72 flag = 0 73 os.chdir(path + title.strip().replace('?', '')) 74 href = a['href'] 75 html = requests.get(href, headers=Hostreferer) 76 mess = BeautifulSoup(html.text, "html.parser") 77 # 最大也在class='pagenavi'div中的第6个span 78 pic_max = mess.find("div", class_='pagenavi').find_all('span') 79 print(pic_max) 80 print(len(pic_max)) #确定最大页数在第几个span标签,网页可能会变动 81 pic_max = pic_max[6].text #最大页数 82 print(pic_max) 83 if(flag == 1 and len(os.listdir(path+title.strip().replace('?',''))) >= int(pic_max)): 84 print('已经保存完毕,跳过') 85 continue 86 for num in range(1, int(pic_max)+1): 87 while True: 88 pic = href+'/'+str(num) 89 html = requests.get(pic, headers=Hostreferer) 90 mess = BeautifulSoup(html.text, "html.parser") 91 pic_url = mess.find('img', alt=title) 92 if(pic_url): 93 break 94 # print(pic_url['src']) 95 html = requests.get(pic_url['src'], headers=Picreferer) 96 file_name = pic_url['src'].split(r'/')[-1] 97 f = open(file_name, 'wb') 98 f.write(html.content) 99 f.close() 100 put_log(data, n, lines) 101 time.sleep(0.5) 102 print('第',n,'页完成') 103 line = 0 104 time.sleep(10)