• 爬取妹子图


    本文转自 https://blog.csdn.net/baidu_35085676/article/details/68958267 

    文中的代码,我自己跑了一遍,主要的解析的方式用的是 BeautifulSoup  但是代码跑起来可能会出现一些问题 TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。个人觉得应该是网站的反爬虫机制的问题,可以尝试一下,变换ip地址。

      1 import requests
      2 from bs4 import BeautifulSoup
      3 import os
      4 import time
      5 
      6 all_url = 'http://www.mzitu.com'
      7 #http请求头
      8 Hostreferer = {
      9     'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
     10     'Referer': 'http://www.mzitu.com'
     11                }
     12 Picreferer = {
     13     'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
     14     'Referer': 'http://i.meizitu.net'
     15 }
     16 #此请求头破解盗链
     17 #保存地址
     18 path = 'E:/pythonFile/meititu/mei/'
     19 #记录文件
     20 data = 'E:/pythonFile/meititu/mei/.data'
     21 #读取保存记录
     22 def get_log(file):
     23     page = 1
     24     line = 0
     25     try:
     26         with open(file, 'r') as f:
     27             l = f.readline()
     28             page, line = [int(i) for i in l.split('|')]
     29     except Exception as e:
     30         print(e)
     31         print('读取记录失败,从初始开始')
     32     return page, line
     33 
     34 #保存记录
     35 def put_log(file, page, line):
     36     try:
     37         with open(file, "w") as f:
     38             f.write('{}|{}'.format(page, line))
     39     except Exception as e:
     40         print('保存记录失败:[{}]'.format(e))
     41 
     42 #找寻最大页数
     43 def find_max_page():
     44     start_html = requests.get(all_url, headers=Hostreferer)
     45     soup = BeautifulSoup(start_html.text, "html.parser")
     46     page = soup.find_all('a', class_='page-numbers')
     47     max_page = page[-2].text
     48     max_page = int(max_page)
     49     return max_page
     50 
     51 if __name__ == "__main__":
     52     same_url = 'http://www.mzitu.com/page/'
     53     max_page = find_max_page()
     54     page, line = get_log(data)
     55     print('从{}页,{}行开始缓存'.format(page, line))
     56     for n in range(page, int(max_page)+1):
     57         ul = same_url+str(n)
     58         start_html = requests.get(ul, headers=Hostreferer)
     59         soup = BeautifulSoup(start_html.text, "html.parser")
     60         all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
     61         for lines in range(line, len(all_a)):
     62             a = all_a[lines]
     63             title = a.get_text() #提取文本
     64             if(title != ''):
     65                 print("准备扒取:"+title)
     66                 #win不能创建带?的目录
     67                 if(os.path.exists(path+title.strip().replace('?',''))):
     68                         #print('目录已存在')
     69                         flag = 1
     70                 else:
     71                     os.makedirs(path+title.strip().replace('?',''))
     72                     flag = 0
     73                 os.chdir(path + title.strip().replace('?', ''))
     74                 href = a['href']
     75                 html = requests.get(href, headers=Hostreferer)
     76                 mess = BeautifulSoup(html.text, "html.parser")
     77                 # 最大也在class='pagenavi'div中的第6个span
     78                 pic_max = mess.find("div", class_='pagenavi').find_all('span')
     79                 print(pic_max)
     80                 print(len(pic_max)) #确定最大页数在第几个span标签,网页可能会变动
     81                 pic_max = pic_max[6].text #最大页数
     82                 print(pic_max)
     83                 if(flag == 1 and len(os.listdir(path+title.strip().replace('?',''))) >= int(pic_max)):
     84                     print('已经保存完毕,跳过')
     85                     continue
     86                 for num in range(1, int(pic_max)+1):
     87                     while True:
     88                         pic = href+'/'+str(num)
     89                         html = requests.get(pic, headers=Hostreferer)
     90                         mess = BeautifulSoup(html.text, "html.parser")
     91                         pic_url = mess.find('img', alt=title)
     92                         if(pic_url):
     93                             break
     94                     # print(pic_url['src'])
     95                     html = requests.get(pic_url['src'], headers=Picreferer)
     96                     file_name = pic_url['src'].split(r'/')[-1]
     97                     f = open(file_name, 'wb')
     98                     f.write(html.content)
     99                     f.close()
    100                 put_log(data, n, lines)
    101                 time.sleep(0.5)
    102         print('',n,'页完成')
    103         line = 0
    104         time.sleep(10)
  • 相关阅读:
    .net core在linux下图片中文乱码
    微信公众号开发--.net core接入
    洛谷P3385负环
    洛谷P3387缩点
    洛谷P2312解方程
    洛谷P3366最小生成树
    洛谷P3378堆
    洛谷P2024食物链
    洛谷P2680运输计划
    洛谷P2886牛继电器
  • 原文地址:https://www.cnblogs.com/tianqianlan/p/11332724.html
Copyright © 2020-2023  润新知