分了3个文件,
-config 存放一些信息及配置
-proxy_pool 抓取代理
-get_mzitu 爬取网页
使用前需要安装redis数据库 https://redis.io/download
config文件
#设置user_agent条目 USER_AGENTS = [ "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0" ] #设置refefer条目 REFERER = [ 'https://www.mzitu.com/', 'https://www.mzitu.com/215027', 'https://www.mzitu.com/201236', ] #设置redis服务器连接 CONN = StrictRedis(host='localhost',port=6379,db=0,password='')
proxy_pool文件
import re import requests import random from config import * from redis import StrictRedis from requests import ConnectionError def headers(): header= { 'User-Agent' : random.choice(USER_AGENTS), } return header def get_page(url): print('pool正在打开网页') try: header = headers() res = requests.get(url, headers=header) if res.status_code == 200: # print(res.text) return res.text else: get_page(url) except ConnectionError: get_page(url) def get_proxy_list(): print('正在爬取网页') base_url = 'https://www.xicidaili.com/wt/' page_n = random.randint(100,2700) url = base_url + str(page_n) print(url) html = get_page(url) # print(html) try: pattrens = 'alt="Cn" /></td>([dD]*?)</tr>' root = re.findall(pattrens, html) # print(root) list_ip = [] # 再次匹配数据的正则 for i in range(len(root)): # print(len(root)) key = re.findall('<td>([dD]*?)</td>', root[i]) # list_ip.append(key[3].lower() + '://' + key[0] + ':' + key[1]) list_ip.append(key[0] + ':' + key[1]) print(list_ip) return list_ip except Exception: print('解析IP地址出错l') traceback.print_exc() def check_proxy(): print('正在检查') list_ip = get_proxy_list() url_baidu = 'https://www.mzitu.com' for i in list_ip: print(i) proxy_dic = {'http':i, } try: r_baidu = requests.get(url_baidu,proxies=proxy_dic) if r_baidu.status_code == 200: save_2_redis(i) else : pass except ConnectionError:pass def save_2_redis(proxy): print('正在写入%s' ,proxy) conn = StrictRedis(host='localhost',port=6379,password='') conn.set(proxy.split(':')[0],proxy) def get_proxy(): print('pool获取proxy') if len(CONN.keys('*')) <= 3: check_proxy() else: key = CONN.randomkey() r = CONN.get(key) CONN.delete(key) print(str(r,encoding='utf-8')) return str(r,encoding='utf-8') def main(): get_proxy() if __name__ == '__main__': main()
get_mzitu文件
import os import requests import random from config import * from proxy_pool import get_proxy from bs4 import BeautifulSoup def headers(): header= { 'User-Agent' : random.choice(USER_AGENTS), } return header def referer_headers(): referer_header = { 'User-Agent': random.choice(USER_AGENTS), 'Referer':'https://www.mzitu.com/', } return referer_header def get_proxy_page(url,proxy_dic=None): if proxy_dic: header = headers() res = requests.get(url, headers=header, proxies=proxy_dic) return res.text,proxy_dic else: try: header = headers() proxy_dic = crate_proxy_dic() res = requests.get(url, headers=header, proxies=proxy_dic) return res.text,proxy_dic except ConnectionError: get_proxy_page(url) def crate_proxy_dic(): proxy = 'http://' + str(get_proxy()) proxy_dic = { 'http': proxy, } # print(proxy_dic) return proxy_dic def get_all_girls(url): print('获取all_girl的url') html,proxy_dic = get_proxy_page(url,None) # 构建soup页面 soup = BeautifulSoup(html, 'html.parser') # 获取 class_='archives' 下的所有 'a'标签 total_info = soup.find(class_='archives').find_all('a') # 遍历 'a' 标签,读取'href'值 all_list=[] for girls_info in total_info: link_url = girls_info['href'] all_list.append(link_url) print(all_list,proxy_dic) return all_list,proxy_dic def get_girl_all_page(all_list,proxy_dic): for url in all_list: html,proxy_dic = get_proxy_page(url,proxy_dic) soup = BeautifulSoup(html,'lxml') # 在 class_='pagenavi' 中的倒数第3个标签,读取 'span' 的值(图片数量) max_page = soup.find(class_='pagenavi',).find_all('a')[-2].find('span').string title = soup.find(class_='main-title').string # 循环读取详情页面中的'img'标签中的'src'值 header = referer_headers() pic_url_list = [] for i in range(int(max_page)): page_url = url + "/%s" %(i+1) pic_url,proxy_dic = append_img_url(page_url, header, proxy_dic) pic_url_list.append(pic_url) download_Pic(title, pic_url_list, proxy_dic) def append_img_url(page_url , header , proxy_dic=None): try: res = requests.get(page_url, headers=header, proxies=proxy_dic) if res.status_code == 200: pic_url = get_img_url(res) print(pic_url,proxy_dic) return pic_url,proxy_dic else: proxy_dic = crate_proxy_dic() res = requests.get(page_url, headers=header, proxies=proxy_dic) if res.status_code==200: proxy_dic = proxy_dic pic_url = get_img_url(res) return pic_url,proxy_dic else: append_img_url(page_url, header, proxy_dic=None) except ConnectionError: append_img_url(page_url, header, proxy_dic=None) def get_img_url(res): html = res.text soup = BeautifulSoup(html, 'lxml') pic_url = soup.find('img').get('src') return pic_url def download_Pic(title, pic_url_list,proxy_dic=None): print('download_pic') # 新建文件夹,路径 os.mkdir(title) headers = referer_headers() # 自定义序列号 j = 1 # 下载图片 for item in pic_url_list: # 定义文件路径及名称 filename = '%s/%s.jpg' % (title, str(j)) print('downloading....%s : NO.%s' % (title, str(j))) with open(filename, 'wb') as f: try: img_res = requests.get(item, headers=headers,proxies=proxy_dic) if img_res.status_code==200: img = img_res.content f.write(img) else: proxy_dic = crate_proxy_dic() img_res = requests.get(item, headers=headers, proxies=proxy_dic) img = img_res.content f.write(img) except ConnectionError: proxy_dic = crate_proxy_dic() img_res = requests.get(item, headers=headers, proxies=proxy_dic) img = img_res.content f.write(img) j += 1 if __name__ == '__main__': url = 'https://www.mzitu.com/all' all_list, proxy_dic = get_all_girls(url) get_girl_all_page(all_list, proxy_dic)