• 代理池爬取mzitu


    分了3个文件,

    -config 存放一些信息及配置

    -proxy_pool 抓取代理

    -get_mzitu 爬取网页

    使用前需要安装redis数据库 https://redis.io/download

    config文件

    #设置user_agent条目
    USER_AGENTS = [
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
        "Mozilla/5.0"
    ]
    #设置refefer条目
    REFERER = [
        'https://www.mzitu.com/',
        'https://www.mzitu.com/215027',
        'https://www.mzitu.com/201236',
    ]
    #设置redis服务器连接
    CONN = StrictRedis(host='localhost',port=6379,db=0,password='')
    

      proxy_pool文件

    import re
    import requests
    import random
    from config import *
    from redis import StrictRedis
    from requests import ConnectionError
    
    def headers():
        header= {
            'User-Agent' : random.choice(USER_AGENTS),
        }
        return header
    
    def get_page(url):
        print('pool正在打开网页')
        try:
            header = headers()
            res = requests.get(url, headers=header)
            if res.status_code == 200:
                # print(res.text)
                return res.text
            else:
                get_page(url)
        except ConnectionError:
            get_page(url)
    
    def get_proxy_list():
        print('正在爬取网页')
        base_url = 'https://www.xicidaili.com/wt/'
        page_n = random.randint(100,2700)
        url = base_url + str(page_n)
        print(url)
        html = get_page(url)
        # print(html)
        try:
            pattrens = 'alt="Cn" /></td>([dD]*?)</tr>'
            root = re.findall(pattrens, html)
            # print(root)
            list_ip = []
            # 再次匹配数据的正则
            for i in range(len(root)):
                # print(len(root))
                key = re.findall('<td>([dD]*?)</td>', root[i])
                # list_ip.append(key[3].lower() + '://' + key[0] + ':' + key[1])
                list_ip.append(key[0] + ':' + key[1])
            print(list_ip)
            return list_ip
        except Exception:
            print('解析IP地址出错l')
            traceback.print_exc()
    def check_proxy():
        print('正在检查')
        list_ip = get_proxy_list()
        url_baidu = 'https://www.mzitu.com'
        for i in list_ip:
            print(i)
            proxy_dic = {'http':i,
            }
            try:
                r_baidu = requests.get(url_baidu,proxies=proxy_dic)
                if r_baidu.status_code == 200:
                    save_2_redis(i)
                else : pass
            except ConnectionError:pass
    
    def save_2_redis(proxy):
        print('正在写入%s' ,proxy)
        conn = StrictRedis(host='localhost',port=6379,password='')
        conn.set(proxy.split(':')[0],proxy)
    
    def get_proxy():
        print('pool获取proxy')
        if len(CONN.keys('*')) <= 3:
            check_proxy()
        else:
            key = CONN.randomkey()
            r = CONN.get(key)
            CONN.delete(key)
            print(str(r,encoding='utf-8'))
            return str(r,encoding='utf-8')
    
    def main():
        get_proxy()
    
    if __name__ == '__main__':
        main()
    

      get_mzitu文件

    import os
    import requests
    import random
    from config import *
    from proxy_pool import get_proxy
    from bs4 import BeautifulSoup
    
    def headers():
        header= {
            'User-Agent' : random.choice(USER_AGENTS),
        }
        return header
    
    def referer_headers():
        referer_header = {
            'User-Agent': random.choice(USER_AGENTS),
            'Referer':'https://www.mzitu.com/',
        }
        return referer_header
    
    def get_proxy_page(url,proxy_dic=None):
        if proxy_dic:
            header = headers()
            res = requests.get(url, headers=header, proxies=proxy_dic)
            return res.text,proxy_dic
        else:
            try:
                header = headers()
                proxy_dic = crate_proxy_dic()
                res = requests.get(url, headers=header, proxies=proxy_dic)
                return res.text,proxy_dic
            except ConnectionError:
                get_proxy_page(url)
    def crate_proxy_dic():
        proxy = 'http://' + str(get_proxy())
        proxy_dic = {
            'http': proxy,
        }
        # print(proxy_dic)
        return proxy_dic
    
    def get_all_girls(url):
        print('获取all_girl的url')
        html,proxy_dic = get_proxy_page(url,None)
        # 构建soup页面
        soup = BeautifulSoup(html, 'html.parser')
        # 获取 class_='archives' 下的所有 'a'标签
        total_info = soup.find(class_='archives').find_all('a')
        # 遍历 'a' 标签,读取'href'值
        all_list=[]
        for girls_info in total_info:
            link_url = girls_info['href']
            all_list.append(link_url)
        print(all_list,proxy_dic)
        return all_list,proxy_dic
    def get_girl_all_page(all_list,proxy_dic):
        for url in all_list:
            html,proxy_dic = get_proxy_page(url,proxy_dic)
            soup = BeautifulSoup(html,'lxml')
            # 在 class_='pagenavi' 中的倒数第3个标签,读取 'span' 的值(图片数量)
            max_page = soup.find(class_='pagenavi',).find_all('a')[-2].find('span').string
            title = soup.find(class_='main-title').string
            # 循环读取详情页面中的'img'标签中的'src'值
            header = referer_headers()
            pic_url_list = []
            for i in range(int(max_page)):
                page_url = url + "/%s"  %(i+1)
                pic_url,proxy_dic = append_img_url(page_url, header, proxy_dic)
                pic_url_list.append(pic_url)
            download_Pic(title, pic_url_list, proxy_dic)
    def append_img_url(page_url , header , proxy_dic=None):
        try:
            res = requests.get(page_url, headers=header, proxies=proxy_dic)
            if res.status_code == 200:
                pic_url = get_img_url(res)
                print(pic_url,proxy_dic)
                return pic_url,proxy_dic
            else:
                proxy_dic = crate_proxy_dic()
                res = requests.get(page_url, headers=header, proxies=proxy_dic)
                if res.status_code==200:
                    proxy_dic = proxy_dic
                    pic_url = get_img_url(res)
                    return pic_url,proxy_dic
                else:
                    append_img_url(page_url, header, proxy_dic=None)
        except ConnectionError:
            append_img_url(page_url, header, proxy_dic=None)
    def get_img_url(res):
        html = res.text
        soup = BeautifulSoup(html, 'lxml')
        pic_url = soup.find('img').get('src')
        return pic_url
    def download_Pic(title, pic_url_list,proxy_dic=None):
        print('download_pic')
        # 新建文件夹,路径
        os.mkdir(title)
        headers = referer_headers()
        # 自定义序列号
        j = 1
        # 下载图片
        for item in pic_url_list:
            # 定义文件路径及名称
            filename = '%s/%s.jpg' % (title, str(j))
            print('downloading....%s : NO.%s' % (title, str(j)))
            with open(filename, 'wb') as f:
                try:
                    img_res = requests.get(item, headers=headers,proxies=proxy_dic)
                    if img_res.status_code==200:
                        img = img_res.content
                        f.write(img)
                    else:
                        proxy_dic = crate_proxy_dic()
                        img_res = requests.get(item, headers=headers, proxies=proxy_dic)
                        img = img_res.content
                        f.write(img)
                except ConnectionError:
                    proxy_dic = crate_proxy_dic()
                    img_res = requests.get(item, headers=headers, proxies=proxy_dic)
                    img = img_res.content
                    f.write(img)
            j += 1
    
    if __name__ == '__main__':
        url = 'https://www.mzitu.com/all'
        all_list, proxy_dic = get_all_girls(url)
        get_girl_all_page(all_list, proxy_dic)
    

      

  • 相关阅读:
    [CTF]ROT5/13/18/47位移密码
    [CTF]维吉尼亚密码(维基利亚密码)
    [CTF]ACSII码
    kxtj2-1009驱动总结
    DC/DCLT1767EMS8E-5芯片
    485通信总结
    安装pymssql报错
    安装mysqlclient报错
    ERROR 1878
    Win10安装gopls
  • 原文地址:https://www.cnblogs.com/lijifei/p/12103057.html
Copyright © 2020-2023  润新知