• python3 爬煎蛋ooxx妹子图


    import re
    import urllib.request
    import random
    import os
    import http.server
    import http.client
    from urllib.error import URLError, HTTPError
    import urllib.parse
    proxy = []  #定义代理IP列表
    
    
    def change_proxy():      #创建使用随机某个代理IP地址
        proxy_ip = random.choice(proxy)
        proxy_support = urllib.request.ProxyHandler({"http":proxy_ip})
        opener = urllib.request.build_opener(proxy_support)
        opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36')]
        urllib.request.install_opener(opener)
        print("代理IP: %s" % proxy_ip)
    
    def url_open(url):     #访问jandan.net网站,如果报错进行重新获取代理IP,最多5次
        count = 0
        while True:
            try:
                if count == "5":
                    print("已经失败了5次,程序退出,重新执行")
                count += 1
                response = urllib.request.urlopen(url)
                html = response.read()
                return html
            except OSError as e:
                print("链接出问题了,智能切换新的代理IP
    出错的问题是:" + str(e))
                change_proxy()
                continue
            except urllib.error.URLError as u:
                print("链接出问题了,智能切换新的代理IP
    出错的问题是:" + str(u))
                change_proxy()
                continue
            except (http.client.BadStatusLine,http.client.IncompleteRead) as h:
                print("链接出问题了,智能切换新的代理IP
    出错的问题是:" + str(h))
                change_proxy()
                continue
    
    def get_pagenum(url):    #获取jandan网站的页面号(2305)
        html = url_open(url).decode("utf-8")
        num_re = re.compile(r'<spansclass="current-comment-page">[d{4}]</span>')
        num = num_re.search(html)
        a = re.compile(r'd{4}')
        num_b = a.search(num.group())
        return  num_b.group()
    
    def get_imgurl(url):    #获取图片的地址
        img = []
        html = url_open(url).decode("utf-8")
        jpg_re = re.compile(r'<img src="//ww.*.jpg')
        numurl = jpg_re.findall(html)
        jpg = re.compile(r'//ww.+.jpg')
        for line in numurl:
            imgurl = jpg.findall(line)
            img.append(imgurl[0])
        return img
    
    def save_img(img):   #保存图片
        i = 0
        for each in img:
            i += 1
            filename = each.split('/')[-1]
            with open(filename,'wb') as f:
                imgpage = url_open("http:%s" %each)
                f.write(imgpage)
                print("下载本页的第%s张图片,名称为%s" %(i,filename))
    
    
    def get_proxy():     #从IP代理网站上抓取代理IP,存入Proxy列表中
        head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
        req = urllib.request.Request(url="http://www.xicidaili.com",headers=head)
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")
        IP = re.compile(r'''<trsclass=.+>s+
                                        <tds.+</td>s+
                                        <td>.+</td>s+
                                        <td>.+</td>s+
                                        <td>.+</td>s+
                                        <tds.+?</td>s+
                                        <td>.+</td>s+
                                        <td>.+</td>s+
                                        <td>.+</td>s+
                                        </tr>
                                        ''',re.VERBOSE)
        proxy_ip = IP.findall(html)
        for num in range(len(proxy_ip)):
            protocol_list = proxy_ip[num].split()
            protocol = protocol_list[-4].split(">")
            HTTP = protocol[1].split("<")
            PORT_list = proxy_ip[num].split()
            PORT = PORT_list[8].split(">")
            PO = PORT[1].split("<")
            ip_list = proxy_ip[num].split()
            ip = ip_list[7].split(">")
            IP = ip[1].split("<")
            if HTTP[0] == "HTTP":
                IP_list = IP[0]+":"+PO[0]
                proxy.append(IP_list)
        return proxy
    
    def download(dir,url):
        if not os.path.isdir(dir):
            os.mkdir(dir)
            os.chdir(dir)
        else:
            os.chdir(dir)
        url = url
        page_num = int(get_pagenum(url))
        for i in range(10):
            page_num -= 1
            pageurl = url + "page-" + str(page_num) + "#comments"
            imgurl = get_imgurl(pageurl)
            print("下载第%s页图片" % page_num)
            saveimg = save_img(imgurl)
    
    if __name__ == "__main__":
        get_proxy()
        change_proxy()
        dir = "ooxx"
        url = "http://jandan.net/ooxx/"
        download(dir,url)
  • 相关阅读:
    Quartz_理解2
    Quartz_理解1
    Java监控常用工具 .
    DB2函数大全
    cvs上传复制项目
    PLSQL DEVELOPER 使用的一些技巧【转】
    webservice_模拟报文测试
    Myeclipse插件将wsdl生成java客户端代码
    利用 UltraEdit 重新排版 XML 结构数据
    uoj164. 【清华集训2015】V 统计
  • 原文地址:https://www.cnblogs.com/jonnter/p/7725219.html
Copyright © 2020-2023  润新知