• python实现简单爬虫抓取图片


    最近在学习python,正如大家所知,python在网络爬虫方面有着广泛的应用,下面是一个利用python程序抓取网络图片的简单程序,可以批量下载一个网站更新的图片,其中使用了代理IP的技术。

    import urllib.request
    import os
    import random
    def url_open(url):
        req=urllib.request.Request(url)
        #为请求设置user-agent,使得程序看起来更像一个人类
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0')
        #代理IP,使用户能以不同IP访问,从而防止被服务器发现
        '''iplist=['1.193.162.123:8000','1.193.162.91:8000','1.193.163.32:8000']
        proxy_support=urllib.request.ProxyHandler({'http':random.choice(iplist)})
        opener=urllib.request.build_opener(proxy_support)
        opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.154 Safari/537.36 LBBROWSER')]
    
        urllib.request.install_opener(opener)'''
    
        response=urllib.request.urlopen(req)
        html=response.read()
        return html
    def get_page(url):
    
        html=url_open(url).decode('utf-8')
        a=html.find('current-comment-page')+23
        b=html.find(']',a)
    
        #print(html[a:b])
        return html[a:b]
    
    
    def find_imgs(url):
        html=url_open(url).decode('utf-8')
        img_addrs=[]
    
        a=html.find('img src=')
        while a!=-1:
            b=html.find('.jpg',a,a+140)
            if b!=-1:
                if html[a+9]!='h':
                    img_addrs.append('http:'+html[a+9:b+4])
                else:
                    img_addrs.append(html[a+9:b+4])
            else:
                b=a+9
    
            a=html.find('img src=',b)
    
        for each in img_addrs:
            print(each+'我的打印')
        return img_addrs
    
    def save_imgs(folder,img_addrs):
        for each in img_addrs:
            #print('one was saved')
            filename=each.split('/')[-1]
            with open(filename,'wb') as f:
                img=url_open(each)
                f.write(img)
    
    
    
    def download_mm(folder='ooxx',pages=10):
        os.mkdir(folder)
        os.chdir(folder)
    
        url="http://jandan.net/ooxx/"
        page_num=int(get_page(url))
    
        for i in range(pages):
            page_num=page_num-1
            page_url=url+'page-'+str(page_num)+'#comments'
            img_addrs=find_imgs(page_url)
            save_imgs(folder,img_addrs)
    
    if __name__=='__main__':
        download_mm()

    完成

    运行结果

    结果

  • 相关阅读:
    POJ 2209
    POJ 2196
    POJ 2215
    POJ 2192
    POJ 2195
    POJ 2181
    POJ 2182
    POJ 2159
    POJ 2153
    字符设备驱动 —— 字符设备驱动框架
  • 原文地址:https://www.cnblogs.com/jjx2013/p/6223742.html
Copyright © 2020-2023  润新知