Python配合BeautifulSoup读取网络图片并保存在本地

本例为Python配合BeautifulSoup读取网络图片，并保存在本地。

BeautifulSoup可代替正则表达式，更好地解析Html文本，获取其中的指定内容，如Tag、Property等

# -*- coding: gbk -*-

import urllib
import urllib2
from bs4 import BeautifulSoup
import time
import re
import os,sys
import chardet

def req(url):
    #url='http://www.szu.edu.cn/2014/news/index_1.html'
    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    req=urllib2.Request(url,headers=header)
    data=urllib.urlopen(req).read()
    print data
    return data

def reqImg():
    #url='http://www.junmeng.com/tj/22376_4.html'
    url=r'http://www.junmeng.com/tj/22376.html'
    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    patnLink=r'<a href=".*/tj/22376_d*.html"><img src.+</a>'
    patnImg=r'<img src=.+>'
    savedir=r'C:UsershpDesktop
esults'
    if not os.path.exists(savedir):
            os.mkdir(savedir)

    for i in range(1,20):
        if i==1:
            tempurl=url
        else:
            tempurl='http://www.junmeng.com/tj/22376_%d.html'%i
        print tempurl
        #req=Request(tempurl,headers=header)
        data=urllib.urlopen(tempurl).read()
        #print data
        if i==19:
            patnLink=r'<a href=.*><img src=.*</a>'
        imgLinks=re.findall(patnLink,data)
        #print results
        link=imgLinks[0]
        #print link
        imgLink=link[link.find('src=')+5:link.find('.jpg')+4]
        print imgLink
        fullLink=r'http://www.junmeng.com%s'%imgLink
        lct=time.strftime('%Y%m%d%H%M%S')
        urllib.urlretrieve(fullLink,'%s\%s%d.jpg'%(savedir,lct,i))
        #return data
        
def reqImg2():
    url=r'http://www.ik6.com/meinv/40569/index.html'
    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    savedir=r'C:UsershpDesktop
esults'
    if not os.path.exists(savedir):
            os.mkdir(savedir)

    for i in range(1,10):
        if i==1:
            tempurl=url
        else:
            tempurl='http://www.ik6.com/meinv/40569/index_%d.html'%i
        print tempurl
        #req=Request(tempurl,headers=header)
        data=urllib.urlopen(tempurl).read()
        page=BeautifulSoup(data)
        imgsrc=page.find_all('center')[0].find_all('img')[0].get('lazysrc')
        print imgsrc
        lct=time.strftime('%Y%m%d%H%M%S')
        urllib.urlretrieve(imgsrc,'%s\%s%d.jpg'%(savedir,lct,i))

def reqImg3():
    url=r'http://www.ik6.com/meinv/40572/index.html'
    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    savedir=r'C:UsershpDesktop
esults'
    if not os.path.exists(savedir):
            os.mkdir(savedir)
    for i in range(1,10):
        if i==1:
            tempurl=url
        else:
            tempurl='http://www.ik6.com/meinv/40572/index_%d.html'%i
        print tempurl
        #req=Request(tempurl,headers=header)
        data=urllib.urlopen(tempurl).read()
        page=BeautifulSoup(data)
        imgsrc=page.find_all('center')[0].find_all('img')[0].get('lazysrc')
        print imgsrc
        lct=time.strftime('%Y%m%d%H%M%S')
        urllib.urlretrieve(imgsrc,'%s\%s%d.jpg'%(savedir,lct,i))
        
def reqImg4(url,themecount,imgcount):
    #url=r'http://www.ik6.com/meinv/40572/index.html'
    header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    savedir=r'C:UsershpDesktop
esult0128'
    if not os.path.exists(savedir):
            os.mkdir(savedir)
    newUrl=(url[:url.rfind('.htm')]+'_%d.html')
    print newUrl
    for i in range(1,imgcount+1):
        if i==1:
            tempurl=url
        else:
            tempurl=newUrl%i
        print tempurl
        try:
            data=urllib.urlopen(tempurl).read()
            if not data:
                print 'no response,exit'
                return
            page=BeautifulSoup(data)
            centers=page.find_all('center')
            if len(centers)==0:
                print 'response has no contents,exit'
                return
            else:
                imgsrc=centers[0].find_all('img')[0].get('lazysrc')
                print imgsrc
                #lct=time.strftime('%Y%m%d%H%M%S')
                #urllib.urlretrieve(imgsrc,'%s\%s%d.jpg'%(savedir,lct,i))
                urllib.urlretrieve(imgsrc,'%s\%d_%d.jpg'%(savedir,themecount,i))
        except Exception,e:
            return

使用：

req('http://blog.csdn.net/suwei19870312/article/details/8148427')
req('http://www.taobao.com')
reqImg()
reqImg2()      
reqImg3()
for i in range(1000):
    count=11170+i
    url=r'http://www.ik6.com/meinv/%d/index.html'%count
    reqImg4(url,8)

相关阅读:
阿里风控大脑关于大数据应用的探索与实践
 MaxCompute
Lyft 基于 Flink 的大规模准实时数据分析平台（附FFA大会视频）
基于MaxCompute InformationSchema进行血缘关系分析
 阿里经济体大数据平台的建设与思考
 厉害了！阿里云首批通过边缘云标准符合性测试
 边缘计算和“寒武纪”有什么关系？阿里云资深专家刘强如是说
 阿里云荣获最佳智能边缘计算技术创新平台
 阿里云杨敬宇：四层技术构建基于城市场景的边缘计算
 阿里云移动研发平台 EMAS 助力银行业打造测试中台，提升发版效能
原文地址：https://www.cnblogs.com/aaronhoo/p/5152335.html