• Python 爬虫2


    import urllib.request
    import os
    import re
    import time
    

    设置头文件

    head={}
    head['User-Agent'] ='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36'
    

    主要是用来查看测试是否可以访问网页,并取回源码

    def url_open(url):
        req=urllib.request.Request(url,headers=head)
        response=urllib.request.urlopen(req)
        html=response.read()
        return html
    

    每一个车模有一个ID,每个ID在网站的中的图片数量不同,此函数主要是取回每个ID下有多少张图片

    def get_page_num(url):
        html=url_open(url).decode('gbk')
        html=str(html)
        text=[]
        a =html.find(r'<div class="content-page"><span class="page-ch">')
        b =html[a+49:a+51]
        mode=re.compile(r'd+')
        page_num=mode.findall(b)
        return page_num
    

    取回车模的ID号

    
    def find_ID_num(url):
    
        html=url_open(url).decode('gb2312')
        ID_num=[]
        a =html.find('src=')
        while a!=-1:
            b=html.find('.jpg',a,a+255)
            if b!=-1:
                c=html[a+5:b+4]
                an=re.match('http://img1.mm131.com/pic/',c)
                if an!=None:
                    c=c.split('/')[-2]
                    ID_num.append(c)
                else:
                    pass
            else:
                b=a+5
            
            a=html.find('src=',b)
          
        return ID_num
    

    生成图片原地址,并存放在列表pic_addrs[]中

    
    def ID_web(url):
        pic_addrs=[]
        ID_num=find_ID_num(url)
        i=0
        while i <len(ID_num):
            
            url1=url+str(ID_num[i])+'.html'
            page_num=get_page_num(url1)
            pic_page=2
            while pic_page<int(page_num[0])+1:
                pic_addr='http://img1.mm131.com/pic/'+str(ID_num[i])+'/'+str(pic_page)+'.jpg'
                pic_addrs.append(pic_addr)
                pic_page+=1
            
            i+=1
            
        return pic_addrs
    

    访问图片原地址,生成本地文件并给图片命名

    
    def save_imgs(folder,pic_addrs):
        for each in pic_addrs:
            filename=each.split('com/')[-1].replace('/','_')
            
            with open(filename,'wb') as f:
                img=url_open(each)
                f.write(img)
                time.sleep(5)
    

    主函数

    
    def download_mm(folder="OOXXmm1"):
        os.chdir(folder)
        url="http://www.mm131.com/chemo/"
        pic_addrs=ID_web(url)
        save_imgs(folder,pic_addrs)
        
                 
    if __name__=='__main__':
        download_mm()
        
    
  • 相关阅读:
    MyStreamRequestHandlerr
    SocketFromServer
    MyQMainWindowDemo
    MyQThread
    Nginx安装与配置
    nginx软件优化
    MySQL优化实施方案
    tomcat优化方向
    Tomcat优化方案
    Nginx和Tomcat优化
  • 原文地址:https://www.cnblogs.com/li-volleyball/p/5616699.html
Copyright © 2020-2023  润新知