• python 学习之爬虫练习


    通过学习python,写两个简单的爬虫,没用线程,本地抓取速度还不错,有些瑕疵就是抓的图片有些显示不出来,代码做个笔记记录下:

    # -*- coding:utf-8 -*-
    
    import re
    import urllib.request
    import os
    
    url = "http://www.58pic.com/yuanchuang/0/day-"
    
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read().decode('gbk')
        return html
    
    def getImg(html,num):
        reg = r'src="(.*?)" '
        imgre = re.compile(reg)
        imglist = re.findall(imgre,html)
        x = 0
        os.mkdir(r"G:collect/%d" % num)
        filePath = r"G:collect/%d/" % num
        for imgurl in imglist:       
            f=open(filePath+str(x)+".jpg",'wb')  
            req=urllib.request.urlopen(imgurl)
            buf=req.read()  
            f.write(buf)
            x+=1
    
    for i in range(1,10):
        getUrl = url+"%d.html" % i
        print(getUrl)
        html = getHtml(getUrl)
        #print(html)
        print(getImg(html,i))

    最终的结果如下图:

    根据上面的初步代码,优化后加强版的爬虫代码,对于链接的状态异常的抛出异常后在继续执行程序。代码如下:

    # -*- coding:utf-8 -*-
    
    import re
    import urllib.request
    import os
    
    url = "http://www.58pic.com/psd/"
    
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read().decode('gbk')
        return html
    
    def getImg(html,num):
        reg = r'src="(.+?.jpg)" class="show-area-pic" id="show-area-pic" alt="(.*?)"'
        imgre = re.compile(reg)
        imglist = re.findall(imgre,html)
        print(imglist)
        filePath = r"F:Py/collect/%d/" % num
        isCreate = os.path.exists(filePath)
        if isCreate == False :
            os.mkdir(r"F:Py/collect/%d" % num)   
            for img in imglist:
                title = img[1]
                f=open(filePath+title+".jpg",'wb') 
                req=urllib.request.urlopen(img[0])
                buf=req.read()  
                f.write(buf)
                
    
    for i in range(22797263,22797666):
        getUrl = url+"%d.html" % i
        #status = urllib.request.urlopen(getUrl).code
        try:
            html = getHtml(getUrl)
            #print(html)
            getImg(html,i)
        except urllib.request.URLError as e:
            print(e.code)
            print(e.reason)
  • 相关阅读:
    承接小项目嵌入式linux相关开发、开源飞控相关开发、qt相关开发、无人机地面站相关开发
    转载 ardupilot 学习
    PX4 IO板启动流程
    PX4 FMU启动流程 1.nsh
    PX4 FMU启动流程 2. 一、 nsh_newconsole
    PX4 FMU启动流程 2. 二、 nsh_initscript
    Pixhawk源码快速阅读 02_进程间通信
    PX4 FMU [5] Loop
    Oracle 执行JOB程序自动存储数据
    C# 禁止非数字输入TextBoox
  • 原文地址:https://www.cnblogs.com/bieanju/p/5884781.html
Copyright © 2020-2023  润新知