• Stage6--Python简单爬虫


    正则表达式简单介绍

    正则表达式(regular expression)描述了一种字符串匹配的模式,可以用来检查一个串是否含有某种子串、将匹配的子串做替换或者从某个串中取出符合某个条件的子串等。

    字符 [a-z]
    数字 [0-9] 或 

    * 匹配前面的子表达式零次或多次
    + 匹配前面的子表达式一次或多次
    ? 匹配前面的子表达式零次或一次

    一个简单爬虫例子

    import re
    import urllib.request
    
    url = "http://mall.csdn.net/coin"
    
    savePath = "G:/QQData/"
    
    def getHtml(url):
        page = urllib.request.urlopen(url)
        html = page.read().decode('utf-8')
        return html
    
    def getImg(html):
        reg = r'http://img.bss.csdn.net/[0-9]+.jpg'
        imgre = re.compile(reg)
        imglist = re.findall(imgre, html)
        return imglist;
    
    def saveImg(url):
        conn = urllib.request.urlopen(url)
        file = open(savePath + getFileName(url), 'wb')
        file.write(conn.read())
        file.close()
        return
    
    def getFileName(url):
        reg = r'[a-z0-9]+.jpg'
        matchObj = re.search(reg, url)
        if matchObj:
            return matchObj.group()
        return
    
    html = str(getHtml(url));
    imgurls = getImg(html);
    
    for imgurl in imgurls:
        print(imgurl)
        saveImg(imgurl)    
    

    这个例子很简单,里面的正则简单的到没有,是爬取CSDN U币商城的图片,很容易看懂。

    可以自定义正则和Url的爬虫

    from tkinter import *
    import re
    import urllib.request
    
    
    
    savePath = "G:/QQData/"
    
    class ControllPanel(Frame):
    
        default_url = "http://www.xingyongshe.com/man/ddfeijibei"
        default_regix = r'<img src="(.+)&amp;w=228&amp;q=90&amp;type=jpg'
        __status_label = None  #私有属性
        url_input = None
        regix_input = None
    
        def __init__(self, master):
            Frame.__init__(self, master, width=360, height=260)
            self.master = master
            self.pack()
            self.add_urlarea()
            self.add_regixarea()
            self.add_buttonarea()
            self.add_statusbar()
    
    
        def add_urlarea(self):
            urlAreaFrame = Frame(self)
            urlAreaFrame.pack(pady=20)
            label = Label(urlAreaFrame, text="网址:")
            label.pack(side=LEFT)
            self.url_input = Entry(urlAreaFrame, width=40)
            self.url_input.insert(0, ControllPanel.default_url)
            self.url_input.pack(side=RIGHT)
    
    
        def add_regixarea(self):
            regixAreaFrame = Frame(self)
            regixAreaFrame.pack()
            label = Label(regixAreaFrame, text="正则:")
            label.pack(side=LEFT)
            self.regix_input = Entry(regixAreaFrame, width=40)
            self.regix_input.insert(0, self.default_regix)     #注意这里的访问方式和上面方式不同
            self.regix_input.pack(side=RIGHT)
    
    
        def add_buttonarea(self):
            buttonAreaFrame = Frame(self)
            buttonAreaFrame.pack(pady=40)
            verify_button = Button(buttonAreaFrame, text="验证正则", command=self.verifyRegix) 
            verify_button.pack(padx=50, side=LEFT)
            start_button = Button(buttonAreaFrame, text="开始爬取", command=self.startRun)
            start_button.pack(padx=50, side=RIGHT)
    
        def verifyRegix(self):
            if self.url_input == None or self.regix_input == None:
                return
            url = self.url_input.get()
            regix = self.regix_input.get()
            runfunction = RunFunction(url, regix)
            imgurls = runfunction.verifyRegix()
            if len(imgurls) > 0:
                self.setStatus("有%d张图片可以爬取", len(imgurls))
            for imgurl in imgurls:
                print(imgurl)
            return imgurls
    
        def startRun(self):
            imgurls = self.verifyRegix()
            runfunction = RunFunction(None, None)
            size = runfunction.runTask(imgurls)
            self.setStatus("爬取了%d张图片放在了" + savePath, size)
    
        def add_statusbar(self):
            statusBarFrame = Frame(self.master)
            statusBarFrame.pack(side=BOTTOM, fill=X)
            self.status_label = Label(statusBarFrame, bd=1, relief=SUNKEN, anchor=W)
            self.status_label.pack(fill=X)
    
        def setStatus(self, format, *args):
            if self.status_label == None:
                return
            self.status_label.config(text=format % args)
            self.status_label.update_idletasks()
            return
    
        def clearStatus(self):
            self.status_label.config(text="")
            self.status_label.update_idletasks()
    
    
    
    
    class RunFunction:
    
        def __init__(self, url, regix):
            self.url = url
            self.regix = regix
    
        def getHtml(self, url):
            page = urllib.request.urlopen(url)
            html = page.read().decode('utf-8')
            return html
    
        def getImg(self, html, reg):
            imgre = re.compile(reg)
            imglist = re.findall(imgre, html)
            return imglist;
    
        def saveImg(self, url):
            conn = urllib.request.urlopen(url)
            file = open(savePath + self.getFileName(url), 'wb')
            file.write(conn.read())
            file.close()
            return
    
        def getFileName(self, url):
            reg = r'[a-z0-9]+.jpg'
            matchObj = re.search(reg, url)
            if matchObj:
                return matchObj.group()
            return
    
        def verifyRegix(self):
            html = str(self.getHtml(self.url))
            imgurls = self.getImg(html, self.regix)
            return imgurls
    
        def runTask(self):
            imgurls = self.verifyRegix(self.url, self.regix)
            for imgurl in imgurls:
                self.saveImg(imgurl)
            return len(imgurls)
    
        def runTask(self, imgurls):
            for imgurl in imgurls:
                self.saveImg(imgurl)
            return len(imgurls)
    
    
    
    root = Tk()
    root.title("爬虫管理窗口")
    
    #让窗口居中显示
    scnWidth, scnHeight = root.maxsize()
    tmpcnf = '%dx%d+%d+%d'%(308, 101, (scnWidth-308)/2, (scnHeight-101)/2)
    root.geometry(tmpcnf)
    
    root.maxsize(600, 300)
    root.minsize(360, 220)
    #root.resizable(False, False) #让窗口尺寸不变
    
    controllPanel = ControllPanel(root)
    controllPanel.setStatus("等待爬取……")
    
    
    root.mainloop()
    root.destroy()
    

    这里写图片描述

  • 相关阅读:
    NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.
    QOpenGLShaderProgram: could not create shader program
    ubuntu loading initial ramdisk 卡住
    ubuntu 下pip install sasl报错fatal error: sasl/sasl.h: No such file or directory
    ImportError: No module named managers
    python docker 多进程提供 稳定tensorflow gpu 线上服务
    侧脸生成正脸概论与精析(一)Global and Local Perception GAN
    pytorch安装 caffe2 安装:git 慢 caffe2 cannot find -lopencv_dep_cudart ,undefined reference to 'pthread_create'
    undefined symbol: PyFPE_jbuf
    Metasploit后渗透模块开发
  • 原文地址:https://www.cnblogs.com/lanzhi/p/6468422.html
Copyright © 2020-2023  润新知