• python爬虫笔记(2、爬取豆瓣top250、写入数据库、数据可视化)


    一、爬取豆瓣top250的数据
    #coding:utf-8
    import urllib.request
    import urllib.parse
    import re
    from bs4 import BeautifulSoup
    import xlwt
    """
    urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)
    data:字节流编码格式(可以用urllib.parse.urlencode()和bytes()方法将参数转化为字节流编码格式的内容)。如果要使用data参数,则请求方式为POST。
    origin_req_host:指定请求方的host名称或者ip地址
    unverifiable:设置网页是否需要验证,默认是False,这个参数一般也不用设置。
    method:method是一个字符串,用来指定请求使用的方法,比如GET,POST和PUT等。
    """
    # url = 'https://www.douban.com'
    # header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"}
    # #class bytes([source[, encoding[, errors]]])
    # content = bytes(urllib.parse.urlencode({"name":"wang"}),encoding="utf-8")
    # response = urllib.request.Request(url=url,data=content,headers=header,method="POST")
    # reasult = urllib.request.urlopen(response)
    # print(reasult.read().decode("utf-8"))
    
    #正则表达式
    #  <a href="https://movie.douban.com/subject/1306249/" class="">
    # r"" 的作用是去除转义字符.
    findLink = re.compile(r'<a href="(.*?)">')
    #<img width="100" alt="肖申克的救赎" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class="">
    findImg = re.compile(r'<img .*src="(.*?)".*>',re.S)  #re.S让换行符包含在匹配中
    findName = re.compile(r'<span class="title">(.*)</span>')
    findGrade = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
    findNum = re.compile(r'<span>(\d*)人评价</span>')
    findInq = re.compile(r'<span class="inq">(.*)</span>')
    findOther = re.compile(r'<p class="">(.*?)</p>',re.S)
    def main():
        #1.爬取网页
        baseurl = 'https://movie.douban.com/top250?start='
        dataList = getData(baseurl)
        savePath = ".\\top250.xls"
        saveData(dataList,savePath)
        #2.保存数据
    
    #爬取网页
    def getData(url):
        dataList = []
        for i in range(0,10):
            baseurl = url+str(i*25)
            html = askUrl(baseurl)
            #2.逐一解析数据
            soup = BeautifulSoup(html,"html.parser")
            for item in soup.find_all("div",class_ = "item"):
                item = str(item)
                data = [] #保存一部电影的所有信息
                #影片详情链接
                link = re.findall(findLink,item)[0]
                data.append(link)
                #影片图片的链接
                img = re.findall(findImg,item)[0]
                data.append(img)
                # 影片名
                name = re.findall(findName, item)
                if len(name) == 2:
                    cname = name[0]
                    data.append(cname)
                    oname = name[1].replace("/","").replace("\xa0","")
                    data.append(oname)
                else:
                    cname = name[0]
                    data.append(cname)
                    data.append(" ")
                #电影评分
                grade = re.findall(findGrade,item)[0]
                data.append(grade)
                #影片评价人数
                number = re.findall(findNum,item)[0]
                data.append(number)
                #影片简介
                introduction = re.findall(findInq,item)
                if len(introduction) != 0:
                    intro = introduction[0].replace(""," ") #去掉句号
                    data.append(intro)
                else:
                    data.append(" ")
                #影片其他信息
                otherInfo = re.findall(findOther,item)[0]
                # re.sub(pattern, repl, string, count=0, flags=0)
                otherInfo = re.sub(r'<br(\s+)?/>(\s)?'," ",otherInfo)
                otherInfo = re.sub("/"," ",otherInfo)
                data.append(otherInfo.strip())
                dataList.append(data)
        return dataList
    
    def saveData(dataList,savePath):
        # 创建一个workbook,并设置编码
        workbook = xlwt.Workbook(encoding="utf-8")
        # 创建一个worksheet
        worksheet = workbook.add_sheet("豆瓣电影top250",cell_overwrite_ok=True)
        setList = ("地址链接","图片链接","电影中文名","电影外文名","评分","评价人数","简介","电影其他信息")
        #第一行显示的内容
        for i in range(0,8):
            worksheet.write(0,i,setList[i])
        for i in range(0,250):
            data = dataList[i]
            for j in range(0,8):
                worksheet.write(i+1,j,data[j])
        #保存
        workbook.save(savePath)
    #得到指定网页的全部内容
    def askUrl(url):
        header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"}
        response = urllib.request.Request(url,headers=header)
        try:
            result = urllib.request.urlopen(response)
            html = result.read().decode("utf-8") #获得网页内容
        except urllib.error.URLError as e:
            if(hasattr(e,"code")):
                print(e.code)
            if (hasattr(e,"error")):
                print(e.error)
        return html
    if __name__ == "__main__":
        main()

    二、将爬取的数据写入到数据库中

    #coding:utf-8
    from bs4 import BeautifulSoup #用来代替正则式取源码中相应标签中的内容
    import urllib.parse
    import urllib.request
    import re
    import mysql.sqlExecute as sqlExecute
    
    #正则表达式
    #  <a href="https://movie.douban.com/subject/1306249/" class="">
    # r"" 的作用是去除转义字符.
    findLink = re.compile(r'<a href="(.*?)">')
    #<img width="100" alt="肖申克的救赎" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp" class="">
    findImg = re.compile(r'<img .*src="(.*?)".*>',re.S)  #re.S让换行符包含在匹配中
    findName = re.compile(r'<span class="title">(.*)</span>')
    findGrade = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
    findNum = re.compile(r'<span>(\d*)人评价</span>')
    findInq = re.compile(r'<span class="inq">(.*)</span>')
    findOther = re.compile(r'<p class="">(.*?)</p>',re.S)
    
    def main():
        #准备url地址
        url = "https://movie.douban.com/top250?start="
        #1.获取指定网页内容
        html = askUrl(url)
        # 2.获取数据并处理后,以数组的形式准备
        dataList = getData(url)
        # 3.将数据保存到数据库中
        result = saveData(dataList)
        print(result)
    
    # 第一步:得到指定网页的内容
    def askUrl(url):
        #准备
        header = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.9231 SLBChan/30'
        }
        response = urllib.request.Request(url, headers=header)
        try:
            result = urllib.request.urlopen(response)
            html = result.read().decode("utf-8")
            '''code
                一个 HTTP 状态码
                这个数字的值对应于存放在http.server.BaseHTTPRequestHandler.responses代码字典中的某个值
                reason
                这通常是一个解释本次错误原因的字符串
                headers
                导致 HTTPError 的特定 HTTP 请求的 HTTP 响应头
            '''
        except urllib.error as e:
            #HTTPError是URLError的子库
            # code、reason、headers
            if(hasattr(e,"code")):
                print(e.code)
            if(hasattr(e,"reason")):
                print(e.reason)
        return html
    
    
    # 第二步:爬取相应的网页,并解析数据,准备数据:data[(1,2,3),(4,5,6)]
    def getData(url):
        dataList = []
        for i in range(0,10):
            baseUrl = url + str(i*25)
            html = askUrl(baseUrl)
            # 逐一解析数据
            soup = BeautifulSoup(html,'html.parser')
            for item in soup.find_all('div',class_ = 'item'):
                data = () #保存一部电影的详细信息
                # print(type(item)) :bs4.element.Tag
                item = str(item)
                # 影片详情链接
                link = re.findall(findLink, item)[0]
                data = data + (str(link),)
                # 影片图片的链接
                img = re.findall(findImg, item)[0]
                data = data + (str(img),)
                # 影片名
                name = re.findall(findName, item)
                if len(name) == 2:
                    cname = name[0]
                    data = data + (str(cname),)
                    oname = name[1].replace("/", "").replace("\xa0", "")
                    data = data + (str(oname),)
                else:
                    cname = name[0]
                    data = data + (str(cname),)
                    data = data + (" ",)
                # 电影评分
                grade = re.findall(findGrade, item)[0]
                data = data + (str(grade),)
                # 影片评价人数
                number = re.findall(findNum, item)[0]
                data = data + (str(number),)
                # 影片简介
                introduction = re.findall(findInq, item)
                if len(introduction) != 0:
                    intro = introduction[0].replace("", " ")  # 去掉句号
                    data = data + (str(intro),)
                else:
                    data = data + (" ",)
                # 影片其他信息
                otherInfo = re.findall(findOther, item)[0]
                # re.sub(pattern, repl, string, count=0, flags=0)
                otherInfo = re.sub(r'<br(\s+)?/>(\s)?', " ", otherInfo)
                otherInfo = re.sub("/", " ", otherInfo)
                data = data + ((otherInfo.strip()),)
                dataList.append(data)
        return dataList
    
    
    #第三步: 将数据保存到数据库中
    def saveData(dataList):
        sqlDb = sqlExecute.MysqlDb()
        sql = "insert into movie_top(movie_link,movie_pic,movie_name,movie_foreign,movie_grade,movie_comment_num,movie_intro,movie_other) values(%s,%s,%s,%s,%s,%s,%s,%s)"
        result = sqlDb.execute_sql(sql,dataList)
        return result
    
    
    if __name__ == "__main__":
        main()

    三、数据可视化(可参考下面例子)

    from pyecharts.charts import Bar
    bar = Bar()
    bar.add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"])
    bar.add_yaxis("商家A", [5, 20, 36, 10, 75, 90])
    # render 会生成本地 HTML 文件,默认会在当前目录生成 render.html 文件
    # 也可以传入路径参数,如 bar.render("mycharts.html")
    bar.render()

     最终数据:

  • 相关阅读:
    嵌入式Linux设备驱动编程(1):基础
    嵌入式Linux网络编程
    Linux进程间通信(5):消息队列
    Android网络通信(2):HTTP通信
    Android网络通信(3):Socket通信
    Android网络通信(5):WiFi
    Linux任务、进程和线程
    Android程序的安装和卸载
    Android网络通信(4):WebKit
    Android网络通信(1):Android网络基础
  • 原文地址:https://www.cnblogs.com/Horsonce/p/16798643.html
Copyright © 2020-2023  润新知