• 01爬取豆瓣网电影数据进行numpy的练习


    level 2:
    10、案例:编写爬虫爬取豆瓣电影排行榜(电影名称,评分),保存为csv文件
    a、用numpy加载csv数据
    b、把评分列转换成float64类型
    c、计算电影的平均评分
    d、求评分最高的电影
    e、求评分在9分以上的电影
    """

    import requests
    from lxml import etree
    import csv
    import numpy as np
    def getHtml():
    url = 'https://movie.douban.com/chart'
    headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
    'Cookie':'ll="118237"; bid=kQ4wCGaUHxM; dbcl2="198098900:5Dr+gGK65ck"; ck=u-be; _pk_id.100001.4cf6=842ffa65a9a6b8b3.1560771548.1.1560771681.1560771548.; _pk_ses.100001.4cf6=*; __yadk_uid=ACadYi5zL218X3UjCuwIiXTk7lThAmup; __utma=30149280.26375845.1560771555.1560771555.1560771555.1; __utmb=30149280.2.10.1560771555; __utmc=30149280; __utmz=30149280.1560771555.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1679117071.1560771555.1560771555.1560771555.1; __utmb=223695111.0.10.1560771555; __utmc=223695111; __utmz=223695111.1560771555.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); push_noty_num=0; push_doumail_num=0; _vwo_uuid_v2=D0397C64E4418CF03F84A9F99DED3AE28|9c841c774e9ad1066dc8a2ca931d9a9a; __utmt=1; __utmv=30149280.19809'
    }
    # 1.请求网页源代码
    strHtml = requests.get(url,headers=headers).text
    # print(strHtml)
    """
    页面分析之标题:
    <a class="nbg" href="https://movie.douban.com/subject/27060077/" title="绿皮书">
    <a class="nbg" href="https://movie.douban.com/subject/27053945/" title="我们">
    页面分析之评分:
    <span class="rating_nums">8.9</span>
    <span class="rating_nums">6.6</span>
    """
    #2. 数据提取
    html = etree.HTML(strHtml)
    # 获取到电影名称
    tittle = html.xpath('//tr[@class="item"]//a/@title')
    print(tittle)
    # 获取到评分
    grade = html.xpath('//span[@class="rating_nums"]/text()')
    print(grade)
    # 3.处理数据(使用拉链函数,让数据一一对应)
    list=[]
    res=zip(tittle,grade)
    for i in res:
    # 将元组数据保存进列表中
    list.append(i)
    # print(list)
    # 4.保存成csv文件
    with open('./doubandianying.csv','w',) as f:
    csv_f = csv.writer(f)
    # 添加第一行
    csv_f.writerow(["title","grade"])
    # 将数据遍历存储
    for row in list:
    csv_f.writerow(row)

    def loadTxt():
    filePath = './doubandianying.csv'
    res=np.loadtxt(
    filePath,
    delimiter=',',
    dtype=str,
    usecols=(0,1),
    skiprows=1
    )
    return res

    def chage():
    filePath = './doubandianying.csv'
    res = np.loadtxt(
    filePath,
    delimiter=',',
    dtype=str,
    usecols=(1),
    skiprows=1
    )
    res = res.astype(np.float)
    return res

    def mean(gradeFloat):
    mean = np.mean(gradeFloat)
    return mean


    def movie(gradeFloat):
    index = np.argmax(gradeFloat)
    # print(index)
    filePath = './doubandianying.csv'
    title = np.loadtxt(
    filePath,
    delimiter=',',
    dtype=str,
    usecols=(0),
    skiprows=1
    )
    # print(title)
    return title[index]
    # index = np.argmin(gradeFloat)

    def movies(gradeFloat):

    res = gradeFloat[(gradeFloat>9)]
    print(res)

    if __name__ == '__main__':
    # 1.爬取数据
    getHtml()

    # 2.加载数据
    lt = loadTxt()
    print(lt)

    # 3.将分数列转换成浮点类型
    gradeFloat = chage()
    print(gradeFloat)
    print(type(gradeFloat))

    # 4.计算电影的平均分
    gradeMean = mean(gradeFloat)
    print(gradeMean)

    # 5.评分最高的电影
    movieFirst = movie(gradeFloat)
    print(movieFirst)

    # 6. q求评分高于9分以上的电影 (无)
    movies = movies(gradeFloat)
    人生苦短,我用python!
  • 相关阅读:
    返回到上一页的html代码的几种写法
    记一次网站服务器内存占用过多问题
    rpm命令数据库修复日志
    Linux vmstat命令实战详解
    innodb的innodb_buffer_pool_size和MyISAM的key_buffer_size
    mysql
    如何查看linux系统下的各种日志文件 linux 系统日志的分析大全
    /var/lock/subsys作用
    CentOS目录结构详解
    MySQL体系结构
  • 原文地址:https://www.cnblogs.com/YangQingHong/p/11043422.html
Copyright © 2020-2023  润新知