• 乐高积木数据处理


    乐高积木数据处理

    素材链接

    1.导入模块

    import pandas as pd
    import numpy as np
    import jieba 
    import time
    
    from pyecharts.charts import Bar,Line,Map,Page,Pie
    from pyecharts import options as opts
    from pyecharts.globals import SymbolType
    from pyecharts.faker import Faker
    

    2.读取数据

    # 读取数据
    df_tb=pd.read_csv('F:Python数据分析课程python数据处理Pandas练习数据分析项目练习legao3225乐高淘宝数据.csv')
    # 读取前五条数据
    df_tb.head()
    

    image-20201013084044939

    3.查看数据集的信息

    df_tb.info()
    

    image-20201013084145268

    4.数据处理

    # 去除重复值
    df_tb.drop_duplicates(inplace=True)
    # 删除购买人数为空的记录
    df_tb=df_tb[df_tb['purchase_num'].str.contains('人付款')]
    
    # 重置索引
    df_tb=df_tb.reset_index(drop=True)
    #再次查看数据集信息
    df_tb.info()
    

    image-20201013084309899

    # purchase_num 处 将购买数量转化为int类型
    df_tb['purchase_num']=df_tb['purchase_num'].str.extract('(d+)').astype('int')
    # 计算销售额
    df_tb['sales_volume']=df_tb['price']*df_tb['purchase_num']
    
    # 新增一列location,对province进行分割
    df_tb['province']=df_tb['location'].str.split(' ').str[0]
    df_tb.head()
    

    image-20201013084807507

    数据可视化

    1. 乐高销量排名top10店铺-条形图
    2. 乐高产地数量排名top10-条形图
    3. 乐高产国内销量分布-地图
    4. 价格分布-饼图
    5. 不同价格区间的销量表现-饼图
    6. 商品标题词云图-词云图

    1.乐高销量排名top10店铺-条形图

    # 对商品店铺名称进行分组,并对购买数量进行求和,降序排序,取前10条数据
    shop_top10=df_tb.groupby('shop_name')['purchase_num'].sum().sort_values(ascending=False).head(10)
    shop_top10
    

    image-20201013091655643

    # 条形图
    # bar1=Bar(init_opts=opts.InitOpts(width='1350px',height='750px'))
    bar1=Bar()
    bar1.add_xaxis(shop_top10.index.tolist())
    bar1.add_yaxis('',shop_top10.values.tolist())
    bar1.set_global_opts(title_opts=opts.TitleOpts(title='乐高销量排名Top10淘宝店铺'),
                         xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-15)),
                         visualmap_opts=opts.VisualMapOpts(max_=28669)
                        )
    bar1.render_notebook()
    

    image-20201013210347740

    2. 乐高产地数量排名top10

    province_top10=df_tb.province.value_counts()[:10]
    province_top10
    

    image-20201013210440405

    bar2=Bar()
    bar2.add_xaxis(province_top10.index.tolist())
    bar2.add_yaxis('',province_top10.values.tolist())
    bar2.set_global_opts(
            title_opts=opts.VisualMapOpts(max_=1000)
                        )
    bar2.render_notebook()
    

    image-20201013210512103

    3. 国内各省份乐高销量分布图

    province_num=df_tb.groupby('province')['purchase_num'].sum().sort_values(ascending=False)
    
    province_num[:10]
    

    image-20201013210555134

    map1=Map()
    map1.add("",[list(z) for z in zip(province_num.index.tolist(),province_num.values.tolist())],
            maptype='china')
    map1.set_global_opts(
        title_opts=opts.TitleOpts(title='国内各产地乐高销量分布图'),
        visualmap_opts=opts.VisualMapOpts(max_=172277)
    )
    map1.render_notebook()
    

    image-20201013210630471

    4.天猫乐高价格分布

    cut_bins=[0,50,100,200,300,500,1000,8888]
    cut_labels=['0~50元','50~100元','100~200元','200~300元','300~500元','500~1000元','1000元以上']
    
    price_cut=pd.cut(df_tb['price'],bins=cut_bins,labels=cut_labels)
    price_num=price_cut.value_counts()
    price_num
    

    image-20201013210713804

    bar3=Bar()
    bar3.add_xaxis(['0~50元','50~100','100~200元','200~300元','300~500元','500~1000元','1000元以上'])
    bar3.add_yaxis('',[895,486,701,288,370,411,260])
    bar3.set_global_opts(title_opts=opts.TitleOpts(title='不同价格区间的商品数量'),
                        visualmap_opts=opts.VisualMapOpts(max_=900))
    bar3.render_notebook()
    

    image-20201013210738823

    5.不同价格区间的销售额整体表现

    df_tb['price_cut']=price_cut
    
    cut_purchase=df_tb.groupby('price_cut')['sales_volume'].sum()
    cut_purchase
    

    image-20201013210814013

    data_pair=[list(z) for z in zip(cut_purchase.index.tolist(),cut_purchase.values.tolist())]
    # 绘制饼图
    piel=Pie()
    piel.add('',data_pair,radius=['35%','60%'])
    piel.set_global_opts(title_opts=opts.TitleOpts(title='不同价格区间的销售额整体表现'),
                        legend_opts=opts.LegendOpts(orient='vertical',pos_top='15%',pos_left='2%'))
    piel.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{d}%"))
    piel.set_colors(['#EF9050','#3B7BA9','#6FB27C','#FFAF34','#D7BFD7','#00BFFE','#7FFFAA'])
    piel.render_notebook()
    

    image-20201013210851311

    6.商品标题词云图

    def get_cut_words(content_series):
        # 读入停用图表析
        stop_words=[]
        with open("F:\Python数据分析课程\python数据处理\Pandas练习\数据分析项目练习\legao3225\cn_stopwords.txt",'r',encoding='utf-8')as f:
            lines=f.readlines()
            for line in lines:
                stop_words.append(line.strip())
        # 添加关键词
        my_words=['乐高','悟空小侠','大颗粒','小颗粒']
        for i in my_words:
            jieba.add_word(i)
        # 自定义停用词
        # my_stop_words=[]
        # stop_words.extend(my_stop_words)
    
        # 分词
        word_num=jieba.lcut(content_series.str.cat(sep='。'),cut_all=False)
        # 条件筛选
        word_num_selected=[i for i in word_num if i not in stop_words and len(i)>=2]
        return  word_num_selected
    text=get_cut_words(content_series=df_tb['goods_name'])
    text[:10]
    

    image-20201013210945413

    import stylecloud
    from IPython.display import Image
    # 绘制词云图
    stylecloud.gen_stylecloud(
        text=' '.join(text),
        collocations=False,
        font_path=r'F:Python数据分析课程python数据处理Pandas练习数据分析项目练习legao3225simhei.ttf',
        icon_name='fas fa-plane',
        background_color='pink',
        size=768,
        output_name='淘宝乐高标题词云图.png'
    )
    Image(filename='淘宝乐高标题词云图.png')
    

    image-20201013211326615

  • 相关阅读:
    dedecms 标签的基本用法
    修改config.php配置
    截取字符
    preg_replace 方法
    php过滤HTML标签、属性等正则表达式汇总
    各种正则验证
    解决问题 “You don't have permission to access /index.html on this server.”
    zend frameword 基本语法
    创建zend framework 项目要注意的
    PHP中级篇 Apache配置httpd-vhosts虚拟主机总结及注意事项[OK]
  • 原文地址:https://www.cnblogs.com/James-221/p/13811400.html
Copyright © 2020-2023  润新知