• python数据处理与机器学习


    提纲

    numpy:

    #genformtxt
    import numpy as np
    #genformtxtdata=np.genfromtxt("genfromtxtdata")
    #print(help(numpy.genfromtxt))
    
    #matrix-list of list
    matrix=np.array([[12,12],[12,12],[1,13]])
    print(matrix)
    #强制转换成一致数据类型
    dataa=np.array([1,2,4.0,1])
    #切片
    
    #判断
    #datab=dataa
    #结果返回true,false
    #导出等于某一值的数组
    #booldata=(datab==1)
    #print(datab[booldata])
    #取出包含某一值的某一行
    boolmatrix =(matrix[:,1]==13)
    print(matrix[boolmatrix,:])
    # & | 与或
    
    #类型转换
    dataa.astype(float)
    
    #求极值
    dataa.min()
    #按照行列求和
    matrix.sum(axis=1)
    
    #np.zeros((3,4)->元组格式)
    #np.arange(15).reshape(3,4)
    #np.random.random()->先进入random模块,默认范围-1->+1
    #np.linspace(0,2*pi,100)->均匀取值
    #np.exp()
    
    #相减:维度一样对应相减,不一样都减去后一个数
    A=np.array([[1,2],[1,1]])
    B=np.array([[1,2],[1,1]])
    print(A*B)#对应元素相乘
    print(A.dot(B))#矩阵相乘
    print(np.dot(A,B))
    
    #矩阵操作
    #向下取
    a=np.floor(10*np.random.random((3,4)))
    b=np.floor(10*np.random.random((3,4)))
    #将矩阵拉成向量
    print(a)
    print(a.ravel())
    
    #数据拼接
    #print(np.hstack((a,b)))
    #print(np.vstack((a,b)))
    #数据切分
    #print(np.hsplit(a,2))
    #print(np.vsplit(a,2))
    
    #数据复制
    b=a
    b.shape=4,3
    #改变b的形状,a的形状跟着变了
    print(a)
    #a,b的ID值一样,指向统一内存空间
    print(id(a),id(b))
    #浅复制
    #c与a虽然指向的地址不同但是共用一套数值,改变 c,a也会改变
    c=a.view()
    c.shape=2,6
    c[1,1]=11
    print(a.shape)
    print(a)
    #深复制
    #d与a完全没关系了
    d=a.copy()
    
    #索引操作
    #找最大值所在的位置
    intt=a.argmax(axis=0)
    print(intt)
    #扩展数组
    a=np.arange(1,20,10)
    b=np.tile(a,(2,3))
    print(b)
    #排序
    a=np.array([[1,2,3],[3,2,1]])
    #从小到大的索引值
    j=np.argsort(a)
    a.sort(axis=1)
    print(j)
    print(a)

     pandas:

    import pandas as pd
    import numpy as np
    current_path = %pwd
    print(current_path)
    #food_info=pd.read_csv("food_info.csv")
    #DataFrame数据类型
    #print(type(food_info))
    #print(food_info.dtypes)
    
    #food_info.head()
    #food_info.tail(4)
    #print(food_info.columns)
    #print(food_info.shape)
    
    #索引与计算
    #print(food_info.loc[0])
    #传入一个list->多列
    #print(food_info[["NDB_No","Shrt_Desc"]])
    #column_list=food_info.columns.tolist()
    #print(column_list)
    
    ##数据预处理
    #food_info.sort_values("NDB_No",inplace=True)
    ##排序后缺失值会被放到最后
    ##从小到大排序
    #print(food_info["NDB_No"])
    ##从大到小
    #food_info.sort_values("NDB_No",inplace=True,ascending=False)
    #print(food_info["NDB_No"])
    
    titanic_train_info=pd.read_csv("titanic_train.csv")
    #print(titanic_train_info.head())
    #age=titanic_train_info["Age"]
    #print(age.loc[0:10])
    #age_is_null=pd.isnull(age)
    #print(age_is_null)
    #age_null_true=age[age_is_null]
    #age_null_count=len(age_null_true)
    #print(age_null_count)
    #除去缺失值求平均
    #age_null_false=titanic_train_info["Age"][age_is_null==False]
    #average_age=sum(age_null_false)/len(age_null_false)
    #average_age1=titanic_train_info["Age"].mean()
    #print(average_age,average_age1)
    
    #数据统计表
    #基准-统计对象-方法
    #求均值是默认方法
    #passager_survival=titanic_train_info.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)
    #print(passager_survival)
    #passager_age=titanic_train_info.pivot_table(index="Pclass",values="Age",aggfunc=np.mean)
    #print(passager_age)
    #port_stats=titanic_train_info.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum)
    #print(port_stats)
    ##缺失值丢掉
    #titanic_train_info1=titanic_train_info
    drop_na_columns=titanic_train_info1.dropna(axis=0,subset=["Age","Sex"])
    drop_na_columns.head()
    
    #定位到某一具体值
    row_index_83_age=titanic_train_info1.loc[83,"Age"]
    print(row_index_83_age)
    
    #自定义函数
    #titanic_train_info1.apply("函数名")
    #series结构
    import pandas as pd
    score_csv=pd.read_csv("fandango_score_comparison.csv")
    series_FILM=score_csv["FILM"]
    #print(type(series_FILM))
    
    from pandas import Series
    film_names=series_FILM.values
    #print(type(film_names))
    series_rt=score_csv["RottenTomatoes"]
    #print(series_rt)
    rt_scores=series_rt.values
    print(rt_scores)
    #以名字所谓索引
    series_customer=Series(rt_scores,index=film_names)
    series_customer["Minions (2015)"]
    series_customer[5:10]

    matplotlib:

    #折线图
    import pandas as pd
    unrate=pd.read_csv("UNRATE.csv")
    unrate["DATE"]=pd.to_datetime(unrate["DATE"])
    #print(unrate.head(12))
    
    import matplotlib.pyplot as plt
    #first_twelve=unrate[0:100]
    #plt.plot(first_twelve["DATE"],first_twelve["VALUE"])
    #plt.xticks(rotation=45)
    #plt.xlabel("month")
    #plt.ylabel("rate")
    #plt.title("失业率")
    #plt.show()
    
    #fig=plt.figure()
    #ax1=fig.add_subplot(4,3,1)
    #ax2=fig.add_subplot(4,3,2)
    #ax2=fig.add_subplot(4,3,6)
    
    import numpy as np
    #fig=plt.figure(figsize=(10,6))
    #ax1=fig.add_subplot(2,1,1)
    #ax2=fig.add_subplot(2,1,2)
    #ax1.plot(np.random.randint(1,5,5),np.arange(5))
    #ax2.plot(np.arange(10)*3,np.arange(10))
    #plt.show()
    
    unrate["Month"]=unrate["DATE"].dt.month
    #fig=plt.figure(figsize=(6,3))
    #plt.plot(unrate[0:12]["Month"],unrate[0:12]["VALUE"],c="red")
    #plt.plot(unrate[12:24]["Month"],unrate[12:24]["VALUE"],c="blue")
    fig=plt.figure(figsize=(10,5))
    colors=["red","blue","green","orange","black"]
    for i in range(5):
        start_index=i*12
        end_index=(i+1)*12
        subset=unrate[start_index:end_index]
        label=str(1948+i)
        plt.plot(subset["Month"],subset["VALUE"],c=colors[i],label=label)
    plt.legend(loc="best")
    plt.show()
    
    #bar
    import pandas as pd
    reviews = pd.read_csv('fandango_scores.csv')
    cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
    norm_reviews = reviews[cols]
    #print(norm_reviews[:1])
    
    
    import matplotlib.pyplot as plt
    from numpy import arange
    #The Axes.bar() method has 2 required parameters, left and height. 
    #We use the left parameter to specify the x coordinates of the left sides of the bar. 
    #We use the height parameter to specify the height of each bar
    num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue', 'Fandango_Stars']
    bar_heights = norm_reviews.ix[0, num_cols].values
    bar_positions = arange(5) + 0.75
    tick_positions = range(1,6)
    fig, ax = plt.subplots()
    
    ax.bar(bar_positions, bar_heights, 0.5)
    #横着画图
    ax.barh(bar_positions, bar_heights, 0.5)
    ax.set_xticks(tick_positions)
    ax.set_xticklabels(num_cols, rotation=45)
    
    ax.set_xlabel('Rating Source')
    ax.set_ylabel('Average Rating')
    ax.set_title('Average User Rating For Avengers: Age of Ultron (2015)')
    plt.show()
    
    #散点图
    #Let's look at a plot that can help us visualize many points.
    #函数返回一个figure图像和一个子图ax的array列表。
    fig = plt.figure(figsize=(10,5))
    ax1 = fig.add_subplot(2,1,1)
    ax2 = fig.add_subplot(2,1,2)
    ax1.scatter(norm_reviews['Fandango_Ratingvalue'], norm_reviews['RT_user_norm'])
    ax1.set_xlabel('Fandango')
    ax1.set_ylabel('Rotten Tomatoes')
    ax2.scatter(norm_reviews['RT_user_norm'], norm_reviews['Fandango_Ratingvalue'])
    ax2.set_xlabel('Rotten Tomatoes')
    ax2.set_ylabel('Fandango')
    plt.show()
    #柱状图
    import pandas as pd
    import matplotlib.pyplot as plt
    reviews = pd.read_csv('fandango_scores.csv')
    cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
    norm_reviews = reviews[cols]
    #print(norm_reviews[:5])
    #数据计数
    fandango_distribution = norm_reviews['Fandango_Ratingvalue'].value_counts()
    #数据索引从小到大排列
    fandango_distribution = fandango_distribution.sort_index()
    imdb_distribution = norm_reviews['IMDB_norm'].value_counts()
    imdb_distribution = imdb_distribution.sort_index()
    #print(fandango_distribution)
    #print(imdb_distribution)
    fig, ax = plt.subplots()
    #ax.hist(norm_reviews['Fandango_Ratingvalue'])
    #bins指定个数,range指定区间
    ax.hist(norm_reviews['Fandango_Ratingvalue'],bins=20)
    ax.hist(norm_reviews['Fandango_Ratingvalue'], range=(4, 5),bins=20)
    ax.set_ylim(0,20)
    #四分图(盒图)
    num_cols = ['RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
    fig, ax = plt.subplots()
    ax.boxplot(norm_reviews[num_cols].values)
    ax.set_xticklabels(num_cols, rotation=90)
    ax.set_ylim(0,5)
    plt.show()
    #一些细节
    import pandas as pd
    import matplotlib.pyplot as plt
    # Add your code here.
    fig, ax = plt.subplots()
    ax.plot(women_degrees['Year'], women_degrees['Biology'], label='Women')
    ax.plot(women_degrees['Year'], 100-women_degrees['Biology'], label='Men')
    #去掉小横线
    ax.tick_params(bottom="off", top="off", left="off", right="off")
    ax.set_title('Percentage of Biology Degrees Awarded By Gender')
    ax.legend(loc="upper right")
    major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
    fig = plt.figure(figsize=(12, 12))
    
    #for sp in range(0,4):
    #    ax = fig.add_subplot(2,2,sp+1)
    #    ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')
    #    ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')
    #    # Add your code here.
    #
    ## Calling pyplot.legend() here will add the legend to the last subplot that was created.
    #plt.legend(loc='upper right')
    #plt.show()
    
    major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
    fig = plt.figure(figsize=(12, 12))
    
    for sp in range(0,4):
        ax = fig.add_subplot(2,2,sp+1)
        ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')
        ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')
        for key,spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0,100)
        ax.set_title(major_cats[sp])
        ax.tick_params(bottom="off", top="off", left="off", right="off")
    # Calling pyplot.legend() here will add the legend to the last subplot that was created.
    plt.legend(loc='upper right')
    plt.show()
    
    
    #Setting Line Width
    cb_dark_blue = (0/255, 107/255, 164/255)
    cb_orange = (255/255, 128/255, 14/255)
    
    fig = plt.figure(figsize=(12, 12))
    
    for sp in range(0,4):
        ax = fig.add_subplot(2,2,sp+1)
        # Set the line width when specifying how each line should look.
        ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women', linewidth=10)
        ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men', linewidth=10)
        for key,spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0,100)
        ax.set_title(major_cats[sp])
        ax.tick_params(bottom="off", top="off", left="off", right="off")
    
    plt.legend(loc='upper right')
    plt.show()
    
    stem_cats = ['Engineering', 'Computer Science', 'Psychology', 'Biology', 'Physical Sciences', 'Math and Statistics']
    fig = plt.figure(figsize=(18, 3))
    for sp in range(0,6):
        ax = fig.add_subplot(1,6,sp+1)
        ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
        ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
        for key,spine in ax.spines.items():
            spine.set_visible(False)
        ax.set_xlim(1968, 2011)
        ax.set_ylim(0,100)
        ax.set_title(stem_cats[sp])
        ax.tick_params(bottom="off", top="off", left="off", right="off")
        
        if sp == 0:
            ax.text(2005, 87, 'Men')
            ax.text(2002, 8, 'Women')
        elif sp == 5:
            ax.text(2005, 62, 'Men')
            ax.text(2001, 35, 'Women')
    plt.show()

    seaborn:

    #seaborn风格模板
    import seaborn as sns
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import numpy as np
    %matplotlib inline
    def sinplot(flip=1):
        x=np.linspace(0,14,100)
        for i in range(1,7):
            plt.plot(x,np.sin(x+i*0.5)*(7-i)*flip)
    #sns默认风格(有五种主题风格)
    #sns.set()
    #sinplot()
    #sns.set_style("whitegrid")
    #sns.set_style("dark")
    #sns.set_style("white")
    #sns.set_style("ticks")
    #data=np.random.normal(size=(20,6))+np.arange(6)/2
    #sns.boxplot(data=data)
    #去掉上方和右边的线条
    #sns.despine()
    #sns.despine(offset=10)
    #sns.despine(left=True)
    #with内执行的都是当前风格
    #with sns.axes_style("darkgrid"):
    #    plt.subplot(211)
    #    sinplot()
    #plt.subplot(212)
    #sinplot(-1)
    ##设置整体布局
    sns.set_style("whitegrid")
    sns.set_context("paper",font_scale=2.5,rc=({"lines.linewidth":4.5}))#poster/notebook
    plt.figure(figsize=(8,6))
    sinplot()
    # 颜色(离散型与连续型)
    
    >颜色很重要
    >color_palette()能传入任何matplot所支持的颜色
    >color_palette()不写参数则默认颜色
    >set_palette()设置所有图的颜色
    
    #分类色板
    #默认的绘图颜色
    current_palette=sns.color_palette()
    sns.palplot(current_palette)
    #hls默认的颜色空间
    sns.palplot(sns.color_palette("hls",8))
    #把颜色放到数据中
    fig=plt.figure(figsize=(10,6))
    data=np.random.normal(size=(20,6))+np.arange(6)/2
    sns.boxplot(data=data,palette=sns.color_palette("hls",8))
    #更改调色板亮度与饱和度
    #fig=plt.figure(figsize=(10,6))
    #sns.palplot(sns.hls_palette(8,l=.2,h=.9))
    #sns.boxplot(data=data,palette=sns.hls_palette(8,l=.2,h=.9))
    
    #调出来成对的颜色
    sns.palplot(sns.color_palette("Paired",8))
    
    使用xkcd来命名颜色
    xkcd包含了一套众包努力的针对随机GRB色的命名,产生了954个可以随时通过xkcd_rgb字典中调用的命名颜色
    
    plt.plot([0,1],[0,1],sns.xkcd_rgb["pale red"],lw=3)
    plt.plot([0,1],[0,2],sns.xkcd_rgb["medium green"],lw=3)
    plt.plot([0,1],[0,3],sns.xkcd_rgb["denim blue"],lw=3)
    #连续画板
    #色彩可以变换,比如用颜色的变化表示值重要性的变化
    sns.palplot(sns.color_palette("Blues"))
    #由深到浅
    sns.palplot(sns.color_palette("Blues_r"))
    #线性调色板
    sns.palplot(sns.color_palette("cubehelix",8))
    sns.palplot(sns.cubehelix_palette(8,start=.5,rot=-0.75))
    #指定颜色深浅
    sns.palplot(sns.light_palette("green"))
    sns.palplot(sns.dark_palette("purple"))
    x,y=np.random.multivariate_normal([0,0],[[1,-.5],[-.5,1]],size=300).T
    #plt.scatter(x,y)
    fig=plt.figure(figsize=(10,6))
    pal=sns.dark_palette("green",as_cmap=True)
    sns.kdeplot(x,y,cmap=pal)
  • 相关阅读:
    elastic search配置ik分词及pinyin分词使搜索同时支持中文和拼音搜索
    centos如何配置本地yum源
    PostGIS 3.0.1编译安装
    CentOS7安装GeoServer
    PostgreSQL v12.0在Centos7手动安装
    CentOS7.3镜像下载
    CentOS7各个版本镜像下载地址
    ElasticSerach7.6.0拼音分词器安装和使用
    uDig配图与GeoServer添加Style
    清除SQL2005数据库日志的方法
  • 原文地址:https://www.cnblogs.com/janghe/p/8013000.html
Copyright © 2020-2023  润新知