• pandas学习之DataFrame结构titanic示例


    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    import pandas as pd
    import numpy as np
    titanic_survival = pd.read_csv("titanic_train.csv")
    print(titanic_survival.head())
    print(titanic_survival.shape)
    #判断年龄的缺失值
    age = titanic_survival["Age"]
    print(age.loc[0:10]) #打印前十行的年龄
    age_is_null = pd.isnull(age)#年龄是空值的布尔值
    print(age_is_null)
    age_null_true = age[age_is_null] #输出年龄为空的索引
    print(age_null_true)
    age_null_count = len(age_null_true)
    print(age_null_count)  #输出年龄为空的数目
    #如果不去除缺失值,就无法计算平均年龄
    good_age = titanic_survival["Age"][age_is_null==False]
    print(good_age) #只打印不为空的年龄的值
    print(sum(good_age)/len(good_age)) #年龄的平均值
    print(titanic_survival["Age"].mean()) #调用.mean()也可以直接计算年龄的平均值
    #每个等级船票的平均价格
    passenger_class = [1,2,3]
    fares_by_class = {}
    for this_class in passenger_class:
        pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
        # 这一级乘客的票价的均值
        fare_class_mean = pclass_rows["Fare"].mean()
        fares_by_class[this_class] = fare_class_mean #把三个等级的票价传给字典
    print(fares_by_class)
    #快速数据统计  pivot_table
    passenger_survival = titanic_survival.pivot_table(index="Pclass",
                                                values="Survived",aggfunc=np.mean)
    print(passenger_survival)#对于每个Pclass,他们的平均获救概率
    passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age")
    print(passenger_age)      #各等舱乘客的平均年龄,aggfunc没指定值,按照求均值的方式做
    port_starts = titanic_survival.pivot_table(index="Embarked",
                                               values=["Fare","Survived"],
                                               aggfunc=np.sum) #各个码头的总船票和获救人数
    print(port_starts)
    #丢掉缺失值 dropna()
    new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"])
    print(new_titanic_survival)  #如果age,sex有缺失值,就直接丢掉这一行
    #通过行号和列号找出指定的值,如果超出就指定最后一个
    row_index_83_age = titanic_survival.loc[83,"Age"]
    print(row_index_83_age)
    
    age_sort = titanic_survival.sort_values("Age",ascending=False)
    print(age_sort[0:10])  #年龄降序排序
    titanic_reindexed = age_sort.reset_index(drop=True)#index也重新排序
    print(titanic_reindexed.loc[0:10])
    #上面几个也可以写成如下形式
    age_sort = titanic_survival.sort_values("Age",ascending=False).reset_index(drop=True)
    #自定义函数 :apply()函数
    def hundredth_row(column):
        hundredth_item = column.loc[99]
        return hundredth_item
    hundredth_row = titanic_survival.apply(hundredth_row)
    print(hundredth_row) #输出第一百行数据
    
    def not_null_count(column):
        column_null = pd.isnull(column)
        null = column[column_null]
        return len(null)               #每一列的缺失值
    column_null_coumt = titanic_survival.apply(not_null_count)
    print(column_null_coumt)
    
    def generate_age_label(row):
        age = row["Age"]
        if pd.isnull(age):
            return "unknown"
        elif age<18:
            return "minor"
        else:
            return "adult"
    age_labels = titanic_survival.apply(generate_age_label,axis=1)
    print(age_labels)
    titanic_survival["age_labels"]=age_labels  #各个年龄段获救的概率
    age_group_survival = titanic_survival.pivot_table(index="age_labels",values="Survived")
    print(age_group_survival)
    

      

  • 相关阅读:
    MyEclipse启动时,报错Error:could not open`E:Program FilesJavaJAVAlibamd64jvm.cfg'
    换JDK以后,MyEclipse无法启动,报错:Failed to load the JNI...
    ORA-12514
    java语言
    基本数据类型
    C# 计算时间间隔,两个时间差(年月日时分秒)
    Java的Stream流
    Java的Lambda表达式和函数式接口
    Java中的Log
    Oracle的触发器Trigger
  • 原文地址:https://www.cnblogs.com/lifengwu/p/9816088.html
Copyright © 2020-2023  润新知