#!/usr/bin/env python # -*- coding:utf-8 -*- import pandas as pd import numpy as np titanic_survival = pd.read_csv("titanic_train.csv") print(titanic_survival.head()) print(titanic_survival.shape) #判断年龄的缺失值 age = titanic_survival["Age"] print(age.loc[0:10]) #打印前十行的年龄 age_is_null = pd.isnull(age)#年龄是空值的布尔值 print(age_is_null) age_null_true = age[age_is_null] #输出年龄为空的索引 print(age_null_true) age_null_count = len(age_null_true) print(age_null_count) #输出年龄为空的数目 #如果不去除缺失值,就无法计算平均年龄 good_age = titanic_survival["Age"][age_is_null==False] print(good_age) #只打印不为空的年龄的值 print(sum(good_age)/len(good_age)) #年龄的平均值 print(titanic_survival["Age"].mean()) #调用.mean()也可以直接计算年龄的平均值 #每个等级船票的平均价格 passenger_class = [1,2,3] fares_by_class = {} for this_class in passenger_class: pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] # 这一级乘客的票价的均值 fare_class_mean = pclass_rows["Fare"].mean() fares_by_class[this_class] = fare_class_mean #把三个等级的票价传给字典 print(fares_by_class) #快速数据统计 pivot_table passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived",aggfunc=np.mean) print(passenger_survival)#对于每个Pclass,他们的平均获救概率 passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age") print(passenger_age) #各等舱乘客的平均年龄,aggfunc没指定值,按照求均值的方式做 port_starts = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum) #各个码头的总船票和获救人数 print(port_starts) #丢掉缺失值 dropna() new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"]) print(new_titanic_survival) #如果age,sex有缺失值,就直接丢掉这一行 #通过行号和列号找出指定的值,如果超出就指定最后一个 row_index_83_age = titanic_survival.loc[83,"Age"] print(row_index_83_age) age_sort = titanic_survival.sort_values("Age",ascending=False) print(age_sort[0:10]) #年龄降序排序 titanic_reindexed = age_sort.reset_index(drop=True)#index也重新排序 print(titanic_reindexed.loc[0:10]) #上面几个也可以写成如下形式 age_sort = titanic_survival.sort_values("Age",ascending=False).reset_index(drop=True) #自定义函数 :apply()函数 def hundredth_row(column): hundredth_item = column.loc[99] return hundredth_item hundredth_row = titanic_survival.apply(hundredth_row) print(hundredth_row) #输出第一百行数据 def not_null_count(column): column_null = pd.isnull(column) null = column[column_null] return len(null) #每一列的缺失值 column_null_coumt = titanic_survival.apply(not_null_count) print(column_null_coumt) def generate_age_label(row): age = row["Age"] if pd.isnull(age): return "unknown" elif age<18: return "minor" else: return "adult" age_labels = titanic_survival.apply(generate_age_label,axis=1) print(age_labels) titanic_survival["age_labels"]=age_labels #各个年龄段获救的概率 age_group_survival = titanic_survival.pivot_table(index="age_labels",values="Survived") print(age_group_survival)