In [1]:
import warnings
import math
import pandas as pd
import numpy as np
import matplotlib
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.set_option('max_colwidth', 500)
get_ipython().magic(u'matplotlib inline')
matplotlib.style.use('ggplot')
from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
myfont = matplotlib.font_manager.FontProperties(fname=u'simsun.ttc', size=14)
In [15]:
data = pd.DataFrame({
'age' : np.random.randint(15, 100, 100),
'height':np.random.randint(140, 180, 100),
'weight':np.random.randint(40, 80, 100),
'gender':np.random.randint(0,2, 100),
'salary':np.random.randint(3000, 30000, 100)
})
data.head()
Out[15]:
In [16]:
data.gender = data.gender.map({0:'man', 1:'women'})
data.head()
Out[16]:
In [28]:
# group 对象
group = data.groupby('gender', as_index=False)
list(group)[0]
Out[28]:
In [25]:
# agg
group.agg({'age':'mean','height':'mean'})
Out[25]:
In [33]:
# transform
data['avg_age'] = group['age'].transform('mean')
data.head()
Out[33]:
In [35]:
# apply
def oldest(x):
df = x.sort_values(by='age', ascending=False)
return df.iloc[-1,:]
group.apply(oldest)
Out[35]:
In [53]:
def age_level(age):
return 'young' if age < 30 else ('middle' if age < 60 else 'senior')
data['level'] = data.age.map(age_level)
data.head()
Out[53]:
In [68]:
# 分组百分比
age_dist = data.groupby(['gender', 'level']).agg({'age':'count'})
age_dist
Out[68]:
In [69]:
# gender_pcts
age_dist.groupby(level = 0).apply(lambda x:x/float(x.sum()))
Out[69]:
In [70]:
age_dist.groupby(level = 'level').apply(lambda x:x/float(x.sum()))
Out[70]:
In [64]:
age_dist.groupby(level = 1).apply(lambda x:x/float(x.sum()))
Out[64]: