如何得到按列分组的dataframe的平均值和标准差
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 2, 'col2': np.random.randint(0,15,6), 'col3': np.random.randint(0, 15, 6)}) print(df) # 按列col1分组后的平均值 df_grouped_mean = df.groupby(['col1']).mean() print(df_grouped_mean) # 按列col1分组后的标准差 df_grouped_std = df.groupby(['col1']).mean() print(df_grouped_std) #> col1 col2 col3 0 apple 2 14 1 banana 11 8 2 orange 8 10 3 apple 5 2 4 banana 6 12 5 orange 11 13 #> col2 col3 col1 apple 3.5 8.0 banana 8.5 10.0 orange 9.5 11.5 #> col2 col3 col1 apple 3.5 8.0 banana 8.5 10.0 orange 9.5 11.5
- 如何得到按列分组后另一列的第n大的值
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 2, 'taste': np.random.rand(6), 'price': np.random.randint(0, 15, 6)}) print(df) # teste列按fruit分组 df_grpd = df['taste'].groupby(df.fruit) # teste列中banana元素的信息 x=df_grpd.get_group('banana') # 排序并找第2大的值 s = x.sort_values().iloc[-2] print(s) #> fruit taste price 0 apple 0.521990 7 1 banana 0.640444 0 2 orange 0.460509 9 3 apple 0.818963 4 4 banana 0.646138 7 5 orange 0.917056 12 #> 0.6404436436085967
- 如何计算分组dataframe的平均值,并将分组列保留为另一列
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 2, 'rating': np.random.rand(6), 'price': np.random.randint(0, 15, 6)}) # 按fruit分组后,price列的平均值,并将分组置为一列 out = df.groupby('fruit', as_index=False)['price'].mean() print(out) #> fruit price 0 apple 4.0 1 banana 6.5 2 orange 11.0 33.如何获取两列值元素相等的位置(并非索引) df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 3), 'fruit2': np.random.choice(['apple', 'orange', 'banana'], 3)}) print(df) # 获取两列元素相等的行 np.where(df.fruit1 == df.fruit2) #> fruit1 fruit2 0 apple banana 1 apple apple 2 orange apple #> (array([1], dtype=int64),)
- 如何创建指定列偏移后的新列
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4), columns = list('abcd')) # 创建往下偏移后的列 df['a_lag1'] = df['a'].shift(1) # 创建往上偏移后的列 df['b_lead1'] = df['b'].shift(-1) print(df) #> a b c d a_lag1 b_lead1 0 29 90 43 24 NaN 36.0 1 94 36 67 66 29.0 76.0 2 81 76 44 49 94.0 97.0 3 55 97 10 74 81.0 43.0 4 32 43 62 62 55.0 NaN
- 如何获得dataframe中单一值的频数
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), columns = list('abcd')) # 统计元素值的个数 pd.value_counts(df.values.ravel()) #> 9 3 7 3 3 3 1 3 6 2 5 2 4 2 8 1 2 1 dtype: int64
-
df = pd.DataFrame(["STD, City State", "33, Kolkata West Bengal", "44, Chennai Tamil Nadu", "40, Hyderabad Telengana", "80, Bangalore Karnataka"], columns=['row']) print(df) # expand=True表示以分割符把字符串分成两列 df_out = df.row.str.split(',|\t', expand=True) # 获取新的列 new_header = df_out.iloc[0] # 重新赋值 df_out = df_out[1:] df_out.columns = new_header print(df_out) #> row 0 STD, City State 1 33, Kolkata West Bengal 2 44, Chennai Tamil Nadu 3 40, Hyderabad Telengana 4 80, Bangalore Karnataka #> 0 STD City State 1 33 Kolkata West Bengal 2 44 Chennai Tamil Nadu 3 40 Hyderabad Telengana 4 80 Bangalore Karnataka
- 如何构建多级索引的dataframe
我们利用元组(Tuple)构建多级索引,然后定义dataframe.# 如何构建多级索引的dataframe # 先通过元组方式构建多级索引 import numpy as np outside = ['A','A','A','B','B','B'] inside =[1,2,3,1,2,3] my_index = list(zip(outside,inside)) # my_index # 转化为pd格式的索引 my_index = pd.MultiIndex.from_tuples(my_index) # my_index # 构建多级索引dataframe df = pd.DataFrame(np.random.randn(6,2),index =my_index,columns=['fea1','fea2']) df 多索引dataframe结果: 获取多索引dataframe的数据: df.loc['A'].iloc[1] #> fea1 -0.794461 fea2 0.882104 Name: 2, dtype: float64 df.loc['A'].iloc[1]['fea1'] #> -0.7944609970323794