• pandas常用语法


    常见的一些pandas的使用方法

    import pandas as pd
    
    food_info = pd.read_csv('food_info.csv')
    print(type(food_info))
    print(food_info.dtypes)
    # print(help(pd.read_csv))
    
    # 显示前3行 food_info.head(3)
    print(food_info.head())
    first_rows = food_info.head()
    print(first_rows)
    food_info.tail(3)
    
    # 显示每一列的列名
    print(food_info.columns)
    print(food_info.shape)
    
    # 取数据,用索引来取数据, loc[]
    print(food_info.loc[0])
    # 也可以通过切片取数据
    print(food_info.loc[3:6])
    # Return a DataFrame containing the rows at index 2, 5, 10
    two_five_ten = [2, 5, 10]
    print(food_info.loc[two_five_ten])
    
    # 取XX列数据
    ndb_col = food_info['NDB_No']
    print(ndb_col)
    # col_name = 'NDB_No'
    # print(food_info[col_name])
    columns = ['Zinc_(mg)', 'Copper_(mg)']
    print(food_info[columns])
    
    # 写一个简单的代码, 把以g结尾的列名找出来
    col_names = food_info.columns.tolist()
    print(col_names)
    gram_columns = []
    
    for c in col_names:
        if c.endswith('(g)'):
            gram_columns.append(c)
    gram_df = food_info[gram_columns]
    print(gram_df.head(3))
    
    # pandas 进行数学运算, 对每一个值都进行相同的操作
    print(food_info['Iron_(mg)'])
    div_1000 = food_info['Iron_(mg)'] / 1000
    print(div_1000)
    
    # 对维度相同的列进行组合, 就是对应位置的操作
    water_energy = food_info['Water_(g)'] * food_info['Energ_Kcal']
    print(water_energy)
    
    # 对应维度增加一列
    iron_grams = food_info['Iron_(mg)'] / 1000
    food_info['Iron_(g)'] = iron_grams
    print(food_info.shape)
    
    # 常见的函数
    max_calories = food_info['Energ_Kcal'].max()
    # 归一化操作Divide the values
    normalized_calories = food_info['Energ_Kcal'] / max_calories
    food_info["Normalized_cal"] = normalized_calories
    
    # 排序的操作
    food_info.sort_values('Sodium_(mg)', inplace=True)
    print(food_info['Sodium_(mg)'].head(20))
    # 降序
    food_info.sort_values('Sodium_(mg)', inplace=True, ascending=False)
    print(food_info['Sodium_(mg)'].head(20))
    
    import pandas as pd
    import numpy as np
    
    titanic_survival = pd.read_csv('titanic_train.csv')
    print(titanic_survival.head())
    
    # 对缺失值进行补充
    age = titanic_survival['Age']
    print(age.loc[0:10])
    age_is_null = pd.isnull(age)
    print(age_is_null)
    age_null_true = age[age_is_null]
    print(age_null_true)
    
    age_null_count = len(age_null_true)
    print(age_null_count)
    # 缺失值进行处理
    mean_age = sum(titanic_survival['Age']) / len(titanic_survival['Age'])
    print(mean_age)
    
    good_ages = titanic_survival['Age'][age_is_null == False]
    print(good_ages)
    correct_mean_age = sum(good_ages) / len(good_ages)
    print(correct_mean_age)
    
    # 也可以直接通过调用.mean()
    correct_mean_age = titanic_survival['Age'].mean()
    print(correct_mean_age)
    
    # mean fare for each class
    passenger_class = [1, 2, 3]
    fares_by_class = {}
    for this_class in passenger_class:
        pclass_rows = titanic_survival[titanic_survival['Pclass'] == this_class]
        pclass_fares = pclass_rows['Fare']
        fares_for_class = pclass_fares.mean()
        fares_by_class[this_class] = fares_for_class
    
    print(fares_by_class)
    
    # 快速进行数据统计
    '''
    index = which columns group by
    values is the columns we want to calculate
    '''
    passenger_survival = titanic_survival.pivot_table(index='Pclass', values='Survived', aggfunc=np.mean)
    print(passenger_survival)
    
    passenger_class = titanic_survival.pivot_table(index='Pclass', values='Fare', aggfunc=np.mean)
    print(passenger_class)
    
    passenger_age = titanic_survival.pivot_table(index='Pclass', values='Age', aggfunc=np.mean)
    print(passenger_age)
    
    port_stats = titanic_survival.pivot_table(index='Embarked', values=['Fare', 'Survived'], aggfunc=np.sum)
    print(port_stats)
    
    # 把缺失值全部删除,
    drop_na_columns = titanic_survival.dropna(axis=1)
    print(drop_na_columns)
    # 删除样本
    new_titanic_survival = titanic_survival.dropna(axis=0, subset=['Age', 'Sex'])
    print(new_titanic_survival)
    
    # 取某一个特殊的值
    row_index_83_age = titanic_survival.loc[83, 'Age']
    print(row_index_83_age)
    
    # 排序, 然后重置索引
    new_titanic_survival = titanic_survival.sort_values('Age', ascending=False)
    print(new_titanic_survival[0:10])
    
    titanic_reindexed = new_titanic_survival.reset_index(drop=True)
    print('------')
    print(titanic_reindexed.loc[0:10])
    
    # pandas 自定义函数 apply
    def hundredth_row(column):
        # extract the hundredth item
        hundredth_item = column.loc[99]
        return hundredth_item
    
    # Return
    hundredth_row = titanic_survival.apply(hundredth_row)
    print(hundredth_row)
    
    def not_null_count(column):
        column_null = pd.isnull(column)
        null = column[column_null]
        return len(null)
    
    column_null_count = titanic_survival.apply(not_null_count)
    print(column_null_count)
    
    # 改变仓位等级
    def which_class(row):
        pclass = row['Pclass']
        if pd.isnull(pclass):
            return 'Unknown'
        elif pclass == 1:
            return 'First Class'
        elif pclass == 2:
            return  'Second Class'
        elif pclass == 3:
            return 'Third Class'
    classes = titanic_survival.apply(which_class, axis=1)
    print(classes)
    
    # 年龄离散化
    def is_minor(row):
        if row['Age'] < 18:
            return True
        else:
            return False
    
    minors = titanic_survival.apply(is_minor, axis=1)
    print(minors)
    
    def generate_age_label(row):
        age = row['Age']
        if pd.isnull(age):
            return 'Unknown'
        elif age < 18:
            return 'minor'
        else:
            return 'adult'
    age_label = titanic_survival.apply(generate_age_label, axis=1)
    print(age_label)
    
    titanic_survival['age_labels'] = age_label
    age_group_survival = titanic_survival.pivot_table(index='age_labels', values='Survived')
    print(age_group_survival)
    
    # Series 结构
    # Dataframe (collection of Series objects)
    # A Series object can hold many data type
    import pandas as pd
    fangango = pd.read_csv("fandango_scores.csv")
    series_film = fangango['FILM']
    print(type(series_film))
    print(series_film[0:5])
    series_rt = fangango['RottenTomatoes']
    print(series_rt[0:5])
    
    # 改变Series索引
    from pandas import Series
    
    film_names = series_film.values
    print(type(film_names))
    print(film_names)
    print('------')
    rt_scores = series_rt.values
    print(rt_scores)
    
    series_custom = Series(rt_scores, index=film_names)
    print(series_custom[['Minions (2015)', 'Max (2015)']])
    fiveten = series_custom[5:10]
    print(fiveten)
    
    # 排序
    original_index = series_custom.index.tolist()
    print(original_index)
    sorted_index = sorted(original_index)
    sort_by_index = series_custom.reindex(sorted_index)
    print(sort_by_index)
    
    sc2 = series_custom.sort_index()
    sc3 = series_custom.sort_values()
    print(sc2[0:10])
    print(sc3[0:10])
    
    # Series 相加 和常见函数
    import numpy as np
    print(np.add(series_custom, series_custom))
    np.sin(series_custom)
    np.max(series_custom)
    
    # 条件表达式
    print(series_custom > 50)
    series_greater_than_50 = series_custom[series_custom > 80]
    print(series_greater_than_50)
    
    # index 相同, 可以直接计算
    rt_citics = Series(fangango['RottenTomatoes'].values, index=fangango['FILM'])
    rt_users = Series(fangango['RottenTomatoes_User'].values, index=fangango['FILM'])
    rt_mean = (rt_citics + rt_users) / 2
    print(rt_mean)
    
    # DataFrame 也可以指定索引
    fangango_films = fangango.set_index('FILM', drop=False)
    print(fangango_films)
    
    # 如果索引不是数字, 也可以用切片, loc
    print(fangango_films['Avengers: Age of Ultron (2015)':'Hot Tub Time Machine 2 (2015)'])
    print(fangango_films.loc['Avengers: Age of Ultron (2015)':'Hot Tub Time Machine 2 (2015)'])
    
    # 利用匿名函数, apply
    types = fangango_films.dtypes
    print(types)
    float_columns = types[types.values == 'float64'].index
    float_df = fangango_films[float_columns]
    print(float_df)
    rt_mt_user = float_df.apply(lambda x: np.std(x))
    print(rt_mt_user)
    
  • 相关阅读:
    shelve模块和xml模块
    time模块,random模块和shutil模块
    包的使用
    目录开发规范
    redis 初步认识四(redis锁,防并发)
    redis 初步认识三(设置登录密码)
    redis 初步认识二(c#调用redis)
    微信小程序 初步认识一(微信运动步数)
    redis 初步认识一(下载安装redis)
    c# 7.0 6.0 新语法
  • 原文地址:https://www.cnblogs.com/jly1/p/12990790.html
Copyright © 2020-2023  润新知