• 数据分析处理库pandas


    老唐数据分析机器学习
    #
    pandas_1 import pandas food_info = pandas.read_csv("food_info.csv") #print(type(food_info)) print (food_info.dtypes) ''' NDB_No int64 Shrt_Desc object Water_(g) float64 Energ_Kcal int64 Protein_(g) float64 Lipid_Tot_(g) float64 Ash_(g) float64 Carbohydrt_(g) float64 Fiber_TD_(g) float64 Sugar_Tot_(g) float64 Calcium_(mg) float64 Iron_(mg) float64 Magnesium_(mg) float64 Phosphorus_(mg) float64 Potassium_(mg) float64 Sodium_(mg) float64 Zinc_(mg) float64 Copper_(mg) float64 Manganese_(mg) float64 Selenium_(mcg) float64 Vit_C_(mg) float64 Thiamin_(mg) float64 Riboflavin_(mg) float64 Niacin_(mg) float64 Vit_B6_(mg) float64 Vit_B12_(mcg) float64 Vit_A_IU float64 Vit_A_RAE float64 Vit_E_(mg) float64 Vit_D_mcg float64 Vit_D_IU float64 Vit_K_(mcg) float64 FA_Sat_(g) float64 FA_Mono_(g) float64 FA_Poly_(g) float64 Cholestrl_(mg) float64 dtype: object ''' #first_rows = food_info.head() #print first_rows #print(food_info.head(3)) #print food_info.columns #print food_info.shape #pandas uses zero-indexing #Series object representing the row at index 0. #print food_info.loc[0] # Series object representing the seventh row. #food_info.loc[6] # Will throw an error: "KeyError: 'the label [8620] is not in the [index]'" #food_info.loc[8620] #The object dtype is equivalent to a string in Python #object - For string values #int - For integer values #float - For float values #datetime - For time values #bool - For Boolean values #print(food_info.dtypes) # Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6. #food_info.loc[3:6] # Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work. # Method 1 #two_five_ten = [2,5,10] #food_info.loc[two_five_ten] # Method 2 #food_info.loc[[2,5,10]] # Series object representing the "NDB_No" column. #ndb_col = food_info["NDB_No"] #print ndb_col # Alternatively, you can access a column by passing in a string variable. #col_name = "NDB_No" #ndb_col = food_info[col_name] #columns = ["Zinc_(mg)", "Copper_(mg)"] #zinc_copper = food_info[columns] #print zinc_copper #print zinc_copper # Skipping the assignment. #zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]] #print(food_info.columns) #print(food_info.head(2)) col_names = food_info.columns.tolist() #print col_names gram_columns = [] for c in col_names: if c.endswith("(g)"): gram_columns.append(c) gram_df = food_info[gram_columns] print(gram_df.head(3)) ''' Water_(g) Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) 0 15.87 0.85 81.11 2.11 0.06 1 15.87 0.85 81.11 2.11 0.06 2 0.24 0.28 99.48 0.00 0.00 Fiber_TD_(g) Sugar_Tot_(g) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) 0 0.0 0.06 51.368 21.021 3.043 1 0.0 0.06 50.489 23.426 3.012 2 0.0 0.00 61.924 28.732 3.694 '''
    # pandas_2
    
    import pandas
    food_info = pandas.read_csv("food_info.csv")
    col_names = food_info.columns.tolist()
    print(col_names)
    print(food_info.head(3))
    '''
    ['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']
       NDB_No                 Shrt_Desc  Water_(g)  Energ_Kcal  Protein_(g)  
    0    1001          BUTTER WITH SALT      15.87         717         0.85   
    1    1002  BUTTER WHIPPED WITH SALT      15.87         717         0.85   
    2    1003      BUTTER OIL ANHYDROUS       0.24         876         0.28   
    
       Lipid_Tot_(g)  Ash_(g)  Carbohydrt_(g)  Fiber_TD_(g)  Sugar_Tot_(g)  ...  
    0          81.11     2.11            0.06           0.0           0.06  ...   
    1          81.11     2.11            0.06           0.0           0.06  ...   
    2          99.48     0.00            0.00           0.0           0.00  ...   
    
       Vit_A_IU  Vit_A_RAE  Vit_E_(mg)  Vit_D_mcg  Vit_D_IU  Vit_K_(mcg)  
    0    2499.0      684.0        2.32        1.5      60.0          7.0   
    1    2499.0      684.0        2.32        1.5      60.0          7.0   
    2    3069.0      840.0        2.80        1.8      73.0          8.6   
    
       FA_Sat_(g)  FA_Mono_(g)  FA_Poly_(g)  Cholestrl_(mg)  
    0      51.368       21.021        3.043           215.0  
    1      50.489       23.426        3.012           219.0  
    2      61.924       28.732        3.694           256.0  
    
    [3 rows x 36 columns]
    '''
    
    #print food_info["Iron_(mg)"]
    #div_1000 = food_info["Iron_(mg)"] / 1000
    #print div_1000
    # Adds 100 to each value in the column and returns a Series object.
    #add_100 = food_info["Iron_(mg)"] + 100
    
    # Subtracts 100 from each value in the column and returns a Series object.
    #sub_100 = food_info["Iron_(mg)"] - 100
    
    # Multiplies each value in the column by 2 and returns a Series object.
    #mult_2 = food_info["Iron_(mg)"]*2
    
    
    #It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
    water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
    water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
    iron_grams = food_info["Iron_(mg)"] / 1000  
    food_info["Iron_(g)"] = iron_grams
    
    #Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g))
    weighted_protein = food_info["Protein_(g)"] * 2
    weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
    initial_rating = weighted_protein + weighted_fat
    
    # the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79
    #For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result, 
    #due to the scale of the values
    # The largest value in the "Energ_Kcal" column.
    max_calories = food_info["Energ_Kcal"].max()
    # Divide the values in "Energ_Kcal" by the largest value.
    normalized_calories = food_info["Energ_Kcal"] / max_calories
    normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
    normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
    food_info["Normalized_Protein"] = normalized_protein
    food_info["Normalized_Fat"] = normalized_fat
    
    #By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
    # Sorts the DataFrame in-place, rather than returning a new DataFrame.
    #print food_info["Sodium_(mg)"]
    food_info.sort_values("Sodium_(mg)", inplace=True)
    print (food_info["Sodium_(mg)"])
    #Sorts by descending order, rather than ascending.
    food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
    print (food_info["Sodium_(mg)"])
    '''
    760     0.0
    758     0.0
    405     0.0
    761     0.0
    2269    0.0
           ... 
    8184    NaN
    8185    NaN
    8195    NaN
    8251    NaN
    8267    NaN
    Name: Sodium_(mg), Length: 8618, dtype: float64
    276     38758.0
    5814    27360.0
    6192    26050.0
    1242    26000.0
    1245    24000.0
             ...   
    8184        NaN
    8185        NaN
    8195        NaN
    8251        NaN
    8267        NaN
    Name: Sodium_(mg), Length: 8618, dtype: float64
    '''
    # pandas_3
    
    import pandas as pd
    import numpy as np
    titanic_survival = pd.read_csv("titanic_train.csv")
    titanic_survival.head()
    
    #The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
    #we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
    age = titanic_survival["Age"]
    # print(age.loc[0:10])
    age_is_null = pd.isnull(age)
    # print (age_is_null)
    age_null_true = age[age_is_null]
    print (age_null_true)
    age_null_count = len(age_null_true)
    print(age_null_count) 
    '''
    5     NaN
    17    NaN
    19    NaN
    26    NaN
    28    NaN
           ..
    859   NaN
    863   NaN
    868   NaN
    878   NaN
    888   NaN
    Name: Age, Length: 177, dtype: float64
    177
    '''
    
    #The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
    mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
    print (mean_age)
    '''
    nan
    '''
    
    #we have to filter out the missing values before we calculate the mean.
    good_ages = titanic_survival["Age"][age_is_null == False]
    #print good_ages
    correct_mean_age = sum(good_ages) / len(good_ages)
    print(correct_mean_age)
    '''
    29.69911764705882
    '''
    
    # missing data is so common that many pandas methods automatically filter for it
    correct_mean_age = titanic_survival["Age"].mean()
    print(correct_mean_age)
    '''
    29.69911764705882
    '''
    
    #mean fare for each class
    passenger_classes = [1, 2, 3]
    fares_by_class = {}
    for this_class in passenger_classes:
        pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
        pclass_fares = pclass_rows["Fare"]
        fare_for_class = pclass_fares.mean()
        fares_by_class[this_class] = fare_for_class
    print (fares_by_class)
    '''
    {1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}
    '''
    
    #index tells the method which column to group by
    #values is the column that we want to apply the calculation to
    #aggfunc specifies the calculation we want to perform
    passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
    print (passenger_survival)
    '''
              Survived
    Pclass          
    1       0.629630
    2       0.472826
    3       0.242363
    '''
    
    passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
    print(passenger_age)
    '''
                  Age
    Pclass           
    1       38.233441
    2       29.877630
    3       25.140620
    '''    
    
    port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
    print(port_stats)
    '''
                    Fare  Survived
    Embarked                      
    C         10072.2962        93
    Q          1022.2543        30
    S         17439.3988       217
    '''
    
    #specifying axis=1 or axis='columns' will drop any columns that have null values
    drop_na_columns = titanic_survival.dropna(axis=1)
    new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
    
    print (drop_na_columns.shape)
    '''
    (891, 9)
    '''
    
    row_index_83_age = titanic_survival.loc[83,"Age"]
    row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
    print(row_index_83_age)
    print(row_index_1000_pclass)
    '''
    28.0
    1
    '''
    
    new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
    print (new_titanic_survival[0:10])
    titanic_reindexed = new_titanic_survival.reset_index(drop=True) # (drop=True) 表示原来的索引不要了,生成新的索引
    print(titanic_reindexed.iloc[0:10])
    '''
         PassengerId  Survived  Pclass                                  Name  
    630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
    851          852         0       3                   Svensson, Mr. Johan   
    493          494         0       1               Artagaveytia, Mr. Ramon   
    96            97         0       1             Goldschmidt, Mr. George B   
    116          117         0       3                  Connors, Mr. Patrick   
    672          673         0       2           Mitchell, Mr. Henry Michael   
    745          746         0       1          Crosby, Capt. Edward Gifford   
    33            34         0       2                 Wheadon, Mr. Edward H   
    54            55         0       1        Ostby, Mr. Engelhart Cornelius   
    280          281         0       3                      Duane, Mr. Frank   
    
          Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
    630  male  80.0      0      0       27042  30.0000   A23        S  
    851  male  74.0      0      0      347060   7.7750   NaN        S  
    493  male  71.0      0      0    PC 17609  49.5042   NaN        C  
    96   male  71.0      0      0    PC 17754  34.6542    A5        C  
    116  male  70.5      0      0      370369   7.7500   NaN        Q  
    672  male  70.0      0      0  C.A. 24580  10.5000   NaN        S  
    745  male  70.0      1      1   WE/P 5735  71.0000   B22        S  
    33   male  66.0      0      0  C.A. 24579  10.5000   NaN        S  
    54   male  65.0      0      1      113509  61.9792   B30        C  
    280  male  65.0      0      0      336439   7.7500   NaN        Q  
       PassengerId  Survived  Pclass                                  Name   Sex  
    0          631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
    1          852         0       3                   Svensson, Mr. Johan  male   
    2          494         0       1               Artagaveytia, Mr. Ramon  male   
    3           97         0       1             Goldschmidt, Mr. George B  male   
    4          117         0       3                  Connors, Mr. Patrick  male   
    5          673         0       2           Mitchell, Mr. Henry Michael  male   
    6          746         0       1          Crosby, Capt. Edward Gifford  male   
    7           34         0       2                 Wheadon, Mr. Edward H  male   
    8           55         0       1        Ostby, Mr. Engelhart Cornelius  male   
    9          281         0       3                      Duane, Mr. Frank  male   
    
        Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
    0  80.0      0      0       27042  30.0000   A23        S  
    1  74.0      0      0      347060   7.7750   NaN        S  
    2  71.0      0      0    PC 17609  49.5042   NaN        C  
    3  71.0      0      0    PC 17754  34.6542    A5        C  
    4  70.5      0      0      370369   7.7500   NaN        Q  
    5  70.0      0      0  C.A. 24580  10.5000   NaN        S  
    6  70.0      1      1   WE/P 5735  71.0000   B22        S  
    7  66.0      0      0  C.A. 24579  10.5000   NaN        S  
    8  65.0      0      1      113509  61.9792   B30        C  
    9  65.0      0      0      336439   7.7500   NaN        Q  
    '''
    
    # This function returns the hundredth item from a series
    def hundredth_row(column):
        # Extract the hundredth item
        hundredth_item = column.iloc[99]
        return hundredth_item
    
    # Return the hundredth item from each column
    hundredth_row = titanic_survival.apply(hundredth_row)
    print (hundredth_row)
    '''
    PassengerId                  100
    Survived                       0
    Pclass                         2
    Name           Kantor, Mr. Sinai
    Sex                         male
    Age                           34
    SibSp                          1
    Parch                          0
    Ticket                    244367
    Fare                          26
    Cabin                        NaN
    Embarked                       S
    dtype: object
    '''
    
    # 判断每列中缺失值个数
    def not_null_count(column):
        column_null = pd.isnull(column)
        null = column[column_null]
        return len(null)
    
    column_null_count = titanic_survival.apply(not_null_count)
    print (column_null_count)
    '''
    PassengerId      0
    Survived         0
    Pclass           0
    Name             0
    Sex              0
    Age            177
    SibSp            0
    Parch            0
    Ticket           0
    Fare             0
    Cabin          687
    Embarked         2
    age_labels       0
    dtype: int64
    '''
    
    # len(titanic_survival[pd.isnull(titanic_survival)])
    # titanic_survival
    
    #By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
    def which_class(row):
        pclass = row['Pclass']
        if pd.isnull(pclass):
            return "Unknown"
        elif pclass == 1:
            return "First Class"
        elif pclass == 2:
            return "Second Class"
        elif pclass == 3:
            return "Third Class"
    
    classes = titanic_survival.apply(which_class, axis=1)
    print (classes)
    '''
    0       Third Class
    1       First Class
    2       Third Class
    3       First Class
    4       Third Class
               ...     
    886    Second Class
    887     First Class
    888     Third Class
    889     First Class
    890     Third Class
    Length: 891, dtype: object
    '''
    
    def is_minor(row):
        if row["Age"] < 18:
            return True
        else:
            return False
    
    minors = titanic_survival.apply(is_minor, axis=1)
    #print minors
    # 离散化
    def generate_age_label(row):
        age = row["Age"]
        if pd.isnull(age):
            return "unknown"
        elif age < 18:
            return "minor"
        else:
            return "adult"
    
    age_labels = titanic_survival.apply(generate_age_label, axis=1)
    print (age_labels)
    '''
    0        adult
    1        adult
    2        adult
    3        adult
    4        adult
            ...   
    886      adult
    887      adult
    888    unknown
    889      adult
    890      adult
    Length: 891, dtype: object
    '''
    
    titanic_survival['age_labels'] = age_labels
    age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
    print(age_group_survival)
    '''
                Survived
    age_labels          
    adult       0.381032
    minor       0.539823
    unknown     0.293785
    '''
    # pandas_4
    
    #Series (collection of values)
    #DataFrame (collection of Series objects)
    #Panel (collection of DataFrame objects)
    
    #A Series object can hold many data types, including
    #float - for representing float values
    #int - for representing integer values
    #bool - for representing Boolean values
    #datetime64[ns] - for representing date & time, without time-zone
    #datetime64[ns, tz] - for representing date & time, with time-zone
    #timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.)
    #category - for representing categorical values
    #object - for representing String values
    
    #FILM - film name
    #RottenTomatoes - Rotten Tomatoes critics average score
    #RottenTomatoes_User - Rotten Tomatoes user average score
    #RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system)
    #RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system)
    #Metacritic - Metacritic critics average score
    #Metacritic_User - Metacritic user average score
    
    import pandas as pd
    fandango = pd.read_csv('fandango_score_comparison.csv')
    series_film = fandango['FILM']
    print(type(series_film))
    print('=========================')
    print(series_film[0:5])
    print('=========================')
    series_rt = fandango['RottenTomatoes']
    print (series_rt[0:5])
    '''
    <class 'pandas.core.series.Series'>
    =========================
    0    Avengers: Age of Ultron (2015)
    1                 Cinderella (2015)
    2                    Ant-Man (2015)
    3            Do You Believe? (2015)
    4     Hot Tub Time Machine 2 (2015)
    Name: FILM, dtype: object
    =========================
    0    74
    1    85
    2    80
    3    18
    4    14
    Name: RottenTomatoes, dtype: int64
    '''
    
    fandango.head()
    '''
        FILM    RottenTomatoes    RottenTomatoes_User    Metacritic    Metacritic_User    IMDB    Fandango_Stars    Fandango_Ratingvalue    RT_norm    RT_user_norm    ...    IMDB_norm    RT_norm_round    RT_user_norm_round    Metacritic_norm_round    Metacritic_user_norm_round    IMDB_norm_round    Metacritic_user_vote_count    IMDB_user_vote_count    Fandango_votes    Fandango_Difference
    0    Avengers: Age of Ultron (2015)    74    86    66    7.1    7.8    5.0    4.5    3.70    4.3    ...    3.90    3.5    4.5    3.5    3.5    4.0    1330    271107    14846    0.5
    1    Cinderella (2015)    85    80    67    7.5    7.1    5.0    4.5    4.25    4.0    ...    3.55    4.5    4.0    3.5    4.0    3.5    249    65709    12640    0.5
    2    Ant-Man (2015)    80    90    64    8.1    7.8    5.0    4.5    4.00    4.5    ...    3.90    4.0    4.5    3.0    4.0    4.0    627    103660    12055    0.5
    3    Do You Believe? (2015)    18    84    22    4.7    5.4    5.0    4.5    0.90    4.2    ...    2.70    1.0    4.0    1.0    2.5    2.5    31    3136    1793    0.5
    4    Hot Tub Time Machine 2 (2015)    14    28    29    3.4    5.1    3.5    3.0    0.70    1.4    ...    2.55    0.5    1.5    1.5    1.5    2.5    88    19560    1021    0.5
    5 rows × 22 columns
    '''
    
    # fandango.loc[[0,1],['FILM','RottenTomatoes']]
    # fandango.FILM[0]
    fandango.iloc[1,2]
    '''
    80
    '''
    
    # Import the Series object from pandas
    from pandas import Series
    
    film_names = series_film.values
    print (type(film_names))
    # print (film_names)
    #print film_names
    rt_scores = series_rt.values
    #print (rt_scores)
    series_custom = Series(rt_scores , index=film_names)
    series_custom[['Minions (2015)', 'Leviathan (2014)']]
    '''
    <class 'numpy.ndarray'>
    Minions (2015)      54
    Leviathan (2014)    99
    dtype: int64
    '''
    
    # int index is also aviable
    series_custom = Series(rt_scores , index=film_names)
    print(series_custom[['Minions (2015)', 'Leviathan (2014)']])
    fiveten = series_custom[5:10]
    print(fiveten)
    '''
    Minions (2015)      54
    Leviathan (2014)    99
    dtype: int64
    The Water Diviner (2015)        63
    Irrational Man (2015)           42
    Top Five (2014)                 86
    Shaun the Sheep Movie (2015)    99
    Love & Mercy (2015)             89
    dtype: int64
    '''
    
    original_index = series_custom.index.tolist()
    # print(original_index)
    sorted_index = sorted(original_index)
    sorted_by_index = series_custom.reindex(sorted_index)
    print (sorted_by_index)
    '''
    '71 (2015)                          97
    5 Flights Up (2015)                 52
    A Little Chaos (2015)               40
    A Most Violent Year (2014)          90
    About Elly (2015)                   97
                                        ..
    What We Do in the Shadows (2015)    96
    When Marnie Was There (2015)        89
    While We're Young (2015)            83
    Wild Tales (2014)                   96
    Woman in Gold (2015)                52
    Length: 146, dtype: int64
    '''
    
    sc2 = series_custom.sort_index()
    sc3 = series_custom.sort_values()
    #print(sc2[0:10])
    print(sc3[0:10])
    '''
    Paul Blart: Mall Cop 2 (2015)     5
    Hitman: Agent 47 (2015)           7
    Hot Pursuit (2015)                8
    Fantastic Four (2015)             9
    Taken 3 (2015)                    9
    The Boy Next Door (2015)         10
    The Loft (2015)                  11
    Unfinished Business (2015)       11
    Mortdecai (2015)                 12
    Seventh Son (2015)               12
    dtype: int64
    '''
    
    #The values in a Series object are treated as an ndarray, the core data type in NumPy
    import numpy as np
    # Add each value with each other
    print (np.add(series_custom, series_custom))
    # Apply sine function to each value
    np.sin(series_custom)
    # Return the highest value (will return a single value not a Series)
    np.max(series_custom)
    '''
    Avengers: Age of Ultron (2015)               148
    Cinderella (2015)                            170
    Ant-Man (2015)                               160
    Do You Believe? (2015)                        36
    Hot Tub Time Machine 2 (2015)                 28
                                                ... 
    Mr. Holmes (2015)                            174
    '71 (2015)                                   194
    Two Days, One Night (2014)                   194
    Gett: The Trial of Viviane Amsalem (2015)    200
    Kumiko, The Treasure Hunter (2015)           174
    Length: 146, dtype: int64
    
    100
    '''
    
    #will actually return a Series object with a boolean value for each film
    series_custom > 50
    series_greater_than_50 = series_custom[series_custom > 50]
    
    criteria_one = series_custom > 50
    criteria_two = series_custom < 75
    both_criteria = series_custom[criteria_one & criteria_two]
    print(both_criteria)
    '''
    Avengers: Age of Ultron (2015)                                            74
    The Water Diviner (2015)                                                  63
    Unbroken (2014)                                                           51
    Southpaw (2015)                                                           59
    Insidious: Chapter 3 (2015)                                               59
    The Man From U.N.C.L.E. (2015)                                            68
    Run All Night (2015)                                                      60
    5 Flights Up (2015)                                                       52
    Welcome to Me (2015)                                                      71
    Saint Laurent (2015)                                                      51
    Maps to the Stars (2015)                                                  60
    Pitch Perfect 2 (2015)                                                    67
    The Age of Adaline (2015)                                                 54
    The DUFF (2015)                                                           71
    Ricki and the Flash (2015)                                                64
    Unfriended (2015)                                                         60
    American Sniper (2015)                                                    72
    The Hobbit: The Battle of the Five Armies (2014)                          61
    Paper Towns (2015)                                                        55
    Big Eyes (2014)                                                           72
    Maggie (2015)                                                             54
    Focus (2015)                                                              57
    The Second Best Exotic Marigold Hotel (2015)                              62
    The 100-Year-Old Man Who Climbed Out the Window and Disappeared (2015)    67
    Escobar: Paradise Lost (2015)                                             52
    Into the Woods (2014)                                                     71
    Inherent Vice (2014)                                                      73
    Magic Mike XXL (2015)                                                     62
    Woman in Gold (2015)                                                      52
    The Last Five Years (2015)                                                60
    Jurassic World (2015)                                                     71
    Minions (2015)                                                            54
    Spare Parts (2015)                                                        52
    dtype: int64
    '''
    
    #data alignment same index
    rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
    rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
    rt_mean = (rt_critics + rt_users)/2
    
    print(rt_mean)
    '''
    FILM
    Avengers: Age of Ultron (2015)               80.0
    Cinderella (2015)                            82.5
    Ant-Man (2015)                               85.0
    Do You Believe? (2015)                       51.0
    Hot Tub Time Machine 2 (2015)                21.0
                                                 ... 
    Mr. Holmes (2015)                            82.5
    '71 (2015)                                   89.5
    Two Days, One Night (2014)                   87.5
    Gett: The Trial of Viviane Amsalem (2015)    90.5
    Kumiko, The Treasure Hunter (2015)           75.0
    Length: 146, dtype: float64
    '''
    # pandas_5
    
    import pandas as pd
    
    #will return a new DataFrame that is indexed by the values in the specified column 
    #and will drop that column from the DataFrame
    #without the FILM column dropped 
    fandango = pd.read_csv('fandango_score_comparison.csv')
    print (type(fandango))
    fandango_films = fandango.set_index('FILM', drop=False)
    #print(fandango_films.index)
    '''
    <class 'pandas.core.frame.DataFrame'>
    '''
    
    # Slice using either bracket notation or loc[]
    fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
    fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
    
    # Specific movie
    fandango_films.loc['Kumiko, The Treasure Hunter (2015)']
    
    # Selecting list of movies
    movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
    fandango_films.loc[movies]
    
    #When selecting multiple rows, a DataFrame is returned, 
    #but when selecting an individual row, a Series object is returned instead
    '''
        FILM    RottenTomatoes    RottenTomatoes_User    Metacritic    Metacritic_User    IMDB    Fandango_Stars    Fandango_Ratingvalue    RT_norm    RT_user_norm    ...    IMDB_norm    RT_norm_round    RT_user_norm_round    Metacritic_norm_round    Metacritic_user_norm_round    IMDB_norm_round    Metacritic_user_vote_count    IMDB_user_vote_count    Fandango_votes    Fandango_Difference
    FILM                                                                                    
    Kumiko, The Treasure Hunter (2015)    Kumiko, The Treasure Hunter (2015)    87    63    68    6.4    6.7    3.5    3.5    4.35    3.15    ...    3.35    4.5    3.0    3.5    3.0    3.5    19    5289    41    0.0
    Do You Believe? (2015)    Do You Believe? (2015)    18    84    22    4.7    5.4    5.0    4.5    0.90    4.20    ...    2.70    1.0    4.0    1.0    2.5    2.5    31    3136    1793    0.5
    Ant-Man (2015)    Ant-Man (2015)    80    90    64    8.1    7.8    5.0    4.5    4.00    4.50    ...    3.90    4.0    4.5    3.0    4.0    4.0    627    103660    12055    0.5
    3 rows × 22 columns
    '''
    
    #The apply() method in Pandas allows us to specify Python logic
    #The apply() method requires you to pass in a vectorized operation 
    #that can be applied over each Series object.
    import numpy as np
    
    # returns the data types as a Series
    types = fandango_films.dtypes
    #print types
    # filter data types to just floats, index attributes returns just column names
    float_columns = types[types.values == 'float64'].index
    # use bracket notation to filter columns to just float columns
    float_df = fandango_films[float_columns]
    #print float_df
    # `x` is a Series object representing a column
    deviations = float_df.apply(lambda x: np.std(x))
    
    print(deviations)
    '''
    Metacritic_User               1.505529
    IMDB                          0.955447
    Fandango_Stars                0.538532
    Fandango_Ratingvalue          0.501106
    RT_norm                       1.503265
    RT_user_norm                  0.997787
    Metacritic_norm               0.972522
    Metacritic_user_nom           0.752765
    IMDB_norm                     0.477723
    RT_norm_round                 1.509404
    RT_user_norm_round            1.003559
    Metacritic_norm_round         0.987561
    Metacritic_user_norm_round    0.785412
    IMDB_norm_round               0.501043
    Fandango_Difference           0.152141
    dtype: float64
    '''
    
    rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
    rt_mt_user.apply(lambda x: np.std(x), axis=1)
    '''
    FILM
    Avengers: Age of Ultron (2015)               0.375
    Cinderella (2015)                            0.125
    Ant-Man (2015)                               0.225
    Do You Believe? (2015)                       0.925
    Hot Tub Time Machine 2 (2015)                0.150
                                                 ...  
    Mr. Holmes (2015)                            0.025
    '71 (2015)                                   0.175
    Two Days, One Night (2014)                   0.250
    Gett: The Trial of Viviane Amsalem (2015)    0.200
    Kumiko, The Treasure Hunter (2015)           0.025
    Length: 146, dtype: float64
    ​'''
  • 相关阅读:
    打印二叉树和为某一值的路径
    顺时针打印数组
    算术表达式
    堆内存与栈内存详解
    【腾讯校招在线考试附加题】将一个10进制数转换为四位定长的36进制数
    反转链表
    记录github出错及解决方案
    centos7操作防火墙
    无法在web.xml或使用此应用程序部署的jar文件中解析绝对uri:[http://java.sun.com/jsp/jstl/core]解决方法
    MyBatis联表查询——别名方式
  • 原文地址:https://www.cnblogs.com/LXL616/p/12036696.html
Copyright © 2020-2023  润新知