• 速战速决 Python python 第三方库(pandas): DataFrame连接,去重,空值处理


    速战速决 Python https://github.com/webabcd/PythonSample
    作者 webabcd

    速战速决 Python - python 第三方库(pandas): DataFrame连接,去重,空值处理

    示例如下:

    thirdLib/pandas/sample3.py

    # pandas
    #   DataFrame 连接:append(), concat(), merge(), join()
    #   DataFrame 去重
    #   DataFrame 空值处理
    
    import pandas as pd
    
    data1 = {
        "name": ["zhao", "qian"],
        "age": [40, 25], 
        "city": ["beijing ", "beijing"],
        "gender": ["M", "F"]
    } 
    a = pd.DataFrame(data=data1) 
    data2 = { 
        "name": ["qian", "sun"],
        "age": [25, 22], 
        "city": ["beijing", "shanghai"],
        "salary": [100, 150]
    } 
    b = pd.DataFrame(data=data2) 
    print(a)
    '''
       name  age      city gender
    0  zhao   40  beijing       M
    1  qian   25   beijing      F
    '''
    print(b)
    '''
       name  age      city  salary
    0  qian   25   beijing     100
    1   sun   22  shanghai     150
    '''
    
    # append() 在 DataFrame 数据的末尾追加指定的 DataFrame 数据
    print(a.append(b))
    '''
       name  age      city gender  salary
    0  zhao   40  beijing       M     NaN
    1  qian   25   beijing      F     NaN
    0  qian   25   beijing    NaN   100.0
    1   sun   22  shanghai    NaN   150.0
    '''
    
    
    # drop_duplicates() 去重
    #   subset - 根据指定的字段去重
    #   keep - first保留第一条重复数据,last保留最后一条重复数据,False删除全部重复数据
    #   inplace - 是否直接修改原对象
    #     False 默认值,原对象不变,返回数据修改后的副本
    #     True 直接修改原对象,返回值为 None
    print(a.append(b).drop_duplicates(subset=['age','city'], keep='first', inplace=False))
    '''
       name  age      city gender  salary
    0  zhao   40  beijing       M     NaN
    1  qian   25   beijing      F     NaN
    1   sun   22  shanghai    NaN   150.0
    '''
    
    
    # concat() 拼接多个 DataFrame 数据
    print(pd.concat([a, b]))
    '''
       name  age      city gender  salary
    0  zhao   40  beijing       M     NaN
    1  qian   25   beijing      F     NaN
    0  qian   25   beijing    NaN   100.0
    1   sun   22  shanghai    NaN   150.0
    '''
    
    # concat() 拼接多个 DataFrame 数据
    #   ignore_index=True 重建索引
    print(pd.concat([a, b], ignore_index=True))
    '''
       name  age      city gender  salary
    0  zhao   40  beijing       M     NaN
    1  qian   25   beijing      F     NaN
    2  qian   25   beijing    NaN   100.0
    3   sun   22  shanghai    NaN   150.0
    '''
    
    # concat() 拼接多个 DataFrame 数据
    #   join='outer' 默认值,各方列名不相同的也集成到一起
    #   join='inner' 各方列名不相同的就舍弃,只保留列名相同的数据
    print(pd.concat([a, b], ignore_index=True, join='inner'))
    '''
       name  age      city
    0  zhao   40  beijing
    1  qian   25   beijing
    2  qian   25   beijing
    3   sun   22  shanghai
    '''
    
    # concat() 拼接多个 DataFrame 数据
    #   axis=1 横向拼接
    print(pd.concat([a, b], axis=1))
    '''
       name  age      city gender  name  age      city  salary
    0  zhao   40  beijing       M  qian   25   beijing     100
    1  qian   25   beijing      F   sun   22  shanghai     150
    '''
    
    
    # merge() 整合 2 个 DataFrame 数据
    #   on='name' 以 name 列为关联关键字整 2 个 DataFrame 数据
    #   如果 2 个 DataFrame 数据的关联关键字不相同,则类似这么设置 left_on="name1", right_on="name2"
    print(pd.merge(a, b, on='name'))
    '''
       name  age_x   city_x gender  age_y   city_y  salary
    0  qian     25  beijing      F     25  beijing     100
    '''
    
    # merge() 整合 2 个 DataFrame 数据
    #   how='inner' 默认值,只整合两边都存在的数据
    #   how='outer' 整合两边的全部数据
    #   how='left' 只整合左边存在的数据
    #   how='right' 只整合右边存在的数据
    print(pd.merge(a, b, on='name', how='outer'))
    '''
       name  age_x    city_x gender  age_y    city_y  salary
    0  zhao   40.0  beijing       M    NaN       NaN     NaN
    1  qian   25.0   beijing      F   25.0   beijing   100.0
    2   sun    NaN       NaN    NaN   22.0  shanghai   150.0
    '''
    
    # merge() 整合 2 个 DataFrame 数据
    #   suffixes - 相同列名整合时,为列名加上指定的后缀,以便区分列是来自哪个 DataFrame 的
    print(pd.merge(a, b, on='name', how='outer', suffixes=("_left", "_right")))
    '''
       name  age_left city_left gender  age_right city_right  salary
    0  zhao      40.0  beijing       M        NaN        NaN     NaN
    1  qian      25.0   beijing      F       25.0    beijing   100.0
    2   sun       NaN       NaN    NaN       22.0   shanghai   150.0
    '''
    
    
    # join() 整合 2 个 DataFrame 数据
    #   相当于 how='left' 的 merge()
    print(a.join(b.set_index("name"), on="name", lsuffix="_left", rsuffix='_right'))
    '''
       name  age_left city_left gender  age_right city_right  salary
    0  zhao        40  beijing       M        NaN        NaN     NaN
    1  qian        25   beijing      F       25.0    beijing   100.0
    '''
    
    
    
    # 以下用于说明空值如何处理
    index3 = ['zhao', 'qian', 'sun', 'li', 'zhou']
    data3 = {
        "age": [None, None, 22, 28, 28],
        "city": ['beijing', 'beijing', None, 'beijing', 'shanghai']
    }
    c = pd.DataFrame(data=data3, index=index3)
    print(c)
    '''
    NaN 数字类型的空值(来自 numpy 的 nan),None 其他类型的空值
           age      city
    zhao   NaN   beijing
    qian   NaN   beijing
    sun   22.0      None
    li    28.0   beijing
    zhou  28.0  shanghai
    '''
    
    # isnull() 判断数据是否有空值
    # notnull() 判断数据是否没有空值
    print(c.isnull())
    '''
            age   city
    zhao   True  False
    qian   True  False
    sun   False   True
    li    False  False
    zhou  False  False
    '''
    
    # 获取指定列没有空值的数据
    print(c[c.age.notnull()])
    '''
           age      city
    sun   22.0      None
    li    28.0   beijing
    zhou  28.0  shanghai
    '''
    
    # dropna() 删除空值数据
    #   how="any" 有一个字段空值,则整行删除
    #   how="all" 所有字段都空值,才整行删除
    #   subset=["column1", "column2"] 只从指定的列中查询
    print(c.dropna(how="any"))
    '''
           age      city
    li    28.0   beijing
    zhou  28.0  shanghai
    '''
    print(c.dropna(how="any", subset=["age"]))
    '''
           age      city
    sun   22.0      None
    li    28.0   beijing
    zhou  28.0  shanghai
    '''
    
    # 为指定的字段中的空值填充一个指定的值
    #   inplace=False 默认值,原对象不变,返回数据修改后的副本
    #   inplace=True 直接修改原对象,返回值为 None
    print(c.fillna({'age':0,'city':'unknown'}, inplace=True)) # None
    print(c)
    '''
           age      city
    zhao   0.0   beijing
    qian   0.0   beijing
    sun   22.0   unknown
    li    28.0   beijing
    zhou  28.0  shanghai
    '''
    

    速战速决 Python https://github.com/webabcd/PythonSample
    作者 webabcd

  • 相关阅读:
    linux安装nexus
    k8s批量删除资源
    spug 一个好用运维管理平台
    SpringMvc几种数据转化方法
    Esxi通过U盘启动
    LobHandler和LobCreator
    Spring的NamedParameterJdbcTemplate
    Spring中的JdbaTemplate
    Rufus
    UltraISO 注册码
  • 原文地址:https://www.cnblogs.com/webabcd/p/python_thirdLib_pandas_sample3.html
Copyright © 2020-2023  润新知