• Pandas库常用函数和操作


    1. DataFrame 处理缺失值  dropna()

    df2.dropna(axis=0, how='any', subset=[u'ToC'], inplace=True)

    把在ToC列有缺失值的行去掉

    补充:还可以用df.fillna()来把缺失值替换为某个特殊标记

    df = df.fillna("missing")  # 用字符串替代
    df = df.fillna(df.mean())  # 用均值或者其它描述性统计值替代

    2. 根据某维度计算重复的行   duplicated()value_counts()

    print df.duplicated(['name']).value_counts()  # 如果不指定列,默认会判断所有列
    """
    输出:
    False    11118
    True       664
    表示有664行是重复的
    """

    duplicated()方法返回一个布尔型的Series,显示各行是否为重复行,非重复行显示为False,重复行显示为True

    value_counts()方法统计数组或序列所有元素出现次数,对某一列统计可以直接用df.column_name.value_counts()

    3. 去重  drop_duplicates()

    df.drop_duplicates(['name'], keep='last', inplace=True)
    """
    keep : {‘first’, ‘last’, False}, default ‘first’
        first : Drop duplicates except for the first occurrence.
        last : Drop duplicates except for the last occurrence.
        False : Drop all duplicates.
    """

    4. 拼接

    (1) 拼接列 merge()

    result = pd.merge(left, right, on='name', how='inner')
    """
    其它参数:
    pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False)
    
    Examples
    >>> A              >>> B
        lkey value         rkey value
    0   foo  1         0   foo  5
    1   bar  2         1   bar  6
    2   baz  3         2   qux  7
    3   foo  4         3   bar  8
    
    >>> A.merge(B, left_on='lkey', right_on='rkey', how='outer')
       lkey  value_x  rkey  value_y
    0  foo   1        foo   5
    1  foo   4        foo   5
    2  bar   2        bar   6
    3  bar   2        bar   8
    4  baz   3        NaN   NaN
    5  NaN   NaN      qux   7
    """

    其它参考:Merge, join, and concatenate

    (2) 拼接行 

    def concat_by_row(data_dir, fout):
        dfs = []
        for filename in os.listdir(data_dir):
            dfs.append(pd.read_excel(os.path.join(data_dir, filename)))
            print(dfs[-1].shape, filename)
        df = pd.concat(dfs, axis=0, ignore_index=True)  # axis=0按行拼接;axis=1按列拼接
        print(df.shape)
        df.to_excel(fout, index=False)

    5. 找出在某一特定维度为空值的所有行

    bool_arr = df.name.notnull()
    print bool_arr.value_counts()
    for idx, value in bool_arr.iteritems():
        if not value:
            print '
    ', idx, value
            print df.iloc[idx]

    6. 指定dataframe的维度及顺序; 保存数据csv文件

    res = {'name':[], 'buss':[], 'label':[]}
    with codecs.open(fname, encoding='utf8') as fr:
        for idx, line in enumerate(fr):
            item = json.loads(line)
            res['name'].append(item['name'])        
            res['buss'].append(item['buss']) 
            res['label'].append(item['label'])
    df = pd.DataFrame(res, columns=['name', 'buss', 'label'])
    df.to_csv('data/xxx.csv', index=False, encoding='utf-8')

    7. 保存到文件

    7.1 读写excel/csv格式文件

    import pandas as pd
    
    def dataframe_read_and_write(fin, fout):  # 读取fin文件,添加一列"新应答"
        df = pd.read_excel(fin)
        # df = read_csv(fin, encoding='utf-8')
        print df.head()
    
        fields = [u"序号", u"问题描述", u"原始应答", u"新应答"]
        data_out = defaultdict(list)
    
        for idx, row in df.iterrows():
            try:
                row = row.to_dict()
                new_answer = "xxxxxx"
                for field in fields[:-1]:
                    data_out[field].append(row[field])
                data_out[fields[-1]].append(new_answer)
            except Exception as error:
                print "Error line", idx, error
    
        df_out = pd.DataFrame(data_out, columns=fields)
        df_out.to_excel(fout, sheet_name="Sheet1", index=False, header=True)
        # df_out.to_csv(fout, index=False, encoding="utf-8")
    
    
    if __name__ == '__main__':
        dataframe_read_and_write(fin="data/tmp.xlsx", fout="data/tmp_out.xlsx")

    7.2 将多张DataFrame表写入到同一个excel文件的不同sheet中

    import pandas as pd
    writer = pd.ExcelWriter('foo.xlsx')
    df.to_excel(writer, 'Data 0')
    df.to_excel(writer, 'Data 1')
    writer.save()

    7.3 向一个已经存在的excel文件中写入一张新sheet;如果文件不存在则创建一个新文件再写入

    import pandas
    from openpyxl import load_workbook
    
    
    def add_new_sheet(df, fout, sheet_name='Sheet1', columns=None):
        if fout and os.path.exists(fout):
            book = load_workbook(fout)
            writer = pd.ExcelWriter(fout, engine='openpyxl')
            writer.book = book
            writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
        else:
            writer = pd.ExcelWriter(fout)
        df.to_excel(writer, sheet_name=sheet_name, columns=columns, index=False)
        writer.save()
        
        
    add_new_sheet(df, fout='Masterfile.xlsx', sheet_name="Main", columns=['Diff1', 'Diff2'])    

    参考:官方解决方案https://github.com/pandas-dev/pandas/issues/3441

    7.4 读取excel文本中的多个sheet

    import xlrd
    
    workbook = xlrd.open_workbook(fin)
    for sheet in workbook.sheets():
        df = pd.read_excel(fin, sheet_name=sheet.name, index_col=None)

    8. 排序

    def sort_dataframe(df, fields_to_sort, fout=None):
        df = df.sort_values(by=fields_to_sort, ascending=True)
        if fout:
            df.to_excel(fout, index=False)
        return df
    
    df = pd.read_excel(data_file)
    sort_dataframe(df, fields_to_sort=["column_A", "column_B"], fout=data_file)
    
    df = pd.read_excel(data_file)  # note: index改变,需要从文件重新读取,才会是有序的,后面遍历df的时候才不会出问题
    print(df.head(10))

    9. 轴标签重命名 df.rename()(列重命名、行index重命名)

    >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
    >>> df.rename(index=str, columns={"A": "a", "B": "c"})
       a  c
    0  1  4
    1  2  5
    2  3  6
    
    >>> df.rename(index=str, columns={"A": "a", "C": "c"})
       a  B
    0  1  4
    1  2  5
    2  3  6
    
    Using axis-style parameters
    
    >>> df.rename(str.lower, axis='columns')
       a  b
    0  1  4
    1  2  5
    2  3  6
    
    >>> df.rename({1: 2, 2: 4}, axis='index')
       A  B
    0  1  4
    2  2  5
    4  3  6

    参数说明:

    Signature: df.rename(mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False, level=None)
    Docstring:
    Alter axes labels.
    
    Function / dict values must be unique (1-to-1). Labels not contained in
    a dict / Series will be left as-is. Extra labels listed don't throw an
    error.
    
    See the :ref:`user guide <basics.rename>` for more.
    
    Parameters
    ----------
    mapper, index, columns : dict-like or function, optional
        dict-like or functions transformations to apply to
        that axis' values. Use either ``mapper`` and ``axis`` to
        specify the axis to target with ``mapper``, or ``index`` and
        ``columns``.
    axis : int or str, optional
        Axis to target with ``mapper``. Can be either the axis name
        ('index', 'columns') or number (0, 1). The default is 'index'.
    copy : boolean, default True
        Also copy underlying data
    inplace : boolean, default False
        Whether to return a new DataFrame. If True then value of copy is
        ignored.
    level : int or level name, default None
        In case of a MultiIndex, only rename labels in the specified
        level.
    
    Returns
    -------
    renamed : DataFrame
    
    See Also
    --------
    pandas.DataFrame.rename_axis
    df.rename()参数说明

    10. 数据选取,修改,切片

    10.1 loc

    在知道列名字的情况下,df.loc[index,column] 选取指定行,列的数据

    # df.loc[index, column_name],选取指定行和列的数据
    df.loc[0,'name'] # 'Snow'
    df.loc[0:2, ['name','age']]          #选取第0行到第2行,name列和age列的数据, 注意这里的行选取是包含下标的。
    df.loc[[2,3],['name','age']]          #选取指定的第2行和第3行,name和age列的数据
    df.loc[df['gender']=='M','name']      #选取gender列是M,name列的数据
    df.loc[df['gender']=='M',['name','age']] #选取gender列是M,name和age列的数据

    10.2 iloc

    在column name特别长或者index是时间序列等各种不方便输入的情况下,可以用iloc (i = index), iloc完全用数字来定位 iloc[row_index, column_index]

    df.iloc[0,0]        #第0行第0列的数据,'Snow'
    df.iloc[1,2]        #第1行第2列的数据,32
    df.iloc[[1,3],0:2]    #第1行和第3行,从第0列到第2列(不包含第2列)的数据
    df.iloc[1:3,[1,2]    #第1行到第3行(不包含第3行),第1列和第2列的数据

    更多参考:

    https://blog.csdn.net/yoonhee/article/details/76168253

    11. 判断某个cell是否为空

    if str(line["col_a"]).strip() == "nan":
        pass 

    12. Dataframe值替换

    df["col_a"] = df["col_a"].replace({"b": "C", "e": "G"})

    更多参考:https://jingyan.baidu.com/article/454316ab4d0e64f7a6c03a41.html

    13. Dataframe筛选数据

    df2 = df[df["col_a"] == "cc"]  # 等于某个值
    df3 = df[df["col_a"].isin(["bb", "cc", "ee"])]  # 取值在某个范围内

    更多参考:https://jingyan.baidu.com/article/0eb457e508b6d303f0a90572.html

    14. 其它常用操作

        # df = pd.read_csv("../../data/data_part1.txt", sep="$")
        df = pd.read_csv("data/data_part1.csv", low_memory=False)
    
        # 数据概览
        df.info()
        df.describe()  # ==> 只显示float型维度的[count, mean, std, min]等统计信息, 例如0108, 3816, 2453, 0112, 2428, 2304
    
        # 数据查看
        df.head(n=5)  # 查看开头几行, 默认n=5
        df.tail(n=5)  # 查看末尾几行, 默认n=5
        df.shape  # 查看行列维度
        df.columns  # 查看列名和列数
        df.dtypes     # 查看数据类型   ==> 可以看到哪些维度的数值是object型/float型
        df["0108"].hist()  # 查看变量分布
        df["0108"].unique()  # 查看有哪些取值
        df["0108"].value_counts()  # 查看这一列的值统计
    
        # 缺失值统计
        df.isnull().sum()  # 查看每一列缺失值情况
        df["n_null"] = df.isnull().sum(axis=1)  # 查看每一行缺失值情况
    
        # 缺失值填充
        mode_df = df.fillna(df.mode().iloc[0], inplace=True)  # 用众数填充
        median_df = df.fillna(df.median())  # 用中位数填充
        df["0108"][df.vid.isnull()] = "0"  # 对某一列填充

    15. 遇到的问题和解决方法

    15.1  df.to_excel(fout) 报错"openpyxl.utils.exceptions.IllegalCharacterError"

    (step 1)  pip install xlsxwriter 

    (step 2) df.to_excel(fout, engine="xlsxwriter") 

    15.2  保存文件时报错"UserWarning: Ignoring URL 'http://www.xxxxxxx' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS"

    writer = pd.ExcelWriter(fout, engine="xlsxwriter", options={'strings_to_urls': False})
    df.to_excel(writer, index=False)
    writer.save()

    参考:

    https://blog.csdn.net/wangquannuaa/article/details/43988719

    https://blog.csdn.net/wangquannuaa/article/details/43984095

  • 相关阅读:
    Nmon 性能:分析 AIX 和 Linux 性能的免费工具
    libvirt(virsh命令总结)
    Linux之shell编程基础
    Shell之sed命令
    linux shell基础
    ubuntu apt-get 遇到的问题
    JavaScript之面向对象学九(原型式继承和寄生式继承)
    JavaScript之apply()和call()的区别
    JavaScript之面向对象学习八(继承)
    JavaScript之面向对象学习七(动态原型模式、寄生构造函数模式、稳妥构造函数模式创建自定义类型)
  • 原文地址:https://www.cnblogs.com/bymo/p/7154476.html
Copyright © 2020-2023  润新知