• pandas dataframe 过滤——apply最灵活!!!


    按照某特定string字段长度过滤:

    import pandas as pd
    
    df = pd.read_csv('filex.csv')
    df['A'] = df['A'].astype('str')
    df['B'] = df['B'].astype('str')
    mask = (df['A'].str.len() == 10) & (df['B'].str.len() == 10)
    df = df.loc[mask]
    print(df)
    

      

    Applied to filex.csv:

    A,B
    123,abc
    1234,abcd
    1234567890,abcdefghij
    

    the code above prints

                A           B
    2  1234567890  abcdefghij

    或者是:

    data={"names":["Alice","Zac","Anna","O"],"cars":["Civic","BMW","Mitsubishi","Benz"],
         "age":["1","4","2","0"]}
    
    df=pd.DataFrame(data)
    """
    df:
      age        cars  names
    0   1       Civic  Alice
    1   4         BMW    Zac
    2   2  Mitsubishi   Anna
    3   0        Benz      O
    Then:
    """
    
    df[
    df['names'].apply(lambda x: len(x)>1) &
    df['cars'].apply(lambda x: "i" in x) &
    df['age'].apply(lambda x: int(x)<2)
      ]
    """
    We will have :
      age   cars  names
    0   1  Civic  Alice
    """
    

      

    最灵活的是用apply:

    def load_metadata(dir_name):    
        columns_index_list = [
            MetaIndex.M_METADATA_ID_INDEX,
            MetaIndex.M_SRC_IP_INDEX,
            MetaIndex.M_DST_IP_INDEX,
            MetaIndex.M_SRC_PORT_INDEX,
            MetaIndex.M_DST_PORT_INDEX,
            MetaIndex.M_PROTOCOL_INDEX,
            MetaIndex.M_HEADER_H,
            MetaIndex.M_PAYLOAD_H,
            MetaIndex.M_TCP_FLAG_H,
            MetaIndex.M_FLOW_FIRST_PKT_TIME,
            MetaIndex.M_FLOW_LAST_PKT_TIME,
            MetaIndex.M_OCTET_DELTA_COUNT_FROM_TOTAL_LEN,
        ]
        columns_name_list = [
            "M_METADATA_ID_INDEX",
            "M_SRC_IP_INDEX",
            "M_DST_IP_INDEX",
            "M_SRC_PORT_INDEX",
            "M_DST_PORT_INDEX",
            "M_PROTOCOL_INDEX",
            "M_HEADER_H",
            "M_PAYLOAD_H",
            "M_TCP_FLAG_H",
            "M_FLOW_FIRST_PKT_TIME",
            "M_FLOW_LAST_PKT_TIME",
            "M_OCTET_DELTA_COUNT_FROM_TOTAL_LEN",
        ]
    
        def metadata_parse_filter(row):
            try:
                if row['M_PROTOCOL_INDEX'] != 6:
                    return False
                if len(row['M_HEADER_H']) < 2 or len(row['M_PAYLOAD_H']) < 2 or not is_l34_tcp_metadata(row['M_METADATA_ID_INDEX']):
                    return False
                first_time = row['M_FLOW_FIRST_PKT_TIME'].split('-')
                last_time = row['M_FLOW_LAST_PKT_TIME'].split('-')
    
                flow_first_pkt_time = int(first_time[0])
                rev_flow_first_pkt_time = int(first_time[1])
    
                flow_last_pkt_time = int(last_time[0])
                rev_flow_last_pkt_time = int(last_time[1])
                if flow_first_pkt_time > flow_last_pkt_time or rev_flow_first_pkt_time > rev_flow_last_pkt_time:
                    return False
                return True
            except Exception as e:
                return False
    
        for root, dirs, files in os.walk(dir_name):
            for filename in files:
                file_path = os.path.join(root, filename)
                df = pd.read_csv(file_path, delimiter='^', usecols=columns_index_list, names=columns_name_list, encoding='utf-8', error_bad_lines=False, warn_bad_lines=True, header=0, lineterminator="
    ")
                filter_df = df.loc[df.apply(metadata_parse_filter, axis=1)]
                yield filter_df
    

     直接按照row过滤! 

  • 相关阅读:
    Flex 学习笔记------组件和视图
    Flex 学习笔记------基于LZMA的文件压缩与上传
    Flex 学习笔记------FLACC & Crossbridge
    Flex 学习笔记------全局事件
    Flex 学习笔记------对象的深层拷贝
    Flex 学习笔记------as 与 js 的通信
    Flex 学习笔记------Local Shared Object 和 Custom Class
    Flex 学习笔记------读取Jpeg图片的width,height和colorSpace
    翻译:eval() 不是魔鬼,只是易被误解
    翻译:javascript 内存管理
  • 原文地址:https://www.cnblogs.com/bonelee/p/9927567.html
Copyright © 2020-2023  润新知