from pandas import DataFrame,Series import pandas as pd import numpy as np # 移除重复数据 data = DataFrame({"k1":["one"]*3+["two"]*4, "k2":[1,1,2,3,3,4,4]}) print(data) ''' k1 k2 0 one 1 1 one 1 2 one 2 3 two 3 4 two 3 5 two 4 6 two 4 ''' # duplicated表示各行是否重复行 print(data.duplicated()) ''' 0 False 1 True 2 False 3 False 4 True 5 False 6 True dtype: bool ''' # 移除重复行 print(data.drop_duplicates()) ''' k1 k2 0 one 1 2 one 2 3 two 3 5 two 4 ''' # 默认判断全部列,也可以指定部分列进行重复项判断 # 默认保留第一个出现的值组合,传入keep='last'则保留最后一个 print(data.drop_duplicates(["k1"])) ''' k1 k2 0 one 1 3 two 3 ''' print(data.drop_duplicates(["k1"],keep='last')) ''' k1 k2 2 one 2 6 two 4 '''