• 实战1-数据清理


    # 1、import csv
    import pandas as pd
    import numpy as np
    df = pd.read_csv('E:/Code/python/region/static/house_details_zz_2.csv', encoding='UTF-8')
    df.head(3)
    
    # 2、是否有缺失值
    df.info()
    
    # 3、是否有重复值
    df.duplicated().value_counts()
    
    # 4、指定部分列重复时去重
    df.drop_duplicates(['lj_number'])
    
    # 5、(正态分布)
    def outRange(S):
    blidx = (S.mean() - 3*S.std()>S)|(S.mean() + 3*S.std()<S)
    idx = np.arange(S.shape[0])[blidx]
    outRange = S.iloc[idx]
    return outRange
    outier = outRange(df['deal_price'])
    outier
    
    # 6、region_1_name不重复值
    df['region_1_name'].unique()
    
    # 7、检测数据中的缺失值
    df.isnull().sum()
    
    # 8、
    import datetime
    # 字符串->datetime64[ns]
    df['deal_time'] = pd.to_datetime(df['deal_time'])
    # 取出deal_time列
    df['deal_time']
    
    # 9、是否有缺失值
    df.info()
    
    # 10、
    for row in df['through_nums']:
    if row == "暂无数据":
    print(row)
    
    # 11、"""删除行"""
    # df[df['through_nums'].isin(["暂无数据"])] # 选取出['through_nums']等于["暂无数据"] 的行
    # df2 = df[~df['through_nums'].isin(["暂无数据"])] #取反,选取出['through_nums']不等于["暂无数据"] 的行
    df.drop(df[df['through_nums'].isin(["暂无数据"])].index,inplace=True) # 根据 drop和index 删除包含["暂无数据"] 的数据
    # df.head(5)
    df.head(3)
    
    # 12、object - > int
    # to_numeric
    df['through_nums'] = pd.to_numeric(df['through_nums'])
    
    # 13、object - > float
    # for row in df['finish_age']:
    # if row == "未知":
    # print(row)
    # 使用to_numeric()函数,告诉其将任何无效数据转换为NaN
    df['finish_age'] = pd.to_numeric(df['finish_age'],errors='coerce')
    # 有空缺的数据用平均值替换
    df.fillna(data.mean(),inplace = True)
    # df.isnull().sum()
    df.info()
    
    # 14、删除finish_age为空的行
    # df.dropna(subset = ['finish_age'],inplace = True)
    df.info()
    
    # 15、
    df['list_time'] = pd.to_datetime(df['list_time'],errors='coerce')
    df.isnull().sum()
  • 相关阅读:
    蓝牙的HFP协议笔记
    23种设计模式
    读QT5.7源码(三)Q_OBJECT 和QMetaObject
    实现私有化(Pimpl) --- QT常见的设计模式
    蓝牙Profile的概念和常见种类(转)
    git分支合并
    git log的常见用法
    QThread详解
    git查看某个文件的修改历史
    因为代理原因导致的NotSerializableException
  • 原文地址:https://www.cnblogs.com/mrfanqie/p/shizhan_20210219.html
Copyright © 2020-2023  润新知