# 1、import csv import pandas as pd import numpy as np df = pd.read_csv('E:/Code/python/region/static/house_details_zz_2.csv', encoding='UTF-8') df.head(3) # 2、是否有缺失值 df.info() # 3、是否有重复值 df.duplicated().value_counts() # 4、指定部分列重复时去重 df.drop_duplicates(['lj_number']) # 5、(正态分布) def outRange(S): blidx = (S.mean() - 3*S.std()>S)|(S.mean() + 3*S.std()<S) idx = np.arange(S.shape[0])[blidx] outRange = S.iloc[idx] return outRange outier = outRange(df['deal_price']) outier # 6、region_1_name不重复值 df['region_1_name'].unique() # 7、检测数据中的缺失值 df.isnull().sum() # 8、 import datetime # 字符串->datetime64[ns] df['deal_time'] = pd.to_datetime(df['deal_time']) # 取出deal_time列 df['deal_time'] # 9、是否有缺失值 df.info() # 10、 for row in df['through_nums']: if row == "暂无数据": print(row) # 11、"""删除行""" # df[df['through_nums'].isin(["暂无数据"])] # 选取出['through_nums']等于["暂无数据"] 的行 # df2 = df[~df['through_nums'].isin(["暂无数据"])] #取反,选取出['through_nums']不等于["暂无数据"] 的行 df.drop(df[df['through_nums'].isin(["暂无数据"])].index,inplace=True) # 根据 drop和index 删除包含["暂无数据"] 的数据 # df.head(5) df.head(3) # 12、object - > int # to_numeric df['through_nums'] = pd.to_numeric(df['through_nums']) # 13、object - > float # for row in df['finish_age']: # if row == "未知": # print(row) # 使用to_numeric()函数,告诉其将任何无效数据转换为NaN df['finish_age'] = pd.to_numeric(df['finish_age'],errors='coerce') # 有空缺的数据用平均值替换 df.fillna(data.mean(),inplace = True) # df.isnull().sum() df.info() # 14、删除finish_age为空的行 # df.dropna(subset = ['finish_age'],inplace = True) df.info() # 15、 df['list_time'] = pd.to_datetime(df['list_time'],errors='coerce') df.isnull().sum()