数据清洗和数据预处理

摘要：

内容：

　　我的github 源代码：https://github.com/Tongzhenguo/Python-Project/blob/master/learntoscikit/preprocessing/demo.py

  1 # coding=utf-8
  2 __author__ = 'arachis'
  3 
  4 import numpy as np
  5 from sklearn import preprocessing
  6 
  7 """
  8     缺失值处理（填充负值，填充中值，填充众数，剔除，单独作为一个特征）
  9 """
 10 
 11 ##直接使用pandas 中的异常值处理
 12 import pandas as pd
 13 import numpy as np
 14 dates = pd.date_range('20130101', periods=6)
 15 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
 16 df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
 17 # df1.fillna(-1)  ##填充负值
 18 df1.dropna() ## 剔除
 19 
 20 
 21 """
 22     异常值处理（剔除）
 23 """
 24 
 25 
 26 """
 27     z-score:均值为0，方差为1(标准化)(基于列向量)
 28 """
 29 X_train = np.array([[ 1., -1.,  2.],
 30            [ 2.,  0.,  0.],
 31            [ 0.,  1., -1.]])
 32 X_test = np.array([[ -3., -1.,  4.]])
 33 X_scaled = preprocessing.scale(X_train)
 34 print  X_scaled
 35 
 36 #Scaled data has zero mean and unit variance:
 37 print X_scaled.mean(axis=0)
 38 print X_scaled.std(axis=0)
 39 
 40 #Scaler
 41 scaler = preprocessing.StandardScaler().fit(X_train)
 42 print scaler.transform(X_train)
 43 print scaler.transform(X_test)
 44 
 45 print scaler.mean_
 46 print scaler.scale_
 47 
 48 """
 49     min-max score：映射到区间[0,1]（最小-最大规范化）(基于列向量)
 50 """
 51 scaler = preprocessing.MinMaxScaler()
 52 print scaler.fit_transform(X_train)
 53 print scaler.transform(X_test) #新的数据可能会不在[0,1]区间内
 54 
 55 
 56 """
 57     规范化（Normalization）（归一化）(基于行向量)
 58 """
 59 normalizer = preprocessing.Normalizer(norm='l2')
 60 print normalizer.fit_transform(X_train)
 61 print normalizer.fit_transform(X_test)
 62 
 63 
 64 """
 65     二值化（Binarization）
 66 """
 67 #给定阈值，将特征转换为0/1
 68 binarizer = preprocessing.Binarizer(threshold=1.1)
 69 print binarizer.transform(X_train)
 70 print binarizer.transform(X_test)
 71 
 72 
 73 """
 74     类别特征编码（Encoding categorical features）
 75 """
 76 #知道各个类别的数目，可通过n_values指定
 77 enc = preprocessing.OneHotEncoder()
 78 print enc.fit([[1, 2, 3], [0, 2, 0]])
 79 print enc.transform([[1, 0, 0]]).toarray()
 80 
 81 
 82 """
 83     标签编码（Label encoding）
 84 """
 85 #非数值型转化为数值型
 86 le = preprocessing.LabelEncoder()
 87 le.fit(["paris", "paris", "tokyo", "amsterdam"])
 88 print le.transform(["tokyo", "tokyo", "paris"])
 89 
 90 
 91 """
 92     生成多项式特征（Generating polynomial features）
 93 """
 94 # （x1,x2） => (1,x1,x2,x1^2,x1*x2,x2^2)
 95 from sklearn.preprocessing import PolynomialFeatures
 96 X = np.arange(6).reshape(3, 2)
 97 poly = PolynomialFeatures(2)
 98 print poly.fit_transform(X)
 99 
100 
101 
102 """
103     滤除方差小的数据(Removing features with low variance)
104 """
105 from sklearn.feature_selection import VarianceThreshold
106 sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
107 sel.fit_transform(X)

相关阅读:
about java
nginx+tomcat
sed
百度推送
 线程及更新ui线程的简单理解
 测试异常检测的Bugly使用
 轮播图带加点，带时间自动轮播加手动轮播
 自定义listView与scrollView使用
 tabLayout加viewPager的实现
 网络获取图片列表
原文地址：https://www.cnblogs.com/arachis/p/preprocessing.html