• 数据清洗和数据预处理


    摘要:

    内容:

      我的github 源代码:https://github.com/Tongzhenguo/Python-Project/blob/master/learntoscikit/preprocessing/demo.py

      1 # coding=utf-8
      2 __author__ = 'arachis'
      3 
      4 import numpy as np
      5 from sklearn import preprocessing
      6 
      7 """
      8     缺失值处理(填充负值,填充中值,填充众数,剔除,单独作为一个特征)
      9 """
     10 
     11 ##直接使用pandas 中的异常值处理
     12 import pandas as pd
     13 import numpy as np
     14 dates = pd.date_range('20130101', periods=6)
     15 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
     16 df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
     17 # df1.fillna(-1)  ##填充负值
     18 df1.dropna() ## 剔除
     19 
     20 
     21 """
     22     异常值处理(剔除)
     23 """
     24 
     25 
     26 """
     27     z-score:均值为0,方差为1(标准化)(基于列向量)
     28 """
     29 X_train = np.array([[ 1., -1.,  2.],
     30            [ 2.,  0.,  0.],
     31            [ 0.,  1., -1.]])
     32 X_test = np.array([[ -3., -1.,  4.]])
     33 X_scaled = preprocessing.scale(X_train)
     34 print  X_scaled
     35 
     36 #Scaled data has zero mean and unit variance:
     37 print X_scaled.mean(axis=0)
     38 print X_scaled.std(axis=0)
     39 
     40 #Scaler
     41 scaler = preprocessing.StandardScaler().fit(X_train)
     42 print scaler.transform(X_train)
     43 print scaler.transform(X_test)
     44 
     45 print scaler.mean_
     46 print scaler.scale_
     47 
     48 """
     49     min-max score:映射到区间[0,1](最小-最大规范化)(基于列向量)
     50 """
     51 scaler = preprocessing.MinMaxScaler()
     52 print scaler.fit_transform(X_train)
     53 print scaler.transform(X_test) #新的数据可能会不在[0,1]区间内
     54 
     55 
     56 """
     57     规范化(Normalization)(归一化)(基于行向量)
     58 """
     59 normalizer = preprocessing.Normalizer(norm='l2')
     60 print normalizer.fit_transform(X_train)
     61 print normalizer.fit_transform(X_test)
     62 
     63 
     64 """
     65     二值化(Binarization)
     66 """
     67 #给定阈值,将特征转换为0/1
     68 binarizer = preprocessing.Binarizer(threshold=1.1)
     69 print binarizer.transform(X_train)
     70 print binarizer.transform(X_test)
     71 
     72 
     73 """
     74     类别特征编码(Encoding categorical features)
     75 """
     76 #知道各个类别的数目,可通过n_values指定
     77 enc = preprocessing.OneHotEncoder()
     78 print enc.fit([[1, 2, 3], [0, 2, 0]])
     79 print enc.transform([[1, 0, 0]]).toarray()
     80 
     81 
     82 """
     83     标签编码(Label encoding)
     84 """
     85 #非数值型转化为数值型
     86 le = preprocessing.LabelEncoder()
     87 le.fit(["paris", "paris", "tokyo", "amsterdam"])
     88 print le.transform(["tokyo", "tokyo", "paris"])
     89 
     90 
     91 """
     92     生成多项式特征(Generating polynomial features)
     93 """
     94 # (x1,x2) => (1,x1,x2,x1^2,x1*x2,x2^2)
     95 from sklearn.preprocessing import PolynomialFeatures
     96 X = np.arange(6).reshape(3, 2)
     97 poly = PolynomialFeatures(2)
     98 print poly.fit_transform(X)
     99 
    100 
    101 
    102 """
    103     滤除方差小的数据(Removing features with low variance)
    104 """
    105 from sklearn.feature_selection import VarianceThreshold
    106 sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    107 sel.fit_transform(X)
  • 相关阅读:
    about java
    nginx+tomcat
    sed
    百度推送
    线程及更新ui线程的简单理解
    测试异常检测的Bugly使用
    轮播图带加点,带时间自动轮播加手动轮播
    自定义listView与scrollView使用
    tabLayout加viewPager的实现
    网络获取图片列表
  • 原文地址:https://www.cnblogs.com/arachis/p/preprocessing.html
Copyright © 2020-2023  润新知