• sklearn中实现标准化、归一化


    这里记录下标准化,归一化等内容:

    from sklearn.feature_extraction import DictVectorizer
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
    from sklearn.feature_selection import VarianceThreshold
    from sklearn.decomposition import PCA
    import numpy as np
    
    def dictvec():
        # sparse默认是True,是为了节省内存
        dict1 = DictVectorizer(sparse=False)
        data1 = dict1.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}])
        print(data1)
        
        print("===============================")
        print("sparse默认为True,返回如下:")
        
        dict2 = DictVectorizer(sparse=True)
        data2 = dict2.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}])
        print(data2)
        
        print("===============================")
        print("字典类别数据:")
        print(dict2.get_feature_names())
        
        return None
    
    def countvec():
        cv = CountVectorizer()
        data = cv.fit_transform(["life is short,i like python life","life is too long,i dislike python"])
        print("===============================")
        print("sparse默认为True,返回如下:")
        print(data)
        
        print("===============================")
        print("CountVectorizer没有sparse参数,按照如下方式操作:")
        print(data.toarray())
        
        print("===============================")
        print("获得类别名:")
        print(cv.get_feature_names())
        
        return None
    
    # 归一化
    def mm():
        # feature_range改变归一化范围,默认0-1
    #     mm = MinMaxScaler(feat ure_range=(2,3))
        mm = MinMaxScaler()
        data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]])
        print("===============================")
        print("归一化,返回如下:")
        print(data)
        
        return None
    
    # 标准化
    def ss():
        std = StandardScaler()
        data = std.fit_transform([[ 1., -1., 3.],[ 2., 4., 2.],[ 4., 6., -1.]])
        print("===============================")
        print("标准化,返回如下:")
        print(data)
        
        return None
        
    # 缺失值
    def im():
        im = Imputer(missing_values='NaN', strategy='mean', axis=0) # 按列
        data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]])
        print("===============================")
        print("缺失值处理,返回如下:")
        print(data)
        
        return None
     
        
    # 降维
    def var():
        """
        特征选择-删除低方差的特征
        """
        var = VarianceThreshold(threshold=0)
        data = var.fit_transform([[0, 2, 0, 3], [0, 1, 4, 3],[0, 1, 1, 3]])
        print("===============================")
        print("删除低方差降维,返回如下:")
        print(data)
        
        return None
    
    # 主成分分析PCA
    def pca():
        """
        主成分分析进行降维
        """
        # 信息保留90%
        pca = PCA(n_components=0.9)
        data = pca.fit_transform([[2,8,4,5],[6,3,0,8],[5,4,9,1]])
        print("===============================")
        print("主成分分析降维,返回如下:")
        print(data)
        
        return None
    
    
    if __name__ == "__main__":
    #     dictvec()
        countvec()
    #     mm()
    #     ss()
    #     im()
    #     var()
    #     pca()
  • 相关阅读:
    linux tomcat 突然验证码出不来
    使用open live writer客户端写博客
    创建自己的maven模板
    Dynamic Web Module 3.0 requires Java 1.6 or newer
    win10 操作配置备忘
    Maven使用
    ORA-12514: TNS:listener does not currently know of service …
    PlantUML——4.实例演示1
    C语言基础(一)
    Linux系统挂载FAT32的U盘
  • 原文地址:https://www.cnblogs.com/yunxiaofei/p/11117069.html
Copyright © 2020-2023  润新知