• python特征工程


    #特征工程
    #1-1sklearn中进行特征选择
    #筛选法-方差筛选过滤
    import numpy as np
    import array
    from sklearn.feature_selection import VarianceThreshold
    x=[[0,0,1],[0,1,0],[1,0,0],[0,1,1],[0,1,0],[0,1,1]]
    sel=VarianceThreshold(threshold=(.8*(1-.8))) #第一列0的比例超过了80%,在结果里剔除这一个特征
    print(sel.fit_transform(x))
    #卡方检验s筛选2个最好的特征
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    from scipy.stats import pearsonr
    iris=load_iris()
    x,y=iris.data,iris.target
    print(x.shape)
    x_new=SelectKBest(chi2,k=2).fit_transform(x,y)
    print(x_new.shape)
    print(np.hstack([x,x_new]))
    '''
    #使用相关系数的方法进行特征选择
    #x_new=SelectKBest(lambda X, Y: array(map(lambda x: pearsonr(x, Y),X.T)).T, k=2).fit_transform(x,y)
    #print(x_new.shape)
    #print(np.hstack([x,x_new]))
    #基于互信息法来进行相关性的判断
    from sklearn.feature_selection import SelectKBest
    from minepy import MINE
    # 由于MINE的设计不是函数式的,定义mic方法将其为函数式的,返回一个二元组,二元组的第2项设置成固定的P值0.5
    def mic(x, y):
    m = MINE()
    m.compute_score(x, y)
    return (m.mic(), 0.5)
    # 选择K个最好的特征,返回特征选择后的数据
    SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target)
    '''

    #包装法-根据模型选择特征-递归特性消除法
    #选定一些算法,根据算法在数量上的表现来进行特征集合,一般选择用的算法包括随机森林,支持向量机和k近邻算法
    import matplotlib.pyplot as plt
    from sklearn.svm import SVC
    from sklearn.model_selection import StratifiedKFold
    from sklearn.feature_selection import RFECV
    from sklearn.datasets import make_classification
    #创建一个虚拟分类数据集1000个样本,8个分类结果,25个特征
    x,y=make_classification(n_samples=1000,n_features=25,n_informative=3,n_redundant=2,
    n_repeated=0,n_classes=8,n_clusters_per_class=1,random_state=0)
    svc=SVC(kernel="linear")
    rfec=RFECV(estimator=svc,step=1,cv=StratifiedKFold(2),scoring="accuracy")
    rfec.fit(x,y)
    print("Optimal number of features: %d" % rfec.n_features_)
    print(rfec.ranking_) #输出各个特征重要性序号,选择的特征是1,其他依次排序
    print(rfec.support_)
    plt.figure()
    plt.xlabel("number of features selected")
    plt.ylabel("Cross validation score(nb of correct classificaions)")
    plt.plot(range(1,len(rfec.grid_scores_)+1),rfec.grid_scores_)
    plt.ylim([0,1])
    plt.show()

    #嵌入法-基于惩罚项的特征选择方法-很少用这个方法
    from sklearn.svm import LinearSVC
    from sklearn.datasets import load_iris
    from sklearn.feature_selection import SelectFromModel
    iris=load_iris()
    x,y=iris.data,iris.target
    print("原来数据的特征维度为:",x.shape)
    lsvc=LinearSVC(C=0.01,penalty="l1",dual=False)
    lsvc.fit(x,y)
    model=SelectFromModel(lsvc,prefit=True)
    x_new=model.transform(x)
    print("l1惩罚项处理之后的数据维度为:",x_new.shape)
    #嵌入法之基于树模型的特征选择法
    from sklearn.ensemble import ExtraTreesClassifier
    from sklearn.datasets import load_iris
    x,y=iris.data,iris.target
    print("原来数据的特征维度为:",x.shape)
    clf=ExtraTreesClassifier()
    clf.fit(x,y)
    print(clf.feature_importances_)
    model=SelectFromModel(clf,prefit=True)
    x_new=model.transform(x)
    print("新数据维度为:",x_new.shape)

    #1-2特征变换与特征提取
    #ong-hot的两种方法
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.datasets import load_iris
    iris=load_iris()
    print(OneHotEncoder().fit_transform(iris.target.reshape(-1,1)).toarray())
    #pandas中的one-hot方法
    import pandas as pd
    print(pd.get_dummies(iris.target))
    #特征组合和降维:主要是从业务的层面进行考虑,在单特征不能取得进一步效果时,需要对于各个原生的单特征进行进行计算组合出新的特征,特别需要业务考量,而不是随意组合

    #2 招聘数据的特征工程探索
    import warnings
    warnings.filterwarnings("ignore")
    import numpy as np
    import pandas as pd
    #导入数据
    la=pd.read_csv("D:Byrbt2018StudyPython机器学习全流程项目实战精讲配套课件第六讲 特征工程lagou_data5.csv",encoding="gbk")
    print(la.head())
    #advantage和label这两个特征作用不大,可以在最后剔除掉
    #分类变量one-hot处理
    #pandas ona-hot方法
    print(pd.get_dummies(la["city"].head()))
    '''
    #sklearn方法
    #先将文本信息进行分列编码
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import LabelEncoder
    la1=LabelEncoder()
    la1.fit(list(la["city"].values))
    la["city"]=la1.transform(list(la["city"].values))
    print(la["city"].head())
    #再由硬编码转变为one-hot编码
    df=OneHotEncoder().fit_transform(la["city"].values.reshape(-1,1)).toarray()
    print(df[:5])
    '''
    #对于招聘数据特征分类变量进行逐个的one-hot处理
    f=["city","industry","education","position_name","size","stage","work_year"]
    for i in f:
    temp=pd.get_dummies(la[i])
    la=pd.concat([la,temp],axis=1) #将转换的列合并
    la=la.drop([i],axis=1) #删掉之前的变量
    print(la.shape)
    #删掉原来的特征即可
    pd.options.display.max_columns=99
    la=la.drop(["advantage","label","position_detail","salary"],axis=1)
    print(la.shape)
    print(la.head())
    la1=la

    #文本类信息的特征提取方法Python-Java-Excel-SQL-R等特征有误分类列
    la=pd.read_csv("D:Byrbt2018StudyPython机器学习全流程项目实战精讲配套课件第六讲 特征工程lagou_data5.csv",encoding="gbk")
    la=la[["position_detail","salary"]]
    #提取python信息的特征列
    for i,j in enumerate(la["position_detail"]):
    if "python" in j:
    la["position_detail"][i]=j.replace("python","Python")
    la["Python"]=pd.Series()
    for i, j in enumerate(la["position_detail"]):
    if "Python" in j:
    la["Python"][i] =1
    else:
    la["Python"][i] =0
    print(la["Python"].head())

    la["R"]=pd.Series()
    for i, j in enumerate(la["position_detail"]):
    if "R" in j:
    la["R"][i] =1
    else:
    la["R"][i] =0
    print(la["R"].value_counts())

    for i,j in enumerate(la["position_detail"]):
    if "sql" in j:
    la["position_detail"][i]=j.replace("sql","SQL")
    la["SQL"]=pd.Series()
    for i, j in enumerate(la["position_detail"]):
    if "SQL" in j:
    la["SQL"][i] =1
    else:
    la["SQL"][i] =0
    print(la["SQL"].value_counts())

    la["Excel"]=pd.Series()
    for i, j in enumerate(la["position_detail"]):
    if "Excel" in j:
    la["Excel"][i] =1
    else:
    la["Excel"][i] =0
    print(la["Excel"].value_counts())

    la["Java"]=pd.Series()
    for i, j in enumerate(la["position_detail"]):
    if "Java" in j:
    la["Java"][i] =1
    else:
    la["Java"][i] =0
    print(la["Java"].value_counts())

    for i,j in enumerate(la["position_detail"]):
    if "linux" in j:
    la["position_detail"][i]=j.replace("linux","Linux")
    la["Linux"]=pd.Series()
    for i, j in enumerate(la["position_detail"]):
    if "Linux" in j:
    la["Linux"][i] =1
    else:
    la["Linux"][i] =0
    print(la["Linux"].value_counts())

    la["C++"]=pd.Series()
    for i, j in enumerate(la["position_detail"]):
    if "C++" in j:
    la["C++"][i] =1
    else:
    la["C++"][i] =0
    print(la["C++"].value_counts())

    for i,j in enumerate(la["position_detail"]):
    if "spark" in j:
    la["position_detail"][i]=j.replace("spark","Spark")
    la["Spark"]=pd.Series()
    for i, j in enumerate(la["position_detail"]):
    if "Spark" in j:
    la["Spark"][i] =1
    else:
    la["Spark"][i] =0
    print(la["Spark"].value_counts())

    for i,j in enumerate(la["position_detail"]):
    if "tensorflow" in j:
    la["position_detail"][i]=j.replace("tensorflow","Tensorflow")
    if "TensorFlow" in j:
    la["position_detail"][i]=j.replace("TensorFlow","Tensorflow")
    la["Tensorflow"]=pd.Series()
    for i, j in enumerate(la["position_detail"]):
    if "Tensorflow" in j:
    la["Tensorflow"][i] =1
    else:
    la["Tensorflow"][i] =0
    print(la["Tensorflow"].value_counts())
    la=la.drop(["position_detail"],axis=1)
    print(la.head())

    la=pd.concat((la,la1),axis=1).reset_index(drop=True)
    print(la.head())
    print(la.shape)

  • 相关阅读:
    ubuntu下erlang man的安装
    ranch分析学习(四)
    ranch分析学习(三)
    ranch分析学习(二)
    ranch分析学习(一)
    IIS 配置错误解决方法集合
    Visual Studio 2013中添加mimeType
    wordpress 开发日志及技巧收集
    css3 动画
    高宽比例计算方法及等比高宽修改计算方法
  • 原文地址:https://www.cnblogs.com/Yanjy-OnlyOne/p/12591085.html
Copyright © 2020-2023  润新知