• day01-特征工程和文本特征提取


    
    # coding=utf-8
    # 特征抽取
    from sklearn.feature_extraction import DictVectorizer
    from sklearn.feature_extraction.text import CountVectorizer
    import jieba
    
    
    # 字典特征抽取
    def dictvec():
        # 实例化
        dict = DictVectorizer(sparse=False)
        # 提取特征
        data = dict.fit_transform([{"city": "北京", "hj": 100}, {"city": "石家庄", "hj": 20}, {"city": "邯郸", "hj": 50}])
        # 打印出来特征
        print(dict.get_feature_names())
        print(data)
        return None
    
    # 文字特征抽取
    def countvec():
        cv = CountVectorizer()
        data = cv.fit_transform(["life is short,i like pyhton","life is long,i dislike python"])
        print(cv.get_feature_names())
        print(data.toarray())
        return None
    
    # 汉字特征抽取
    def hanzivec():
        c1 = jieba.cut("所以在乡村产业转型升级造成大量职工失业的不良影响之前,如何安顿好下岗职工便成了一个继续推进乡村产业转型升级和深化改革必须要解决的问题。我们团队准备调查下岗职工的情况,了解他们下岗与再就业中的经历,总结出他们再就业的经验,并整理出一些人仍然处于失业状态的原因。为不同层次的下岗人员提供可借鉴经验,同时帮助政府解决失业人员的再就业问题,为深化改革和乡村产业转型升级扫清障碍。")
        cv = CountVectorizer()
        data = cv.fit_transform([" ".join(list(c1))])
        print(cv.get_feature_names())
        print(data.toarray())
        return None
    
    if __name__ == '__main__':
        dictvec()
    
    
    

    特征工程主要是机器学习之前数据的处理,将特征值提取出来,方便后续使用

  • 相关阅读:
    leetcode--Populating Next Right Pointers in Each Node II
    leetcode—Populating Next Right Pointers in Each Node
    Pascal's Triangle II
    leetcode—pascal triangle
    leetcode—triangle
    October 23rd, 2017 Week 43rd Monday
    October 22nd, 2017 Week 43rd Sunday
    October 21st 2017 Week 42nd Saturday
    October 20th 2017 Week 42nd Friday
    October 19th 2017 Week 42nd Thursday
  • 原文地址:https://www.cnblogs.com/wuren-best/p/14262923.html
Copyright © 2020-2023  润新知