• nltk_28Twitter情感分析模型


    sklearn实战-乳腺癌细胞数据挖掘(博客主亲自录制视频教程)

    https://study.163.com/course/introduction.htm?courseId=1005269003&utm_campaign=commission&utm_source=cp-400000000398149&utm_medium=share

    生产Twitter情感分析的模型,并保存数据为pickle,此过程可能要一个小时,所以下次调用数据就很简单了

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jan 12 10:44:19 2017
    
    @author: Administrator
    
    用于短评论分析-- Twitter
    
    保存后的"positive.txt","negative.txt"需要转码为utf-8
    在线转码网址
    http://www.esk365.com/tools/GB2312-UTF8.asp
    
    
    features=5000,准确率百分之60以上
    features=10000,准确率百分之 以上
    
    运行时间可能长达一个小时
    """
    
    import nltk
    import random
    import pickle
    from nltk.tokenize import word_tokenize
            
    short_pos = open("positive.txt","r").read()
    short_neg = open("negative.txt","r").read()
    
    # move this up here
    documents = []
    all_words = []
    
    for r in short_pos.split('
    '):
        documents.append( (r, "pos") )
    
    for r in short_neg.split('
    '):
        documents.append( (r, "neg") )
    
    
    #  j is adject, r is adverb, and v is verb
    #allowed_word_types = ["J","R","V"] 允许形容词类别
    allowed_word_types = ["J"]
    
    
    for p in short_pos.split('
    '):
        documents.append( (p, "pos") )
        words = word_tokenize(p)
        pos = nltk.pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())
    
    
    for p in short_neg.split('
    '):
        documents.append( (p, "neg") )
        words = word_tokenize(p)
        pos = nltk.pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())
    
    #保存文档            
    save_documents = open("pickled_algos/documents.pickle","wb")
    pickle.dump(documents, save_documents)
    save_documents.close()           
    
    
    #保存特征
    all_words = nltk.FreqDist(all_words)
    #最好改成2万以上
    word_features = list(all_words.keys())[:5000]
    save_word_features = open("pickled_algos/word_features5k.pickle","wb")
    pickle.dump(word_features, save_word_features)
    save_word_features.close()
    
    
    def find_features(document):
        words = word_tokenize(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)
    
        return features
    
    featuresets = [(find_features(rev), category) for (rev, category) in documents]
    
    random.shuffle(featuresets)
    print(len(featuresets))
    
    testing_set = featuresets[10000:]
    training_set = featuresets[:10000]
    
    
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
    classifier.show_most_informative_features(15)
    
    #保存分类器
    save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
    pickle.dump(classifier, save_classifier)
    save_classifier.close()
    
     sentiment_mod.py
    # -*- coding: utf-8 -*-
    """
    Created on Thu Jan 12 16:47:51 2017
    
    @author: Administrator
    """
    
    #File: sentiment_mod.py
    
    import nltk
    import random
    import pickle
    from nltk.tokenize import word_tokenize
    
    documents_f = open("pickled_algos/documents.pickle", "rb")
    documents = pickle.load(documents_f)
    documents_f.close()
    
    
    word_features5k_f = open("pickled_algos/word_features5k.pickle", "rb")
    word_features = pickle.load(word_features5k_f)
    word_features5k_f.close()
    
    
    def find_features(document):
        words = word_tokenize(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)
    
        return features
    
    
    featuresets_f = open("pickled_algos/featuresets.pickle", "rb")
    featuresets = pickle.load(featuresets_f)
    featuresets_f.close()
    
    random.shuffle(featuresets)
    print(len(featuresets))
    
    testing_set = featuresets[10000:]
    training_set = featuresets[:10000]
    
    
    open_file = open("pickled_algos/originalnaivebayes5k.pickle", "rb")
    classifier = pickle.load(open_file)
    open_file.close()
    
    
    
    def sentiment(text):
        feats = find_features(text)
        return classifier.classify(feats)
    

    测试

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jan 12 16:50:12 2017
    
    @author: Administrator
    """
    
    import sentiment_mod as s
    
    print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"))
    print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))
    

    python风控评分卡建模和风控常识(博客主亲自录制视频教程)

  • 相关阅读:
    CGI(通用网关接口)
    PHP简介
    SEO搜索引擎优化/URL
    使用表单标签,与用户交互
    认识<img>标签,为网页插入图片
    使用mailto在网页中链接Email地址
    使用<a>标签,链接到另一个页面
    1037. Magic Coupon (25)
    1038. Recover the Smallest Number (30)
    1034. Head of a Gang (30) -string离散化 -map应用 -并查集
  • 原文地址:https://www.cnblogs.com/webRobot/p/6278919.html
Copyright © 2020-2023  润新知