• 机器学习实战2--时光网评论分析


    # coding: utf-8
    
    # In[74]:
    
    import sys  
    reload(sys)  
    sys.setdefaultencoding('utf8')
    import graphlab
    import datetime
    
    
    # In[75]:
    
    graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)
    
    
    # In[76]:
    
    products = graphlab.SFrame.read_csv('data/commit.csv')
    
    
    # In[77]:
    
    products.head()
    
    
    # In[78]:
    
    #文本分析工具,对评论进行统计分析
    products['word_count'] = graphlab.text_analytics.count_words(products['commits'])
    
    
    # In[79]:
    
    products.head()
    
    
    # In[80]:
    
    graphlab.canvas.set_target('browser')
    
    
    # In[81]:
    
    products['movie_name_zh'].show()
    
    
    # In[82]:
    
    #获取战狼2的点评
    giraffe_reviews = products[products['movie_name_zh'] == '战狼2']
    
    
    # In[83]:
    
    len(giraffe_reviews) #点评数量
    
    
    # In[84]:
    
    giraffe_reviews['userscore'].show(view='Categorical') #查看分类性质的评分分布
    #发现战狼2打7分的人最多
    
    
    # In[88]:
    
    #忽略5分的评论,忽略没分的评论,分数转为浮点格式
    products = products[products['userscore'] != 'None']
    products['userscore']
    products['userscore'] = products['userscore'].astype(float)
    products = products[products['userscore'] != 5]
    
    
    # In[89]:
    
    #筛选正面评价
    products['sentiment'] = products['userscore'] >=6.0
    
    
    # In[90]:
    
    products.head()
    
    
    # In[91]:
    
    #区分训练集和测试集,并限定seed
    train_data,test_data = products.random_split(.8, seed=0)
    
    
    # In[92]:
    
    #创建逻辑回归分类器
    sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                         target='sentiment',
                                                         features=['word_count'],
                                                         validation_set=test_data)
    
    
    # In[93]:
    
    #评估情感分析模型.ROC(受试者工作特征曲线)
    sentiment_model.evaluate(test_data, metric='roc_curve')
    
    
    # In[94]:
    
    #可视化评估结果
    sentiment_model.show(view='Evaluation')
    
    
    # In[95]:
    
    #添加新的一列,predicted_sentiment(情感预测),结果概率预测
    giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')
    
    
    # In[96]:
    
    giraffe_reviews.head()
    
    
    # In[97]:
    
    #排序,ascending决定是否升序
    giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)
    
    
    # In[98]:
    
    giraffe_reviews.head()
    
    
    # In[110]:
    
    giraffe_reviews[0]
    
    
    # In[112]:
    
    giraffe_reviews[2]
    
    
    # In[113]:
    
    giraffe_reviews[-1]
    
    
    # In[114]:
    
    giraffe_reviews[-2]
    
    
    # In[117]:
    
    giraffe_reviews = giraffe_reviews[giraffe_reviews['userscore'] == 'None']
    
    
    # In[119]:
    
    giraffe_reviews.tail()
    
    
    # In[ ]:

    代码地址(附作业答案): https://github.com/RedheatWei/aiproject/tree/master/Machine%20Learning%20Specialization/week3

    爬虫地址: https://github.com/RedheatWei/mtime_commit

  • 相关阅读:
    datagrid
    SQL语句
    JavaScript事件
    DOM和BOM
    JavaScript基础知识
    css
    网络编程常识
    集合框架
    多线程常识
    面向对象常识
  • 原文地址:https://www.cnblogs.com/redheat/p/9263942.html
Copyright © 2020-2023  润新知