# coding=utf-8
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
def naivebayes():
# 提取新闻数据
news = fetch_20newsgroups(subset="all")
# 分割新闻数据
x_train,x_test,y_train,y_test = train_test_split(news.data,news.target,test_size=0.25)
# 提取关键字的特征值,使用tf算法
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
# 使用训练集的关键字提取测试集的特征值
x_test = tf.transform(x_test)
print("特征值为:")
print(tf.get_feature_names())
# 使用朴素贝叶斯算法进行预测
mlt = MultinomialNB()
mlt.fit(x_train,y_train)
print("预测的数据集特征值为:")
print(x_test.toarray())
print("预测的分类为:",mlt.predict(x_test))
print("测试的准确率为:",mlt.score(x_test,y_test))
return None
if __name__ == '__main__':
naivebayes()
朴素贝叶斯算法算出的是预测的数据属于哪些目标值的概率,哪个概率大了则为哪个目标值
可用于文章的分类