import numpy as np import matplotlib.pyplot as plt from scipy import stats from sklearn import datasets from sklearn.semi_supervised import LabelSpreading // sklearn.semi_supervised中的半监督估计器 from sklearn.metrics import confusion_matrix, classification_report //混淆矩阵、分类报告 digits = datasets.load_digits() //导入手写体数字 数据集
rng = np.random.RandomState(2) //生成随机数0-1796个数字 indices = np.arange(len(digits.data)) //切片 rng.shuffle(indices) //打乱 X = digits.data[indices[:340]] //取340个数据 y = digits.target[indices[:340]] images = digits.images[indices[:340]] n_total_samples = len(y) //340 n_labeled_points = 40 //标注好的数据40条 indices = np.arange(n_total_samples) unlabeled_set = indices[n_labeled_points:] //未标注的数据 y_train = np.copy(y) y_train[unlabeled_set] = -1 //将 y_train 中含有无标签的值设为 -1(进行清除)
//标签传播学习 lp_model = LabelSpreading(gamma=.25, max_iter=20) //训练模型 lp_model.fit(X, y_train) predicted_labels = lp_model.transduction_[unlabeled_set] //预测标签 true_labels = y[unlabeled_set] //真实标签 cm = confusion_matrix(true_labels, predicted_labels, labels=lp_model.classes_) print("Label Spreading model: %d labeled & %d unlabeled points (%d total)" % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)) print(classification_report(true_labels, predicted_labels)) print("Confusion matrix") print(cm) //计算每个转换分布的不确定值 pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T) //选取前10个最不准的标签 uncertainty_index = np.argsort(pred_entropies)[-10:] //绘图 f = plt.figure(figsize=(7, 5)) //设置图片大小 for index, image_index in enumerate(uncertainty_index): image = images[image_index] sub = f.add_subplot(2, 5, index + 1) //设置子图 sub.imshow(image, cmap=plt.cm.gray_r) plt.xticks([]) plt.yticks([]) sub.set_title('predict: %i true: %i' % ( lp_model.transduction_[image_index], y[image_index]))
官方文档:https://sklearn.apachecn.org/docs/master/15.html