#WSS未知异常预测第一种算法实现——kmeans改进版聚类算法
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
import pandas as pd
class kmeans_optimization:
def __init__(self, data): # 传入一个二维的数组numpy数组为所需要检测的数据data
self.data =np.array(data)
#结合实际数据实现的改进版本聚类方法
def run(self):
# 首先进行k_means聚类表示
self.kmeans = KMeans(n_clusters=2, n_init=100)
self.kmeans.fit(self.data)
kmeans_pre = self.kmeans.labels_
self.r1 = self.data[kmeans_pre == 0]
self.r2 = self.data[kmeans_pre == 1]
self.kmeans_center = self.kmeans.cluster_centers_ # 获取聚类中心
# 计算聚类的结果到各自聚类中心的距离
self.distance1 = cdist(self.r1, [self.kmeans_center[0]])
self.distance2 = cdist(self.r2, [self.kmeans_center[1]])
# 进行k_means聚类合理性判断
if len(self.r1) < 0.1 * len(self.data) or len(self.r2) < 0.1 * len(self.data):
if len(self.r1) < len(self.r2):
self.errordata = self.r1
self.normaldata = self.r2
else:
self.errordata = self.r2
self.normaldata = self.r1
else:
if self.distance1.max() < self.distance2.max():
self.distance = self.distance2
self.distance_1 = self.distance1
self.errordata1 = self.r2
self.normaldata1 = self.r1
else:
self.distance = self.distance1
self.distance_1 = self.distance2
self.errordata1 = self.r1
self.normaldata1 = self.r2
#定义输出异常距离输出的阈值大小
self.threshold = self.distance.mean() + 3 * self.distance1.std()
self.error_ind1 = []
self.normal_ind1 = []
for (i, v) in enumerate(self.distance):
if v > self.threshold:
self.error_ind1.append(i)
else:
self.normal_ind1.append(i)
self.errordata = self.errordata1[self.error_ind1] #得到异常的数据集合
self.normaldata2 = self.errordata1[self.normal_ind1]
self.normaldata = np.vstack((self.normaldata1, self.normaldata2)) #得到正常的数据集合
#索引查找和返回
self.errorindex = []
for (i, v) in enumerate(self.errordata):
for (j, v1) in enumerate(self.data):
if (v1 == v).all():
self.errorindex.append(j)
self.index = [i for i in range(len(self.data))]
self.normaldataindex = [i for i in self.index if i not in self.errorindex]
return self.errordata,self.normaldata,self.errorindex,self.normaldataindex
if __name__ == '__main__':
x = np.random.normal(1, 0.5, size=(20000, 69))
print(x)
y = np.random.normal(5, 0.5, size=(10, 69))
print(y)
y1 = np.random.normal(7, 1, (15, 69))
y2 = np.random.normal(10, 1, (10, 69))
z = np.vstack((x, y, y1, y2))
z = np.array(z)
print(z)
z = pd.read_excel("finaldata.xlsx")
z = z.iloc[:, :69]
z = np.array(z)
k = kmeans_optimization(z)
print(k.run())