1-1.py
#algorithm-1:KNN
#调用sklearn中KNN算法解决回归问题
#调用各个库
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
#1-2加载数据
boston=datasets.load_boston()
X=boston.data
y=boston.target
x=X[y<50.0]
y=y[y<50.0]
#1-3进行数据的分割训练数据集与测试数据集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=666)
#调用sklearn中KNN算法解决回归问题
from sklearn.neighbors import KNeighborsRegressor
knn=KNeighborsRegressor()
knn.fit(x_train,y_train)
y_predict1=knn.predict(x_test)
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
print(mean_absolute_error(y_test,y_predict1))
print(mean_squared_error(y_test,y_predict1))
print(knn.score(x_test,y_test))
#使用网格搜索的方式进行模型超参数的确定,交叉验证的方式寻找最好的超参数
param_grid=[
{
"weights":["uniform"],
"n_neighbors":[i for i in range(1,11)]
},
{
"weights":["distance"],
"n_neighbors":[i for i in range(1,11)],
"p":[i for i in range(1,6)]
}
]
k=KNeighborsRegressor()
from sklearn.model_selection import GridSearchCV
#定义相应网格搜索方式(输入初始化参数:1机器学习算法、2超参数组合列表、3n_jobs(选择并行内核数目,-1表示全部是用),4verbose=2表示输出相应搜索过程)
grid_search=GridSearchCV(k,param_grid,verbose=1)
grid_search.fit(x_train,y_train)
print(grid_search.best_params_) #输出最好的超参数组合
print(grid_search.best_score_) #输出最好的模型的时候的准确度
knn_best=grid_search.best_estimator_ #定义出最好的分类器
y_pre=knn_best.predict(x_test)
plt.plot(y_test)
plt.plot(y_pre)
plt.show()
DWT算法_test.py
#DWT算法的实现和对比分析展示
import numpy as np
import matplotlib.pyplot as plt
float_formatter = lambda x: "%.2f" % x
np.set_printoptions(formatter={'float_kind': float_formatter})
def TimeSeriesSimilarity(s1, s2):
l1 = len(s1)
l2 = len(s2)
plt.plot(s1, "r", s2, "g")
plt.show()
s1 = (s1 - np.mean(s1)) / np.std(s1)
s2 = (s2 - np.mean(s2)) / np.std(s2)
paths = np.full((l1 + 1, l2 + 1), np.inf) # 全部赋予无穷大
paths[0, 0] = 0
for i in range(l1):
for j in range(l2):
d = s1[i] - s2[j]
cost = d ** 2
paths[i + 1, j + 1] = cost + min(paths[i, j + 1], paths[i + 1, j], paths[i, j])
paths = np.sqrt(paths)
s = paths[l1, l2]
return s, paths.T
if __name__ == '__main__':
s1 = [1, 2, 0, 1, 1, 2]
s2 = [1, 0, 1]
s2 = [3,5,1,3,3,5]
plt.plot(s1,"r",s2,"g")
plt.show()
print(s1,s2)
distance, paths = TimeSeriesSimilarity(s1, s2)
print(distance)
s1 = np.array([1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1])
s2 = np.array([0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2])
s3 = np.array([0.8, 1.5, 0, 1.2, 0, 0, 0.6, 1, 1.2, 0, 0, 1, 0.2, 2.4, 0.5, 0.4])
s4 = np.array([1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1.5])
s5 = np.array([x + 1 for x in s1])
distance, paths = TimeSeriesSimilarity(s1, s2)
print(distance)
distance, paths = TimeSeriesSimilarity(s1, s3)
print(distance)
distance, paths = TimeSeriesSimilarity(s1, s4)
print(distance)
distance, paths = TimeSeriesSimilarity(s1, s5)
print(distance)
kmeans聚类分析
#非监督聚类算法实现失效数据检测试实现
import xlwt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.rcParams["axes.unicode_minus"]=False
data=pd.read_excel("C:/Users/y50014900/Desktop/过程测试_033GRR_IL-DM_20200601-20200709.xlsx")
print(data)
print(data.describe())
print(data.columns)
print(data.index)
print(len(data))
data["start_time"]=pd.to_datetime(data["start_time"])
df=data.sort_index(by=["start_time","condition"])
print(df)
data1=pd.read_excel("C:/Users/y50014900/Desktop/过程测试_033GRR_IL-DM_20200601-20200709_1.xlsx")
print(df.values==data1.values)
df.index=range(len(df))
print(df.describe())
print(data.describe())
data=df
data4=data[data["condition"]==190.65]
data5=data[data["condition"]==193.65]
data6=data[data["condition"]==196.65]
print(len(data4),len(data5),len(data6))
index=[]
index1=[]
for i in range(len(data)-1):
if data.loc[i,"condition"]==data.loc[i+1,"condition"]:
print(i)
index.append(i)
if data.loc[i,"condition"]==193.65 and data.loc[i+1,"condition"]==190.65:
print(i)
index1.append(i-1)
index1.append(i)
print(index)
print(index1)
d1=data.drop(index+index1)
data=d1
data.index=range(0,len(data))
print(len(data))
data4=data[data["condition"]==190.65]
data5=data[data["condition"]==193.65]
data6=data[data["condition"]==196.65]
print(len(data4),len(data5),len(data6))
print(data4.describe())
print(data5.describe())
print(data6.describe())
col=data.columns.tolist()
print(col)
c1=col[3:]
data4=data4.fillna(value={i:data4[i].mean() for i in c1}) #填补缺失值,用整个列的数据的平均值进行填充
print(data4.describe())
print(data5.describe())
print(data6.describe())
data4.index=range(len(data4))
data5.index=range(len(data4))
data6.index=range(len(data4))
#输出最终的数据表格数据
finaldata=pd.concat([data4[c1],data5[c1],data6[c1]],axis=1)
finaldata.index=range(len(finaldata))
print(finaldata.columns)
print(finaldata.head())
#使用k_means算法来实现最终的失效检测(所有端口三指标超标失效)
scale1=StandardScaler().fit(finaldata)
finaldata1=scale1.transform(finaldata)
print(finaldata1)
## Z分数标准化标准化
data_zs=(finaldata-finaldata.mean())/finaldata.std()
print(data_zs.values)
## 也可以自定义函数minmax标准化、或者现成的函数
scale2=MinMaxScaler().fit(finaldata)
finaldata2=scale2.transform(finaldata)
print(finaldata2)
#定义聚类分析算法进行分析
finaldata_1=finaldata.loc[:,["p1","p2"]]
kmeans=KMeans(n_clusters=2,random_state=123,max_iter=1000)
kmeans.fit(finaldata)
kmeans_1=KMeans(n_clusters=2,random_state=123)
kmeans_1.fit(finaldata_1)
kmeans1=KMeans(n_clusters=2,random_state=123)
kmeans1.fit(finaldata1)
finaldata_2=finaldata.loc[:,"p16"]
kmeans_2 = KMeans(n_clusters=2, random_state=123)
kmeans_2.fit(finaldata_2)
#聚类算法的结果标签输出
kmeans_pre_1=kmeans_1.labels_
kmeans_pre= kmeans.labels_ #获取聚类标签
kmeans_pre1=kmeans1.labels_
kmeans_pre2=kmeans_2.labels_
'''
kmeans_center= kmeans.cluster_centers_ #获取聚类中心
kmeans1_center=kmeans1.cluster_centers_
kmeans_inertia = kmeans.inertia_ # 获取聚类准则的总和
kmeans1_inertia = kmeans1.inertia_ # 获取聚类准则的总和
result=kmeans.predict([[5.6,2.8,4.9,2.0]])
result1=kmeans1.predict([[5.6,2.8,4.9,2.0]])
result2=kmeans2.predict([[5.6,2.8,4.9,2.0]])
## 这里有点小问题,就是预测的数据需要使用和训练数据同样的标准化才行。
print(result)
#聚类算法指标评价实现(真实与预测结果的对比)
from sklearn.metrics import adjusted_rand_score,adjusted_mutual_info_score
print(adjusted_rand_score(kmeans_pre,final_resultdata["result1"]))
print(adjusted_mutual_info_score(kmeans_pre,final_resultdata["result1"]))
print(adjusted_rand_score([1,1,0,0,1,0],[0,0,1,1,1,0]))
print(adjusted_mutual_info_score([1,1,0,0,1,0],[0,0,1,1,0,1]))
'''
port=["p{}".format(i) for i in range(1,24)]
re=[]
z=[]
for i in port:
finaldata_3=finaldata.loc[:,i]
kmeans_3 = KMeans(n_clusters=2, random_state=123)
kmeans_3.fit(finaldata_3)
kmeans_pre3 = kmeans_3.labels_
ze=len(kmeans_pre3[kmeans_pre3==0])
re.append(kmeans_pre3)
z.append(ze)
print(re)
print(z)
print(kmeans_pre[:30])
print(kmeans_pre1[:30])
#三端口三频率的异常检测分析
result1=[]
for i in range(len(finaldata)):
j=1 if all(finaldata.loc[i,:]>3) else 0
result1.append(j)
finaldata["result1"]=result1
print(finaldata["result1"])
#单端口三频率的异常检测分析
result2=[]
for i in range(len(finaldata)):
j=0
while j<len(c1):
if all(finaldata.loc[i,c1[j]]>4.5):
k=1
break
else:
k=0
j=j+1
result2.append(k)
finaldata["result2"]=result2
print(len(finaldata[finaldata["result2"]==1]))
index=[i for (i,v) in enumerate(kmeans_pre) if v==0]
print(index)
index1=[i for (i,v) in enumerate(finaldata["result1"]) if v==1]
print(index1)
j=0
j1=0
for i in range(len(finaldata)):
if kmeans_pre[i]==0 and finaldata["result1"][i]==1:
j=j+1
for i in range(len(finaldata)):
if kmeans_pre_1[i]==0 and finaldata["result1"][i]==1:
j1=j1+1
print(j,j1)
print(len(kmeans_pre[kmeans_pre==0]))
sum=len(finaldata[finaldata["result1"]==1])
print(sum)
accuracy=j/sum
accuracy1=j1/sum
print(accuracy,accuracy1)
j=0
for i in range(len(finaldata)):
if kmeans_pre2[i]==0 and finaldata["result2"][i]==1:
j=j+1
print(j)
sum=len(finaldata[finaldata["result2"]==1])
print(sum)
accuracy=j/sum
print(accuracy)
print(finaldata["p1"])
'''
#使用k_means聚类方法进行分析输出不同的类别数据
from sklearn.cluster import KMeans
import numpy as np
x=np.random.normal(5,1,size=(1000,2))
print(x)
y=np.random.normal(10,2,size=(100,2))
print(y)
z=np.vstack((x,y))
print(z)
kmeans_3 = KMeans(n_clusters=2, random_state=123)
kmeans_3.fit(z)
kmeans_pre3 = kmeans_3.labels_
import matplotlib.pyplot as plt
r1=z[kmeans_pre3==0]
r2=z[kmeans_pre3==1]
print(r1)
print(r2)
plt.plot(r1[:,0],r1[:,1],'bo',r2[:,0],r2[:,1],'r*')
plt.show()
'''
#不同聚类算法汇总展示
#使用kmeans聚类的方法对于各个特征的大小相差较大的数据样本进行聚类时具有较好的效果,不过对于一些特征叫较为接近的的数据进行聚类效果一般,但是可以覆盖
from sklearn.cluster import KMeans,DBSCAN
import numpy as np
from sklearn.manifold import TSNE
x=np.random.normal(5,1,size=(10000,2))
print(x)
y=np.random.normal(10,1,size=(100,2))
print(y)
z=np.vstack((x,y))
print(z)
kmeans_3 = KMeans(n_clusters=2, random_state=123)
kmeans_3.fit(z)
kmeans_pre3 = kmeans_3.labels_
import matplotlib.pyplot as plt
r1=z[kmeans_pre3==0]
r2=z[kmeans_pre3==1]
print(r1)
print(r2)
plt.plot(r1[:,0],r1[:,1],'bo',r2[:,0],r2[:,1],'r*')
plt.show()
'''
#数据的降维,将其转换为了两维的数据进行展示和输出
t=TSNE(learning_rate=1000)
t1=t.fit_transform(z)
print(t1)
print(t1.shape) #数据的降维可视化展示,将数据转换为二维的数据
plt.scatter(t1[:,0],t1[:,1])
plt.show()
d=DBSCAN()
d.fit(z)
d_pre=d.labels_
print(d_pre)
print(len(d_pre))
r1=z[d_pre==0]
r2=z[d_pre==1]
print(d_pre)
print(r1)
print(r2)
plt.plot(r1[:,0],r1[:,1],'bo',r2[:,0],r2[:,1],'r*')
plt.show()
'''
#feijiandu_test
from sklearn.datasets import load_iris
import xlwt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
iris=load_iris()
iris_data=iris['data']
iris_target=iris['target']
iris_names=iris['feature_names']
print("是骡子是马打印出来看看就知道了:
",'第一个',iris_data,'
','第二个',iris_target,'
','第三个',iris_names)
# 依次是:花瓣长度、花瓣宽度、花萼长度、花萼宽度。
## Z分数标准化标准化
data_zs=(iris_data-iris_data.mean())/iris_data.std()
## 也可以自定义函数minmax标准化、或者现成的函数
scale=MinMaxScaler().fit(iris_data)
iris_datascale=scale.transform(iris_data)
kmeans=KMeans(n_clusters=3,random_state=123).fit(iris_datascale)
result=kmeans.predict([[5.6,2.8,4.9,2.0]])
## 这里有点小问题,就是预测的数据需要使用和训练数据同样的标准化才行。
print(result)
#简答打印结果
r1=pd.Series(kmeans.labels_).value_counts()
r2=pd.DataFrame(kmeans.cluster_centers_)
r=pd.concat([r2,r1],axis=1)
r.columns=list(iris_names)+[u'类别数目']
print(r)
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne=TSNE(n_components=2,init='random',random_state=177).fit(iris_data)
df=pd.DataFrame(tsne.embedding_)
df['labels']=kmeans.labels_
df1=df[df['labels']==0]
df2=df[df['labels']==1]
df3=df[df['labels']==2]
fig=plt.figure(figsize=(9,6))
plt.plot(df1[0],df1[1],'bo',df2[0],df2[1],'r*',df3[0],df3[1],'gD')
plt.show()
fig=plt.figure(figsize=(9,6))
plt.plot(df1[0],df1[1],'bo',df2[0],df2[1],'r*')
plt.show()
#失效异常数据机器学习哦模型建立
#失效预测的模型初步构建测试分析与尝试——机器学习算法
#过程测试的IL_AM/DM的的异常结果预测分析与展示——算法集成思想的使用
#1-1各个库的调用和使用尝试-knn算法
#1-导入标记好的含有异常检测结果的数据进行展示和分析使用
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
finaldata=pd.read_excel("C:/Users/y50014900/Desktop/过程测试_033GRR10L4105623_20200601-20200708_IL_DM_异常检测分类结果.xlsx")
feature=["p1","p2","p3","p4","p5","p6","p7","p8","p9","p10","p11","p12","p13","p14","p15","p16","p17","p18","p19","p20","p21","p22","p23"]
#feature1=["p{}".format(x) for x in range(1,24)]
#print(feature1)
DM_target1=["DM1"]
DM_target2=["DM2"]
x=finaldata[feature]
x=np.array(x) #对数据的输入需要进行numpy二维数组的转换和形式统一
y1=finaldata[DM_target1].values.ravel() #将表格中的目标列向量转换为一维的数组,作为目标预测的向量
y2=finaldata[DM_target2].values.ravel()
#进行数据的numpy数据形式转换,为算法的数据输入做好准备工作
#2-进行机器学习算法的使用
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y1,test_size=0.2,random_state=666)
'''
#数据归一化的实现和转换
from sklearn.preprocessing import StandardScaler #数据归一化
s=StandardScaler()
s.fit(x_train)
x_train=s.transform(x_train)
x_test=s.transform(x_test)
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)
y_predict=knn.predict(x_test)
'''
knn=KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train,y_train)
y_predict=knn.predict(x_test)
from sklearn.metrics import accuracy_score #导入整体模型的准确度
from sklearn.metrics import confusion_matrix #导入整体模型的混淆矩阵
from sklearn.metrics import precision_score #导入整体模型的精准率
from sklearn.metrics import recall_score #导入整体模型的召回率
from sklearn.metrics import f1_score
print(knn.score(x_test,y_test)) #输出准确度
print(accuracy_score(y_test,y_predict)) #输出准确度
print(accuracy_score(y_test,y_predict,normalize=False)) #输出准确的预测个数
print(confusion_matrix(y_test,y_predict)) #输出混淆矩阵的大小
print(precision_score(y_test,y_predict)) #输出精准率
print(recall_score(y_test,y_predict)) #输出召回率
print(f1_score(y_test,y_predict)) #输出f1的综合评价指标测试
#3使用网格搜索的方式进行模型超参数的确定,交叉验证的方式寻找最好的模型的超参数的平均主义思想输出即可
#需要定义好其中的数据网格搜索的超参数的参数范围区间和大小,来进行超参数的最优参数组合寻找
param_grid=[
{
"weights":["uniform"],
"n_neighbors":[i for i in range(1,11)]
},
{
"weights":["distance"],
"n_neighbors":[i for i in range(1,11)],
"p":[i for i in range(1,6)]
}
]
k=KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
#定义相应网格搜索方式(输入初始化参数:1机器学习算法、2超参数组合列表、3n_jobs(选择并行内核数目,-1表示全部是用),4verbose=2表示输出相应搜索过程)
grid_search=GridSearchCV(k,param_grid,cv=5,verbose=2)
grid_search.fit(x,y1)
print(grid_search.best_params_) #输出最好的超参数组合
print(grid_search.best_score_) #输出最好的模型的时候的准确度
knn_best=grid_search.best_estimator_ #定义出最好的分类器
y_pre=knn_best.predict(x_test)
print(knn_best.score(x_test,y_test))
plt.scatter(range(len(y_test)),y_test,color="r",label="实际测试点结果")
plt.scatter(range(len(y_pre)),y_pre,color="g",label="预测测试点结果")
plt.xlabel("数据时间次序")
plt.ylabel("异常检测标签")
plt.legend()
plt.show()
#非监督聚类分析测试
#使用k_means聚类方法进行分析输出不同的类别数据
from sklearn.cluster import KMeans
import numpy as np
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.rcParams["axes.unicode_minus"]=False
x=np.random.normal(1,0.5,size=(100000,2))
print(x)
y=np.random.normal(5,0.5,size=(10,2))
print(y)
y1=np.random.normal(7,1,(15,2))
y2=np.random.normal(10,1,(10,2))
z=np.vstack((x,y,y1,y2))
y2=np.random.normal(7,1,(10,2))
print(z)
plt.plot(z[:,0],z[:,1],'b*',label="WSS所有数据点集合")
plt.xlabel("p1_IL指标数据")
plt.ylabel("p2_IL指标数据")
plt.legend()
plt.show()
kmeans_3 = KMeans(n_clusters=2,n_init=100)
kmeans_3.fit(z)
kmeans_pre3 = kmeans_3.labels_
r1=z[kmeans_pre3==0]
r2=z[kmeans_pre3==1]
print(r1)
print(r2)
plt.plot(r1[:,0],r1[:,1],'bo',label="异常数据点集合")
plt.plot(r2[:,0],r2[:,1],'r*',label="正常数据点集合")
plt.xlabel("p1_IL指标数据")
plt.ylabel("p2_IL指标数据")
plt.legend()
plt.show()
kmeans_center= kmeans_3.cluster_centers_ #获取聚类中心
print(kmeans_center)
distance1=cdist(r1,[kmeans_center[0]])
distance2=cdist(r2,[kmeans_center[1]])
print(distance1)
print(distance2)
plt.plot(distance1,label="异常数据点距离集合")
plt.plot(distance2,label="正常数据点距离集合")
plt.xlabel("数据分布序号")
plt.ylabel("距离聚类中心距离大小")
plt.legend()
plt.show()
threshold1= distance1.mean() + 3 * distance1.std()
threshold2= distance2.mean() + 3* distance1.std()
if len(r1)<0.2*len(z) or len(r2)<0.2*len(z):
if len(r1)<len(r2):
errorpoint = r1
normal=r2
else:
errorpoint = r2
normal = r1
plt.plot(r1[:, 0], r1[:, 1], 'bo', r2[:, 0], r2[:, 1], 'r*')
plt.show()
else:
if distance1.max()<distance2.max():
distance=distance2
r=r2
n=r1
else:
distance=distance1
r=r1
n=r2
threshold = distance.mean() +7 *distance1.std()
error=[]
normal=[]
for (i,v) in enumerate(distance):
if v>threshold:
error.append(i)
else:
normal.append(i)
errorpoint=r[error]
normalpoint=r[normal]
normal=np.vstack((normalpoint,n))
plt.plot(errorpoint[:,0],errorpoint[:,1],'r*',label="异常数据点集合")
plt.plot(normal[:,0],normal[:,1],'bo',label="正常数据点集合")
plt.xlabel("p1_IL指标数据")
plt.ylabel("p2_IL指标数据")
plt.legend()
plt.show()
#IL测试代码
#主要继进行过程测试数据的IL-DM数据的预处理和数据的异常检测输出类
#导入相应的数据分析包
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.rcParams["axes.unicode_minus"]=False
#定义相关的类
class Process_IL_DM_LABELoutput:
#初始化方法,定义各个变量
def __init__(self,datapath,number,time):
self.data=pd.read_excel(datapath)
self.number=number
self.time=time
self.col=self.data.columns.tolist()
#数据预处理,清洗数据即可
#将原来导出表格数据按照其时间进行排序(另外对于相同时间的数据按照其检测光的频率大小进行排序即可)
def timesort(self):
self.data["start_time"] = pd.to_datetime(self.data["start_time"])
self.data=self.data.sort_index(by=["start_time", "condition"])
self.data.index=range(len(self.data))
print(self.data.describe())
print(self.data)
#数据清洗函数,主要是去除重复检测的数值
def dataclear(self):
index = []
index1 = []
for i in range(len(self.data) - 1):
if self.data.loc[i, "condition"] == self.data.loc[i + 1, "condition"]:
print(i)
index.append(i)
if self.data.loc[i, "condition"] == 193.65 and self.data.loc[i + 1, "condition"] == 190.65:
print(i)
index1.append(i - 1)
index1.append(i)
print(index)
print(index1)
self.data = self.data.drop(index + index1)
self.data.index = range(0, len(self.data))
print(self.data)
#预处理数据完成最终干净数据输出函数
def finaldataoutput(self):
self.c1 = self.col[3:] #定义好数据列表的位置索引集合p1:p23
data4 = self.data[self.data["condition"] == 190.65]
data5 = self.data[self.data["condition"] == 193.65]
data6 = self.data[self.data["condition"] == 196.65]
data4 = data4.fillna(value={i: data4[i].mean() for i in self.c1}) #对各个数据进行填补缺失值,用整个列的数据的平均值进行填充
data5 = data5.fillna(value={i: data5[i].mean() for i in self.c1})
data6 = data6.fillna(value={i: data6[i].mean() for i in self.c1})
print(data4.describe())
print(data5.describe())
print(data6.describe())
data4.index = range(len(data4))
data5.index = range(len(data4))
data6.index = range(len(data4))
self.finaldata = pd.concat([data4[self.c1], data5[self.c1], data6[self.c1]], axis=1)
print(self.finaldata.columns)
print(self.finaldata.index)
#1-三端口三频率超标异常检测函数
def IL_three_outlier_detection(self,threshold1):
self.threshold1=threshold1
self.result1 = []
for i in range(len(self.finaldata)):
j = 1 if all(self.finaldata.loc[i, :]>threshold1) else 0
self.result1.append(j)
self.finaldata["result1"] = self.result1
print(self.finaldata["result1"])
#2-单端口三频率超标异常检测函数
def IL_one_outlier_detection(self,threshold2):
self.threshold2 = threshold2
self.result2 = []
for i in range(len(self.finaldata)):
j = 0
while j < len(self.c1):
if all(self.finaldata.loc[i, self.c1[j]] > threshold2):
k = 1
break
else:
k = 0
j = j + 1
self.result2.append(k)
self.finaldata["result2"] = self.result2
print(self.finaldata["result2"])
#异常点输出图像函数
def plot(self,result):
plt.figure(1)
plt.scatter(self.finaldata.index[self.finaldata[result]==1], self.finaldata[result][self.finaldata[result]==1],color="r",label="异常点集合")
plt.scatter(self.finaldata.index[self.finaldata[result]==0],self.finaldata[result][self.finaldata[result]==0],color="g",label="正常点集合")
plt.title("过程检测_"+self.number+"_IL_DM_"+result+"_异常点检测标记输出图像")
plt.xlabel("数据时间次序_"+self.time)
plt.ylabel("IL-DM异常标签")
plt.legend()
plt.show()
#最终处理完的数据文件保存函数
def finaldata_save(self):
#c2 = ["190.65-" + i for i in self.c1]
#c3 = ["193.65-" + i for i in self.c1]
#c4 = ["196.65-" + i for i in self.c1]
#self.finaldata.columns=c2+c3+c4+["result1","result2"]
c2 = self.col[:2]
self.finaldata = pd.concat([self.data[c2], self.finaldata], axis=1)
self.finaldata.to_excel("过程测试_"+self.number+"_"+self.time+"_IL-DM_异常检测分类结果.xlsx")
d=Process_IL_DM_LABELoutput('C:/Users/y50014900/Desktop/过程测试_033GRR_IL-DM_20200601-20200709.xlsx','033GRR10L4105623','20200601-20200708')
d.timesort()
d.dataclear()
d.finaldataoutput()
d.IL_three_outlier_detection(3)
d.IL_one_outlier_detection(4)
d.plot("result1")
d.plot("result2")
d.finaldata_save()
#PDL测试代码
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.rcParams["axes.unicode_minus"]=False
data=pd.read_excel("C:/Users/y50014900/Desktop/过程测试_033GRR_PDL-AM_2020-6-1_2020-7-8.xlsx")
print(data)
print(data.describe())
print(data.columns)
print(data.index)
print(len(data))
data["start_time"]=pd.to_datetime(data["start_time"])
df=data.sort_index(by=["start_time","condition"])
print(df)
data=df
data.index=range(0,len(data))
data4=data[data["condition"]==190.65]
data5=data[data["condition"]==193.65]
data6=data[data["condition"]==196.65]
print(len(data4),len(data5),len(data6))
index=[]
index1=[]
for i in range(len(data)-1):
if data.loc[i,"condition"]==data.loc[i+1,"condition"]:
print(i)
index.append(i)
if data.loc[i,"condition"]==193.65 and data.loc[i+1,"condition"]==190.65:
print(i)
index1.append(i-1)
index1.append(i)
print(index)
print(index1)
data=data.drop(index+index1)
data.index=range(0,len(data))
print(len(data))
data4=data[data["condition"]==190.65]
data5=data[data["condition"]==193.65]
data6=data[data["condition"]==196.65]
print(len(data4),len(data5),len(data6))
print(data4.describe())
print(data5.describe())
print(data6.describe())
data4.index=range(len(data4))
data5.index=range(len(data4))
data6.index=range(len(data4))
#缺失值处理方式1—使用均值填充缺失值的空格地方
'''
col=data4.columns.tolist()
print(col)
c1=col[3:]
data4 = data4.fillna(value={i: data4[i].mean() for i in c1}) #对各个数据进行填补缺失值,用整个列的数据的平均值进行填充
data5 = data5.fillna(value={i: data5[i].mean() for i in c1})
data6 = data6.fillna(value={i: data6[i].mean() for i in c1})
print(data4.describe())
print(data5.describe())
print(data6.describe())
'''
#缺失值出处理方式2=-直接缺失的行数据即可—对应的时间节点即可删除掉其中的数据
col=data4.columns.tolist()
print(col)
c1=col[3:]
finaldata=pd.concat([data4[c1],data5[c1],data6[c1]],axis=1)
finaldata.index=range(len(finaldata))
print(finaldata.columns)
print(len(finaldata))
finaldata=finaldata.dropna(axis=0)
print(len(finaldata))
finaldata.index=range(len(finaldata))
data1=finaldata.iloc[:,:23]
data2=finaldata.iloc[:,23:46]
data3=finaldata.iloc[:,46:69]
print(len(data1),len(data2),len(data3))
print(data1)
print(data2)
print(data3)
#1-单端口三频率的异常检测分析3
result1=[]
for i in range(len(finaldata)):
j=0
while j<len(c1):
if all(finaldata.loc[i,c1[j]]>0.17):
k=1
break
else:
k=0
j=j+1
result1.append(k)
finaldata["result1"]=result1
print(finaldata["result1"])
'''
#2相邻端口单频超标,其他端口具有趋势
result2=[]
for i in range(len(finaldata)):
j = 0
while j < len(c1) - 1:
if (any(finaldata.loc[i, c1[j]] > 0.2) and any(finaldata.loc[i, c1[j+1]] > 0.2)) and all(finaldata.loc[i, c1[j:j+2]] > 0.2 - 0.05):
k = 1
break
else:
k = 0
j = j + 1
result2.append(k)
finaldata["result2"] = result2
print(finaldata["result2"])
#3-相邻端口三频率超标的测试
result3=[]
for i in range(len(finaldata)):
j=0
while j<len(c1)-1:
if all(finaldata.loc[i,c1[j]]>0.1) and all(finaldata.loc[i,c1[j+1]]>0.1):
k=1
break
else:
k=0
j=j+1
result3.append(k)
finaldata["result3"]=result3
print(finaldata["result3"])
#4-1-较多端口单频超标(只要有一种频率超标就叫超标,不需要频率对应一致)
result4=[]
number=15
for i in range(len(finaldata)):
j=0
m=0
while j<len(c1):
if any(finaldata.loc[i,c1[j]]>0.15):
m+=1
else:
m=m
j=j+1
if m>number:
k=1
else:
k=0
result4.append(k)
finaldata["result4"]=result4
print(finaldata["result4"])
#4-2-较多端口单频超标(该频率必须一致,多个端口同一个频率数据超标)
result4=[]
number=10 #定义较多端口的个数即可
for i in range(len(data1)):
if ((data1.loc[i]>0.15).sum()>number or (data2.loc[i]>0.15).sum()>number) or (data3.loc[i]>0.15).sum()>number: #可以指定每一种频率谱线的超标的值的阈值
k=1
else:
k=0
result4.append(k)
finaldata["result4_1"]=result4
print(finaldata["result4_1"])
#5-谱线中含有大于0.15dB的频谱线-测试问题检测
result5=[]
for i in range(len(finaldata)):
j=0
while j<len(c1):
if any(finaldata.loc[i,c1[j]]>0.15):
k=1
break
else:
k=0
j=j+1
result5.append(k)
finaldata["result5"]=result5
print(finaldata["result5"])
print(finaldata.describe())
'''