tsne
数据不做预处理:
# coding: utf-8 import collections import numpy as np import os import pickle from sklearn.neighbors import NearestNeighbors import numpy as np from sklearn.manifold import TSNE
# ....... X = X+black_verify+white_verify+unknown_verify+bd_verify print black_verify_labels+white_verify_labels+unknown_verify_labels+bd_verify_labels y = y+black_verify_labels+white_verify_labels+unknown_verify_labels+bd_verify_labels print("ALL data check:") print("len of X:", len(X)) print("len of y:", len(y)) # print(unknown_verify) X_embedded = TSNE(n_components=2).fit_transform(X) with open("tsne_data_X.pkl", "wb") as f: pickle.dump([X_embedded, y], f)
import pickle from collections import Counter import numpy as np import matplotlib.pyplot as Plot def main(): with open("tsne_data_X.pkl", "rb") as f: [X_embedded, y] = pickle.load(f, encoding='iso-8859-1') print(len(X_embedded)) print(len(y)) print(X_embedded[:3]) print(y[:3]) i = 0 for l in y: if type(l) == type([]): raise Exception(str([i,y])) i+=1 print(Counter(y)) Y, labels = np.array(X_embedded), np.array(y) titles = ("white","black","black_verify_labels","white_verify_labels","unknown_verify_labels","bd_verify_labels") colors=['b', 'c', 'y', 'm', 'r', 'g', 'peru'] for i in range(0, 6): idx_1 = [i1 for i1 in range(len(labels)) if labels[i1]==i] flg1=Plot.scatter(Y[idx_1,0], Y[idx_1,1], 20,color=colors[i],label=titles[i]); Plot.legend() Plot.savefig('tsne.pdf') Plot.show() main()
数据做standard标准化处理
使用pca,不进行预处理:
使用standard scaler预处理,再做pca:
from sklearn import preprocessing scaler = preprocessing.StandardScaler().fit(X) #scaler = preprocessing.MinMaxScaler().fit(X) X = scaler.transform(X) print("standard X sample:", X[:3]) black_verify = scaler.transform(black_verify) print(black_verify) white_verify = scaler.transform(white_verify) print(white_verify) unknown_verify = scaler.transform(unknown_verify) print(unknown_verify) bd_verify = scaler.transform(bd_verify) print(bd_verify) #print black_verify_labels+white_verify_labels+unknown_verify_labels+bd_verify_labels X = np.concatenate((X,black_verify,white_verify,unknown_verify,bd_verify)) #X = X+black_verify+white_verify+unknown_verify+bd_verify y = y+black_verify_labels+white_verify_labels+unknown_verify_labels+bd_verify_labels print("ALL data check:") print("len of X:", len(X)) print("len of y:", len(y)) # print(unknown_verify) X_embedded = PCA(n_components=2).fit_transform(X) with open("pca_data_X_scaled.pkl", "wb") as f: pickle.dump([X_embedded, y], f)
最后效果:
最后使用自编码器来来降维:
代码:
X = np.concatenate((X,black_verify,white_verify,unknown_verify,bd_verify)) y = y+black_verify_labels+white_verify_labels+unknown_verify_labels+bd_verify_labels print("ALL data check:") print("len of X:", len(X)) print("len of y:", len(y)) # print(unknown_verify) ratio_of_train = 0.8 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=(1 - ratio_of_train)) # Building the encoder encoder = tflearn.input_data(shape=[None, 75]) encoder = tflearn.fully_connected(encoder, 64) encoder = tflearn.fully_connected(encoder, 2) # Building the decoder decoder = tflearn.fully_connected(encoder, 64) decoder = tflearn.fully_connected(decoder, 75, activation='sigmoid') # Regression, with mean square error net = tflearn.regression(decoder, optimizer='adam', learning_rate=0.0001, loss='mean_square', metric=None) # Training the auto encoder model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(X_train, X_train, n_epoch=200, validation_set=(X_test, X_test), run_id="auto_encoder", batch_size=1024) # Encoding X[0] for test print(" Test encoding of X[0]:") # New model, re-using the same session, for weights sharing encoding_model = tflearn.DNN(encoder, session=model.session) print(encoding_model.predict([X[0]])) X_embedded = encoding_model.predict(X) #TSNE(n_components=2).fit_transform(X) with open("tflearn_auto_enc_data_X_scaled.pkl", "wb") as f: pickle.dump([X_embedded, y], f)
如果是迭代次数不一样,则可能有一些差别,见下图,和上面的可能有些差别:
修改64为128: