1、包的导入
不建议采用如下写法,这样做会破坏命名空间:
from numpy import *
应该这样写:
import numpy as np
2、numpy 关于赋值
普通的赋值,如b=a,这时,a和b是一个对象,改变了a也就改变了b,并不是将a的值复制一份,赋值给b;
这样做的好处是优化代码效率;numpy尽量避免复制
3、numpy修剪函数
clip
4、绘制散点图
#!/usr/bin/python # -*- coding: utf-8 -*- import matplotlib.pyplot as plt import numpy as np #散点图 X=np.array([1,2,3,4]) X_EN=np.array(['one','two','three','four']) Y=np.array([2,4,6,8]) plt.scatter(X,Y) plt.xlabel("X") plt.ylabel("Y") plt.title("study") plt.xticks(X,X_EN) plt.grid() plt.show()
5、可以把函数作为参数传入
如:def add(x,y,f):
... return f(x)+f(y)
调用:add(-3,-4,abs)
6、区分花卉(1)
#!/usr/bin/python # -*- coding: utf-8 -*- from sklearn.datasets import load_iris from matplotlib import pyplot as plt import numpy as np data = load_iris() features = data['data'] target = data['target'] #先可视化其中两个特征的关系setosa=0; versicolor=1; virginica=2 for t,marker,color in zip(xrange(3),">ox","rgb"): plt.scatter(features[target==t,0],features[target==t,3],marker=marker,c=color) plt.show() plength=features[:,2] is_setosa=(target==0) max_setosa = plength[is_setosa].max()#setosa的最大长度 min_no_setosa=plength[~is_setosa].min()#其他两种的最小长度 print max_setosa,' ',min_no_setosa #区分其他两种
7、线性回归
#!/usr/bin/python # -*- coding: utf-8 -*- import numpy from sklearn import metrics from sklearn.linear_model import LinearRegression from sklearn import cross_validation #准备数据集 Xtrain=numpy.array([[1],[2],[3],[6],[7],[8]]) Ytrain=numpy.array([[1],[2],[3],[4],[7],[8]]) #训练模型 linreg=LinearRegression() linreg.fit(Xtrain,Ytrain)#二维数组 print linreg.intercept_#截距 print linreg.coef_#系数 #预测 y_pre=linreg.predict(9) print '预测X=9 Y=',y_pre #模型评价 计算均方差(差的平方,求和,再求平均值) print 'MSE',metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain) #模型评价 计算均方根误差 print 'RMSE',numpy.sqrt(metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain)) #交叉验证 x_train,y_train,x_test,y_test=cross_validation.train_test_split(Xtrain,Ytrain,test_size=0.3,random_state=0) linreg.fit(x_train,y_train) print '交叉验证结果:'
#!/usr/bin/python # -*- coding: utf-8 -*- import numpy from sklearn import metrics from sklearn.linear_model import LinearRegression from sklearn import cross_validation #准备数据集 Xtrain=numpy.array([[1,2],[2,2],[3,3],[6,5],[7,4],[8,9]]) Ytrain=numpy.array([[3],[4],[6],[11],[11],[17]]) #训练模型 linreg=LinearRegression() linreg.fit(Xtrain,Ytrain)#二维数组 print linreg.intercept_#截距 print linreg.coef_#系数 #预测 y_pre=linreg.predict([9,8]) print '预测X=9 Y=',y_pre #模型评价 计算均方差(差的平方,求和,再求平均值) print 'MSE',metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain) #模型评价 计算均方根误差 print 'RMSE',numpy.sqrt(metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain)) #交叉验证
8、TF-IDF算法
import scipy as sp import math def tfIdf(term,doc,docset): tf=float(doc.count(term))/sum(a_doc.count(term) for a_doc in docset) idf=math.log(float(len(docset))/len([a_doc for a_doc in docset if term in a_doc])) return tf*idf a,abb,abc=['a'],['a','b','b'],['a','b','c'] D=[a,abb,abc] print tfIdf('a',a,D) print tfIdf('b',a,D) print tfIdf('a',abb,D) print tfIdf('b',abb,D) print tfIdf('b',abc,D)
9、K均值分类
#!/usr/bin/env python # -*- coding: utf-8 -*- import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_blobs from sklearn.cluster import KMeans # X为样本特征,Y为样本簇类别, 共1000个样本,每个样本2个特征,共4个簇,簇中心在[-1,-1], [0,0],[1,1], [2,2], 簇方差分别为[0.4, 0.2, 0.2] X, y = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [0,0], [1,1], [2,2]], cluster_std=[0.4, 0.2, 0.2, 0.2], random_state =16) #plt.scatter(X[:, 0], X[:, 1], marker='o') #plt.show() model = KMeans(n_clusters=6,random_state=9).fit(X); pre_y=model.predict(X) X1=X[pre_y==0] X2=X[pre_y==1] X3=X[pre_y==2] X4=X[pre_y==3] X5=X[pre_y==4] X6=X[pre_y==5] plt.scatter(X1[:, 0], X1[:, 1], marker='o') plt.scatter(X2[:, 0], X2[:, 1], marker='*',color="r") plt.scatter(X3[:, 0], X3[:, 1], marker='+') plt.scatter(X4[:, 0], X4[:, 1], marker='.') plt.scatter(X5[:, 0], X5[:, 1], marker='>') plt.scatter(X6[:, 0], X6[:, 1], marker='^',color="g") plt.show()
10、潜在狄利克雷主题模型
#!/usr/bin/env python # -*- coding: utf-8 -*- from gensim import corpora,models,similarities documents = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] # 去除停用词并分词 # 译者注:这里只是例子,实际上还有其他停用词 # 处理中文时,请借助 Py结巴分词 https://github.com/fxsjy/jieba stoplist = set('for a of the and to in'.split()) texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] # 去除仅出现一次的单词 from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] from pprint import pprint # pretty-printer pprint(texts) dictionary = corpora.Dictionary(texts) dictionary.save('/tmp/deerwester.dict') # 把字典保存起来,方便以后使用 print(dictionary) #查看每个词对应的编号 print 'id2word:',dictionary.token2id #得到语料库 corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # 存入硬盘,以备后需 print corpus #建立潜在狄利克雷模型 model = models.ldamodel.LdaModel(corpus,num_topics=2,id2word=dictionary) print 'modele:',model topics=[model[c] for c in corpus] print 'topic0',topics[0] #未完待续
#!/usr/bin/env python # -*- coding: utf-8 -*- from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation import numpy as np import jieba.analyse import sys reload(sys) sys.setdefaultencoding('utf-8') def pre_detail(): jieba.analyse.set_stop_words("./Data/stopWords") with open("./Data/push_candidate_10w") as f: documents = f.readlines() with open("./Data/nlp_test", mode='w') as f2: for document in documents: f2.write(" ".join(jieba.analyse.extract_tags(document, topK=None)) + " ") if __name__ == '__main__': print ("LDA 主题模型") with open("./Data/nlp_test", mode='rw') as train_file: words = train_file.readlines() cntVector = CountVectorizer() cntTf = cntVector.fit_transform(words) # 词频向量 lda = LatentDirichletAllocation(n_topics=100) theme = lda.fit_transform(cntTf) paras = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''] with open("./Data/tmp", mode='rw') as result_file: push = result_file.readlines() np.set_printoptions(threshold=np.nan) max_p = np.max(theme, axis=1) for i in range(0, max_p.size): paras[np.where(theme[i] == max_p[i])[0][0]] += str(push[i]).decode("utf-8") print str(paras).decode("unicode-escape").replace("', u", ' ----------------------------------' '----------------------------------- ')
11、对象保存与加载
@staticmethod def save_obj(obj,file_name): with open(file_name,'wb') as f: pickle.dump(obj,f) @staticmethod def load_obj(file_name): with open(file_name,'rb') as f: return pickle.load(f)
12、K邻近算法参数
n_neighbors - 选取最近的几个点
weights - 各个点的比重,可选:uniform 各点比重相同、distance 按照距离赋予权重、[callable]函数
algorithm - 寻找最邻近使用的算法
import json import jieba import math from MsgIdentification.DataProcess import DataProcess from sklearn.neighbors import KNeighborsClassifier class CompositeModel: def __init__(self): super().__init__() self.X = DataProcess.load_obj("./data/model.pkl") self.labelList = DataProcess.load_obj("./data/labelList.pkl") self.vectorizer = DataProcess.load_obj("./data/vectorizer.pkl") self.knn = KNeighborsClassifier(n_neighbors=4,weights="distance") self.knn.fit(self.X,self.labelList) def predict(self, msgInfo: object) -> object: in_json = json.loads(msgInfo) msg = in_json['body'] #if(len(DataProcess.chinese_reg.findall(msg))<16): # return False #length = math.pow(len(DataProcess.chinese_reg.findall(msg)), 1) / math.pow(len(msg),1.5) #if length > 0.12 or length < 0.000001: # return False #if len(DataProcess.rubish_reg.findall(msg)): # return True seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False)) dellist = [] for word in seg_list: if (len(word) == 1): dellist.append(word) for word in dellist: seg_list.remove(word) new_sentence = " ".join(seg_list) new_vec = self.vectorizer.transform([new_sentence]) return self.knn.predict(new_vec)
K邻近算法测试:
from MsgIdentification.CompositeModel import CompositeModel, DataProcess import json import datetime if __name__ == '__main__': # file = open("./data/data") # msgInfos = file.readlines() data_train = DataProcess(); data_train.dataToVector() msgInfos = DataProcess.load_obj("./data/test_data.pkl"); model = CompositeModel() i = 0.02; right = 0; effect_identify = 0 critical_error = 0 false_count = 0.01 for msgInfo in msgInfos: in_json = json.loads(msgInfo) result = model.predict(msgInfo) label = in_json['spam'] i += 1 if not label: false_count += 1 if result == label: right += 1 if result: effect_identify += 1 if result and label == False: critical_error += 1 print(right * 1.0 / i, " -- ", critical_error * 1.0 / false_count, " -- ", effect_identify * 1.0 / i, " -- ", effect_identify * 1.0 / (i - false_count))
13、交叉验证使用的包
from sklearn import model_selection
或
#!/usr/bin/env python # -*- coding: utf-8 -*- from sklearn.model_selection import KFold import numpy as np cv = KFold(n_splits=3, shuffle=True) X = np.asarray([10, 22, 32, 41, 35, 46, 57, 18, 59]) for train, test in cv.split(X): print(X[train]) print(X[test]) print('=========')
等份2^n份的函数:
def dataset_split_n(X, Y, n_fold): """ 将X Y 分成 2^n_fold等份 :param X: :param Y: :param n_fold: :return: """ parent_x = [] parent_y = [] child_x = [] child_y = [] parent_x.append(X) parent_y.append(Y) for n in range(0, n_fold): child_x = [] child_y = [] for index in range(0, len(parent_x)): tmp_x1, tmp_x2, tmp_y1, tmp_y2 = train_test_split(parent_x[index], parent_y[index], test_size=0.5) child_x.append(tmp_x1) child_x.append(tmp_x2) child_y.append(tmp_y1) child_y.append(tmp_y2) parent_x = child_x parent_y = child_y return child_x, child_y
14、贝叶斯模型及测试
from sklearn.naive_bayes import MultinomialNB from MsgIdentification.DataProcess import DataProcess import json import jieba import math class Bayes: def __init__(self): super().__init__() self.X = DataProcess.load_obj("./data/model.pkl") self.labelList = DataProcess.load_obj("./data/labelList.pkl") self.vectorizer = DataProcess.load_obj("./data/vectorizer.pkl") self.clf = MultinomialNB() self.clf.fit(self.X, self.labelList) def predict(self, msgInfo): in_json = json.loads(msgInfo) msg = in_json['body'] seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False)) dellist = [] for word in seg_list: if len(word) == 1: dellist.append(word) for word in dellist: seg_list.remove(word) new_sentence = " ".join(seg_list) new_vec = self.vectorizer.transform([new_sentence]) if len(DataProcess.chinese_reg.findall(msg)) < 16: return False length = math.pow(len(DataProcess.chinese_reg.findall(msg)), 1) / math.pow(len(msg), 1.5) if length > 0.12 or length < 0.000001: return False if len(DataProcess.rubish_reg.findall(msg)): return True return self.clf.predict(new_vec) if __name__ == '__main__': #dataProcess = DataProcess() #dataProcess.dataToVector() bayes = Bayes() msgInfos = DataProcess.load_obj("./data/test_data.pkl") i = 0.02; right = 0; effect_identify = 0 critical_error = 0 false_count = 0.01 for msgInfo in msgInfos: result = bayes.predict(msgInfo) in_json = json.loads(msgInfo) label = in_json['spam'] i += 1 if not label: false_count += 1 if result == label: right += 1 if result: effect_identify += 1 if result and label == False: critical_error += 1 print(right * 1.0 / i, " -- ", critical_error * 1.0 / false_count, " -- ", effect_identify * 1.0 / i, " -- ", effect_identify * 1.0 / (i - false_count))
15、决策树模型及测试
决策树大概分为三种类型:ID3 基于信息熵和信息增益、C4.5 基于信息增益率、基于GINI(基尼)系数的
from sklearn import tree from MsgIdentification.DataProcess import DataProcess import json import math import jieba import pydotplus import graphviz class DecisionTree: def __init__(self): super().__init__() self.clf = tree.DecisionTreeClassifier(max_depth=100, min_samples_split=5, min_samples_leaf=3) self.X = DataProcess.load_obj("./data/model.pkl") self.labelList = DataProcess.load_obj("./data/labelList.pkl") self.vectorizer = DataProcess.load_obj("./data/vectorizer.pkl") self.clf.fit(self.X, self.labelList) def predict(self, msgInfo): json_temp = json.loads(msgInfo) msg = json_temp['body'] seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False)) dellist = [] for word in seg_list: if len(word) == 1: dellist.append(word) for word in dellist: seg_list.remove(word) new_sentence = " ".join(seg_list) new_vec = self.vectorizer.transform([new_sentence]) if len(DataProcess.chinese_reg.findall(msg)) < 16: return False length = math.pow(len(DataProcess.chinese_reg.findall(msg)), 1) / math.pow(len(msg), 1.5) if length > 0.12 or length < 0.000001: return False if len(DataProcess.rubish_reg.findall(msg)): return True return self.clf.predict(new_vec) def save_pdf(self): dot_data = tree.export_graphviz(self.clf, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("./data/decisionTree.pdf") if __name__ == '__main__': decisionTree = DecisionTree() decisionTree.save_pdf() msgInfos = DataProcess.load_obj("./data/test_data.pkl") i = 0.02; right = 0; effect_identify = 0 critical_error = 0 false_count = 0.01 for msgInfo in msgInfos: result = decisionTree.predict(msgInfo) in_json = json.loads(msgInfo) label = in_json['spam'] i += 1 if not label: false_count += 1 if result == label: right += 1 if result: effect_identify += 1 if result and label == False: critical_error += 1 print(right * 1.0 / i, " -- ", critical_error * 1.0 / false_count, " -- ", effect_identify * 1.0 / i, " -- ", effect_identify * 1.0 / (i - false_count))
16、基于tensorflow的神经网络
import tensorflow as tf import numpy as np from MsgIdentification.DataProcess import DataProcess import json import jieba def add_layer(inputs, in_size, out_size, activation_function=None): # add one more layer and return the output of this layer Weights = tf.Variable(tf.random_normal([in_size, out_size])) biases = tf.Variable(tf.zeros([1, out_size]) + 0.1) Wx_plus_b = tf.matmul(inputs, Weights) + biases if activation_function is None: outputs = Wx_plus_b else: outputs = activation_function(Wx_plus_b) return outputs # 2.定义节点准备接收数据 # define placeholder for inputs to network xs = tf.placeholder(tf.float32, [None, 3729]) ys = tf.placeholder(tf.float32, [None, 1]) l1 = add_layer(xs, 3729, 50, activation_function=tf.nn.relu) #l2 = add_layer(l1, 50, 10, activation_function=None) prediction = add_layer(l1, 50, 1, activation_function=None) loss = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction), reduction_indices=[1])) train_step = tf.train.GradientDescentOptimizer(0.01).minimize(loss) init = tf.initialize_all_variables() saver = tf.train.Saver() sess = tf.Session() # 上面定义的都没有运算,直到 sess.run 才会开始运算 sess.run(init) x = DataProcess.load_obj("./data/model.pkl") label_list = DataProcess.load_obj("./data/labelList.pkl") x_data = np.asarray(x.toarray()) y_data = np.asarray(label_list).reshape(444, 1) saver.restore(sess, "./data/neuralNetwork.ckpt") #for i in range(10000000): # # training train_step 和 loss 都是由 placeholder 定义的运算,所以这里要用 feed 传入参数 # sess.run(train_step, feed_dict={xs: x_data, ys: y_data}) # if i % 10 == 0: # to see the step improvement # now_loss = sess.run(loss, feed_dict={xs: x_data, ys: y_data}) # print(now_loss) # if now_loss < 0.0001: # break # if i % 1000 == 0: # save_path = saver.save(sess, "./data/neuralNetwork.ckpt") # print("Save to path: ", save_path) msgInfos = DataProcess.load_obj("./data/test_data.pkl"); vectorizer = DataProcess.load_obj("./data/vectorizer.pkl") i = 0 for i in range(0, len(msgInfos) - 2, 50): msgInfo = msgInfos[i] in_json = json.loads(msgInfo) msg = in_json['body'] label = in_json['spam'] seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False)) dellist = [] for word in seg_list: if (len(word) == 1): dellist.append(word) for word in dellist: seg_list.remove(word) new_sentence = " ".join(seg_list) new_vec = vectorizer.transform([new_sentence]) print(sess.run(prediction, feed_dict={xs: np.asarray(new_vec.toarray())}) > 0.4, label)