• python学习


    1、包的导入

    不建议采用如下写法,这样做会破坏命名空间:

    from numpy import *

    应该这样写:

    import numpy as np

     2、numpy 关于赋值

    普通的赋值,如b=a,这时,a和b是一个对象,改变了a也就改变了b,并不是将a的值复制一份,赋值给b;

    这样做的好处是优化代码效率;numpy尽量避免复制

    3、numpy修剪函数

    clip

    4、绘制散点图

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import matplotlib.pyplot as plt
    import numpy as np
    
    #散点图
    X=np.array([1,2,3,4])
    X_EN=np.array(['one','two','three','four'])
    Y=np.array([2,4,6,8])
    plt.scatter(X,Y)
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.title("study")
    plt.xticks(X,X_EN)
    plt.grid()
    plt.show()
    绘制散点图

    5、可以把函数作为参数传入

    如:def add(x,y,f):
    ...     return f(x)+f(y)

    调用:add(-3,-4,abs)

    6、区分花卉(1)

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    from sklearn.datasets import load_iris
    from matplotlib import pyplot as plt
    import numpy as np
    data = load_iris()
    features = data['data']
    target = data['target']
    #先可视化其中两个特征的关系setosa=0; versicolor=1; virginica=2
    for t,marker,color in zip(xrange(3),">ox","rgb"):
        plt.scatter(features[target==t,0],features[target==t,3],marker=marker,c=color)
    plt.show()
    
    plength=features[:,2]
    is_setosa=(target==0)
    max_setosa = plength[is_setosa].max()#setosa的最大长度
    min_no_setosa=plength[~is_setosa].min()#其他两种的最小长度
    print max_setosa,' ',min_no_setosa
    
    #区分其他两种
    区分花卉

     7、线性回归

    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import numpy
    from sklearn import metrics
    from sklearn.linear_model import LinearRegression
    from sklearn import cross_validation
    
    #准备数据集
    Xtrain=numpy.array([[1],[2],[3],[6],[7],[8]])
    Ytrain=numpy.array([[1],[2],[3],[4],[7],[8]])
    #训练模型
    linreg=LinearRegression()
    linreg.fit(Xtrain,Ytrain)#二维数组
    
    print linreg.intercept_#截距
    print linreg.coef_#系数
    
    #预测
    y_pre=linreg.predict(9)
    print '预测X=9 Y=',y_pre
    
    #模型评价 计算均方差(差的平方,求和,再求平均值)
    print 'MSE',metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain)
    #模型评价 计算均方根误差
    print 'RMSE',numpy.sqrt(metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain))
    #交叉验证
    x_train,y_train,x_test,y_test=cross_validation.train_test_split(Xtrain,Ytrain,test_size=0.3,random_state=0)
    linreg.fit(x_train,y_train)
    print '交叉验证结果:'
    View Code
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import numpy
    from sklearn import metrics
    from sklearn.linear_model import LinearRegression
    from sklearn import cross_validation
    
    #准备数据集
    Xtrain=numpy.array([[1,2],[2,2],[3,3],[6,5],[7,4],[8,9]])
    Ytrain=numpy.array([[3],[4],[6],[11],[11],[17]])
    #训练模型
    linreg=LinearRegression()
    linreg.fit(Xtrain,Ytrain)#二维数组
    
    print linreg.intercept_#截距
    print linreg.coef_#系数
    
    #预测
    y_pre=linreg.predict([9,8])
    print '预测X=9 Y=',y_pre
    
    #模型评价 计算均方差(差的平方,求和,再求平均值)
    print 'MSE',metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain)
    #模型评价 计算均方根误差
    print 'RMSE',numpy.sqrt(metrics.mean_squared_error(linreg.predict(Xtrain),Ytrain))
    #交叉验证
    多维输入

     8、TF-IDF算法

    import scipy as sp
    import math
    def tfIdf(term,doc,docset):
        tf=float(doc.count(term))/sum(a_doc.count(term) for a_doc in docset)
        idf=math.log(float(len(docset))/len([a_doc for a_doc in docset if term in a_doc]))
        return tf*idf
    
    a,abb,abc=['a'],['a','b','b'],['a','b','c']
    D=[a,abb,abc]
    
    print tfIdf('a',a,D)
    print tfIdf('b',a,D)
    print tfIdf('a',abb,D)
    print tfIdf('b',abb,D)
    print tfIdf('b',abc,D)
    TF-IDF

     9、K均值分类

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.datasets.samples_generator import make_blobs
    from sklearn.cluster import KMeans
    # X为样本特征,Y为样本簇类别, 共1000个样本,每个样本2个特征,共4个簇,簇中心在[-1,-1], [0,0],[1,1], [2,2], 簇方差分别为[0.4, 0.2, 0.2]
    X, y = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [0,0], [1,1], [2,2]], cluster_std=[0.4, 0.2, 0.2, 0.2],
                      random_state =16)
    #plt.scatter(X[:, 0], X[:, 1], marker='o')
    #plt.show()
    
    model = KMeans(n_clusters=6,random_state=9).fit(X);
    pre_y=model.predict(X)
    X1=X[pre_y==0]
    X2=X[pre_y==1]
    X3=X[pre_y==2]
    X4=X[pre_y==3]
    X5=X[pre_y==4]
    X6=X[pre_y==5]
    plt.scatter(X1[:, 0], X1[:, 1], marker='o')
    plt.scatter(X2[:, 0], X2[:, 1], marker='*',color="r")
    plt.scatter(X3[:, 0], X3[:, 1], marker='+')
    plt.scatter(X4[:, 0], X4[:, 1], marker='.')
    plt.scatter(X5[:, 0], X5[:, 1], marker='>')
    plt.scatter(X6[:, 0], X6[:, 1], marker='^',color="g")
    plt.show()
    K-Means

     10、潜在狄利克雷主题模型

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    from gensim import corpora,models,similarities
    documents = ["Human machine interface for lab abc computer applications",
                  "A survey of user opinion of computer system response time",
                  "The EPS user interface management system",
                  "System and human system engineering testing of EPS",
                  "Relation of user perceived response time to error measurement",
                  "The generation of random binary unordered trees",
                  "The intersection graph of paths in trees",
                  "Graph minors IV Widths of trees and well quasi ordering",
                  "Graph minors A survey"]
    
    # 去除停用词并分词
    # 译者注:这里只是例子,实际上还有其他停用词
    #         处理中文时,请借助 Py结巴分词 https://github.com/fxsjy/jieba
    stoplist = set('for a of the and to in'.split())
    texts = [[word for word in document.lower().split() if word not in stoplist]
             for document in documents]
    
    # 去除仅出现一次的单词
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    
    texts = [[token for token in text if frequency[token] > 1]
             for text in texts]
    
    from pprint import pprint   # pretty-printer
    pprint(texts)
    
    dictionary = corpora.Dictionary(texts)
    dictionary.save('/tmp/deerwester.dict') # 把字典保存起来,方便以后使用
    print(dictionary)
    #查看每个词对应的编号
    print 'id2word:',dictionary.token2id
    #得到语料库
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # 存入硬盘,以备后需
    print corpus
    #建立潜在狄利克雷模型
    model = models.ldamodel.LdaModel(corpus,num_topics=2,id2word=dictionary)
    print 'modele:',model
    topics=[model[c] for c in corpus]
    print 'topic0',topics[0]
    #未完待续
    LDA主题模型
    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
    import numpy as np
    import jieba.analyse
    import sys
    
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    
    def pre_detail():
        jieba.analyse.set_stop_words("./Data/stopWords")
        with open("./Data/push_candidate_10w") as f:
            documents = f.readlines()
            with open("./Data/nlp_test", mode='w') as f2:
                for document in documents:
                    f2.write(" ".join(jieba.analyse.extract_tags(document, topK=None)) + "
    ")
    
    
    if __name__ == '__main__':
        print ("LDA 主题模型")
        with open("./Data/nlp_test", mode='rw') as train_file:
            words = train_file.readlines()
            cntVector = CountVectorizer()
            cntTf = cntVector.fit_transform(words)  # 词频向量
            lda = LatentDirichletAllocation(n_topics=100)
            theme = lda.fit_transform(cntTf)
            paras = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
                     '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
                     '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
                     '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
                     '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
            with open("./Data/tmp", mode='rw') as result_file:
                push = result_file.readlines()
                np.set_printoptions(threshold=np.nan)
                max_p = np.max(theme, axis=1)
                for i in range(0, max_p.size):
                    paras[np.where(theme[i] == max_p[i])[0][0]] += str(push[i]).decode("utf-8")
            print str(paras).decode("unicode-escape").replace("', u", '
    ----------------------------------'
                                                                      '-----------------------------------
    ')
    LDA 实现方式2

    11、对象保存与加载

        @staticmethod
        def save_obj(obj,file_name):
            with open(file_name,'wb') as f:
                pickle.dump(obj,f)
    
        @staticmethod
        def load_obj(file_name):
            with open(file_name,'rb') as f:
                return pickle.load(f)
    对象保存与加载

     12、K邻近算法参数

      n_neighbors - 选取最近的几个点

      weights - 各个点的比重,可选:uniform 各点比重相同distance 按照距离赋予权重[callable]函数

      algorithm - 寻找最邻近使用的算法

    import json
    import jieba
    import math
    from MsgIdentification.DataProcess import DataProcess
    from sklearn.neighbors import KNeighborsClassifier
    class CompositeModel:
    
        def __init__(self):
            super().__init__()
            self.X = DataProcess.load_obj("./data/model.pkl")
            self.labelList = DataProcess.load_obj("./data/labelList.pkl")
            self.vectorizer = DataProcess.load_obj("./data/vectorizer.pkl")
            self.knn = KNeighborsClassifier(n_neighbors=4,weights="distance")
            self.knn.fit(self.X,self.labelList)
    
        def predict(self, msgInfo: object) -> object:
            in_json = json.loads(msgInfo)
            msg = in_json['body']
            #if(len(DataProcess.chinese_reg.findall(msg))<16):
            #    return False
            #length = math.pow(len(DataProcess.chinese_reg.findall(msg)), 1) / math.pow(len(msg),1.5)
            #if length > 0.12 or length < 0.000001:
            #    return False
            #if len(DataProcess.rubish_reg.findall(msg)):
            #    return True
    
            seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False))
            dellist = []
            for word in seg_list:
                if (len(word) == 1):
                    dellist.append(word)
            for word in dellist:
                seg_list.remove(word)
            new_sentence = " ".join(seg_list)
            new_vec = self.vectorizer.transform([new_sentence])
            return self.knn.predict(new_vec)
    K邻近算法

    K邻近算法测试:

    from MsgIdentification.CompositeModel import CompositeModel, DataProcess
    import json
    import datetime
    
    if __name__ == '__main__':
        # file = open("./data/data")
        # msgInfos = file.readlines()
        data_train = DataProcess();
        data_train.dataToVector()
        msgInfos = DataProcess.load_obj("./data/test_data.pkl");
        model = CompositeModel()
        i = 0.02;
        right = 0;
        effect_identify = 0
        critical_error = 0
        false_count = 0.01
        for msgInfo in msgInfos:
            in_json = json.loads(msgInfo)
            result = model.predict(msgInfo)
            label = in_json['spam']
    
            i += 1
            if not label:
                false_count += 1
    
            if result == label:
                right += 1
                if result:
                    effect_identify += 1
    
            if result and label == False:
                critical_error += 1
    
        print(right * 1.0 / i, " -- ", critical_error * 1.0 / false_count, " -- ", effect_identify * 1.0 / i, " -- ",
              effect_identify * 1.0 / (i - false_count))
    K邻近测试

    13、交叉验证使用的包

      from sklearn import model_selection

      或

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    from sklearn.model_selection import KFold
    import numpy as np
    
    cv = KFold(n_splits=3, shuffle=True)
    X = np.asarray([10, 22, 32, 41, 35, 46, 57, 18, 59])
    for train, test in cv.split(X):
        print(X[train])
        print(X[test])
        print('=========')
    交叉验证

       等份2^n份的函数:

    def dataset_split_n(X, Y, n_fold):
        """
        将X Y 分成 2^n_fold等份
        :param X:
        :param Y:
        :param n_fold:
        :return:
        """
        parent_x = []
        parent_y = []
        child_x = []
        child_y = []
        parent_x.append(X)
        parent_y.append(Y)
        for n in range(0, n_fold):
            child_x = []
            child_y = []
            for index in range(0, len(parent_x)):
                tmp_x1, tmp_x2, tmp_y1, tmp_y2 = train_test_split(parent_x[index], parent_y[index], test_size=0.5)
                child_x.append(tmp_x1)
                child_x.append(tmp_x2)
                child_y.append(tmp_y1)
                child_y.append(tmp_y2)
            parent_x = child_x
            parent_y = child_y
        return child_x, child_y
    View Code

    14、贝叶斯模型及测试

    from sklearn.naive_bayes import MultinomialNB
    from MsgIdentification.DataProcess import DataProcess
    import json
    import jieba
    import math
    
    
    class Bayes:
        def __init__(self):
            super().__init__()
            self.X = DataProcess.load_obj("./data/model.pkl")
            self.labelList = DataProcess.load_obj("./data/labelList.pkl")
            self.vectorizer = DataProcess.load_obj("./data/vectorizer.pkl")
            self.clf = MultinomialNB()
            self.clf.fit(self.X, self.labelList)
    
        def predict(self, msgInfo):
            in_json = json.loads(msgInfo)
            msg = in_json['body']
    
            seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False))
            dellist = []
            for word in seg_list:
                if len(word) == 1:
                    dellist.append(word)
            for word in dellist:
                seg_list.remove(word)
            new_sentence = " ".join(seg_list)
            new_vec = self.vectorizer.transform([new_sentence])
            if len(DataProcess.chinese_reg.findall(msg)) < 16:
                return False
            length = math.pow(len(DataProcess.chinese_reg.findall(msg)), 1) / math.pow(len(msg), 1.5)
            if length > 0.12 or length < 0.000001:
                return False
            if len(DataProcess.rubish_reg.findall(msg)):
                return True
            return self.clf.predict(new_vec)
    
    
    if __name__ == '__main__':
        #dataProcess = DataProcess()
        #dataProcess.dataToVector()
        bayes = Bayes()
        msgInfos = DataProcess.load_obj("./data/test_data.pkl")
        i = 0.02;
        right = 0;
        effect_identify = 0
        critical_error = 0
        false_count = 0.01
        for msgInfo in msgInfos:
            result = bayes.predict(msgInfo)
            in_json = json.loads(msgInfo)
            label = in_json['spam']
            i += 1
            if not label:
                false_count += 1
    
            if result == label:
                right += 1
                if result:
                    effect_identify += 1
    
            if result and label == False:
                critical_error += 1
    
        print(right * 1.0 / i, " -- ", critical_error * 1.0 / false_count, " -- ", effect_identify * 1.0 / i, " -- ",
              effect_identify * 1.0 / (i - false_count))
    贝叶斯

    15、决策树模型及测试

      决策树大概分为三种类型:ID3 基于信息熵和信息增益、C4.5 基于信息增益率、基于GINI(基尼)系数的

    from sklearn import tree
    from MsgIdentification.DataProcess import DataProcess
    import json
    import math
    import jieba
    import pydotplus
    import graphviz
    
    
    class DecisionTree:
        def __init__(self):
            super().__init__()
            self.clf = tree.DecisionTreeClassifier(max_depth=100, min_samples_split=5, min_samples_leaf=3)
            self.X = DataProcess.load_obj("./data/model.pkl")
            self.labelList = DataProcess.load_obj("./data/labelList.pkl")
            self.vectorizer = DataProcess.load_obj("./data/vectorizer.pkl")
            self.clf.fit(self.X, self.labelList)
    
        def predict(self, msgInfo):
            json_temp = json.loads(msgInfo)
            msg = json_temp['body']
    
            seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False))
            dellist = []
            for word in seg_list:
                if len(word) == 1:
                    dellist.append(word)
            for word in dellist:
                seg_list.remove(word)
            new_sentence = " ".join(seg_list)
            new_vec = self.vectorizer.transform([new_sentence])
            if len(DataProcess.chinese_reg.findall(msg)) < 16:
                return False
            length = math.pow(len(DataProcess.chinese_reg.findall(msg)), 1) / math.pow(len(msg), 1.5)
            if length > 0.12 or length < 0.000001:
                return False
            if len(DataProcess.rubish_reg.findall(msg)):
                return True
            return self.clf.predict(new_vec)
    
        def save_pdf(self):
            dot_data = tree.export_graphviz(self.clf, out_file=None)
            graph = pydotplus.graph_from_dot_data(dot_data)
            graph.write_pdf("./data/decisionTree.pdf")
    
    
    if __name__ == '__main__':
        decisionTree = DecisionTree()
        decisionTree.save_pdf()
        msgInfos = DataProcess.load_obj("./data/test_data.pkl")
        i = 0.02;
        right = 0;
        effect_identify = 0
        critical_error = 0
        false_count = 0.01
        for msgInfo in msgInfos:
            result = decisionTree.predict(msgInfo)
            in_json = json.loads(msgInfo)
            label = in_json['spam']
            i += 1
            if not label:
                false_count += 1
    
            if result == label:
                right += 1
                if result:
                    effect_identify += 1
    
            if result and label == False:
                critical_error += 1
    
        print(right * 1.0 / i, " -- ", critical_error * 1.0 / false_count, " -- ", effect_identify * 1.0 / i, " -- ",
              effect_identify * 1.0 / (i - false_count))
    决策树

    16、基于tensorflow的神经网络

    import tensorflow as tf
    import numpy as np
    from MsgIdentification.DataProcess import DataProcess
    import json
    import jieba
    
    
    def add_layer(inputs, in_size, out_size, activation_function=None):
        # add one more layer and return the output of this layer
        Weights = tf.Variable(tf.random_normal([in_size, out_size]))
        biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
        Wx_plus_b = tf.matmul(inputs, Weights) + biases
        if activation_function is None:
            outputs = Wx_plus_b
        else:
            outputs = activation_function(Wx_plus_b)
        return outputs
    
    
    # 2.定义节点准备接收数据 # define placeholder for inputs to network
    xs = tf.placeholder(tf.float32, [None, 3729])
    ys = tf.placeholder(tf.float32, [None, 1])
    
    l1 = add_layer(xs, 3729, 50, activation_function=tf.nn.relu)
    #l2 = add_layer(l1, 50, 10, activation_function=None)
    prediction = add_layer(l1, 50, 1, activation_function=None)
    loss = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction),
                                        reduction_indices=[1]))
    train_step = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
    init = tf.initialize_all_variables()
    saver = tf.train.Saver()
    sess = tf.Session()
    
    # 上面定义的都没有运算,直到 sess.run 才会开始运算
    sess.run(init)
    x = DataProcess.load_obj("./data/model.pkl")
    label_list = DataProcess.load_obj("./data/labelList.pkl")
    x_data = np.asarray(x.toarray())
    y_data = np.asarray(label_list).reshape(444, 1)
    
    saver.restore(sess, "./data/neuralNetwork.ckpt")
    
    #for i in range(10000000):
    #    # training train_step 和 loss 都是由 placeholder 定义的运算,所以这里要用 feed 传入参数
    #    sess.run(train_step, feed_dict={xs: x_data, ys: y_data})
    #    if i % 10 == 0:  # to see the step improvement
    #        now_loss = sess.run(loss, feed_dict={xs: x_data, ys: y_data})
    #        print(now_loss)
    #        if now_loss < 0.0001:
    #            break
    #    if i % 1000 == 0:
    #        save_path = saver.save(sess, "./data/neuralNetwork.ckpt")
    #        print("Save to path: ", save_path)
    
    msgInfos = DataProcess.load_obj("./data/test_data.pkl");
    vectorizer = DataProcess.load_obj("./data/vectorizer.pkl")
    i = 0
    for i in range(0, len(msgInfos) - 2, 50):
        msgInfo = msgInfos[i]
        in_json = json.loads(msgInfo)
        msg = in_json['body']
        label = in_json['spam']
        seg_list = list(jieba.cut(DataProcess.especial_reg.sub("", msg), cut_all=False))
        dellist = []
        for word in seg_list:
            if (len(word) == 1):
                dellist.append(word)
        for word in dellist:
            seg_list.remove(word)
        new_sentence = " ".join(seg_list)
        new_vec = vectorizer.transform([new_sentence])
        print(sess.run(prediction, feed_dict={xs: np.asarray(new_vec.toarray())}) > 0.4, label)
    神经网络
  • 相关阅读:
    浅谈HTML5单页面架构(一)——requirejs + angular + angular-route
    嵌入式开发之web---vue 前端 注册登录login
    嵌入式开发之web---vue 前端 admin 后台管理系统
    嵌入式开发之web---vue-demo webstorm goahead 嵌入式智能设备
    嵌入式开发之web---vue vscode和vue webstorm 开发环境搭建
    多媒体开发之h264---h264格式说明
    嵌入式开发之内核内存异常排查---关闭oom killer
    Elasticsearch cat api的用法
    Django Mysql SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED
    Django-基础-2-ORM
  • 原文地址:https://www.cnblogs.com/tengpan-cn/p/7156543.html
Copyright © 2020-2023  润新知