• 检测用户命令序列异常——使用LSTM分类算法【使用朴素贝叶斯,类似垃圾邮件分类的做法也可以,将命令序列看成是垃圾邮件】


    通过 搜集 Linux 服务器 的 bash 操作 日志, 通过 训练 识别 出 特定 用户 的 操作 习惯, 然后 进一步 识别 出 异常 操作 行为。

    使用 SEA 数据 集 涵盖 70 多个 UNIX 系统 用户 的 行为 日志, 这些 数据 来自 UNIX 系统 acct 机制 记录 的 用户 使用 的 命令。 SEA 数据 集中 每个 用户 都 采集 了 15000 条 命令, 从 用户 集合 中 随机 抽取 50 个 用户 作为 正常 用户, 剩余 用户 的 命令 块 中 随机 插入 模拟 命令 作为 内部 伪装 者 攻击 数据。其中 训练 集合 大小 为 80, 测试 集合 大小 为 70。


    数据集示意:

    cpp
    sh
    xrdb
    cpp
    sh
    xrdb
    mkpts
    test
    stty
    hostname
    date
    echo
    [
    find
    chmod
    tty
    echo
    env
    echo
    sh
    userenv
    wait4wm
    xhost
    xsetroot
    reaper
    xmodmap
    sh
    [
    cat
    stty
    hostname
    date
    echo
    [
    find
    chmod
    tty
    echo
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    more
    sh
    launchef
    launchef
    sh
    9term
    sh
    launchef
    sh
    launchef
    hostname
    [
    cat
    stty
    hostname
    date
    echo
    [
    find
    chmod
    tty
    echo
    sh
    more
    sh
    more
    sh
    ex
    sendmail
    sendmail
    sh
    MediaMai
    sendmail
    sh
    rm
    MediaMai
    sh
    rm
    MediaMai
    launchef
    launchef
    sh
    sh
    more
    sh
    sh
    rm
    MediaMai
    netstat
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    netscape
    sh
    netscape
    more
    sh
    rm
    sh
    MediaMai
    =
    telnet
    tput
    netscape
    netscape
    netscape
    netscape
    netscape
    
    # -*- coding:utf-8 -*-
    
    import sys
    
    import re
    import numpy as np
    
    
    import nltk
    import csv
    import matplotlib.pyplot as plt
    from nltk.probability import FreqDist
    from sklearn.feature_extraction.text import CountVectorizer
    
    from sklearn import cross_validation
    from tflearn.data_utils import to_categorical, pad_sequences
    from tflearn.datasets import imdb
    import tflearn
    
    #测试样本数
    N=80
    
    def load_user_cmd_new(filename):
        cmd_list=[]
        dist=[]
        with open(filename) as f:
            i=0
            x=[]
            for line in f:
                line=line.strip('
    ')
                x.append(line)
                dist.append(line)
                i+=1
                if i == 100:
                    cmd_list.append(x)
                    x=[]
                    i=0
    
        fdist = FreqDist(dist).keys()
        return cmd_list,fdist
    
    def load_user_cmd(filename):
        cmd_list=[]
        dist_max=[]
        dist_min=[]
        dist=[]
        with open(filename) as f:
            i=0
            x=[]
            for line in f:
                line=line.strip('
    ')
                x.append(line)
                dist.append(line)
                i+=1
                if i == 100:
                    cmd_list.append(x)
                    x=[]
                    i=0
    
        fdist = FreqDist(dist).keys()
        dist_max=set(fdist[0:50])
        dist_min = set(fdist[-50:])
        return cmd_list,dist_max,dist_min
    
    def get_user_cmd_feature(user_cmd_list,dist_max,dist_min):
        user_cmd_feature=[]
        for cmd_block in user_cmd_list:
            f1=len(set(cmd_block))
            fdist = FreqDist(cmd_block).keys()
            f2=fdist[0:10]
            f3=fdist[-10:]
            f2 = len(set(f2) & set(dist_max))
            f3=len(set(f3)&set(dist_min))
            x=[f1,f2,f3]
            user_cmd_feature.append(x)
        return user_cmd_feature
    
    def get_user_cmd_feature_new(user_cmd_list,dist):
        user_cmd_feature=[]
        for cmd_list in user_cmd_list:
            x=[]
            for cmd in  cmd_list:
                v = [0] * len(dist)
                for i in range(0, len(dist)):
                    if cmd == dist[i]:
                        v[i] = 1
                x.append(v)
            user_cmd_feature.append(x)
        return user_cmd_feature
    
    def get_label(filename,index=0):
        x=[]
        with open(filename) as f:
            for line in f:
                line=line.strip('
    ')
                x.append( int(line.split()[index]))
        return x
    
    
    def do_knn(x_train,y_train,x_test,y_test):
        neigh = KNeighborsClassifier(n_neighbors=3)
        neigh.fit(x_train, y_train)
        y_predict=neigh.predict(x_test)
        score = np.mean(y_test == y_predict) * 100
    
        print  score
    
    
    def do_rnn(x_train,x_test,y_train,y_test):
        global n_words
        # Data preprocessing
        # Sequence padding
        print "GET n_words embedding %d" % n_words
    
    
        #x_train = pad_sequences(x_train, maxlen=100, value=0.)
        #x_test = pad_sequences(x_test, maxlen=100, value=0.)
        # Converting labels to binary vectors
        y_train = to_categorical(y_train, nb_classes=2)
        y_test = to_categorical(y_test, nb_classes=2)
    
        # Network building
        net = tflearn.input_data(shape=[None, 100,n_words])
        net = tflearn.lstm(net, 10,  return_seq=True)
        net = tflearn.lstm(net, 10, )
        net = tflearn.fully_connected(net, 2, activation='softmax')
        net = tflearn.regression(net, optimizer='adam', learning_rate=0.1,name="output",
                                 loss='categorical_crossentropy')
    
        # Training
    
        model = tflearn.DNN(net, tensorboard_verbose=3)
        model.fit(x_train, y_train, validation_set=(x_test, y_test), show_metric=True,
                 batch_size=32,run_id="maidou")
    
    
    if __name__ == '__main__':
        user_cmd_list,dist=load_user_cmd_new("../data/MasqueradeDat/User7")
        #print  "Dist:(%s)" % dist
        n_words=len(dist)
        user_cmd_feature=get_user_cmd_feature_new(user_cmd_list,dist)
    
        labels=get_label("../data/MasqueradeDat/label.txt",6)
        y=[0]*50+labels
    
        x_train=user_cmd_feature[0:N]
        y_train=y[0:N]
    
        x_test=user_cmd_feature[N:150]
        y_test=y[N:150]
    
        #print x_train
    
        do_rnn(x_train,x_test,y_train,y_test)
    

     效果:

    Training Step: 30  | total loss: 0.10088 | time: 1.185s
    | Adam | epoch: 010 | loss: 0.10088 - acc: 0.9591 | val_loss: 0.18730 - val_acc: 0.9571 -- iter: 80/80
    --

  • 相关阅读:
    成长篇之代码灵异事件
    idea快捷键
    java环境配置常用链接
    MySQL分区
    English 动词篇
    仿stl+函数模板
    java 数组复制
    拓扑排序(Topological Sorting)
    2017蓝桥杯第十题(k倍区间)
    翻译NYOJ
  • 原文地址:https://www.cnblogs.com/bonelee/p/10000250.html
Copyright © 2020-2023  润新知