• 【新人赛】阿里云恶意程序检测 -- 实践记录 11.24


    使用word2vec训练词向量

    使用word2vec无监督学习训练词向量,输入的是训练数据和测试数据,输出的是每个词的词向量,总共三百个词左右。

    求和:然后再将每行数据中的每个词的词向量加和,得到每行的词向量表示。

    其他还可以通过求平均,求众数或者最大值等等方法得到每行的词向量表示。

    代码如下:

    import time
    import csv
    import pickle
    import numpy as np
    import xgboost as xgb
    from sklearn.model_selection import StratifiedKFold
    from sklearn.feature_extraction.text import CountVectorizer
    from gensim.models.word2vec import Word2Vec
    import warnings
    
    warnings.filterwarnings('ignore')  # 忽略警告
    
    with open("security_train.csv.pkl", "rb") as f:
        labels = pickle.load(f)
        files = pickle.load(f)
    
    with open("security_test.csv.pkl", "rb") as f:
        file_names = pickle.load(f)
        outfiles = pickle.load(f)
    

    训练词向量模型的方法:

    def train_w2v_model(files, size, model, flag):
      for batch in range(int(len(files)/size) + 1):
        sentences = []
        print("batch:", batch)
        if batch != int(len(files)/size):
          for i in range(batch*size, size*(batch+1)):
            sentence = files[i].split(' ')
            sentences.append(sentence)
        else:
          for i in range(size*(batch+1), len(files)):
            sentence = files[i].split(' ')
            sentences.append(sentence)
    
        sentences = np.array(sentences)
    
        if batch == 0 and flag == True:
          model.build_vocab(sentences)
        else:
          model.build_vocab(sentences, update=True)
    
        model.train(sentences, total_examples = model.corpus_count, epochs = model.epochs)
    
      print("done.")
      return model
    
    # 训练词向量
    model = Word2Vec()
    model = train_w2v_model(files, 1000, model, True)
    model = train_w2v_model(outfiles, 1000, model, False)
    model.save('./temp/w2cmodel_train_test')
    # model = Word2Vec.load('./temp/w2cmodel0')
    print(model)
    

    对每行数据求词向量之和的方法:

    def train_sum_vec(files, model, size=100):
      rtvec = []
      for i in range(len(files)):
        if i % 100 == 0: 
          print(i)
        text = files[i].split(' ')
        # 对每个句子的词向量进行求和计算
        vec = np.zeros(size).reshape((1, size))
        for word in text:
          try:
            vec += model[word].reshape((1, size))
          except KeyError:
            continue
        rtvec.append(vec)
      
      train_vec = np.concatenate(rtvec)
      return train_vec
    

    得到训练数据的词向量:

    # 将词向量保存为 Ndarray
    train_vec = train_sum_vec(files, model)
    # 保存 Word2Vec 模型及词向量
    model.save('w2v_model.pkl')
    np.save('X_train_test_vec.npy', train_vec)
    print('done.')
    

    得到测试数据的词向量:

    test_vec = train_sum_vec(outfiles, model)
    np.save('y_test_vec.npy', test_vec)
    print('done.')
    

    xgboost训练:

    meta_train = np.zeros(shape=(len(files), 8))
    meta_test = np.zeros(shape=(len(outfiles), 8))
    
    k = 10
    skf = StratifiedKFold(n_splits=k, random_state=42, shuffle=True)
    
    X_vector = np.load('X_train_test_vec.npy')
    y_vector = np.load('y_test_vec.npy')
    
    for i, (tr_ind, te_ind) in enumerate(skf.split(X_vector, labels)):
        X_train, X_train_label = X_vector[tr_ind], labels[tr_ind]
        X_val, X_val_label = X_vector[te_ind], labels[te_ind]
    
        print('FOLD: {}'.format(str(i)))
        print(len(tr_ind), len(te_ind))
        
        dtrain = xgb.DMatrix(X_train, label=X_train_label)
        dtest = xgb.DMatrix(X_val, X_val_label)
        dout = xgb.DMatrix(y_vector)
        
        param = {'max_depth': 6, 'eta': 0.1, 'eval_metric': 'mlogloss', 'silent': 1, 'objective': 'multi:softprob',
                 'num_class': 8, 'subsample': 0.8, 'colsample_bytree': 0.85}
    
        evallist = [(dtrain, 'train'), (dtest, 'val')]  # 测试 , (dtrain, 'train')
        num_round = 300  # 循环次数
        bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=50)
    
        # dtr = xgb.DMatrix(train_features)
        pred_val = bst.predict(dtest)
        pred_test = bst.predict(dout)
        meta_train[te_ind] = pred_val
        meta_test += pred_test
        
    meta_test /= 10.0
    
    with open("word2vec_result_{}.pkl".format(
            str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))),
            'wb') as f:
        pickle.dump(meta_train, f)
        pickle.dump(meta_test, f)
    
    result = meta_test
    out = []
    
    for i in range(len(file_names)):
        tmp = []
        a = result[i].tolist()
        tmp.append(file_names[i])
        tmp.extend(a)
        out.append(tmp)
        
    with open("word2vec_10k_{}.csv".format(
            str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))),
            "w",
            newline='') as csvfile:
        writer = csv.writer(csvfile)
    
        # 先写入columns_name
        writer.writerow(["file_id", "prob0", "prob1", "prob2", "prob3", "prob4", "prob5", "prob6", "prob7"])
        # 写入多行用writerows
        writer.writerows(out)
    

    提交到线上得到的结果为,0.725923

    使用词向量的平均值,提交到线上结果为,0.751533

    数据增强后,结果为,0.711533

  • 相关阅读:
    We7 2.7版:全拖拽建站 开源CMS
    We7 CMS 2.6RC2版本发布 开源CMS
    LINQ简易教程
    C# 引用 C# DLL
    ASP.NET中母版页与JavaScript控制的一点小问题
    LINQ连接远端数据库问题
    ASP.NET中自动生成XML文件并通过XSLT显示在网页中的方法
    【转载】常见逻辑错误
    因为压力大变得很郁闷的时候怎么办
    代码覆盖度C#代码监控工具NCover、Rational PureCoverage、BullseyeCoverage
  • 原文地址:https://www.cnblogs.com/yanqiang/p/11910317.html
Copyright © 2020-2023  润新知