• xgboost实例代码


     1 # -*- coding: utf-8 -*-
     2 import xgboost as xgb
     3 import csv
     4 import jieba
     5 jieba.load_userdict('wordDict.txt')
     6 import numpy as np
     7 from sklearn.feature_extraction.text import CountVectorizer
     8 from sklearn.feature_extraction.text import TfidfTransformer
     9  
    10  
    11 # 读取训练集
    12 def readtrain():
    13     with open('Train.csv', 'rb') as csvfile:
    14         reader = csv.reader(csvfile)
    15         column1 = [row for row in reader]
    16     content_train = [i[1] for i in column1[1:]] # 第一列为文本内容,并去除列名
    17     opinion_train = [i[2] for i in column1[1:]] # 第二列为类别,并去除列名
    18     print '训练集有 %s 条句子' % len(content_train)
    19     train = [content_train, opinion_train]
    20     return train
    21  
    22  
    23 # 将utf8的列表转换成unicode
    24 def changeListCode(b):
    25     a = []
    26     for i in b:
    27         a.append(i.decode('utf8'))
    28     return a
    29  
    30  
    31 # 对列表进行分词并用空格连接
    32 def segmentWord(cont):
    33     c = []
    34     for i in cont:
    35         a = list(jieba.cut(i))
    36         b = " ".join(a)
    37         c.append(b)
    38     return c
    39  
    40  
    41 # 类别用数字表示:pos:2,neu:1,neg:0
    42 def transLabel(labels):
    43     for i in range(len(labels)):
    44         if labels[i] == 'pos':
    45             labels[i] = 2
    46         elif labels[i] == 'neu':
    47             labels[i] = 1
    48         elif labels[i] == 'neg':
    49             labels[i] = 0
    50         else: print "label无效:",labels[i]
    51     return labels
    52  
    53  
    54 train = readtrain()
    55 content = segmentWord(train[0])
    56 opinion = transLabel(train[1])  # 需要用数字表示类别
    57 opinion = np.array(opinion)     # 需要numpy格式
    58  
    59  
    60 train_content = content[:7000]
    61 train_opinion = opinion[:7000]
    62 test_content = content[7000:]
    63 test_opinion = opinion[7000:]
    64  
    65  
    66 vectorizer = CountVectorizer()
    67 tfidftransformer = TfidfTransformer()
    68 tfidf = tfidftransformer.fit_transform(vectorizer.fit_transform(train_content))
    69 weight = tfidf.toarray()
    70 print tfidf.shape
    71 test_tfidf = tfidftransformer.transform(vectorizer.transform(test_content))
    72 test_weight = test_tfidf.toarray()
    73 print test_weight.shape
    74  
    75  
    76 dtrain = xgb.DMatrix(weight, label=train_opinion)
    77 dtest = xgb.DMatrix(test_weight, label=test_opinion)  # label可以不要,此处需要是为了测试效果
    78 param = {'max_depth':6, 'eta':0.5, 'eval_metric':'merror', 'silent':1, 'objective':'multi:softmax', 'num_class':3}  # 参数
    79 evallist  = [(dtrain,'train'), (dtest,'test')]  # 这步可以不要,用于测试效果
    80 num_round = 50  # 循环次数
    81 bst = xgb.train(param, dtrain, num_round, evallist)
    82 preds = bst.predict(dtest)
  • 相关阅读:
    Install Failed Insufficient Storage, 解决 ADB 安装APK失败问题
    finding-the-smallest-circle-that-encompasses-other-circles
    PyTorch for Semantic Segmentation
    Semantic-Segmentation-DL
    Awesome Semantic Segmentation
    应用于语义分割问题的深度学习技术综述
    JS打开浏览器
    Overcoming iOS HTML5 audio limitations
    Android APP Testing Tutorial with Automation Framework
    基于方向包围盒投影转换的轮廓线拼接算法
  • 原文地址:https://www.cnblogs.com/luozeng/p/9610505.html
Copyright © 2020-2023  润新知