1 # -*- coding: utf-8 -*- 2 import xgboost as xgb 3 import csv 4 import jieba 5 jieba.load_userdict('wordDict.txt') 6 import numpy as np 7 from sklearn.feature_extraction.text import CountVectorizer 8 from sklearn.feature_extraction.text import TfidfTransformer 9 10 11 # 读取训练集 12 def readtrain(): 13 with open('Train.csv', 'rb') as csvfile: 14 reader = csv.reader(csvfile) 15 column1 = [row for row in reader] 16 content_train = [i[1] for i in column1[1:]] # 第一列为文本内容,并去除列名 17 opinion_train = [i[2] for i in column1[1:]] # 第二列为类别,并去除列名 18 print '训练集有 %s 条句子' % len(content_train) 19 train = [content_train, opinion_train] 20 return train 21 22 23 # 将utf8的列表转换成unicode 24 def changeListCode(b): 25 a = [] 26 for i in b: 27 a.append(i.decode('utf8')) 28 return a 29 30 31 # 对列表进行分词并用空格连接 32 def segmentWord(cont): 33 c = [] 34 for i in cont: 35 a = list(jieba.cut(i)) 36 b = " ".join(a) 37 c.append(b) 38 return c 39 40 41 # 类别用数字表示:pos:2,neu:1,neg:0 42 def transLabel(labels): 43 for i in range(len(labels)): 44 if labels[i] == 'pos': 45 labels[i] = 2 46 elif labels[i] == 'neu': 47 labels[i] = 1 48 elif labels[i] == 'neg': 49 labels[i] = 0 50 else: print "label无效:",labels[i] 51 return labels 52 53 54 train = readtrain() 55 content = segmentWord(train[0]) 56 opinion = transLabel(train[1]) # 需要用数字表示类别 57 opinion = np.array(opinion) # 需要numpy格式 58 59 60 train_content = content[:7000] 61 train_opinion = opinion[:7000] 62 test_content = content[7000:] 63 test_opinion = opinion[7000:] 64 65 66 vectorizer = CountVectorizer() 67 tfidftransformer = TfidfTransformer() 68 tfidf = tfidftransformer.fit_transform(vectorizer.fit_transform(train_content)) 69 weight = tfidf.toarray() 70 print tfidf.shape 71 test_tfidf = tfidftransformer.transform(vectorizer.transform(test_content)) 72 test_weight = test_tfidf.toarray() 73 print test_weight.shape 74 75 76 dtrain = xgb.DMatrix(weight, label=train_opinion) 77 dtest = xgb.DMatrix(test_weight, label=test_opinion) # label可以不要,此处需要是为了测试效果 78 param = {'max_depth':6, 'eta':0.5, 'eval_metric':'merror', 'silent':1, 'objective':'multi:softmax', 'num_class':3} # 参数 79 evallist = [(dtrain,'train'), (dtest,'test')] # 这步可以不要,用于测试效果 80 num_round = 50 # 循环次数 81 bst = xgb.train(param, dtrain, num_round, evallist) 82 preds = bst.predict(dtest)