'''
0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive
'''
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
train_data = pd.read_csv('train.tsv', sep=' ')
print(train_data.head()) # 训练集
test_data = pd.read_csv('test.tsv', sep=' ')
print(test_data.head()) # 测试集
print('--------- 开始特征提取')
# ---------------------------------- 特征提取
vectorizer = CountVectorizer(ngram_range=(1,3), # N元特征
max_features = 150000)
corpus_train = train_data['Phrase'] # 语料库
corpus_test = test_data['Phrase'] # 语料库
vectorizer.fit(pd.concat([corpus_train,corpus_test]))
print(vectorizer.get_feature_names()[:10])
X_train = vectorizer.transform(corpus_train) # 向量化
X_test = vectorizer.transform(corpus_test) # 向量化
y_train = list(train_data['Sentiment'])
print(type(X_train))
print(X_train[:1])
print(y_train[:5])
print('--------- 开始训练')
# ----------------------------------- 训练
model = LogisticRegression(max_iter=1000000)
model.fit(X_train, y_train)
y_test = model.predict(X_test)
#y_test = [4]*66292
print('--------- 开始输出')
# ------------------------------------ 输出
output = pd.DataFrame({'PhraseId': test_data.PhraseId,
'Sentiment': y_test})
output.to_csv('my_submission.csv', index=False) # 输出
print('ok')