import pandas as pd
class ProcessData(object):
def __init__(self):
#self.path = "../data/new_data/随机抽听_1.xls"
#self.path = "../data/new_data/无意义核对语料.xlsx"
self.path = "../data/new_data/0520新增语义.xlsx"
def write_suiji(self):
readbook = pd.read_excel(self.path, sheet_name="sheet1")
s1 = readbook["语句"]
y = readbook["标注大类"]
with open("../data/train.txt", "a+", encoding="utf8") as f:
for s,l in zip(s1, y):
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
line = l + " " + s + "
"
#f.write(l + " " + s + "
")
def get_nomeans(self):
readbook = pd.read_excel(self.path, sheet_name="Sheet1")
s1 = readbook["客户语句"]
y = readbook["语义小类"]
with open("../data/train.txt", "a+", encoding="utf8") as f:
for s,l in zip(s1, y):
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
line = l + " " + s + "
"
f.write(l + " " + s + "
")
def get_wenti(self):
readbook = pd.read_excel(self.path, sheet_name="Sheet1")
s1 = readbook["客户话术"]
y = readbook["语义"]
with open("../data/train.txt", "a+", encoding="utf8") as f:
for s,l in zip(s1, y):
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
f.write(l + " " + s + "
")
def get_0520(self):
readbook = pd.read_excel(self.path, sheet_name="Sheet1")
s1 = readbook["客户语句"]
y = readbook["语义"]
with open("../data/train.txt", "a+", encoding="utf8") as f:
for s,l in zip(s1, y):
s = s.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
l = l.replace(" ", "").replace("
", "").replace("
", "").replace(" ", "")
f.write(l + " " + s + "
")
def get_fangshi(self):
dataLine = []
with open("../data/new_data/还款方式语料.txt", "r", encoding="utf8") as f:
for line in f.readlines():
sentence = line.replace(" ", "").replace("
", "").replace(" ", "").replace("
", "")
dataLine.append(sentence)
with open("../data/train.txt", "a+", encoding="utf8") as f:
for data in dataLine:
s = data
l = "支付宝微信号是多少"
f.write(l + " " + s + "
")
if __name__ == '__main__':
ProcessData().get_fangshi()