参考:Word Embedding Tutorial: word2vec using Gensim [EXAMPLE]
参考:NLP入门(三)词形还原(Lemmatization)
参考:Implementing Word2Vec with Gensim Library in Python
文本预处理
- 分词
- 单词转化为小写字母
- 去除单词中的标点符号
- 去除单词中的数字
- 去除空字符
- 去掉停用词
- 去掉空的list
- 词形还原
首先导入必要的 libraries
import gensim import nltk from gensim.models import Word2Vec # 停用词 from nltk.corpus import stopwords stop = stopwords.words('english') # 标点符号 import string # string.punctuation # 词形还原 from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer()
加载数据并显示
data = [{"tag": "welcome", "patterns": ["Hi", "How are you", "Is any one to talk?", "Hello", "hi are you available"], "responses": ["Hello, thanks for contacting us", "Good to see you here"," Hi there, how may I assist you?"] }, {"tag": "goodbye", "patterns": ["Bye", "See you later", "Goodbye", "I will come back soon"], "responses": ["See you later, thanks for visiting", "have a great day ahead", "Wish you Come back again soon."] }, {"tag": "thankful", "patterns": ["Thanks for helping me", "Thank your guidance", "That's helpful and kind from you"], "responses": ["Happy to help!", "Any time!", "My pleasure", "It is my duty to help you"] }, {"tag": "hoursopening", "patterns": ["What hours are you open?", "Tell your opening time?", "When are you open?", "Just your timing please"], "responses": ["We're open every day 8am-7pm", "Our office hours are 8am-7pm every day", "We open office at 8 am and close at 7 pm"] }, {"tag": "payments", "patterns": ["Can I pay using credit card?", " Can I pay using Mastercard?", " Can I pay using cash only?" ], "responses": ["We accept VISA, Mastercard and credit card", "We accept credit card, debit cards and cash. Please don’t worry"] } ] bigger_list = [] for i in range(len(data)): for s in data[i]['patterns']: li = s.split(" ") bigger_list.append(li) bigger_list
输出结果如下:
[['Hi'], ['How', 'are', 'you'], ['Is', 'any', 'one', 'to', 'talk?'], ['Hello'], ['hi', 'are', 'you', 'available'], ['Bye'], ['See', 'you', 'later'], ['Goodbye'], ['I', 'will', 'come', 'back', 'soon'], ['Thanks', 'for', 'helping', 'me'], ['Thank', 'your', 'guidance'], ["That's", 'helpful', 'and', 'kind', 'from', 'you'], ['What', 'hours', 'are', 'you', 'open?'], ['Tell', 'your', 'opening', 'time?'], ['When', 'are', 'you', 'open?'], ['Just', 'your', 'timing', 'please'], ['Can', 'I', 'pay', 'using', 'credit', 'card?'], ['', 'Can', 'I', 'pay', 'using', 'Mastercard?'], ['', 'Can', 'I', 'pay', 'using', 'cash', 'only?']]
将单词都转换为小写字母:
# 将单词变为小写 bigger_list = [[w.lower() for w in s] for s in bigger_list] bigger_list
输出结果如下:
[['hi'], ['how', 'are', 'you'], ['is', 'any', 'one', 'to', 'talk?'], ['hello'], ['hi', 'are', 'you', 'available'], ['bye'], ['see', 'you', 'later'], ['goodbye'], ['i', 'will', 'come', 'back', 'soon'], ['thanks', 'for', 'helping', 'me'], ['thank', 'your', 'guidance'], ["that's", 'helpful', 'and', 'kind', 'from', 'you'], ['what', 'hours', 'are', 'you', 'open?'], ['tell', 'your', 'opening', 'time?'], ['when', 'are', 'you', 'open?'], ['just', 'your', 'timing', 'please'], ['can', 'i', 'pay', 'using', 'credit', 'card?'], ['', 'can', 'i', 'pay', 'using', 'mastercard?'], ['', 'can', 'i', 'pay', 'using', 'cash', 'only?']]
删除单词里面的标点符号
import string # 存储标点符号为一个字符串 # string.punctuation # 去掉单词中的标点 # ''.join([x for x in 'alex?' if x not in string.punctuation]) # 输出为 alex # 去掉单词中的标点 bigger_list = [[''.join([x for x in w if x not in string.punctuation]) for w in s] for s in bigger_list] bigger_list
输出结果如下:
[['hi'], ['how', 'are', 'you'], ['is', 'any', 'one', 'to', 'talk'], ['hello'], ['hi', 'are', 'you', 'available'], ['bye'], ['see', 'you', 'later'], ['goodbye'], ['i', 'will', 'come', 'back', 'soon'], ['thanks', 'for', 'helping', 'me'], ['thank', 'your', 'guidance'], ['thats', 'helpful', 'and', 'kind', 'from', 'you'], ['what', 'hours', 'are', 'you', 'open'], ['tell', 'your', 'opening', 'time'], ['when', 'are', 'you', 'open'], ['just', 'your', 'timing', 'please'], ['can', 'i', 'pay', 'using', 'credit', 'card'], ['', 'can', 'i', 'pay', 'using', 'mastercard'], ['', 'can', 'i', 'pay', 'using', 'cash', 'only']]
去掉空字符
# 去掉空字符 bigger_list = [[w for w in s if w!=''] for s in bigger_list] bigger_list
输出结果如下:
[['hi'], ['how', 'are', 'you'], ['is', 'any', 'one', 'to', 'talk'], ['hello'], ['hi', 'are', 'you', 'available'], ['bye'], ['see', 'you', 'later'], ['goodbye'], ['i', 'will', 'come', 'back', 'soon'], ['thanks', 'for', 'helping', 'me'], ['thank', 'your', 'guidance'], ['thats', 'helpful', 'and', 'kind', 'from', 'you'], ['what', 'hours', 'are', 'you', 'open'], ['tell', 'your', 'opening', 'time'], ['when', 'are', 'you', 'open'], ['just', 'your', 'timing', 'please'], ['can', 'i', 'pay', 'using', 'credit', 'card'], ['can', 'i', 'pay', 'using', 'mastercard'], ['can', 'i', 'pay', 'using', 'cash', 'only']]
去掉停用词
from nltk.corpus import stopwords # 存储停用词 stop = stopwords.words('english') # 去掉停用词 bigger_list = [[w for w in s if w not in stop] for s in bigger_list] bigger_list
输出结果如下:
[['hi'], [], ['one', 'talk'], ['hello'], ['hi', 'available'], ['bye'], ['see', 'later'], ['goodbye'], ['come', 'back', 'soon'], ['thanks', 'helping'], ['thank', 'guidance'], ['thats', 'helpful', 'kind'], ['hours', 'open'], ['tell', 'opening', 'time'], ['open'], ['timing', 'please'], ['pay', 'using', 'credit', 'card'], ['pay', 'using', 'mastercard'], ['pay', 'using', 'cash']]
去掉空的 list
# 去掉空的list bigger_list = [s for s in bigger_list if len(s) > 0] bigger_list
输出结果如下:
[['hi'], ['one', 'talk'], ['hello'], ['hi', 'available'], ['bye'], ['see', 'later'], ['goodbye'], ['come', 'back', 'soon'], ['thanks', 'helping'], ['thank', 'guidance'], ['thats', 'helpful', 'kind'], ['hours', 'open'], ['tell', 'opening', 'time'], ['open'], ['timing', 'please'], ['pay', 'using', 'credit', 'card'], ['pay', 'using', 'mastercard'], ['pay', 'using', 'cash']]
词形还原
# 词形还原 from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() bigger_list = [[wnl.lemmatize(w) for w in s] for s in bigger_list] bigger_list
输出结果如下:
[['hi'], ['one', 'talk'], ['hello'], ['hi', 'available'], ['bye'], ['see', 'later'], ['goodbye'], ['come', 'back', 'soon'], ['thanks', 'helping'], ['thank', 'guidance'], ['thats', 'helpful', 'kind'], ['hour', 'open'], ['tell', 'opening', 'time'], ['open'], ['timing', 'please'], ['pay', 'using', 'credit', 'card'], ['pay', 'using', 'mastercard'], ['pay', 'using', 'cash']]
模型训练并存储以及调用
# 训练模型 model= Word2Vec(bigger_list,min_count=1,size=300,workers=4) # 模型存储 model.save("word2vec.model") model.save('word2vec.bin') # 模型加载 model = Word2Vec.load('word2vec.bin') # 词汇 list(model.wv.vocab) # thanks 对应的 vector model.wv.word_vec('thanks')
word2vec API讲解
在gensim中,word2vec 相关的API都在包gensim.models.word2vec中。和算法有关的参数都在类gensim.models.word2vec.Word2Vec中。算法需要注意的参数有:
- sentences:我们要分析的语料,可以是一个列表,或者从文件中遍历读出(word2vec.LineSentence(filename) )。
- size:词向量的维度,默认值是100。这个维度的取值一般与我们的语料的大小相关,如果是不大的语料,比如小于100M的文本语料,则使用默认值一般就可以了。如果是超大的语料,建议增大维度。
- window:即词向量上下文最大距离,window越大,则和某一词较远的词也会产生上下文关系。默认值为5,在实际使用中,可以根据实际的需求来动态调整这个window的大小。如果是小语料则这个值可以设的更小。对于一般的语料这个值推荐在[5;10]之间。
- sg:即我们的word2vec两个模型的选择了。如果是0, 则是CBOW模型;是1则是Skip-Gram模型;默认是0即CBOW模型。
- hs:即我们的word2vec两个解法的选择了。如果是0, 则是Negative Sampling;是1的话并且负采样个数negative大于0, 则是Hierarchical Softmax。默认是0即Negative Sampling。
- negative:即使用Negative Sampling时负采样的个数,默认是5。推荐在[3,10]之间。这个参数在我们的算法原理篇中标记为neg。
- cbow_mean:仅用于CBOW在做投影的时候,为0,则算法中的xw为上下文的词向量之和,为1则为上下文的词向量的平均值。在我们的原理篇中,是按照词向量的平均值来描述的。个人比较喜欢用平均值来表示xw,默认值也是1,不推荐修改默认值。
- min_count:需要计算词向量的最小词频。这个值可以去掉一些很生僻的低频词,默认是5。如果是小语料,可以调低这个值。
- iter:随机梯度下降法中迭代的最大次数,默认是5。对于大语料,可以增大这个值。
- alpha:在随机梯度下降法中迭代的初始步长。算法原理篇中标记为η,默认是0.025。
- min_alpha: 由于算法支持在迭代的过程中逐渐减小步长,min_alpha给出了最小的迭代步长值。随机梯度下降中每轮的迭代步长可以由iter,alpha, min_alpha一起得出。这部分由于不是word2vec算法的核心内容,因此在原理篇我们没有提到。
利用json和pandas处理
#list of libraries used by the code import string from gensim.models import Word2Vec import logging from nltk.corpus import stopwords from textblob import Word import json import pandas as pd #data in json format json_file = 'intents.json' with open('intents.json','r') as f: data = json.load(f) #displaying the list of stopwords stop = stopwords.words('english') #dataframe df = pd.DataFrame(data) df['patterns'] = df['patterns'].apply(', '.join) # print(df['patterns']) #print(df['patterns']) #cleaning the data using the NLP approach print(df) df['patterns'] = df['patterns'].apply(lambda x:' '.join(x.lower() for x in x.split())) df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation)) df['patterns']= df['patterns'].str.replace('[^ws]','') df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if not x.isdigit())) df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop)) df['patterns'] = df['patterns'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) #taking the outer list bigger_list=[] for i in df['patterns']: li = list(i.split(" ")) bigger_list.append(li) #structure of data to be taken by the model.word2vec print("Data format for the overall list:",bigger_list) #custom data is fed to machine for further processing model = Word2Vec(bigger_list, min_count=1,size=300,workers=4) #print(model)