from bs4 import BeautifulrSoup
import re
preData=re.sub(r'[^a-zA-Z]',' ',data)
#two commom ways to clean data
def cleaner(word):
word = re.sub(r'#.', '', word)
word = re.sub(r'
', '', word)
word = re.sub(r',', '', word)
word = re.sub(r'-', ' ', word)
word = re.sub(r'.', '', word)
word = re.sub(r'\', ' ', word)
word = re.sub(r'\x.+', '', word)
word = re.sub(r'd', '', word)
word = re.sub(r'^_.', '', word)
word = re.sub(r'_', ' ', word)
word = re.sub(r'^ ', '', word)
word = re.sub(r' $', '', word)
word = re.sub(r'?', '', word)
word = re.sub(r'é', '', word)
word = re.sub(r'§', '', word)
word = re.sub(r'¦', '', word)
word = re.sub(r'æ', '', word)
word = re.sub(r'd+', '', word)
word = re.sub('(.*?)d+(.*?)', '', word)
return word.lower()
def hashing(word):
word = re.sub(r'ain$', r'ein', word)
word = re.sub(r'ai', r'ae', word)
word = re.sub(r'ay$', r'e', word)
word = re.sub(r'ey$', r'e', word)
word = re.sub(r'ie$', r'y', word)
word = re.sub(r'^es', r'is', word)
word = re.sub(r'a+', r'a', word)
word = re.sub(r'j+', r'j', word)
word = re.sub(r'd+', r'd', word)
word = re.sub(r'u', r'o', word)
word = re.sub(r'o+', r'o', word)
word = re.sub(r'ee+', r'i', word)
if not re.match(r'ar', word):
word = re.sub(r'ar', r'r', word)
word = re.sub(r'iy+', r'i', word)
word = re.sub(r'ih+', r'eh', word)
word = re.sub(r's+', r's', word)
if'[rst]y', 'word') and word[-1] != 'y':
word = re.sub(r'y', r'i', word)
if'[bcdefghijklmnopqrtuvwxyz]i', word):
word = re.sub(r'i$', r'y', word)
if'[acefghijlmnoqrstuvwxyz]h', word):
word = re.sub(r'h', '', word)
word = re.sub(r'k', r'q', word)
return word
def array_cleaner(array):
X = []
for sentence in array:
clean_sentence = ''
words = sentence.split(' ')
for word in words:
clean_sentence = clean_sentence +' '+ cleaner(word)
return X
X_train = array_cleaner(X_train)
words_notstop=[w for w in words if w not in stopwords]
train_data['review'] = train_data['review'].str.strip()
train_data['review'] = train_data['review'].apply(lambda x:' '.join([w for w in x.split() if len(w) > 3]))
train_data['review'] = train_data['review'].str.split()
from nltk.stem.porter import *
stemmer =PorterStemmer()
train_data['review'] = train_data['review'].apply(lambda x: [stemmer.stem(i) for i in x])