• Python数据分析8-----网页文本处理


    1、去除网页的标签,如<br/>

    from bs4 import BeautifulrSoup 
    preData=BeautifulSoup(data,'html.parser').get_text()

    2、将标点符号等去掉,用正则表达式。

    import re
    #表示将data中的除了大小写字母之外的符号换成空格 preData
    =re.sub(r'[^a-zA-Z]',' ',data)

      去除特殊符号:

    #two commom ways to clean data
    def cleaner(word):
        word = re.sub(r'#.', '', word)
        word = re.sub(r'
    ', '', word)
        word = re.sub(r',', '', word)
        word = re.sub(r'-', ' ', word)
        word = re.sub(r'.', '', word)
        word = re.sub(r'\', ' ', word)
        word = re.sub(r'\x.+', '', word)
        word = re.sub(r'd', '', word)
        word = re.sub(r'^_.', '', word)
        word = re.sub(r'_', ' ', word)
        word = re.sub(r'^ ', '', word)
        word = re.sub(r' $', '', word)
        word = re.sub(r'?', '', word)
        word = re.sub(r'é', '', word)
        word = re.sub(r'§', '', word)
        word = re.sub(r'¦', '', word)
        word = re.sub(r'æ', '', word)
        word = re.sub(r'd+', '', word)
        word = re.sub('(.*?)d+(.*?)', '', word)
        return word.lower()
    def hashing(word):
        word = re.sub(r'ain$', r'ein', word)
        word = re.sub(r'ai', r'ae', word)
        word = re.sub(r'ay$', r'e', word)
        word = re.sub(r'ey$', r'e', word)
        word = re.sub(r'ie$', r'y', word)
        word = re.sub(r'^es', r'is', word)
        word = re.sub(r'a+', r'a', word)
        word = re.sub(r'j+', r'j', word)
        word = re.sub(r'd+', r'd', word)
        word = re.sub(r'u', r'o', word)
        word = re.sub(r'o+', r'o', word)
        word = re.sub(r'ee+', r'i', word)
        if not re.match(r'ar', word):
            word = re.sub(r'ar', r'r', word)
        word = re.sub(r'iy+', r'i', word)
        word = re.sub(r'ih+', r'eh', word)
        word = re.sub(r's+', r's', word)
        if re.search(r'[rst]y', 'word') and word[-1] != 'y':
            word = re.sub(r'y', r'i', word)
        if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
            word = re.sub(r'i$', r'y', word)
        if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
            word = re.sub(r'h', '', word)
        word = re.sub(r'k', r'q', word)
        return word
    
    def array_cleaner(array):
        X = []
        for sentence in array:
            clean_sentence = ''
            words = sentence.split(' ')
            for word in words:
                clean_sentence = clean_sentence +' '+ cleaner(word)
            X.append(clean_sentence)
        return X
    X_train = array_cleaner(X_train)

    3、将文本中的单词小写化,并将data用空格分开

    words=data.lower().split()

    4、去掉停用词

    #可以自己下载停用词
    #nltk.download() 
    words_notstop=[w for w in words if w not in stopwords]

    5、将所有的词连接成一个句子

    sentence=' '.join(words)

     6、把空格前缀去除

    train_data['review'] = train_data['review'].str.strip() 

    7、删除短词,删除句子中词语长度小于3的词,如haa,hi等无意义的词

    ##删除短单词
    train_data['review'] = train_data['review'].apply(lambda x:' '.join([w for w in x.split() if len(w) > 3]))

    8、分词

    ##分词
    train_data['review'] = train_data['review'].str.split()

    9、提取词干

    ##提取词干,即基于规则从单词中去除后缀的过程。例如,play,player,played,plays,playing都是play的变种。
    from nltk.stem.porter import *
    stemmer =PorterStemmer()
    train_data['review'] = train_data['review'].apply(lambda x: [stemmer.stem(i) for i in x])
  • 相关阅读:
    java基础:9.2 接口implements,Comparable,Cloneable接口
    java基础:9.1 抽象类
    java基础:9.4 web爬虫
    java基础:6.0 ArrayList
    java基础:9.3 从web上读取数据
    java基础:12.1 文本I/O(一)
    3.2 FPGA 配置电路/主模式 从模式 JTAG模式
    谷歌浏览器查看HTTP协议
    MIME类型说明
    常见的响应码说明
  • 原文地址:https://www.cnblogs.com/Lee-yl/p/9325995.html
Copyright © 2020-2023  润新知