• 【389】Implement N-grams using NLTK


    Ref: Natural Language Toolkit

    Ref: n-grams in python, four, five, six grams?

    Ref: "Elegant n-gram generation in Python"

    import nltk
    
    sentence = """At eight o'clock on Thursday morning
    Arthur didn't feel very good."""
    
    # 1 gram
    
    tokens = nltk.word_tokenize(sentence)
    
    print("1 gram:
    ", tokens, "
    ")
    
    # 2 grams
    
    n = 2
    
    tokens_2 = nltk.ngrams(tokens, n)
    
    print("2 grams:
    ", [i for i in tokens_2], "
    ")
    
    # 3 grams
    
    n = 3
    
    tokens_3 = nltk.ngrams(tokens, n)
    
    print("3 grams:
    ", [i for i in tokens_3], "
    ")
    
    # 4 grams
    
    n = 4
    
    tokens_4 = nltk.ngrams(tokens, n)
    
    print("4 grams:
    ", [i for i in tokens_4], "
    ")
    
    outputs:
    1 gram:
     ['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.'] 
    
    2 grams:
     [('At', 'eight'), ('eight', "o'clock"), ("o'clock", 'on'), ('on', 'Thursday'), ('Thursday', 'morning'), ('morning', 'Arthur'), ('Arthur', 'did'), ('did', "n't"), ("n't", 'feel'), ('feel', 'very'), ('very', 'good'), ('good', '.')] 
    
    3 grams:
     [('At', 'eight', "o'clock"), ('eight', "o'clock", 'on'), ("o'clock", 'on', 'Thursday'), ('on', 'Thursday', 'morning'), ('Thursday', 'morning', 'Arthur'), ('morning', 'Arthur', 'did'), ('Arthur', 'did', "n't"), ('did', "n't", 'feel'), ("n't", 'feel', 'very'), ('feel', 'very', 'good'), ('very', 'good', '.')] 
    
    4 grams:
     [('At', 'eight', "o'clock", 'on'), ('eight', "o'clock", 'on', 'Thursday'), ("o'clock", 'on', 'Thursday', 'morning'), ('on', 'Thursday', 'morning', 'Arthur'), ('Thursday', 'morning', 'Arthur', 'did'), ('morning', 'Arthur', 'did', "n't"), ('Arthur', 'did', "n't", 'feel'), ('did', "n't", 'feel', 'very'), ("n't", 'feel', 'very', 'good'), ('feel', 'very', 'good', '.')] 
    

     Another method to output:

    import nltk
    
    sentence = """At eight o'clock on Thursday morning
    Arthur didn't feel very good."""
    
    # 1 gram
    
    tokens = nltk.word_tokenize(sentence)
    
    print("1 gram:
    ", tokens, "
    ")
    
    # 2 grams
    
    n = 2
    
    tokens_2 = nltk.ngrams(tokens, n)
    
    print("2 grams:
    ", [' '.join(list(i)) for i in tokens_2], "
    ")
    
    # 3 grams
    
    n = 3
    
    tokens_3 = nltk.ngrams(tokens, n)
    
    print("3 grams:
    ", [' '.join(list(i)) for i in tokens_3], "
    ")
    
    # 4 grams
    
    n = 4
    
    tokens_4 = nltk.ngrams(tokens, n)
    
    print("4 grams:
    ", [' '.join(list(i)) for i in tokens_4], "
    ")
    
    outputs:
    1 gram:
     ['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.'] 
    
    2 grams:
     ['At eight', "eight o'clock", "o'clock on", 'on Thursday', 'Thursday morning', 'morning Arthur', 'Arthur did', "did n't", "n't feel", 'feel very', 'very good', 'good .'] 
    
    3 grams:
     ["At eight o'clock", "eight o'clock on", "o'clock on Thursday", 'on Thursday morning', 'Thursday morning Arthur', 'morning Arthur did', "Arthur did n't", "did n't feel", "n't feel very", 'feel very good', 'very good .'] 
    
    4 grams:
     ["At eight o'clock on", "eight o'clock on Thursday", "o'clock on Thursday morning", 'on Thursday morning Arthur', 'Thursday morning Arthur did', "morning Arthur did n't", "Arthur did n't feel", "did n't feel very", "n't feel very good", 'feel very good .'] 
    

    获取一段文字中的大写字母开头的词组和单词

    import nltk
    from nltk.corpus import stopwords
    a = "I am Alex Lee. I am from Denman Prospect and I love this place very much. We don't like apple. The big one is good."
    tokens = nltk.word_tokenize(a)
    caps = []
    for i in range(1, 4):
        for eles in nltk.ngrams(tokens, i):
            length = len(list(eles))
            for j in range(length):
                if eles[j][0].islower() or not eles[j][0].isalpha():
                    break
                elif j == length - 1:
                    caps.append(' '.join(list(eles)))
    
    caps = list(set(caps))
    caps = [c for c in caps if c.lower() not in stopwords.words('english')]
    print(caps)
    
    outputs:
    ['Denman', 'Prospect', 'Alex Lee', 'Lee', 'Alex', 'Denman Prospect']
    
  • 相关阅读:
    matlab学习(1)——sparse函数和full函数处理稀疏矩阵
    MFC学习(6)——以数组矩阵形式表示读取出来的BMP图像||将数组矩阵数据转成BMP图像
    opencv学习(5)——HOG算子
    图像处理MFC学习(7)——实现8*8数组的DCT、IDCT
    MFC学习(3)——WIDTHBYTES()每行象素所占的字节数目
    【iOS】The differences between Class Extension and Header File 类扩展与头文件的区别
    PAT算法题学习笔记
    【前端】require函数实现原理
    【前端】从输入URL到页面加载完成的过程中都发生了什么事情
    Photoshop学习笔记(待续)
  • 原文地址:https://www.cnblogs.com/alex-bn-lee/p/10614774.html
Copyright © 2020-2023  润新知