• spaCy 基本使用



    关于 spaCy

    官网:https://spacy.io

    相比 NLTK

    • spacy 速度更快
    • 可以和深度学习结合

    安装

    (base) $ python -m spacy download en
    

    import spacy
    
    

    加载英文模型,需要先安装模型 $ sudo python -m spacy download en

    如果显示 ✘ Couldn't link model to 'en'
    则注意执行命令前加上 sudo

    # 加载英文模型   
    nlp = spacy.load('en')
     
    doc = nlp('Start using Neo4j with built-in guides.Learn the basics of graph database technology ')
     
    type(doc) 
    # spacy.tokens.doc.Doc
    

    分词

    for token in doc:
        print(token)
    
    '''
        Start
        using
        Neo4j
        with
        built
        -
        in
        guides
        .
        Learn
        the
        basics
        of
        graph
        database
        technology
    '''
    

    分句

    for sent in doc.sents:
        print(sent)
    
    '''
        Start using Neo4j within seconds, with built-in guides and sample datasets for popular use cases.
        Learn the basics of graph database technology, from building a data model to deploying a graph-powered application.
    '''
    

    词性

    
    for token in doc:
        print('{} {}'.format(token, token.pos_)) 
    '''
    
        Start VERB
        using VERB
        Neo4j PROPN
        within ADP
        seconds NOUN
        , PUNCT
        with ADP
        built VERB
        - PUNCT
        in ADP
        guides NOUN
        and CCONJ
        sample NOUN
        datasets NOUN
        for ADP
        popular ADJ
        use NOUN
        cases NOUN
        . PUNCT
        Learn VERB
        the DET
        basics NOUN
        of ADP
        graph PROPN
        database NOUN
        technology NOUN
        , PUNCT
        from ADP
        building VERB
        a DET
        data NOUN
        model NOUN
        to ADP
        deploying VERB
        a DET
        graph NOUN
        - PUNCT
        powered VERB
        application NOUN
        . PUNCT
    '''
    

    命名体识别

    
    doc2 = nlp('arXiv is a free distribution service and an open-access archive for 1,812,439 scholarly articles. Materials on this site are not peer-reviewed by arXiv.')
     
    for ent in doc2.ents:
        print('{}, {} '.format(ent, ent.label_))   # 实体,类型
    '''
    arXiv, ORG 
    1,812,439, CARDINAL 
    arXiv, ORG 
    '''
    
    from spacy import displacy
    displacy.render(doc2, style='ent')
    
    


    displacy.render(doc2, style='ent')
    
    displacy.render(doc2, style='dep')
    

    image.png


    频次统计

    # 找到一本书中 人名出现频次
    
    def read_file(filename):
        with open(filename, 'r') as file:
            return file.read()
        
    def read_file_to_list(filename):
        with open(filename, 'r') as file:
            return file.readlines()
     
    text = read_file('data/pride_and_prejudice.txt')
     
    precessed_text = nlp(text) # 执行的时间比较长
     
    # 查看统计
    # sents 内部并非字符串,而是 spacy.tokens.span.Span 类型
    sents = [sent for sent in precessed_text.sents]
     
    print(len(sents))
    #  7153
    
    
    

    from collections import Counter
     
     def find_person(doc):
        c = Counter()
        names = []
        for ent in precessed_text.ents:
            if ent.label_ == 'PERSON':
    #             print(ent)
                c[ent.lemma_] += 1
    #             names.append(ent)
        
        return c.most_common(10)
            
    find_person(precessed_text)
    '''
        [('Elizabeth', 600),
         ('Darcy', 355),
         ('Jane', 277),
         ('Bingley', 260),
         ('Bennet', 258),
         ('Collins', 166),
         ('Wickham', 108),
         ('Lizzy', 94),
         ('Gardiner', 90),
         ('Lady Catherine', 76)]
    '''
    
    

    恐怖袭击分析

    
     
    text2 = read_file_to_list('data/rand-terrorism-dataset.txt')
    
    text2[:5]
    '''
        ['CHILE.  An explosion from a single stick of dynamite went off on the patio of the Santiago Binational Center, causing $21,000 in damages.
    ',
         'ISRAEL.  Palestinian terrorists fired five mortar shells into the collective settlement at Masada, causing slight damage but no injuries.
    ',
         'GUATEMALA.  A bomb was thrown over the wall surrounding the U.S. Marines guards house in Guatemala City, causing damage but no injuries.
    ',
         'FRANCE.  Five French students bombed the Paris offices of   Chase Manhattan Bank before dawn.  Trans-World Airways and the Bank of America were also bombed.   They claimed to be protesting the U.S. involvement in the Vietnam war.
    ',
         'UNITED STATES - Unidentified anti-Castro Cubans attempted to bomb the Miami branch of the Spanish National Tourist Office.
    ']
    '''
    
    
    
    # 哪些恐怖组织在哪些国家造成了多少次恐怖袭击
     
    nlp2 = [nlp(art)  for art in text2]
    # nlp2.sents[:3]
     
    # 选取下面常见数据来展示
    
    common_terrorist_groups = [
        'taliban', 
        'al - qaeda', 
        'hamas',  
        'fatah', 
        'plo', 
        'bilad al - rafidayn'
    ]
    
    common_locations = [
        'iraq',
        'baghdad', 
        'kirkuk', 
        'mosul', 
        'afghanistan', 
        'kabul',
        'basra', 
        'palestine', 
        'gaza', 
        'israel', 
        'istanbul', 
        'beirut', 
        'pakistan'
    ]
    
    from collections import defaultdict
     
    # 构建计数器
    
    location_entity_dict = defaultdict(Counter)
     
    # 找一个实体,识别每一篇文章
    
    for art in nlp2:
        art_terrorist_group = [ent.lemma_ for ent in art.ents if ent.label_ == 'PERSON' or ent.label_ == 'ORG'] # 恐怖组织或人
        art_locations = [ent.lemma_  for ent in art.ents if ent.label_ == 'GPE']  #地点
        
    #     print(len(art_terrorist_group))
    #     print(len(art_locations))
        
        # 剔除
        terrorist_common = [ent for ent in art_terrorist_group if ent in common_terrorist_groups]
        locations_common = [ent for ent in art_locations if ent in common_locations]
        
        # 统计组织干了什么坏事
        for found_ent in terrorist_common:
            for found_loc in locations_common:
                location_entity_dict[found_ent][found_loc] += 1
                
    
    print(location_entity_dict)
        
        
    
    defaultdict(<class 'collections.Counter'>, {})
    
    location_entity_dict
    # defaultdict(collections.Counter, {})
    
     
    location_entity_dict = defaultdict(Counter)
    
    for article in nlp2:
        
        article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_=='PERSON' or ent.label_ =='ORG']#人或者组织
        article_locations = [ent.lemma_ for ent in article.ents if ent.label_=='GPE']
        terrorist_common = [ent for ent in article_terrorist_groups if ent in common_terrorist_groups]
        locations_common = [ent for ent in article_locations if ent in common_locations]
        
        for found_entity in terrorist_common:
            for found_location in locations_common:
                location_entity_dict[found_entity][found_location] += 1
                
     
    location_entity_dict
    # defaultdict(collections.Counter, {})
    


    相关资料



  • 相关阅读:
    从hadoop框架与MapReduce模式中谈海量数据处理
    Hadoop
    Clone Graph
    Gas Station
    ZigZag Conversion
    String to Integer (atoi)
    Palindrome Number
    Container With Most Water
    Longest Common Prefix
    求和问题总结(leetcode 2Sum, 3Sum, 4Sum, K Sum)
  • 原文地址:https://www.cnblogs.com/fldev/p/14371134.html
Copyright © 2020-2023  润新知