• ik中文分词器及拼音分词器试用


    安装

    ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.6.4/elasticsearch-analysis-ik-5.6.4.zip
    ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-pinyin/releases/download/v5.6.4/elasticsearch-analysis-pinyin-5.6.4.zip

    安装后需要重启elasticsearch服务

    查看当前已安装插件

    GET _cat/plugins
    
    结果
    node01 analysis-ik     5.6.4
    node01 analysis-pinyin 5.6.4

    测试中文分词器,支持ik_max_word和ik_smart两种方式

    GET _analyze
    {
      "analyzer":"ik_max_word",
      "text":"中华人民共和国国歌"
    }
    结果
    {
      "tokens": [
        {
          "token": "中华人民共和国",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 0
        },
        {
          "token": "中华人民",
          "start_offset": 0,
          "end_offset": 4,
          "type": "CN_WORD",
          "position": 1
        },
        {
          "token": "中华",
          "start_offset": 0,
          "end_offset": 2,
          "type": "CN_WORD",
          "position": 2
        },
        {
          "token": "华人",
          "start_offset": 1,
          "end_offset": 3,
          "type": "CN_WORD",
          "position": 3
        },
        {
          "token": "人民共和国",
          "start_offset": 2,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 4
        },
        {
          "token": "人民",
          "start_offset": 2,
          "end_offset": 4,
          "type": "CN_WORD",
          "position": 5
        },
        {
          "token": "共和国",
          "start_offset": 4,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 6
        },
        {
          "token": "共和",
          "start_offset": 4,
          "end_offset": 6,
          "type": "CN_WORD",
          "position": 7
        },
        {
          "token": "",
          "start_offset": 6,
          "end_offset": 7,
          "type": "CN_CHAR",
          "position": 8
        },
        {
          "token": "国歌",
          "start_offset": 7,
          "end_offset": 9,
          "type": "CN_WORD",
          "position": 9
        }
      ]
    }
    使用ik_smart,则会尽可能少的返回词语:
    {
      "tokens": [
        {
          "token": "中华人民共和国",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 0
        },
        {
          "token": "国歌",
          "start_offset": 7,
          "end_offset": 9,
          "type": "CN_WORD",
          "position": 1
        }
      ]
    }

    ik分词器支持自定义词库

    vi config/IKAnalyzer.cfg.xml

    <?
    xml version="1.0" encoding="UTF-8"?> <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 扩展配置</comment> <!--用户可以在这里配置自己的扩展字典 --> <entry key="ext_dict">zhouls.dic</entry> <!--用户可以在这里配置自己的扩展停止词字典--> <entry key="ext_stopwords"></entry> <!--用户可以在这里配置远程扩展字典 --> <!-- <entry key="remote_ext_dict">words_location</entry> --> <!--用户可以在这里配置远程扩展停止词字典--> <!-- <entry key="remote_ext_stopwords">words_location</entry> --> </properties>

    #配置完成需要重启服务

    简单测试拼音分词

    PUT test08
    {
      "index": {
        "analysis": {
          "analyzer": {
            "pinyin_analyzer": {
              "tokenizer": "my_pinyin",
              "filter": "word_delimiter"
            }
          },
          "tokenizer": {
            "my_pinyin": {
              "type": "pinyin",
              "first_letter": "none",
              "padding_char": " "
            }
          }
        }
      }
    }
    
    GET medcl/_analyze
    {
      "text":"刘德华",
      "analyzer":"pinyin_analyzer"
    }
    结果
    {
      "tokens": [
        {
          "token": "liu",
          "start_offset": 0,
          "end_offset": 1,
          "type": "word",
          "position": 0
        },
        {
          "token": "ldh",
          "start_offset": 0,
          "end_offset": 3,
          "type": "word",
          "position": 0
        },
        {
          "token": "de",
          "start_offset": 1,
          "end_offset": 2,
          "type": "word",
          "position": 1
        },
        {
          "token": "hua",
          "start_offset": 2,
          "end_offset": 3,
          "type": "word",
          "position": 2
        }
      ]
    }

    同时支持中文和拼音的分词器

    PUT test06
    {
      "settings":{
        "number_of_shards":"1",
        "index.refresh_interval":"15s",
        "index":{
          "analysis":{
            "analyzer":{
               "ik_pinyin_analyzer":{
                "type":"custom",
                "tokenizer":"ik_smart",
                "filter":"pinyin_filter"
              }
            },
            "filter":{
              "pinyin_filter":{
                "type":"pinyin",
                "keep_first_letter": false
              }
            }
          }
        }
      },
      "mappings": {
        "doc":{
          "properties": {
            "name":{
              "type": "text",
              "analyzer": "ik_pinyin_analyzer"
            }
          }
        }
      }
    }
    
    POST test06/_analyze
    {
      "analyzer": "ik_pinyin_analyzer",
      "text":"中华人民共和国国歌"
    }
    结果
    {
      "tokens": [
        {
          "token": "zhong",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 0
        },
        {
          "token": "hua",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 1
        },
        {
          "token": "ren",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 2
        },
        {
          "token": "min",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 3
        },
        {
          "token": "gong",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 4
        },
        {
          "token": "he",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 5
        },
        {
          "token": "guo",
          "start_offset": 0,
          "end_offset": 7,
          "type": "CN_WORD",
          "position": 6
        },
        {
          "token": "guo",
          "start_offset": 7,
          "end_offset": 9,
          "type": "CN_WORD",
          "position": 7
        },
        {
          "token": "ge",
          "start_offset": 7,
          "end_offset": 9,
          "type": "CN_WORD",
          "position": 8
        }
      ]
    }

    参考文档:

    https://blog.csdn.net/u013905744/article/details/80935846

    https://www.cnblogs.com/xing901022/p/5910139.html

    https://blog.csdn.net/qq_28018283/article/details/80396937

  • 相关阅读:
    大项目之网上书城(五)——主页(End)
    # 大项目之网上书城(四)——主页(下中)
    大项目之网上书城(三)——主页(中)
    大项目之网上书城(二)——主页(上)
    大项目之网上书城(一)——注册页面
    Mycat
    centos7 bash: netstat: 未找到命令
    docker 推送镜像到Harbor错误修改
    Jenkins插件下载镜像加速
    docker镜像加速器
  • 原文地址:https://www.cnblogs.com/libin2015/p/10497647.html
Copyright © 2020-2023  润新知