• elasticsearch2.x ik插件


    先来一个标准分词(standard),配置如下:

    curl -XPUT localhost:9200/local -d '{
    
        "settings" : {
    
            "analysis" : {
    
                "analyzer" : {
    
                    "stem" : {
    
                        "tokenizer" : "standard",
    
                        "filter" : ["standard", "lowercase", "stop", "porter_stem"]
    
                    }
    
                }
    
            }
    
        },
    
        "mappings" : {
    
            "article" : {
    
                "dynamic" : true,
    
                "properties" : {
    
                    "title" : {
    
                        "type" : "string",
    
                        "analyzer" : "stem"
    
                    }
    
                }
    
            }
    
        }
    
    }'

    index:local

    type:article

    default analyzer:stem (filter:小写、停用词等)

    field:title  

    测试:

    # Index Data
    
    curl -XPUT localhost:9200/local/article/1 -d'{"title": "Fight for your life"}'
    
    curl -XPUT localhost:9200/local/article/2 -d'{"title": "Fighting for your life"}'
    
    curl -XPUT localhost:9200/local/article/3 -d'{"title": "My dad fought a dog"}'
    
    curl -XPUT localhost:9200/local/article/4 -d'{"title": "Bruno fights Tyson tomorrow"}'
    
      
    
    # search on the title field, which is stemmed on index and search
    
    curl -XGET localhost:9200/local/_search?q=title:fight
    
      
    
    # searching on _all will not do anystemming, unless also configured on the mapping to be stemmed...
    
    curl -XGET localhost:9200/local/_search?q=fight

    例如:

    Fight for your life

    分词如下:

    {"tokens":[
    
    {"token":"fight","start_offset":1,"end_offset":6,"type":"<ALPHANUM>","position":1},<br>{"token":"your","start_offset":11,"end_offset":15,"type":"<ALPHANUM>","position":3},<br>{"token":"life","start_offset":16,"end_offset":20,"type":"<ALPHANUM>","position":4}
    
    ]}

    部署ik分词器

    在elasticsearch.yml中配置  index.analysis.analyzer.ik.type : "ik"

    delete之前创建的index,重新配置如下:

    curl -XPUT localhost:9200/local -d '{
    
        "settings" : {
    
            "analysis" : {
    
                "analyzer" : {
    
                    "ik" : {
    
                        "tokenizer" : "ik"
    
                    }
    
                }
    
            }
    
        },
    
        "mappings" : {
    
            "article" : {
    
                "dynamic" : true,
    
                "properties" : {
    
                    "title" : {
    
                        "type" : "string",
    
                        "analyzer" : "ik"
    
                    }
    
                }
    
            }
    
        }
    
    }'

    测试:

    curl 'http://localhost:9200/local/_analyze?analyzer=ik&pretty=true' -d'  
    
    {  
    
        "text":"中华人民共和国国歌" 
    
    }  
    
    '  
    
    {
    
      "tokens" : [ {
    
        "token" : "text",
    
        "start_offset" : 12,
    
        "end_offset" : 16,
    
        "type" : "ENGLISH",
    
        "position" : 1
    
      }, {
    
        "token" : "中华人民共和国",
    
        "start_offset" : 19,
    
        "end_offset" : 26,
    
        "type" : "CN_WORD",
    
        "position" : 2
    
      }, {
    
        "token" : "国歌",
    
        "start_offset" : 26,
    
        "end_offset" : 28,
    
        "type" : "CN_WORD",
    
        "position" : 3
    
      } ]
    
    }

    如果我们想返回最细粒度的分词结果,需要在elasticsearch.yml中配置如下:

    index:
    
      analysis:
    
        analyzer:
    
          ik:
    
              alias: [ik_analyzer]
    
              type: org.elasticsearch.index.analysis.IkAnalyzerProvider
    
          ik_smart:
    
              type: ik
    
              use_smart: true
    
          ik_max_word:
    
              type: ik
    
              use_smart: false

    测试:

    curl 'http://localhost:9200/index/_analyze?analyzer=ik_max_word&pretty=true' -d'  
    
    {  
    
        "text":"中华人民共和国国歌" 
    
    }  
    
    '  
    
    {
    
      "tokens" : [ {
    
        "token" : "text",
    
        "start_offset" : 12,
    
        "end_offset" : 16,
    
        "type" : "ENGLISH",
    
        "position" : 1
    
      }, {
    
        "token" : "中华人民共和国",
    
        "start_offset" : 19,
    
        "end_offset" : 26,
    
        "type" : "CN_WORD",
    
        "position" : 2
    
      }, {
    
        "token" : "中华人民",
    
        "start_offset" : 19,
    
        "end_offset" : 23,
    
        "type" : "CN_WORD",
    
        "position" : 3
    
      }, {
    
        "token" : "中华",
    
        "start_offset" : 19,
    
        "end_offset" : 21,
    
        "type" : "CN_WORD",
    
        "position" : 4
    
      }, {
    
        "token" : "华人",
    
        "start_offset" : 20,
    
        "end_offset" : 22,
    
        "type" : "CN_WORD",
    
        "position" : 5
    
      }, {
    
        "token" : "人民共和国",
    
        "start_offset" : 21,
    
        "end_offset" : 26,
    
        "type" : "CN_WORD",
    
        "position" : 6
    
      }, {
    
        "token" : "人民",
    
        "start_offset" : 21,
    
        "end_offset" : 23,
    
        "type" : "CN_WORD",
    
        "position" : 7
    
      }, {
    
        "token" : "共和国",
    
        "start_offset" : 23,
    
        "end_offset" : 26,
    
        "type" : "CN_WORD",
    
        "position" : 8
    
      }, {
    
        "token" : "共和",
    
        "start_offset" : 23,
    
        "end_offset" : 25,
    
        "type" : "CN_WORD",
    
        "position" : 9
    
      }, {
    
        "token" : "",
    
        "start_offset" : 25,
    
        "end_offset" : 26,
    
        "type" : "CN_CHAR",
    
        "position" : 10
    
      }, {
    
        "token" : "国歌",
    
        "start_offset" : 26,
    
        "end_offset" : 28,
    
        "type" : "CN_WORD",
    
        "position" : 11
    
      } ]
    
    }
  • 相关阅读:
    乱码解决方案SecureCRT中文乱码解决方案
    普通用户注销windows server 2003 普通用户(users)远程登录立即自动注销的解决方法
    jquery同步基于jquery的$.ajax async使用
    服务解释WinSer 8 无法访问共享官方解释
    备用nulljs 输出内容到新窗口
    返回解释Java乔晓松Android SD卡路径问题以及如何获取SDCard内存大小
    复制最佳实践MySQL 磁盘复制技术DRBD:优缺点比较、注意事项以及最佳实践
    schema类SpringMVC+Hibernate+Spring整合(二)
    类class2013第十四周上机任务【项目2 抽象Shape类】
    数据库javaJAVA连接oracle数据库
  • 原文地址:https://www.cnblogs.com/jiu0821/p/5625578.html
Copyright © 2020-2023  润新知