本文引自 https://blog.csdn.net/caideb/article/details/81632154

cnblog的排版好看很多,所以在这里建一篇分享博客。

-----------------------------------------------------------------------------------------------

 

扩展字典中的词会被筛选出来,扩展停止词中的词会被过滤掉

1.没有加入扩展字典 停止词字典用法

1) ik分词器

[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 725
{
  "tokens" : [ {
    "token" : "自古",
    "start_offset" : 0,
    "end_offset" : 2,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "CN_WORD",
    "position" : 1
  }, {
    "token" : "",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 2
  }, {
    "token" : "",
    "start_offset" : 4,
    "end_offset" : 5,
    "type" : "CN_CHAR",
    "position" : 3
  }, {
    "token" : "",
    "start_offset" : 5,
    "end_offset" : 6,
    "type" : "CN_WORD",
    "position" : 4
  }, {
    "token" : "",
    "start_offset" : 6,
    "end_offset" : 7,
    "type" : "CN_CHAR",
    "position" : 5
  } ]
}

 

2) ik_smart分词器

[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_smart","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty                   HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 725
{
  "tokens" : [ {
    "token" : "自古",
    "start_offset" : 0,
    "end_offset" : 2,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "CN_WORD",
    "position" : 1
  }, {
    "token" : "",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 2
  }, {
    "token" : "",
    "start_offset" : 4,
    "end_offset" : 5,
    "type" : "CN_CHAR",
    "position" : 3
  }, {
    "token" : "",
    "start_offset" : 5,
    "end_offset" : 6,
    "type" : "CN_WORD",
    "position" : 4
  }, {
    "token" : "",
    "start_offset" : 6,
    "end_offset" : 7,
    "type" : "CN_CHAR",
    "position" : 5
  } ]
}

 

3) ik_max_word分词器

[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_max_word","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 725
{
  "tokens" : [ {
    "token" : "自古",
    "start_offset" : 0,
    "end_offset" : 2,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "CN_WORD",
    "position" : 1
  }, {
    "token" : "",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 2
  }, {
    "token" : "",
    "start_offset" : 4,
    "end_offset" : 5,
    "type" : "CN_CHAR",
    "position" : 3
  }, {
    "token" : "",
    "start_offset" : 5,
    "end_offset" : 6,
    "type" : "CN_WORD",
    "position" : 4
  }, {
    "token" : "",
    "start_offset" : 6,
    "end_offset" : 7,
    "type" : "CN_CHAR",
    "position" : 5
  } ]
}

 

2.加入自定义字典

扩展字典:用于创建分词的字典

停止字典:用于过滤的字典,也就是说,该字典的单词或者字符串都会进行过滤

test.dic

刀扇
背刺

 

teststop.dic

自古
过

 

/analysis-ik/config/IKAnalyzer.cfg.xml

 

1) ik分词器

[root@localhost config]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 728
{
  "tokens" : [ {
    "token" : "刀扇",
    "start_offset" : 2,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "CN_WORD",
    "position" : 1
  }, {
    "token" : "",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 2
  }, {
    "token" : "背刺",
    "start_offset" : 5,
    "end_offset" : 7,
    "type" : "CN_WORD",
    "position" : 3
  }, {
    "token" : "",
    "start_offset" : 5,
    "end_offset" : 6,
    "type" : "CN_WORD",
    "position" : 4
  }, {
    "token" : "",
    "start_offset" : 6,
    "end_offset" : 7,
    "type" : "CN_CHAR",
    "position" : 5
  } ]
}

 

2) ik_smart分词器

[root@localhost config]#  curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_smart","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty                  HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 260
{
  "tokens" : [ {
    "token" : "刀扇",
    "start_offset" : 2,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "背刺",
    "start_offset" : 5,
    "end_offset" : 7,
    "type" : "CN_WORD",
    "position" : 1
  } ]
}

 

3) ik_max_word分词器

[root@localhost config]#  curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_max_word","text":"自古刀扇过背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 728
{
  "tokens" : [ {
    "token" : "刀扇",
    "start_offset" : 2,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "CN_WORD",
    "position" : 1
  }, {
    "token" : "",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 2
  }, {
    "token" : "背刺",
    "start_offset" : 5,
    "end_offset" : 7,
    "type" : "CN_WORD",
    "position" : 3
  }, {
    "token" : "",
    "start_offset" : 5,
    "end_offset" : 6,
    "type" : "CN_WORD",
    "position" : 4
  }, {
    "token" : "",
    "start_offset" : 6,
    "end_offset" : 7,
    "type" : "CN_CHAR",
    "position" : 5
  } ]
}

 

相关文章:

  • 2022-12-23
  • 2022-02-13
  • 2021-05-05
  • 2022-01-26
  • 2022-02-10
  • 2021-07-03
  • 2021-09-27
  • 2022-12-23
猜你喜欢
  • 2021-11-24
  • 2021-08-27
  • 2021-08-12
  • 2021-05-21
  • 2021-07-16
  • 2022-12-23
  • 2021-06-10
相关资源
相似解决方案