相关度评分 TF&IDF算法
Elasticsearch的相关度评分(relevance score)算法采用的是term frequency/inverse document frequency算法,简称为TF/IDF算法。
算法介绍:
- relevance score算法:简单来说就是,就是计算出一个索引中的文本,与搜索文本,它们之间的关联匹配程度。
- TF/IDF算法:分为两个部分,IF 和IDF
-
Term Frequency(TF): 搜索文本中的各个词条在field文本中出现了多少次,出现的次数越多,就越相关
例如:
搜索请求:hello world
doc1: hello you, and world is very good
doc2: hello, how are you
那么此时根据TF算法,doc1的相关度要比doc2的要高 -
Inverse Document Frequency(IDF):搜索文本中的各个词条在整个索引的所有文档中出现的次数,出现的次数越多,就越不相关。
搜索请求: hello world
doc1: hello, today is very good.
doc2: hi world, how are you.
比如在index中有1万条document, hello这个单词在所有的document中,一共出现了1000次,world这个单词在所有的document中一共出现100次。那么根据IDF算法此时doc2的相关度要比doc1要高。 -
field-length norm:field-length norm就是field长度越长,相关度就越弱
搜索请求:hello world
doc1: {"title": "hello article", "content": "1万个单词"}
doc2: {"title": "my article", "content": "1万个单词, hi world"}
此时hello world在整个index中出现的次数是一样多的。但是根据Field-length norm此时doc1比doc2相关度要高。因为title字段更短。
_score是如何被计算出来的
GET /test_index/test_type/_search?explain { "query": { "match": { "test_field": "test hello" } } }
{ "took": 1, "timed_out": false, "_shards": { "total": 5, "successful": 5, "failed": 0 }, "hits": { "total": 3, "max_score": 0.843298, "hits": [ { "_shard": "[test_index][2]", "_node": "1LdqLFqxQQq4xg2MphI_gw", "_index": "test_index", "_type": "test_type", "_id": "6", "_score": 0.843298, "_source": { "test_field": "test test" }, "_explanation": { "value": 0.843298, "description": "sum of:", "details": [ { "value": 0.843298, "description": "sum of:", "details": [ { "value": 0.843298, "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.843298, "description": "score(doc=0,freq=2.0 = termFreq=2.0\n), product of:", "details": [ { "value": 0.6931472, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 2, "description": "docFreq", "details": [] }, { "value": 4, "description": "docCount", "details": [] } ] }, { "value": 1.2166219, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 2, "description": "termFreq=2.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 1.75, "description": "avgFieldLength", "details": [] }, { "value": 2.56, "description": "fieldLength", "details": [] } ] } ] } ] } ] }, { "value": 0, "description": "match on required clause, product of:", "details": [ { "value": 0, "description": "# clause", "details": [] }, { "value": 1, "description": "_type:test_type, product of:", "details": [ { "value": 1, "description": "boost", "details": [] }, { "value": 1, "description": "queryNorm", "details": [] } ] } ] } ] } }, { "_shard": "[test_index][1]", "_node": "1LdqLFqxQQq4xg2MphI_gw", "_index": "test_index", "_type": "test_type", "_id": "8", "_score": 0.43445712, "_source": { "test_field": "test client 2" }, "_explanation": { "value": 0.43445715, "description": "sum of:", "details": [ { "value": 0.43445715, "description": "sum of:", "details": [ { "value": 0.43445715, "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.43445715, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.47000363, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 2, "description": "docFreq", "details": [] }, { "value": 3, "description": "docCount", "details": [] } ] }, { "value": 0.92436975, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 3.3333333, "description": "avgFieldLength", "details": [] }, { "value": 4, "description": "fieldLength", "details": [] } ] } ] } ] } ] }, { "value": 0, "description": "match on required clause, product of:", "details": [ { "value": 0, "description": "# clause", "details": [] }, { "value": 1, "description": "_type:test_type, product of:", "details": [ { "value": 1, "description": "boost", "details": [] }, { "value": 1, "description": "queryNorm", "details": [] } ] } ] } ] } }, { "_shard": "[test_index][3]", "_node": "1LdqLFqxQQq4xg2MphI_gw", "_index": "test_index", "_type": "test_type", "_id": "7", "_score": 0.25316024, "_source": { "test_field": "test client 1" }, "_explanation": { "value": 0.25316024, "description": "sum of:", "details": [ { "value": 0.25316024, "description": "sum of:", "details": [ { "value": 0.25316024, "description": "weight(test_field:test in 0) [PerFieldSimilarity], result of:", "details": [ { "value": 0.25316024, "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", "details": [ { "value": 0.2876821, "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", "details": [ { "value": 1, "description": "docFreq", "details": [] }, { "value": 1, "description": "docCount", "details": [] } ] }, { "value": 0.88, "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:", "details": [ { "value": 1, "description": "termFreq=1.0", "details": [] }, { "value": 1.2, "description": "parameter k1", "details": [] }, { "value": 0.75, "description": "parameter b", "details": [] }, { "value": 3, "description": "avgFieldLength", "details": [] }, { "value": 4, "description": "fieldLength", "details": [] } ] } ] } ] } ] }, { "value": 0, "description": "match on required clause, product of:", "details": [ { "value": 0, "description": "# clause", "details": [] }, { "value": 1, "description": "*:*, product of:", "details": [ { "value": 1, "description": "boost", "details": [] }, { "value": 1, "description": "queryNorm", "details": [] } ] } ] } ] } } ] } }