【发布时间】:2016-01-29 07:36:11
【问题描述】:
我正在使用 elasticsearch 来搜索在其中扮演的演员的电影。当我搜索例如“莱昂纳多·迪卡普里奥”有大约 10 部电影我回来了,但它们都有不同的分数。由于他们都有相同的演员,我希望他们有相同的分数。有没有人能够阐明为什么会发生这种情况并希望如何阻止它?
Elasticsearch 1.7.2 版
映射:
{
"programs": {
"mappings": {
"program_doc_type": {
"properties": {
"cast": {
"type": "string",
"analyzer": "keyword_analyzer",
"fields": {
"name": {
"type": "string",
"analyzer": "name_analyzer"
}
}
},
"django_id": {
"type": "integer"
},
"has_poster": {
"type": "boolean"
},
"imdb_id": {
"type": "string",
"index": "not_analyzed"
},
"kind": {
"type": "string",
"index": "not_analyzed"
},
"record_url_count": {
"type": "integer"
},
"release_date": {
"type": "date",
"format": "dateOptionalTime"
},
"release_year": {
"type": "integer"
},
"title": {
"type": "string",
"analyzer": "pattern"
},
"tms_id": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
分析员:
"analysis": {
"analyzer": {
"keyword_analyzer": {
"type": "custom",
"filter": [
"lowercase"
],
"tokenizer": "keyword"
},
"name_analyzer": {
"type": "custom",
"filter": [
"lowercase"
],
"tokenizer": "whitespace"
}
}
}
查询:
{
"query": {
"match": {"cast.name": "leonardo dicaprio"}
}
}
首页结果:
{
"took": 12,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 62,
"max_score": 12.046804,
"hits": [
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "1077511",
"_score": 12.046804,
"_source": {
"imdb_id": "tt4007278",
"tms_id": "",
"record_url_count": 0,
"release_date": "2014-08-20",
"title": "Carbon",
"has_poster": false,
"release_year": 2014,
"django_id": 1077511,
"kind": "movie",
"cast": [
"Leonardo DiCaprio"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "812919",
"_score": 11.906615,
"_source": {
"imdb_id": "tt2076929",
"tms_id": "",
"record_url_count": 0,
"title": "Satori",
"has_poster": false,
"release_year": 2014,
"django_id": 812919,
"kind": "N/A",
"cast": [
"Leonardo DiCaprio"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "376792",
"_score": 11.886408,
"_source": {
"imdb_id": "tt0402538",
"tms_id": "",
"record_url_count": 0,
"title": "Titanic: The Premiere",
"has_poster": true,
"release_year": 2000,
"django_id": 376792,
"kind": "movie",
"cast": [
"Leonardo DiCaprio"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "306106",
"_score": 11.69776,
"_source": {
"imdb_id": "tt0325727",
"tms_id": "",
"record_url_count": 0,
"release_date": "1998-08-16",
"title": "Leo Mania",
"has_poster": true,
"release_year": 1998,
"django_id": 306106,
"kind": "movie",
"cast": [
"Leonardo DiCaprio"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "269743",
"_score": 9.637444,
"_source": {
"imdb_id": "tt0286234",
"tms_id": "",
"record_url_count": 0,
"title": "Total Eclipse",
"has_poster": false,
"release_year": 1995,
"django_id": 269743,
"kind": "movie",
"cast": [
"Leonardo DiCaprio",
"Agnieszka Holland"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "840945",
"_score": 9.358208,
"_source": {
"imdb_id": "tt2195237",
"tms_id": "",
"record_url_count": 0,
"release_date": "2004-12-01",
"title": "MovieReal: The Aviator",
"has_poster": false,
"release_year": 2004,
"django_id": 840945,
"kind": "series",
"cast": [
"Leonardo DiCaprio",
"Martin Scorsese"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "382168",
"_score": 9.358208,
"_source": {
"imdb_id": "tt0408269",
"tms_id": "",
"record_url_count": 0,
"release_date": "1998-09-29",
"title": "To Leo with Love",
"has_poster": true,
"release_year": 1998,
"django_id": 382168,
"kind": "movie",
"cast": [
"Jo Wyatt",
"Leonardo DiCaprio"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "846212",
"_score": 7.2280827,
"_source": {
"imdb_id": "tt2218442",
"tms_id": "",
"record_url_count": 0,
"title": "Legacy of Secrecy",
"has_poster": false,
"release_year": 1947,
"django_id": 846212,
"kind": "N/A",
"cast": [
"Leonardo DiCaprio",
"Robert De Niro",
"D'Anthony Palms"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "595027",
"_score": 7.1439695,
"_source": {
"imdb_id": "tt1294988",
"tms_id": "",
"record_url_count": 0,
"release_date": "2006-09-27",
"title": "Emporio Armani 'Red' One Night Only",
"has_poster": false,
"release_year": 2006,
"django_id": 595027,
"kind": "movie",
"cast": [
"Kim Cattrall",
"Leonardo DiCaprio",
"Beyoncé Knowles"
]
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "752646",
"_score": 7.1439695,
"_source": {
"imdb_id": "tt1826731",
"tms_id": "",
"record_url_count": 0,
"release_date": "2009-06-02",
"title": "Lives of Quiet Desperation: The Making of Revolutionary Road",
"has_poster": false,
"release_year": 2009,
"django_id": 752646,
"kind": "movie",
"cast": [
"Kathy Bates",
"Leonardo DiCaprio",
"Kate Winslet"
]
}
}
]
}
}
更新:
我禁用了字段长度规范,这似乎对其进行了很大改进,但它们仍然不尽相同。我仍然很困惑。根据我的阅读,确定相关性的三种方法:
- 词频
- 逆文档频率
- 字段长度规范(已禁用)
由于每个节目只有一次莱昂纳多·迪卡普里奥,在我看来,他们应该有相同的分数,但事实并非如此。也许我是误会了。以下是禁用字段长度规范后的更新设置:
映射:
{
"programs": {
"mappings": {
"program_doc_type": {
"properties": {
"cast": {
"type": "string",
"norms": {
"enabled": false
},
"analyzer": "keyword_analyzer",
"fields": {
"name": {
"type": "string",
"norms": {
"enabled": false
},
"analyzer": "name_analyzer"
}
}
},
"django_id": {
"type": "integer"
},
"has_poster": {
"type": "boolean"
},
"imdb_id": {
"type": "string",
"index": "not_analyzed"
},
"kind": {
"type": "string",
"index": "not_analyzed"
},
"record_url_count": {
"type": "integer"
},
"release_date": {
"type": "date",
"format": "dateOptionalTime"
},
"release_year": {
"type": "integer"
},
"title": {
"type": "string",
"analyzer": "pattern"
},
"tms_id": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
}
首页结果:
{
"took": 20,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 836,
"max_score": 13.778852,
"hits": [
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "421026",
"_score": 13.778852,
"_source": {
"tms_id": "",
"django_id": 421026,
"imdb_id": "tt0449557",
"has_poster": false,
"release_date": "2005-05-24",
"kind": "movie",
"cast": [
"Leonardo DiCaprio",
"Jeffrey M. Schwartz",
"Donald L. Barlett",
"James B. Steele"
],
"release_year": 2005,
"record_url_count": 0,
"title": "The Affliction of Howard Hughes: Obsessive-Compulsive Disorder"
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "555015",
"_score": 13.778852,
"_source": {
"tms_id": "MV002510340000",
"django_id": 555015,
"imdb_id": "tt1130884",
"has_poster": true,
"release_date": "2010-02-19",
"kind": "movie",
"cast": [
"Leonardo DiCaprio",
"Mark Ruffalo",
"Ben Kingsley",
"Max von Sydow"
],
"release_year": 2010,
"record_url_count": 2,
"title": "Shutter Island"
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "104669",
"_score": 13.778852,
"_source": {
"tms_id": "",
"django_id": 104669,
"imdb_id": "tt0108330",
"has_poster": true,
"release_date": "1993-04-23",
"kind": "movie",
"cast": [
"Robert De Niro",
"Ellen Barkin",
"Leonardo DiCaprio",
"Jonah Blechman"
],
"release_year": 1993,
"record_url_count": 1,
"title": "This Boy's Life"
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "846212",
"_score": 13.778852,
"_source": {
"django_id": 846212,
"title": "Legacy of Secrecy",
"imdb_id": "tt2218442",
"has_poster": false,
"kind": "N/A",
"cast": [
"Leonardo DiCaprio",
"Robert De Niro",
"D'Anthony Palms"
],
"release_year": 1947,
"record_url_count": 0,
"tms_id": ""
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "256632",
"_score": 13.778852,
"_source": {
"django_id": 256632,
"title": "The Movie Show",
"imdb_id": "tt0271918",
"has_poster": false,
"kind": "series",
"cast": [
"Ray Brady",
"Russell Crowe",
"Larry Day",
"Leonardo DiCaprio"
],
"release_year": 1986,
"record_url_count": 0,
"tms_id": ""
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "269743",
"_score": 13.778852,
"_source": {
"django_id": 269743,
"title": "Total Eclipse",
"imdb_id": "tt0286234",
"has_poster": false,
"kind": "movie",
"cast": [
"Leonardo DiCaprio",
"Agnieszka Holland"
],
"release_year": 1995,
"record_url_count": 0,
"tms_id": ""
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "1007190",
"_score": 13.778852,
"_source": {
"tms_id": "",
"django_id": 1007190,
"imdb_id": "tt3391950",
"has_poster": false,
"release_date": "2013-12-29",
"kind": "series",
"cast": [
"Leonardo DiCaprio",
"Jonah Hill",
"Martin Scorsese",
"Terence Winter"
],
"release_year": 2013,
"record_url_count": 0,
"title": "The Hollywood Reporter in Focus"
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "1077511",
"_score": 13.778852,
"_source": {
"tms_id": "",
"django_id": 1077511,
"imdb_id": "tt4007278",
"has_poster": false,
"release_date": "2014-08-20",
"kind": "movie",
"cast": [
"Leonardo DiCaprio"
],
"release_year": 2014,
"record_url_count": 0,
"title": "Carbon"
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "302615",
"_score": 13.57246,
"_source": {
"django_id": 302615,
"title": "Directors: James Cameron",
"imdb_id": "tt0322031",
"has_poster": true,
"kind": "movie",
"cast": [
"Michael Biehn",
"James Cameron",
"Jamie Lee Curtis",
"Leonardo DiCaprio"
],
"release_year": 1997,
"record_url_count": 0,
"tms_id": ""
}
},
{
"_index": "programs",
"_type": "program_doc_type",
"_id": "509785",
"_score": 13.57246,
"_source": {
"tms_id": "",
"django_id": 509785,
"imdb_id": "tt0923573",
"has_poster": false,
"release_date": "2003-05-06",
"kind": "movie",
"cast": [
"Frank Abagnale Jr.",
"Amy Adams",
"Nathalie Baye",
"Leonardo DiCaprio"
],
"release_year": 2003,
"record_url_count": 0,
"title": "'Catch Me If You Can': The Casting of the Film"
}
}
]
}
}
结果有了很大改善,但最后 2 个的分数仍然与其他结果不同。
【问题讨论】:
-
这可能不相关,但您说 10 个结果,而您的输出显示 70182 个结果?你的查询也是完整的查询,除了匹配查询之外什么都没有,对吧?
-
我刚刚重新检索了所有新鲜数据并更新了帖子
-
您是否删除了索引并重新索引所有内容?在禁用规范后,它给了我相同的分数。如果您没有完全删除索引,那么根据文档
Norms will not be removed instantly, but will be removed as old segments are merged into new segments as you continue indexing new documents. Any score computation on a field that has had norms removed might return inconsistent results since some documents won’t have norms anymore while other documents might still have norms. -
我删除了整个索引并重新建立了索引,但还是一样。
-
我明白了,我猜我的数据集非常小,我可能是错的,但这可能是由于对不同分片上的文档进行评分造成的。您将能够使用explain api 计算分数,查看具有不同分数的文档,您就会知道原因。我希望这会有所帮助。
标签: elasticsearch