2017-12-10 11 views
2

インデックス(「デルタ」という名前)のドキュメントに割り当てることができる、作成したタグ(キーワードフレーズ)の全文検索を有効にしようとしています。elasticsearchで一貫性のないスコアリング

私の結果は、同じコードを繰り返し実行した場合、(1)私が期待していたものではなく、(2)一貫性がありません。

以下はいくつかのコードです。コードとドキュメントやマッピングの他の部分に問題がないことを確認するために、マッピングとドキュメントを簡略化しました。私はKibana Dev Tools Consoleを使用してこのすべてを実行しています。

PUT /mdelta 
{ 
    "mappings":{ 
    "tags":{ 
     "properties":{ 
     "synonyms":{ 
      "type":"text" 
     } 
     } 
    } 
    } 
} 

POST _bulk 
{ "index" : { "_index" : "mdelta", "_type" : "tags" }} 
{"synonyms":"Iron"} 
{ "index" : { "_index" : "mdelta", "_type" : "tags" }} 
{"synonyms":"Fe"} 
{ "index" : { "_index" : "mdelta", "_type" : "tags" }} 
{"synonyms":"Iron Deficiency"} 
{ "index" : { "_index" : "mdelta", "_type" : "tags" }} 
{"synonyms":"Serum Iron"} 
{ "index" : { "_index" : "mdelta", "_type" : "tags" }} 
{"synonyms":"Iron Sulfate"} 
{ "index" : { "_index" : "mdelta", "_type" : "tags" }} 
{"synonyms":"Iron Deficiency Anemia"} 

GET mdelta/tags/_search 
{ 
    "explain":false, 
    "query": { 
     "match" : { 
      "synonyms" : "iron" 
     } 
    } 
} 

スコアリングアルゴリズムの私の理解に基づいて、私はドキュメント{"synonyms":"Iron"}は(トップスコア)最初に返されることを期待します。これはそうではありません。結果...

{ 
    "took": 0, 
    "timed_out": false, 
    "_shards": { 
    "total": 5, 
    "successful": 5, 
    "skipped": 0, 
    "failed": 0 
    }, 
    "hits": { 
    "total": 5, 
    "max_score": 0.5377023, 
    "hits": [ 
     { 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "AWA8jRR9YXA6OBvYOfj9", 
     "_score": 0.5377023, 
     "_source": { 
      "synonyms": "Iron Sulfate" 
     } 
     }, 
     { 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "AWA8jRR9YXA6OBvYOfj5", 
     "_score": 0.2876821, 
     "_source": { 
      "synonyms": "Iron" 
     } 
     }, 
     { 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "AWA8jRR9YXA6OBvYOfj8", 
     "_score": 0.25811607, 
     "_source": { 
      "synonyms": "Serum Iron" 
     } 
     }, 
     { 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "AWA8jRR9YXA6OBvYOfj7", 
     "_score": 0.1805489, 
     "_source": { 
      "synonyms": "Iron Deficiency" 
     } 
     }, 
     { 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "AWA8jRR9YXA6OBvYOfj-", 
     "_score": 0.14638957, 
     "_source": { 
      "synonyms": "Iron Deficiency Anemia" 
     } 
     } 
    ] 
    } 
} 

説明をtrueに設定してクエリを繰り返しました。

{ 
    "took": 38, 
    "timed_out": false, 
    "_shards": { 
    "total": 5, 
    "successful": 5, 
    "skipped": 0, 
    "failed": 0 
    }, 
    "hits": { 
    "total": 5, 
    "max_score": 0.5377023, 
    "hits": [ 
     { 
     "_shard": "[mdelta][4]", 
     "_node": "McQ619KqR0akS1mHvTXjDw", 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "AWA8jRR9YXA6OBvYOfj9", 
     "_score": 0.5377023, 
     "_source": { 
      "synonyms": "Iron Sulfate" 
     }, 
     "_explanation": { 
      "value": 0.5377023, 
      "description": "weight(synonyms:iron in 1) [PerFieldSimilarity], result of:", 
      "details": [ 
      { 
       "value": 0.5377023, 
       "description": "score(doc=1,freq=1.0 = termFreq=1.0\n), product of:", 
       "details": [ 
       { 
        "value": 0.6931472, 
        "description": "idf, computed as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)) from:", 
        "details": [ 
        { 
         "value": 1, 
         "description": "docFreq", 
         "details": [] 
        }, 
        { 
         "value": 2, 
         "description": "docCount", 
         "details": [] 
        } 
        ] 
       }, 
       { 
        "value": 0.7757405, 
        "description": "tfNorm, computed as (freq * (k1 + 1))/(freq + k1 * (1 - b + b * fieldLength/avgFieldLength)) from:", 
        "details": [ 
        { 
         "value": 1, 
         "description": "termFreq=1.0", 
         "details": [] 
        }, 
        { 
         "value": 1.2, 
         "description": "parameter k1", 
         "details": [] 
        }, 
        { 
         "value": 0.75, 
         "description": "parameter b", 
         "details": [] 
        }, 
        { 
         "value": 1.5, 
         "description": "avgFieldLength", 
         "details": [] 
        }, 
        { 
         "value": 2.56, 
         "description": "fieldLength", 
         "details": [] 
        } 
        ] 
       } 
       ] 
      } 
      ] 
     } 
     }, 
     { 
     "_shard": "[mdelta][2]", 
     "_node": "McQ619KqR0akS1mHvTXjDw", 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "AWA8jRR9YXA6OBvYOfj5", 
     "_score": 0.2876821, 
     "_source": { 
      "synonyms": "Iron" 
     }, 
     "_explanation": { 
      "value": 0.2876821, 
      "description": "weight(synonyms:iron in 0) [PerFieldSimilarity], result of:", 
      "details": [ 
      { 
       "value": 0.2876821, 
       "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", 
       "details": [ 
       { 
        "value": 0.2876821, 
        "description": "idf, computed as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)) from:", 
        "details": [ 
        { 
         "value": 1, 
         "description": "docFreq", 
         "details": [] 
        }, 
        { 
         "value": 1, 
         "description": "docCount", 
         "details": [] 
        } 
        ] 
       }, 
       { 
        "value": 1, 
        "description": "tfNorm, computed as (freq * (k1 + 1))/(freq + k1 * (1 - b + b * fieldLength/avgFieldLength)) from:", 
        "details": [ 
        { 
         "value": 1, 
         "description": "termFreq=1.0", 
         "details": [] 
        }, 
        { 
         "value": 1.2, 
         "description": "parameter k1", 
         "details": [] 
        }, 
        { 
         "value": 0.75, 
         "description": "parameter b", 
         "details": [] 
        }, 
        { 
         "value": 1, 
         "description": "avgFieldLength", 
         "details": [] 
        }, 
        { 
         "value": 1, 
         "description": "fieldLength", 
         "details": [] 
        } 
        ] 
       } 
       ] 
      } 
      ] 
     } 
     }, 
     { 
     "_shard": "[mdelta][3]", 
     "_node": "McQ619KqR0akS1mHvTXjDw", 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "AWA8jRR9YXA6OBvYOfj8", 
     "_score": 0.25811607, 
     "_source": { 
      "synonyms": "Serum Iron" 
     }, 
     "_explanation": { 
      "value": 0.25811607, 
      "description": "weight(synonyms:iron in 0) [PerFieldSimilarity], result of:", 
      "details": [ 
      { 
       "value": 0.25811607, 
       "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", 
       "details": [ 
       { 
        "value": 0.2876821, 
        "description": "idf, computed as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)) from:", 
        "details": [ 
        { 
         "value": 1, 
         "description": "docFreq", 
         "details": [] 
        }, 
        { 
         "value": 1, 
         "description": "docCount", 
         "details": [] 
        } 
        ] 
       }, 
       { 
        "value": 0.89722675, 
        "description": "tfNorm, computed as (freq * (k1 + 1))/(freq + k1 * (1 - b + b * fieldLength/avgFieldLength)) from:", 
        "details": [ 
        { 
         "value": 1, 
         "description": "termFreq=1.0", 
         "details": [] 
        }, 
        { 
         "value": 1.2, 
         "description": "parameter k1", 
         "details": [] 
        }, 
        { 
         "value": 0.75, 
         "description": "parameter b", 
         "details": [] 
        }, 
        { 
         "value": 2, 
         "description": "avgFieldLength", 
         "details": [] 
        }, 
        { 
         "value": 2.56, 
         "description": "fieldLength", 
         "details": [] 
        } 
        ] 
       } 
       ] 
      } 
      ] 
     } 
     }, 
     { 
     "_shard": "[mdelta][1]", 
     "_node": "McQ619KqR0akS1mHvTXjDw", 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "AWA8jRR9YXA6OBvYOfj7", 
     "_score": 0.1805489, 
     "_source": { 
      "synonyms": "Iron Deficiency" 
     }, 
     "_explanation": { 
      "value": 0.1805489, 
      "description": "weight(synonyms:iron in 0) [PerFieldSimilarity], result of:", 
      "details": [ 
      { 
       "value": 0.1805489, 
       "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:", 
       "details": [ 
       { 
        "value": 0.18232156, 
        "description": "idf, computed as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)) from:", 
        "details": [ 
        { 
         "value": 2, 
         "description": "docFreq", 
         "details": [] 
        }, 
        { 
         "value": 2, 
         "description": "docCount", 
         "details": [] 
        } 
        ] 
       }, 
       { 
        "value": 0.9902773, 
        "description": "tfNorm, computed as (freq * (k1 + 1))/(freq + k1 * (1 - b + b * fieldLength/avgFieldLength)) from:", 
        "details": [ 
        { 
         "value": 1, 
         "description": "termFreq=1.0", 
         "details": [] 
        }, 
        { 
         "value": 1.2, 
         "description": "parameter k1", 
         "details": [] 
        }, 
        { 
         "value": 0.75, 
         "description": "parameter b", 
         "details": [] 
        }, 
        { 
         "value": 2.5, 
         "description": "avgFieldLength", 
         "details": [] 
        }, 
        { 
         "value": 2.56, 
         "description": "fieldLength", 
         "details": [] 
        } 
        ] 
       } 
       ] 
      } 
      ] 
     } 
     }, 
     { 
     "_shard": "[mdelta][1]", 
     "_node": "McQ619KqR0akS1mHvTXjDw", 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "AWA8jRR9YXA6OBvYOfj-", 
     "_score": 0.14638957, 
     "_source": { 
      "synonyms": "Iron Deficiency Anemia" 
     }, 
     "_explanation": { 
      "value": 0.14638956, 
      "description": "weight(synonyms:iron in 1) [PerFieldSimilarity], result of:", 
      "details": [ 
      { 
       "value": 0.14638956, 
       "description": "score(doc=1,freq=1.0 = termFreq=1.0\n), product of:", 
       "details": [ 
       { 
        "value": 0.18232156, 
        "description": "idf, computed as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)) from:", 
        "details": [ 
        { 
         "value": 2, 
         "description": "docFreq", 
         "details": [] 
        }, 
        { 
         "value": 2, 
         "description": "docCount", 
         "details": [] 
        } 
        ] 
       }, 
       { 
        "value": 0.8029196, 
        "description": "tfNorm, computed as (freq * (k1 + 1))/(freq + k1 * (1 - b + b * fieldLength/avgFieldLength)) from:", 
        "details": [ 
        { 
         "value": 1, 
         "description": "termFreq=1.0", 
         "details": [] 
        }, 
        { 
         "value": 1.2, 
         "description": "parameter k1", 
         "details": [] 
        }, 
        { 
         "value": 0.75, 
         "description": "parameter b", 
         "details": [] 
        }, 
        { 
         "value": 2.5, 
         "description": "avgFieldLength", 
         "details": [] 
        }, 
        { 
         "value": 4, 
         "description": "fieldLength", 
         "details": [] 
        } 
        ] 
       } 
       ] 
      } 
      ] 
     } 
     } 
    ] 
    } 
} 

あなたが最初のヒット(「硫酸鉄」)を見れば、docFreqが1であるとDOCCOUNTこれは間違っている2であることが表示されます。また

、私はdelete /mdeltaを実行してから再実行私のコードならば、私は例えば、結果の異なる順序を得ることができます...私が間違っているのかについて

{ 
    "took": 4, 
    "timed_out": false, 
    "_shards": { 
    "total": 5, 
    "successful": 5, 
    "skipped": 0, 
    "failed": 0 
    }, 
    "hits": { 
    "total": 5, 
    "max_score": 0.2876821, 
    "hits": [ 
     { 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "Qd0JQWABt4cFDxBHv7Fe", 
     "_score": 0.2876821, 
     "_source": { 
      "synonyms": "Serum Iron" 
     } 
     }, 
     { 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "Pt0JQWABt4cFDxBHv7Fe", 
     "_score": 0.2876821, 
     "_source": { 
      "synonyms": "Iron" 
     } 
     }, 
     { 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "QN0JQWABt4cFDxBHv7Fe", 
     "_score": 0.2876821, 
     "_source": { 
      "synonyms": "Iron Deficiency" 
     } 
     }, 
     { 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "Qt0JQWABt4cFDxBHv7Fe", 
     "_score": 0.19856805, 
     "_source": { 
      "synonyms": "Iron Sulfate" 
     } 
     }, 
     { 
     "_index": "mdelta", 
     "_type": "tags", 
     "_id": "Q90JQWABt4cFDxBHv7Fe", 
     "_score": 0.16853254, 
     "_source": { 
      "synonyms": "Iron Deficiency Anemia" 
     } 
     } 
    ] 
    } 
} 

任意のアイデアは次のようになり大いに感謝します。

+0

シノニムフィールドではどのような分析装置を使用していますか? –

+0

私は何も指定していないので、標準のアナライザ。 elastic 6.0 documentationから 'インデックス時に、アナライザが指定されていなければ、デフォルトと呼ばれるインデックス設定でアナライザが検索されます。これが失敗すると、デフォルトのアナライザーを使用するようにデフォルト設定されます。 ' – Blech

答えて

1

データを再索引付けする際に一貫した結果が得られないのは、用語頻度がシャードごとに計算されるためです。再インデックス時には、ルーティングを指定しないため、シャード割り当ては前のインデックスとは異なります。

問題:

は[あなた]は、弾性から

を期待得ていないがために、あなたのインデックス内のドキュメントの数が少ないのかもしれないです。次のようにパラメータsearch_typeを使用してクエリを実行してみてください:GET mdelta/tags/_search?search_type= dfs_query_then_fetch これにより、インデックスレベルの頻度が最初に計算されます。 これは開発中でも使用できますが、本番環境ではお勧めできません。十分なデータがある場合、その周波数はシャード全体で多かれ少なかれ同じでなければなりません。

参照:https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-search-type.html

+0

Fantastic。ニース明確なexaplanation。ありがとう。私が自分自身でこれを明確にしていることを確認するために、私もこの問題を修正したシャードを強制的にテストしました。長期的な解決策ではありませんが、今私はより明確に理解しています。 – Blech

+0

助けてくれると嬉しいです:) –

関連する問題