Details
-
Bug
-
Resolution: Fixed
-
Critical
-
7.6.0, 7.6.2, 7.6.1
-
Untriaged
-
0
-
Unknown
Description
This is the 2024-04-29 source build we get very different “scores” I use the exact same data in the xattrs and the doc doc as base64 strings (and also as JSON numeric arrays)
Test #1
Below we have two indexes and two data sets with the same base64 embedding at 32 partitions.
Not the different scores about 0.05 for the body and about 33.0 for the xattrs
Here we query form the body of the document
{
|
"knn": [
|
{
|
"k": 5,
|
"field": "embedding_base64",
|
"vector_base64": "B/w2vLucQT ….. vector from doc t1 …. YYS8xn9Ouw=="
|
}
|
],
|
"fields": [
|
"*"
|
],
|
"query": {
|
"match_none": {}
|
},
|
"explain": true,
|
"size": 10,
|
"from": 0
|
}
|
This is the same query against the base64 string in the user XATTRs (as above)
{
|
"knn": [
|
{
|
"k": 3,
|
"field": "embedding_base64",
|
"vector_base64": "B/w2vLucQT ….. vector from doc t1 …. YYS8xn9Ouw=="
|
}
|
],
|
"fields": [
|
"*"
|
],
|
"query": {
|
"match_none": {}
|
},
|
"explain": true,
|
"size": 10,
|
"from": 0
|
}
|
Test #2
We have the same scoring issue if we use JSON numeric arrays in the xattrs or the doc body I copied the doc.embedding into the user xattrs as "embedding"
{
|
"knn": [{
|
"k": 10,
|
"field": "embedding",
|
"vector": [-0.011168486, 0.011817153, -0.0006976959, ....... }],
|
"fields": ["*"],
|
"query": {
|
"match_none": {}
|
}
|
}
|
The ground truth is as follows the above does very good in both accuracy and recall@5
"ground_truth": [
|
"k6841",
|
"k9630",
|
"k1507",
|
"k9749",
|
"k9467",
|
"k3017",
|
"k6301",
|
"k6502",
|
"k1130",
|
"k5569",
|
"k7678",
|
"k799",
|
"k5973",
|
"k1408",
|
"k407",
|
"k7186"
|
]
|
BUT the same data in the XATTRs doesn't return any ground truth hits at all and has wacky scores
{
|
"knn": [{
|
"k": 10,
|
"field": "_$xattrs.embedding",
|
"vector": [-0.011168486, 0.011817153, -0.0006976959, .......
|
}],
|
"fields": ["*"],
|
"query": {
|
"match_none": {}
|
}
|
}
|
}
|
NO ground truths are found above and the accuracy + recall@5 is terrible
Other
Looking exact same query looking at a sing doc say k1993 (I used eventing to do an exact copy from the doc body to the xattr)
The index was:
{
|
"type": "fulltext-index",
|
"name": "std._default.a_std_part8",
|
"uuid": "33ae9ed1fd34f099",
|
"sourceType": "gocbcore",
|
"sourceName": "std",
|
"sourceUUID": "52b2a8cd47d84f4ad9865dc2087e83da",
|
"planParams": {
|
"maxPartitionsPerPIndex": 32,
|
"indexPartitions": 32
|
},
|
"params": {
|
"doc_config": {
|
"docid_prefix_delim": "",
|
"docid_regexp": "",
|
"mode": "scope.collection.type_field",
|
"type_field": "type"
|
},
|
"mapping": {
|
"analysis": {},
|
"default_analyzer": "standard",
|
"default_datetime_parser": "dateTimeOptional",
|
"default_field": "_all",
|
"default_mapping": {
|
"dynamic": false,
|
"enabled": false
|
},
|
"default_type": "_default",
|
"docvalues_dynamic": false,
|
"index_dynamic": false,
|
"store_dynamic": false,
|
"type_field": "_type",
|
"types": {
|
"_default._default": {
|
"dynamic": false,
|
"enabled": true,
|
"properties": {
|
"_$xattrs": {
|
"dynamic": false,
|
"enabled": true,
|
"properties": {
|
"embedding": {
|
"dynamic": false,
|
"enabled": true,
|
"fields": [
|
{
|
"dims": 4096,
|
"index": true,
|
"name": "embedding",
|
"similarity": "dot_product",
|
"type": "vector",
|
"vector_index_optimized_for": "recall"
|
}
|
]
|
}
|
}
|
},
|
"embedding": {
|
"dynamic": false,
|
"enabled": true,
|
"fields": [
|
{
|
"dims": 4096,
|
"index": true,
|
"name": "embedding",
|
"similarity": "dot_product",
|
"type": "vector",
|
"vector_index_optimized_for": "recall"
|
}
|
]
|
}
|
}
|
}
|
}
|
},
|
"store": {
|
"indexType": "scorch",
|
"segmentVersion": 16
|
}
|
},
|
"sourceParams": {
|
"includeXAttrs": true
|
}
|
}
|
The vectors to test with the ground truth doc "t1" is attached.