Description
Update: This is possibly a documentation issue, but the original problem was caused by the fact that "array_distinct" returns results in apparently random order, and using the same expression in different parts of a query can have different results.
E.g., the following query:
select uuid, os1, os2
|
from nutshell
|
LET os1 = array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`),
|
os2 = array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`)
|
where array_length(array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`)) > 1 |
sometimes returns the same values for os1 and os2, but sometimes they are in a different order. If this is the expected behavior, it should be documented as such.
Working with the Nutshell data set (ask me if you want access to the data), the documents have a sub-object called 'node' which is an array of entries about each node in a cluster. It looks as follows. I'm interested in extracting the subfield `OS Name`:
"node": [ |
{
|
"node": "ns_1@127.0.0.1", |
"results": { |
"analysers": { |
"Open File Information": { |
"Command": [ |
"beam.smp", |
"memcached", |
"moxi", |
"==Total==" |
],
|
"File Descriptors": [ |
"74", |
"202", |
"46", |
"328" |
],
|
"Memory Maps": [ |
"40", |
"22", |
"18", |
"84" |
]
|
},
|
"Per Node Bucket Stats (CB buckets only)": { |
"Active": "-", |
"Bucket": "Total (0 buckets)", |
"Current Items": "0", |
"In-Memory KV (MB)": "0", |
"Metadata %": "-", |
"Replica": "0", |
"Resident Ratio": "-" |
}
|
},
|
"checkers": { |
"CB Packages": "ii couchbase-server 3.1.6 Couchbase Server", |
"CB Uptime": "??? (Attribute not found). Cannot check.", |
"CB Version": "3.1.6-1904-rel-enterprise - EOL Version", |
"Collection Time": "2017-07-04 02:21:08", |
"Data Directory": "/opt/couchbase/var/lib/couchbase/data - ext4 - (/dev/sdb1 1.1TB / 1% used)", |
"Data Quota": 77221, |
"Dedupe NMVB Maps": "Disabled", |
"Erlang VM vsize": 2078, |
"Installed CPUs": 40, |
"Installed RAM": 128702, |
"Interface 'bond0' MTU": "uses Jumbo Frames MTU value 9000", |
"Interface 'bond0' failures": "RX:363116 / TX:0 - Details:\n - RX packets:739,726 errors:0 dropped:363,116 overruns:0 frame:0 ", |
"Interface 'eth0' MTU": "uses Jumbo Frames MTU value 9000", |
"Interface 'eth0' failures": "RX:363103 / TX:0 - Details:\n - RX packets:362,638 errors:0 dropped:363,103 overruns:0 frame:0 ", |
"Interface 'eth1' MTU": "uses Jumbo Frames MTU value 9000", |
"Interface 'eth1' failures": "RX:0 / TX:0", |
"Interface 'eth2' Status": "down", |
"Interface 'eth3' Status": "down", |
"Interface 'vlan330' (172.20.19.66) failures": "RX:117 / TX:0 - Details:\n - RX packets:298,533 errors:0 dropped:117 overruns:0 frame:0 ", |
"Interface 'vlan330' MTU": "uses Jumbo Frames MTU value 9000", |
"Memcached Fragmentation": "??? (Attribute not found). Cannot check.", |
"Memcached vsize": 395, |
"NUMA Configuration": "2 nodes - Couchbase processes have default NUMA policy", |
"OS Name": "Linux 3.13.0-32-generic", |
"OS Version": "Ubuntu 12.04.5 LTS", |
"Swap Used": { |
"total": 28762108, |
"used": 0 |
},
|
"System Uptime": "23 min", |
"Total Server Quota": 77221, |
"Transparent Huge Pages": "always - Recommended is 'never' or 'madvise'", |
"Used RAM": { |
"total": 128701, |
"used": 1610 |
},
|
"analyse_connections": "Failed to run analyser", |
"check_non_default_max_connections": "Failed to run checker", |
"vm.swappiness": "0 (Recommended value)" |
}
|
}
|
}
|
]
|
I wanted to group different `OS Type` values by substrings they might contain, so I set up a case statement. Here is the very simplest form, which extracts the first `OS Name` and also indicates whether the substring `el6` was seen in the string:
select os, linux_type
|
from nutshell
|
LET os = array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`)[0], |
linux_type = case when contains(array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`)[0],"el6") |
then "el6" |
else "unknown" |
end
|
where array_length(array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`)) > 0 |
The "let" clause is setting the value of "os", and is setting "linux_type" to "el6" if that substring is seen, and "unknown" otherwise.
Yet some of the results are are coming out with "os" containing "el6", and "linux_type" as "unknown", while others are correctly detecting "el6" and setting "linux_type" to "el6".
As an example, the following query looks for linux_type = "unknown" and the os name containing "el6". It should not return any values, but it does:
select os, linux_type
|
from nutshell
|
LET os = array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`)[0], |
linux_type = case when contains(array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`)[0],"el6") then "el6" else "unknown" end |
where array_length(array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`)) > 0 |
and linux_type = "unknown" and contains(os,"el6") |
It seems not to be evaluating the nested array_star expressions, because if I change this to a nested query, it works properly:
select os, linux_type
|
from
|
|
(select os
|
from nutshell
|
LET os = array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`)[0] |
where array_length(array_distinct(array_star(array_star(array_star(node).results).checkers).`OS Name`)) > 0) inner_query |
|
let
|
linux_type = case when contains(os,"el6") |
then "el6" |
else "unknown" |
end
|
where linux_type = "unknown" and contains(os,"el6") |