Couchbase N1QL query aggregation - json

I'm trying to write a query that will aggregate the results of a query to provide total values for the results matched.
The documents in the bucket look like this:
{
"clientId": "test-client",
"event": {
"history": [
{
"code": "FAILED",
"serviceId": "s1"
},
{
"code": "SUCCESS",
"serviceId": "s2"
}
],
"size": 200
}
},
{
"clientId": "test-client",
"event": {
"history": [
{
"code": "FAILED",
"serviceId": "s1"
},
{
"code": "SUCCESS",
"serviceId": "s2"
}
],
"size": 200
}
},
{
"clientId": "test-client",
"event": {
"history": [
{
"code": "SUCCESS",
"serviceId": "s1"
}
],
"size": 200
}
}
The output document I'm looking to produce looks like this:
{
"clientId": "test-client",
"totalSize": 600,
"totalVolume": 3,
"serviceSummary": [
{
"serviceId": "s1",
"serviceTotalSize": 200,
"serviceTotalVolume": 1
},
{
"serviceId": "s2",
"serviceTotalSize": 400,
"serviceTotalVolume": 2
}
]
}
So the query needs to
aggregate all results for the clientId calculating totalSize and totalVolume
look at the content of the history array, and find the serviceId with a code of "SUCCESS"
provide a total size and volume for the serivceId where the event was successful
So far I have a query like this:
select
d.clientId,
count(*) totalVolume,
sum(d.event.size) totalSize ,
ARRAY_AGG(DISTINCT h.serviceId) serviceSummary
from demo d
unnest d.event.history h
where h.code = 'SUCCESS'
group by d.clientId;
which produces part of the result I want, but not the full serviceSummary
thanks for any help.

SQL standards doesn't allow nested aggregates you need intervening subquery with multi level aggregates.
SELECT d1.clientId,
SUM(d1.serviceTotalVolume) AS totalVolume,
SUM(d1.serviceTotalSize) AS totalSize,
ARRAY_AGG({d1.serviceId, d1.serviceTotalVolume, d1.serviceTotalSize}) AS serviceSummary
FROM ( SELECT
d.clientId,
h.serviceId,
COUNT(1) AS serviceTotalVolume,
SUM(d.event.size) AS serviceTotalSize
FROM demo AS d
UNNEST d.event.history AS h
WHERE h.code = 'SUCCESS'
GROUP BY d.clientId, h.serviceId) AS d1
GROUP BY d1.clientId;

Related

Elasticsearch - How to get the latest record in each group with filter?

I have a few records in elastic search I want to group the record by user_id and fetch the latest record which is event_type is 1
If the latest record event_type value is not 1 then we should not fetch that record. I did it in MySQL query. Please let me know how can I do that same in elastic search.
After executing the MySQL query
SELECT * FROM user_events
WHERE id IN( SELECT max(id) FROM `user_events` group by user_id ) AND event_type=1;
I need the same output in elasticsearch aggregations.
Elasticsearch Query:
GET test_analytic_report/_search
{
"from": 0,
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"event_date": {
"gte": "2022-10-01",
"lte": "2023-02-06"
}
}
}
]
}
},
"sort": {
"event_date": {
"order": "desc"
}
},
"aggs": {
"group": {
"terms": {
"field": "user_id"
},
"aggs": {
"group_docs": {
"top_hits": {
"size": 1,
"_source": ["user_id", "event_date", "event_type"],
"sort": {
"user_id": "desc"
}
}
}
}
}
}
}
I have the above query I have two users whose user_id is 55 and 56. So, in my aggregations, it should not come. But It fetched the other event_type data but I want only event_types=1 with the latest one. if the user's last record does not have event_type=1, it should not come.
In the above table, user_id 56 latest record event_type contains 2 so it should not come in our aggregations.
I tried but it's not returning the exact result that I want.
Note: event_date is the current date and time. As per the above image, I have inserted it manually that's why the date differs
GET user_events/_search
{
"size": 1,
"query": {
"term": {
"event_type": 1
}
},
"sort": [
{
"id": {
"order": "desc"
}
}
]
}
Explanation: This is an Elasticsearch API request in JSON format. It retrieves the latest event of type 1 (specified by "event_type": 1 in the query) from the "user_events" index, with a size of 1 (specified by "size": 1) and sorts the results in descending order by the "id" field (specified by "order": "desc" in the sort).
If your ES version supports, you can do it with field collapse feature. Here is an example query:
{
"_source": false,
"query": {
"bool": {
"filter": {
"term": {
"event_type": 1
}
}
}
},
"collapse": {
"field": "user_id",
"inner_hits": {
"name": "the_record",
"size": 1,
"sort": [
{
"id": "desc"
}
]
}
},
"sort": [
{
"id": {
"order": "desc"
}
}
]
}
In the response, you will see that the document you want is in inner_hits under the name you give. In my example it is the_record. You can change the size of the inner hits if you want more records in each group and sort them.
Tldr;
They are many ways to go about it:
Sorting
Collapsing
Latest Transform
All those solution are approximate of what you could get with sql.
But my personal favourite is transform
Solution - transform jobs
Set up
We create 2 users, with 2 events.
PUT 75324839/_bulk
{"create":{}}
{"user_id": 1, "type": 2, "date": "2015-01-01T00:00:00.000Z"}
{"create":{}}
{"user_id": 1, "type": 1, "date": "2016-01-01T00:00:00.000Z"}
{"create":{}}
{"user_id": 2, "type": 1, "date": "2015-01-01T00:00:00.000Z"}
{"create":{}}
{"user_id": 2, "type": 2, "date": "2016-01-01T00:00:00.000Z"}
Transform job
This transform job is going to run against the index 75324839.
It will find the latest document, with regard to the user_id, based of the value in date field.
And the results are going to be stored in latest_75324839.
PUT _transform/75324839
{
"source": {
"index": [
"75324839"
]
},
"latest": {
"unique_key": [
"user_id"
],
"sort": "date"
},
"dest": {
"index": "latest_75324839"
}
}
If you were to query latest_75324839
You would find:
{
"hits": [
{
"_index": "latest_75324839",
"_id": "AGvuZWuqqz7c5ytICzX5Z74AAAAAAAAA",
"_score": 1,
"_source": {
"date": "2017-01-01T00:00:00.000Z",
"user_id": 1,
"type": 1
}
},
{
"_index": "latest_75324839",
"_id": "AA3tqz9zEwuio1D73_EArycAAAAAAAAA",
"_score": 1,
"_source": {
"date": "2016-01-01T00:00:00.000Z",
"user_id": 2,
"type": 2
}
}
]
}
}
Get the final results
To get the amount of user with type=1.
A simple search query such as:
GET latest_75324839/_search
{
"query": {
"term": {
"type": {
"value": 1
}
}
},
"aggs": {
"number_of_user": {
"cardinality": {
"field": "user_id"
}
}
}
}
Side notes
This transform job has been running in batch, this means it will only run once.
It is possible to run it in a continuous fashion, to get all the time the latest event for a user_id.
Here are some examples.
Your are looking for an SQL HAVING clause, which would allow you to filter results after grouping. But sadly there is nothing equivalent on Elastic.
So it is not possible to
sort, collapse and filter afterwards (even post_filter does not
help here)
use a top_hits aggregation with custom sorting and then filter
use any map/reduce scripted aggregations, as they do not support
sorting.
work with subqueries.
So basically seen, Elastic is not a database. Any sorting or relation to other documents should be based on scoring. And the score should be calculated independently for each document, distributed on shards.
But there is a tiny loophole, which might be the solution for your use case. It is based on a top_metrics aggregation followed by bucket selector to eliminate the unwanted event types:
GET test_analytic_report/_search
{
"size": 0,
"aggs": {
"by_id": {
"terms": {
"field": "user_id",
"size": 100
},
"aggs": {
"tm": {
"top_metrics": {
"metrics": {
"field": "event_type"
},
"sort": [
{
"id": {
"order": "desc"
}
}
]
}
},
"event_type_filter": {
"bucket_selector": {
"buckets_path": {
"event_type": "tm.event_type"
},
"script": "params.event_type == 1"
}
}
}
}
}
}
If you require more fields from the source document you can add them to the top_metrics.
It is sorted by id now, but you can also use event_date.

N1QL search certain objects within the object

I have multiple documents in my couchbase bucket with the following structure:
{
"objectType": "someObjectType",
"id": "1",
"latest": {
"id": "1",
"version": "biyr4jqi4nny"
},
"history": {
"bi11b295bjng": {
"id": "1",
"version": "bi11b295bjng"
}
"bi1189wx1h6v": {
"id": "1",
"version": "bi1189wx1h6v"
}
}
}
As seen in the snippet above, history is an object of objects. What I'm trying to do is selecting objectType, id, latest and history, but history should include only the object specified in query instead of all the history (which may be vast).
My query looks like this:
SELECT
bucket.id,
bucket.objectType,
bucket.latest,
bucket.history.bi11b295bjng
FROM bucket WHERE objectType = 'someObjectType'
Which produces me the following response:
[
{
"objectType": "someObjectType",
"id": 1,
"latest": {
"id": "9mobile_NG_001-ROW",
"version": "biyr4jqi4nny"
},
"biyr4jqi4nny": {
"id": "1",
"version": "biyr4jqi4nny"
}
}
]
Queried object got unwrapped from the parent one. Desired output should look like this:
[
{
"objectType": "someObjectType",
"id": 1,
"latest": {
"id": "9mobile_NG_001-ROW",
"version": "biyr4jqi4nny"
},
"history": {
"biyr4jqi4nny": {
"id": "1",
"version": "biyr4jqi4nny"
}
}
}
]
How could I get history.{version} without losing the parent object?
Construct object and Alias as history
SELECT
b.id,
b.objectType,
b.latest,
{ b.history.bi11b295bjng } AS history
FROM bucket AS b
WHERE b.objectType = "someObjectType";
If need multiple objects of same field
SELECT
b.id,
b.objectType,
b.latest,
{ b.history.bi11b295bjng, b.history.bi1189wx1h6v } AS history
FROM bucket AS b
WHERE b.objectType = "someObjectType";
If you have many want to remove one of them
SELECT
b.id,
b.objectType,
b.latest,
OBJECT_REMOVE(b.history,"bi11b295bjng") AS history
FROM bucket AS b
WHERE b.objectType = "someObjectType";

Snowflake - Querying Nested JSON

I need some help querying this JSON file I've ingested into a temp table in Snowflake. So, I've created a JSON_DATA variant column and plan to query and do a COPY INTO another table, but my query isn't working yet... I feel I'm close (possibly?)
JSON layout:
{
"nextPage": "01",
"page": "0",
"status": "ok",
"transactions": [
{
"id": "65985",
"recordTp": "vendorbill",
"values": {
"account": [
{
"text": "14500 Deferred Expenses",
"value": "249"
}
],
"account.number": "1450",
"account.type": [
{
"text": "Deferred Expense",
"value": "DeferExpense"
}
],
"amount": "51733",
"classnohierarchy": [
{
"text": "901 Corporate",
"value": "139"
}
],
"currency": [
{
"text": "Canadian Dollar",
"value": "3"
}
],
"customer.altname": "V Sties expenses (Tor)",
"customer.custate": "12/31/2019",
"customer.custentient": "ada Inc.",
"customer.custendate": "1/1/2019",
"customer.entyid": "PR781",
"departmentnohierarchy": [
{
"text": "8rity",
"value": "37"
}
],
"fxamount": "689",
"location": [
{
"text": "Othad Projects",
"value": "48"
}
],
"postingperiod": [
{
"text": "Jan 2020",
"value": "1"
}
],
"subsidiary.custrecord_region": [
{
"text": "CANADA",
"value": "3"
}
],
"subsidiarynohierarchy": [
{
"text": "ada Inc.",
"value": "25"
}
]
}
},
I've been able to query the values that are not (deeply) nested but I need help getting, for example, the values from 'classnohierarchy', to get both the 'text' and 'value' I tried:
transactions.value:"values".classnohierarchy.text::string as class_txt,
transactions.value:"values".classnohierarchy.value::string as class_val,
but it's returning NULL values.
Below is my entire query:
SELECT
JSON_DATA:status::string as connection_status,
transactions.value:id::string as id,
transactions.value:recordType::string as record_type,
transactions.value:"values"::variant as trans_val,
transactions.value:"values".account as acc,
transactions.value:"values".account.text as text,
transactions.value:"values".account.value as val,
transactions.value:"values"."account.number"::string as acc_num,
transactions.value:"values"."account.type".text::string as acc_type_txt,
transactions.value:"values"."account.type".value::string as acc_type_val,
transactions.value:"values".amount::string as amount,
**transactions.value:"values".classnohierarchy.text::string as class_txt,
transactions.value:"values".classnohierarchy.value::string as class_val,**
transactions.value:"values".currency.text::string as currency_text,
transactions.value:"values".currency.value::string as currency_val,
transactions.value:"values"."customer.altname"::string as customer_project_name,
transactions.value:"values"."customer.custate"::string as customer_end_date,
transactions.value:"values"."customer.custentient"::string as customer_end_client,
transactions.value:"values"."customer.custendate"::string as customer_start_date,
transactions.value:"values"."customer.entyid"::string as customer_project_id,
transactions.value:"values".departmentnohierarchy.text::string as department_name,
transactions.value:"values".departmentnohierarchy.value::string as department_value,
transactions.value:"values".fxamount::string as fx_amount,
transactions.value:"values".location.text::string as product_name,
transactions.value:"values".postingperiod.text::string as postingperiod,
transactions.value:"values".postingperiod.value::string as postingperiod,
transactions.value:"values"."subsidiary.custrecord_region".text::string as region_name,
transactions.value:"values"."subsidiary.custrecord_region".value::string as region_value,
transactions.value:"values".subsidiarynohierarchy.text::string as entity_name,
transactions.value:"values".subsidiarynohierarchy.value::string as entity_value,
FROM MY_TABLE,
LATERAL FLATTEN (JSON_DATA:transactions) as transactions
and here's a picture of whats showing in Snowflake:
SNOWFLAKE_SCREENSHOT
departmentnohierarchy is an array. you need to mention the index as below.
select *,transactions.VALUE:"values".departmentnohierarchy[0].value::text as department_name
FROM jsont1,
LATERAL FLATTEN (JSON_DATA:transactions) as transactions

sub document under main document how to get a listing with pagination using couchbase(N1QL) query

any one can help me how to get the sub document List with pagination
i just give a sample example :
{
"accessories": [`
{
"data": {
"name": "TEST",
"updated_at": "2020-03-27T16:16:20.818Z"
},
"id": "56e83ea1-042e-47e0-85f8-186189c37426"
}
],
"calibration_reports": [`
{
"data": {
"deleted_at": "",
"frm27_equipment": [
"test_cat1"
],
"frm27_link": [
"yes"
],
"frm27_submit": null,
"updated_at": "2020-03-30T10:24:52.703Z"
},
"id": "e4c8b1b4-7f37-46db-a49d-bca74482b968"
},
{
"data": {
"deleted_at": "",
"frm27_equipment": [
"test_cat1"
],
"frm27_link": [
"no"
],
"frm27_submit": null,
"updated_at": "2020-03-30T10:34:37.615Z"
},
"id": "445854d6-66bf-4e33-b620-05a5053119a8"
}
],
}
]
}
Here i want to get a calibration_reports list with pagination is it possible ? using couchbase (N1ql Query)
please if any one know, what is the process how to get the list of result with pagination using couchbase(N1QL) query. please help me.
One possible way to go about this is to use UNNEST.
For instance:
SELECT calreports.id
FROM utpal u
UNNEST u.calibration_reports calreports
This would return something like:
[
{ "id": "aaa" },
{ "id": "bbb" },
{ "id": "ccc" },
... etc ...
]
And then you can use normal LIMIT/OFFSET for pagination, like so:
SELECT calreports.id
FROM utpal u
UNNEST u.calibration_reports calreports
LIMIT 50
OFFSET 150;

AWS Athena - Querying JSON - Searching for Values

I have nested JSON files on S3 and am trying to query them with Athena.
However, I am having problems to query the nested JSON values.
My JSON file looks like this:
{
"id": "17842007980192959",
"acount_id": "17841401243773780",
"stats": [
{
"name": "engagement",
"period": "lifetime",
"values": [
{
"value": 374
}
],
"title": "Engagement",
"description": "Total number of likes and comments on the media object",
"id": "17842007980192959/insights/engagement/lifetime"
},
{
"name": "impressions",
"period": "lifetime",
"values": [
{
"value": 11125
}
],
"title": "Impressions",
"description": "Total number of times the media object has been seen",
"id": "17842007980192959/insights/impressions/lifetime"
},
{
"name": "reach",
"period": "lifetime",
"values": [
{
"value": 8223
}
],
"title": "Reach",
"description": "Total number of unique accounts that have seen the media object",
"id": "17842007980192959/insights/reach/lifetime"
},
{
"name": "saved",
"period": "lifetime",
"values": [
{
"value": 0
}
],
"title": "Saved",
"description": "Total number of unique accounts that have saved the media object",
"id": "17842007980192959/insights/saved/lifetime"
}
],
"import_date": "2017-12-04"
}
What I'm trying to do is to query the "stats" field value where name=impressions.
So ideally something like:
SELECT id, account_id, stats.values.value WHERE stats.name='engagement'
AWS example: https://docs.aws.amazon.com/athena/latest/ug/searching-for-values.html
Any help would be appreciated.
You can query the JSON with the following table definition:
CREATE EXTERNAL TABLE test(
id string,
acount_id string,
stats array<
struct<
name:string,
period:string,
values:array<
struct<value:string>>,
title:string
>
>
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
LOCATION 's3://bucket/';
Now, the value column is available through the following unnesting:
select id, acount_id, stat.name,x.value
from test
cross join UNNEST(test.stats) as st(stat)
cross join UNNEST(stat."values") as valx(x)
WHERE stat.name='engagement';