Cloudant Selector Query for fetching particular elements inside Array - json

I have the below requirement.
In the below array elements I have to select and compare the value of LoanAmount. In the previous posts, the below solutions are mentioned.
{
"_id": "65c5e4c917781f7365f4d814f6e1665f",
"_rev": "2-73615006996721fef9507c2d1dacd184",
"userprofile": {
"name": "tom",
"age": 30,
"employer": "Microsoft"
},
"loansBorrowed": [{"loanamount": 5000,
"loandate": "01/01/2001",
"repaymentdate": "01/01/2001",
"rateofinterest": 5.6,
"activeStatus": true,
"penalty": {
"penalty-amount": 500,
"reasonforPenalty": "Exceeded the date by 10 days"
}
},
{
"loanamount": 3000,
"loandate": "01/01/2001",
"repaymentdate": "01/01/2001",
"rateofinterest": 5.6,
"activeStatus": true,
"penalty": {
"penalty-amount": 400,
"reasonforPenalty": "Exceeded the date by 10 days"
}
},
{
"loanamount": 2000,
"loandate": "01/01/2001",
"repaymentdate": "01/01/2001",
"rateofinterest": 5.6,
"activeStatus": true,
"penalty": {
"penalty-amount": 500,
"reasonforPenalty": "Exceeded the date by 10 days"
}
}
]
}
Index:
{
"index": {
"fields": [{
"name": "loansBorrowed.[].loanamount",
"type":"number"
}],
"type": "json"
}
Selector query:
{"selector": {
"loansBorrowed": {
"$elemMatch": {
"loanamount": 3000
}
}
}
}
But that index and selector queries are providing all the records for that particular Query instead of providing me only record with 3000.
Please suggest how to fetch only particular element inside an array block.

I don't think it's possible to only return specific items in an array. You could accomplish something similar using views. Here is an example design document:
{
"_id": "_design/loans",
"_rev": "1-a115abe01632dd43ee1d0d10546b737d",
"views": {
"by_amount": {
"map": "function (doc) {\n if (doc.loansBorrowed) {\n for (var i=0; i<doc.loansBorrowed.length; i++) {\n emit(doc.loansBorrowed[i].loanamount, {userprofile: doc.userprofile, loan:doc.loansBorrowed[i]});\n }\n }\n}"
}
},
"language": "javascript"
}
This creates a view called by_amount. Here is the map function:
function (doc) {
if (doc.loansBorrowed) {
for (var i=0; i<doc.loansBorrowed.length; i++) {
emit(doc.loansBorrowed[i].loanamount, {userprofile: doc.userprofile, loan:doc.loansBorrowed[i]});
}
}
}
Here I am using the loan amount as the key. This let's you query by the loan amount. The value can be whatever you want to return. In this case I am returning a document with the user's profile and the loan.
You can then query this view like so:
https://xxx.cloudant.com/YOUR_DB/_design/loans/_view/by_amount?key=3000
Which results in the something like the following (note: I added a second loan with a value of 3000 to show how it would look with multiple loans that matched):
{
"total_rows":6,
"offset":2,
"rows":[
{
"id":"796a8954600cee9dbb9e0a4040593942",
"key":3000,
"value":{
"userprofile":{
"name":"tom",
"age":30,
"employer":"Microsoft"
},
"loan":{
"loanamount":3000,
"loandate":"01/01/2001",
"repaymentdate":"01/01/2001",
"rateofinterest":5.6,
"activeStatus":true,
"penalty":{
"penalty-amount":400,
"reasonforPenalty":"Exceeded the date by 10 days"
}
}
}
},
{
"id":"c93f52da36a51f0ddd75f5be381c916e",
"key":3000,
"value":{
"userprofile":{
"name":"joe",
"age":50,
"employer":"Google"
},
"loan":{
"loanamount":3000,
"loandate":"01/01/2001",
"repaymentdate":"01/01/2001",
"rateofinterest":5.6,
"activeStatus":true,
"penalty":{
"penalty-amount":400,
"reasonforPenalty":"Exceeded the date by 10 days"
}
}
}
}
]
}

Related

Elasticsearch - How to get the latest record in each group with filter?

I have a few records in elastic search I want to group the record by user_id and fetch the latest record which is event_type is 1
If the latest record event_type value is not 1 then we should not fetch that record. I did it in MySQL query. Please let me know how can I do that same in elastic search.
After executing the MySQL query
SELECT * FROM user_events
WHERE id IN( SELECT max(id) FROM `user_events` group by user_id ) AND event_type=1;
I need the same output in elasticsearch aggregations.
Elasticsearch Query:
GET test_analytic_report/_search
{
"from": 0,
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"event_date": {
"gte": "2022-10-01",
"lte": "2023-02-06"
}
}
}
]
}
},
"sort": {
"event_date": {
"order": "desc"
}
},
"aggs": {
"group": {
"terms": {
"field": "user_id"
},
"aggs": {
"group_docs": {
"top_hits": {
"size": 1,
"_source": ["user_id", "event_date", "event_type"],
"sort": {
"user_id": "desc"
}
}
}
}
}
}
}
I have the above query I have two users whose user_id is 55 and 56. So, in my aggregations, it should not come. But It fetched the other event_type data but I want only event_types=1 with the latest one. if the user's last record does not have event_type=1, it should not come.
In the above table, user_id 56 latest record event_type contains 2 so it should not come in our aggregations.
I tried but it's not returning the exact result that I want.
Note: event_date is the current date and time. As per the above image, I have inserted it manually that's why the date differs
GET user_events/_search
{
"size": 1,
"query": {
"term": {
"event_type": 1
}
},
"sort": [
{
"id": {
"order": "desc"
}
}
]
}
Explanation: This is an Elasticsearch API request in JSON format. It retrieves the latest event of type 1 (specified by "event_type": 1 in the query) from the "user_events" index, with a size of 1 (specified by "size": 1) and sorts the results in descending order by the "id" field (specified by "order": "desc" in the sort).
If your ES version supports, you can do it with field collapse feature. Here is an example query:
{
"_source": false,
"query": {
"bool": {
"filter": {
"term": {
"event_type": 1
}
}
}
},
"collapse": {
"field": "user_id",
"inner_hits": {
"name": "the_record",
"size": 1,
"sort": [
{
"id": "desc"
}
]
}
},
"sort": [
{
"id": {
"order": "desc"
}
}
]
}
In the response, you will see that the document you want is in inner_hits under the name you give. In my example it is the_record. You can change the size of the inner hits if you want more records in each group and sort them.
Tldr;
They are many ways to go about it:
Sorting
Collapsing
Latest Transform
All those solution are approximate of what you could get with sql.
But my personal favourite is transform
Solution - transform jobs
Set up
We create 2 users, with 2 events.
PUT 75324839/_bulk
{"create":{}}
{"user_id": 1, "type": 2, "date": "2015-01-01T00:00:00.000Z"}
{"create":{}}
{"user_id": 1, "type": 1, "date": "2016-01-01T00:00:00.000Z"}
{"create":{}}
{"user_id": 2, "type": 1, "date": "2015-01-01T00:00:00.000Z"}
{"create":{}}
{"user_id": 2, "type": 2, "date": "2016-01-01T00:00:00.000Z"}
Transform job
This transform job is going to run against the index 75324839.
It will find the latest document, with regard to the user_id, based of the value in date field.
And the results are going to be stored in latest_75324839.
PUT _transform/75324839
{
"source": {
"index": [
"75324839"
]
},
"latest": {
"unique_key": [
"user_id"
],
"sort": "date"
},
"dest": {
"index": "latest_75324839"
}
}
If you were to query latest_75324839
You would find:
{
"hits": [
{
"_index": "latest_75324839",
"_id": "AGvuZWuqqz7c5ytICzX5Z74AAAAAAAAA",
"_score": 1,
"_source": {
"date": "2017-01-01T00:00:00.000Z",
"user_id": 1,
"type": 1
}
},
{
"_index": "latest_75324839",
"_id": "AA3tqz9zEwuio1D73_EArycAAAAAAAAA",
"_score": 1,
"_source": {
"date": "2016-01-01T00:00:00.000Z",
"user_id": 2,
"type": 2
}
}
]
}
}
Get the final results
To get the amount of user with type=1.
A simple search query such as:
GET latest_75324839/_search
{
"query": {
"term": {
"type": {
"value": 1
}
}
},
"aggs": {
"number_of_user": {
"cardinality": {
"field": "user_id"
}
}
}
}
Side notes
This transform job has been running in batch, this means it will only run once.
It is possible to run it in a continuous fashion, to get all the time the latest event for a user_id.
Here are some examples.
Your are looking for an SQL HAVING clause, which would allow you to filter results after grouping. But sadly there is nothing equivalent on Elastic.
So it is not possible to
sort, collapse and filter afterwards (even post_filter does not
help here)
use a top_hits aggregation with custom sorting and then filter
use any map/reduce scripted aggregations, as they do not support
sorting.
work with subqueries.
So basically seen, Elastic is not a database. Any sorting or relation to other documents should be based on scoring. And the score should be calculated independently for each document, distributed on shards.
But there is a tiny loophole, which might be the solution for your use case. It is based on a top_metrics aggregation followed by bucket selector to eliminate the unwanted event types:
GET test_analytic_report/_search
{
"size": 0,
"aggs": {
"by_id": {
"terms": {
"field": "user_id",
"size": 100
},
"aggs": {
"tm": {
"top_metrics": {
"metrics": {
"field": "event_type"
},
"sort": [
{
"id": {
"order": "desc"
}
}
]
}
},
"event_type_filter": {
"bucket_selector": {
"buckets_path": {
"event_type": "tm.event_type"
},
"script": "params.event_type == 1"
}
}
}
}
}
}
If you require more fields from the source document you can add them to the top_metrics.
It is sorted by id now, but you can also use event_date.

Find the average value in MongoDB from JSON

In my MongoDB (export from JSON file) I have database "dab" with structure like this:
id:"1"
datetime:"2020-05-08 5:09:56"
name:"namea"
lat:55.826738
lon:45.0423412
analysis:"[{"0":0.36965591924860347},{"5":0.10391287134268598},{"10":0.086884394..."
I'm using that db for spark analysis via MongoDB-Spark Connector.
My problem is field "analysis" - I need average result for all values from every interval ("0", "5", "10", ..., "1000"), so I have to sum 0.36965591924860347 + 0.10391287134268598 + 0.086884394 + ... and divide by number of intervals (I have 200 intervals in every column), and finally multiply the result by 100.
My solution would be this one:
db.collection.aggregate([
{
$set: {
analysis: {
$map: {
input: "$analysis",
in: { $objectToArray: "$$this" }
}
}
}
},
{
$set: {
analysis: {
$map: {
input: "$analysis",
in: { $first: "$$this.v" }
}
}
}
},
{ $set: { average: { $multiply: [ { $avg: "$analysis" }, 100 ] } } }
])
Mongo playground
You can use $reduce on that array,sum the values,and then divide with the number of elements and then multiply with 100
db.collection.aggregate([
{
"$addFields": {
"average": {
"$multiply": [
{
"$divide": [
{
"$reduce": {
"input": "$analysis",
"initialValue": 0,
"in": {
"$let": {
"vars": {
"sum": "$$value",
"data": "$$this"
},
"in": {
"$add": [
"$$sum",
{
"$arrayElemAt": [
{
"$arrayElemAt": [
{
"$map": {
"input": {
"$objectToArray": "$$data"
},
"as": "m",
"in": [
"$$m.k",
"$$m.v"
]
}
},
0
]
},
1
]
}
]
}
}
}
}
},
{
"$size": "$analysis"
}
]
},
100
]
}
}
}
])
You can test the code here
But this code has 1 problem, you save data in documents, and MongoDB
doesn't have a function like get(document,$$k), the new MongoDB v5.0 has a $getField but still accepts only constants no variables.
I mean we cant do in your case getField(doc,"5").
So we have the cost of converting each document to an array.

Query Nested Mongodb info with Variable Nested Field names

I have a MongoDB that is structured as below:
[
{
"subject_id": "1",
"name": "Maria",
"dob": "1/1/00",
"gender": "F",
"visits": {
"1/1/18": {
"date_entered": "1/2/18",
"entered_by": "Sally"
},
"1/2/18": {
"date_entered": "1/2/18",
"entered_by": "Tim",
}
},
"samples": {
"XXX123": {
"collected_by": "Sally",
"collection_date": "1/3/18"
}
}
},
{
"subject_id": "2",
"name": "Bob",
"dob": "1/2/00",
"gender": "M",
"visits": {
"1/3/18": {
"date_entered": "1/4/18",
"entered_by": "Tim"
}
},
"samples": {
"YYY456": {
"collected_by": "Sally",
"collection_date": "1/5/18"
},
"ZZZ789": {
"collected_by": "Tim",
"collection_date": "1/6/18"
},
"AAA123": {
"collected_by": "Sally",
"collection_date": "1/7/18"
}
}
}
]
If I wanted to query the database to find all samples collected by Sally or all visits entered by Tim, what would be the best way of doing that?
I'm new to MongoDB and my attempts with various regex's haven't produced results. Any advice would be greatly appreciated.
I first used project on the required fields to use objectToArray followed by unwind to create separate records for array created in project.
The results are then filtered using match.
This works for the data provided in the question -
db.so.aggregate([
{$project: {visits: {$objectToArray: "$visits"}, samples: {$objectToArray: "$samples"}}},
{$unwind: "$visits"},
{$unwind: "$samples"},
{ $match: {
$or : [
{ "visits.v.entered_by" : "Tim" },
{ "samples.v.collected_by" : "Sally" }
]
}
}
])

sql subqueries to mongodb

I am new to MongoDB and I am trying to turn SQL queries into MongoDB queries. But can't seem to find any way to turn a SQL query with a subquery to mongoDB.
for example:
SELECT article, dealer, price
FROM shop
WHERE price=(SELECT MAX(price) FROM shop);
I tried the following, but it doesn't seem to work.
db.shop.group({
"initial": {},
"reduce": function(obj, prev) {
prev.maximumvalueprice = isNaN(prev.maximumvalueprice) ? obj.price :
Math.max(prev.maximumvalueprice, obj.price);
}}).forEach(
function(data){
db.shop.find({
"price": data
},
{
"article": 1,
"dealer": 1,
"price": 1
})
})
How do I convert this SQL query into a MongoDB query?
If you are using MongoDB v. 3.2 or newer you can try to use $lookup.
Try to use aggregation:
$sort your collection by price by DESC;
set $limit to 1 (it will take a first document, which will be with biggest price);
then use $lookup to select the documents from the same collection by max price and set it to tmpCollection element;
$unwind tmpCollection;
$replaceRoot - change document root to $tmpCollection
Example:
db.getCollection("shop").aggregate([
{$sort: {"price":-1}},
{$limit: 1},
{$lookup: {
from: "shop",
localField: "price",
foreignField: "price",
as: "tmpCollection"
}},
{$unwind: "$tmpCollection"},
{$replaceRoot: {newRoot:"$tmpCollection"}}
]);
Looks like you need the aggregation framework for this task using $first within a $group pipeline stage on ordered documents. The initial pipeline step for ordering the documents in the collection is $sort:
db.shop.aggregate([
{ "$sort": { "price": -1 } }, // <-- sort the documents first in descending order
{
"$group": {
"_id": null,
"article": { "$first": "$article" },
"dealer": { "$first": "$dealer" },
"price": { "$first": "$price" }
}
}
])
or using $last
db.shop.aggregate([
{ "$sort": { "price": 1 } }, // <-- note the sort direction
{
"$group": {
"_id": null,
"article": { "$last": "$article" },
"dealer": { "$last": "$dealer" },
"price": { "$last": "$price" }
}
}
])

Elasticsearch bucket aggregation using concatenated parameter

I'm using Elasticsearch API and the schema of the document as follow
{
name: "",
born_year: "",
born_month: "",
born_day: "",
book_type: "",
price: <some number>,
country: ""
}
Now what I need is to get the document count per each name where born before 1995 (born_year + born_month + born_day < "20051220"). How can i achieve?
I tried this:
{
"query": {
"query_string": {
"query": "country:\"SL\""
}
},
"size": 0,
"aggs": {
"total": {
"terms": {
"field": "name"
}
}
}
}
But I have no idea how can I add filter for the birthday.
As mentioned by #val, you need to add a real date field that you can easily add by concatenating these three fields at creation time.
But how you filter based on date range, there are two ways and both of them will return different result sets
Now the level of filtering is your choice.
You mentioned querying on country field. But you have not mentioned at what level you want to filter on date range. I will give you queries for both the cases.
Mappings- assuming you create a date field.
{
name:"",
born_year:"",
born_month:"",
born_day:"",
book_type:"",
price:<some number>,
country:"",
date : ""
}
Case - 1) Filtering date range for name aggregations only, here documents count will not be effected by the date range filter
{
"query": {
"query_string": {
"query": "country:\"SL\""
}
},
"aggs": {
"total": {
"filter": {
"range": {
"date": {
"gte": "your_date_mx",
"lte": "your_date_min"
}
}
},
"aggs": {
"NAME": {
"terms": {
"field": "name",
"size": 10
}
}
}
}
}
}
Case 2) In this case both your documents count and aggregation will be filtered for date range as we add date range filter at query level.
{
"query": {
"query_string": {
"query": "country:\"SL\""
},
"bool": {
"must": [
{
"range": {
"date": {
"gte": "your_date_mx",
"lte": "your_date_mic"
}
}
}
]
}
},
"aggs": {
"toal": {
"terms": {
"field": "name",
"size": 10
}
}
}
}
So adding a filter to aggregation will effect only aggs count.
Edit -
Approach1) with groovy script try to concatinate the string and parse it to integer and then compare with your input date.
{
"query": {
"bool": {
"must": [
{}
],
"filter": {
"script": {
"script": {
"inline": "(doc['year'].value + doc['month'].value + doc['date'].value).toInteger() > 19910701",
"params": {
"param1": 19911122
}
}
}
}
}
}
}
Make sure when indexing index date(or month) with single digit like 6 as 06
2) Approach 2 - parse the string the exact date(preferred)
{
"query": {
"bool": {
"must": [
{}
],
"filter": {
"script": {
"script": {
"inline": "Date.parse('dd-MM-yyyy',doc['date'].value +'-'+ doc['month'].value +'-'+ doc['year'].value).format('dd-MM-yyyy') > param1",
"params": {
"param1": "04-05-1991"
}
}
}
}
}
}
}
Second approach is much better approach as you don't have to worry about the maintaing the string for each field(date, month, day) to later parse to proper int for comparing.