Find the average value in MongoDB from JSON - json

In my MongoDB (export from JSON file) I have database "dab" with structure like this:
id:"1"
datetime:"2020-05-08 5:09:56"
name:"namea"
lat:55.826738
lon:45.0423412
analysis:"[{"0":0.36965591924860347},{"5":0.10391287134268598},{"10":0.086884394..."
I'm using that db for spark analysis via MongoDB-Spark Connector.
My problem is field "analysis" - I need average result for all values from every interval ("0", "5", "10", ..., "1000"), so I have to sum 0.36965591924860347 + 0.10391287134268598 + 0.086884394 + ... and divide by number of intervals (I have 200 intervals in every column), and finally multiply the result by 100.

My solution would be this one:
db.collection.aggregate([
{
$set: {
analysis: {
$map: {
input: "$analysis",
in: { $objectToArray: "$$this" }
}
}
}
},
{
$set: {
analysis: {
$map: {
input: "$analysis",
in: { $first: "$$this.v" }
}
}
}
},
{ $set: { average: { $multiply: [ { $avg: "$analysis" }, 100 ] } } }
])
Mongo playground

You can use $reduce on that array,sum the values,and then divide with the number of elements and then multiply with 100
db.collection.aggregate([
{
"$addFields": {
"average": {
"$multiply": [
{
"$divide": [
{
"$reduce": {
"input": "$analysis",
"initialValue": 0,
"in": {
"$let": {
"vars": {
"sum": "$$value",
"data": "$$this"
},
"in": {
"$add": [
"$$sum",
{
"$arrayElemAt": [
{
"$arrayElemAt": [
{
"$map": {
"input": {
"$objectToArray": "$$data"
},
"as": "m",
"in": [
"$$m.k",
"$$m.v"
]
}
},
0
]
},
1
]
}
]
}
}
}
}
},
{
"$size": "$analysis"
}
]
},
100
]
}
}
}
])
You can test the code here
But this code has 1 problem, you save data in documents, and MongoDB
doesn't have a function like get(document,$$k), the new MongoDB v5.0 has a $getField but still accepts only constants no variables.
I mean we cant do in your case getField(doc,"5").
So we have the cost of converting each document to an array.

Related

JMESPATH query expression for matching Value in an Array

I have the following JSON
{
“Record”: [
{
“FirstName": “John”,
“LastName”: “Smith”,
“City”: “Chicago”,
“Possessions”: [
{
“Item”: “TV”
},
{
“Item”: “XBOX-S”
},
{
“Item”: “DVR”
},
{
“Item”: “Setup Box”
}
]
},
{
“FirstName": “Jane”,
“LastName”: “Doe”,
“City”: “Seattle”,
“Possessions”: [
{
“Item”: “DVR”
},
{
“Item”: “PS5”
},
{
“Item”: “FireStick”
}
]
},
{
“FirstName": “Jane”,
“LastName”: “Lee”,
“City”: “Dallas”,
“Possessions”: [
{
“Item”: “TV”
},
{
“Item”: “PS5”
},
{
“Item”: “FireStick”
}
]
}
]
}
How do I get a table of First Name and Last Name by matching a particular possession ("TV") using JMESPATH query?
I need to match the Value in the Array Possessions for each entry in the Record
I have tried using the following query, but it does not work,
Record[?contains(Possessions[].Item, `TV`) == `true`]
anyone knows how this request will work or point me in the right direction, it would be greatly appreciated.

How to do custom window function on JSON object with pandas?

I have a rather nested JSON object below, and I am trying to calculate the user (ie 'profileId') with the most events (ie length of 'parameters' key.
I have the code below to get the length of the parameter, but I am trying to now have that calculation be correct for each record, as they way I have it set now would set it the same value for each record - I looked into pandas window functions https://pandas.pydata.org/docs/user_guide/window.html but am having trouble getting to the correct outcome.
response = response.json()
df = pd.json_normalize(response['items'])
df['calcfield'] = len(df["events"].iloc[0][0].get('parameters'))
the output of df['arrayfield'] is below:
[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"admin"
},
{
"name":"method_name",
"value":"directory.users.list"
},
{
"name":"client_id",
"value":"722230783769-dsta4bi9fkom72qcu0t34aj3qpcoqloq.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"7158"
},
{
"name":"product_bucket",
"value":"GSUITE_ADMIN"
},
{
"name":"app_name",
"value":"Untitled project"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
] }, {
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:58:48.914Z",
"uniqueQualifier":"-4002873813067783265",
"applicationName":"token",
"customerId":"C02f6wppb"
},
"etag":"\"5T53xK7dpLei95RNoKZd9uz5Xb8LJpBJb72fi2HaNYM/9DTdB8t7uixvUbjo4LUEg53_gf0\"",
"actor":{
"email":"nancy.admin#hyenacapital.net",
"profileId":"100230688039070881323"
},
"ipAddress":"54.80.168.30",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"gmail"
},
{
"name":"method_name",
"value":"gmail.users.messages.list"
},
{
"name":"client_id",
"value":"927538837578.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"2"
},
{
"name":"product_bucket",
"value":"GMAIL"
},
{
"name":"app_name",
"value":"Zapier"
},
{
"name":"client_type",
"value":"WEB"
}
]
ORIGINAL JSON BLOB I READ IN
{
"kind":"admin#reports#activities",
"etag":"\"5g8\"",
"nextPageToken":"A:1651795128914034:-4002873813067783265:151219070090:C02f6wppb",
"items":[
{
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:59:39.421Z",
"uniqueQualifier":"5526793068617678141",
"applicationName":"token",
"customerId":"cds"
},
"etag":"\"jkYcURYoi8\"",
"actor":{
"email":"blah#blah.net",
"profileId":"1323"
},
"ipAddress":"107.178.193.87",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"admin"
},
{
"name":"method_name",
"value":"directory.users.list"
},
{
"name":"client_id",
"value":"722230783769-dsta4bi9fkom72qcu0t34aj3qpcoqloq.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"7158"
},
{
"name":"product_bucket",
"value":"GSUITE_ADMIN"
},
{
"name":"app_name",
"value":"Untitled project"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
]
},
{
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:58:48.914Z",
"uniqueQualifier":"-4002873813067783265",
"applicationName":"token",
"customerId":"df"
},
"etag":"\"5T53xK7dpLei95RNoKZd9uz5Xb8LJpBJb72fi2HaNYM/9DTdB8t7uixvUbjo4LUEg53_gf0\"",
"actor":{
"email":"blah.blah#bebe.net",
"profileId":"1324"
},
"ipAddress":"54.80.168.30",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"gmail"
},
{
"name":"method_name",
"value":"gmail.users.messages.list"
},
{
"name":"client_id",
"value":"927538837578.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"2"
},
{
"name":"product_bucket",
"value":"GMAIL"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
]
}
]
}
Use:
df.groupby('actor.profileId')['events'].apply(lambda x: [len(x.iloc[i][0]['parameters']) for i in range(len(x))])
which returns the list of each profileid count of parameters. Output and the sample data:
actor.profileId
1323 [7]
1324 [7]
Name: events, dtype: object
It's not entirely clear what you asking and df['arrayfield'] isn't in your example provided. However, if you look at the events column after json_normalize, you can use the following line to pull out the length of each parameters key. The blob you gave as an example was set to response...
df = pd.json_normalize(response['items'])
df['calcfield'] = df['events'].str[0].str.get('parameters').str.len()
Becauase each parameters key has 7 elements, it's tough to say this is what you really want.

Gatsby graphql filter statement is not working as expected

I have a query (based on JSON input) defined in Gatsbys GraphiQL IDE which looks this trying to return only those items which have a value of less than 10 in playtime_forever:
allContentJson(
filter: {response: {games: {elemMatch: {playtime_forever: {lt: 10}}}}}
) {
edges {
node {
response {
games {
playtime_forever
}
}
}
}
}
}
However running this query I also receive those which are greater than 10:
{
"data": {
"allContentJson": {
"edges": [
{
"node": {
"response": {
"games": [
{
"playtime_forever": 0
},
{
"playtime_forever": 93
},
{
"playtime_forever": 5302
}
]
},
}
}
]
}
},
"extensions": {}
}
I have this feeling this is how elemMatch is supposed to work or is there any other approach (possibly by defining types?)

Removing a specific attribute in an array of nested documents

Excuse my English, I'm from Russia.
I asked this question in the Russian version SO, but they still haven't answered it.
There is a record collection that stores archival files. Here is its simplified structure (I omitted most of the attributes):
{
"_id": 1,
"tomes": [
{
"number":1,
"archive_number":1
},
{
"number":2,
"archive_number":1
}
]
}
{
"_id": 2,
"tomes": [
{
"number":1,
"archive_number":1
},
{
"number":2,
"archive_number":1
},
{
"number":3,
"archive_number":1
}
]
}
I need to remove the archive_number attribute from each of the nested documents of the tomes array for all documents in the record collection.
After deletion, the structure should look like this:
{
"_id": 1,
"tomes": [
{
"number":1,
},
{
"number":2,
}
]
}
{
"_id": 2,
"tomes": [
{
"number":1,
},
{
"number":2,
},
{
"number":3,
}
]
}
I was able to write a query like this:
db.record.update(
{
"tomes": {
$elemMatch:{
"archive_number":{$exists:true}
}
}
},
{
$unset: {
"tomes.$.archive_number":1
}
},
false, true
)
But this query only removes the archive_number attribute on one volume per archive case. I.e., after launch, we will see the following picture:
{
"_id": 1,
"tomes": [
{
"number":1,
},
{
"number":2,
"archive_number":1
}
]
}
{
"_id": 2,
"tomes": [
{
"number":1,
},
{
"number":2,
"archive_number":1
},
{
"number":3,
"archive_number":1
}
]
}
Can you please tell me how to delete all volumes? I don’t know how to correct the request, but my head doesn’t understand anymore.
Solution 1
With $[<indentifier>] (filtered positional operator) and arrayFilters to update the document(s) in the array.
db.collection.update({
"tomes": {
$elemMatch: {
"archive_number": {
$exists: true
}
}
}
},
{
$unset: {
"tomes.$[tome].archive_number": 1
}
},
{
arrayFilters: [
{
"tome.archive_number": {
$exists: true
}
}
],
multi: true
})
Sample Mongo Playground (Solution 1)
Solution 2
With $[] (all positional operator).
The all positional operator $[] indicates that the update operator should modify all elements in the specified array field.
db.collection.update({
"tomes": {
$elemMatch: {
"archive_number": {
$exists: true
}
}
}
},
{
$unset: {
"tomes.$[].archive_number": 1
}
},
{
multi: true
})
Sample Mongo Playground (Solution 2)
References
How the arrayFilters Parameter Works in MongoDB

Presto Json Parsing

I have a json field(attached sample) and i need to extract the values in ProvisioningSystem path but it works only if i hardcode the array location.How can i extract the value without hardcoding ?Thanks in Advance!
Code:
TRANSFORM(CAST(JSON_EXTRACT(order_json, '$.Order.Accounts.Account') AS ARRAY), x -> JSON_EXTRACT_SCALAR(x,'$.ProvisioningSystems.ProvisioningSystem[1].SystemName'))
Json:
{
"Order":
{
"Accounts": {
"Account": [
{
"ProvisioningSystems": {},
},
{
"ProvisioningSystems": {
"ProvisioningSystem": [
{
"SystemOrderRef": "12345",
"SystemName": "Testsystem",
"SystemOrderRefType": "Provision"
}
]
},
}
]
},
}
}
}