N1QL Query to join array fields with an array in another document - couchbase

I have 3 documents types :
Data
{
"formId": "7508e7b2-bcf7-437b-a206-9fee87256d01",
"dataValues": [
{
"questionId": "Someguid123",
"questionValue": "Question1"
},
{
"questionId": "Someguid",
"questionValue": "Question2"
},
{
"questionId": "AnotherGuid",
"questionValue": "Question3"
}
],
"lastUpdateDateTime": "2023-01-04T10:56:49Z",
"type": "Data",
"templateId": "41e4cc2c-e9fb-4bdc-9dc2-af19e5988984",
"creationDateTime": "2022-12-28T11:20:46Z"
}
AttachedDocuments
{
"id": "AttachedDocuments::77961b70-2071-4410-837a-436c908a4fa5",
"lastUpdateDateTime": "2023-01-05T11:47:17Z",
"documents": [
{
"isUploaded": false,
"id": "DocumentMetadata::001",
"isDeleted": false,
"type": "photo",
"parentId": "Someguid123"
},
{
"isUploaded": false,
"id": "DocumentMetadata::002",
"isDeleted": false,
"type": "photo",
"parentId": "Someguid123"
}
],
"type": "AttachedDocuments",
"parentDocId": "MyFormData::7508e7b2-bcf7-437b-a206-9fee87256d01",
"creationDateTime": "2022-12-28T11:20:46Z"
}
DocumentMetaData
{
"id": "DocumentMetadata::001",
"type": "DocumentMetadata",
"name": "MyForm_001.png",
"documentId": "549c4da2-ad3a-4f92-bfa2-019750a11007",
"contentType": "FILE",
"parentDocumentId": "AttachedDocuments::77961b70-2071-4410-837a-436c908a4fa5",
"creationDateTime": "2023-01-04T10:56:49Z"
},
{
"id": "DocumentMetadata::002",
"type": "DocumentMetadata",
"name": "MyForm_002.png",
"documentId": "549c4da2-ad3a-4f92-bfa2-019750a11007",
"contentType": "FILE",
"parentDocumentId": "AttachedDocuments::77961b70-2071-4410-837a-436c908a4fa5",
"creationDateTime": "2023-01-04T10:56:49Z"
}
Every Data type document has only one AttachedDocuments document with parentDocId* field set to formId field of Data document.
If items in Data.dataValues has a document attached to it, AttachedDocuments.documents array have items with parentId field set to Data.dataValues[i].questionId.
Also every AttachedDocuments.documents[i] item has a DocumentMetadata document with id of AttachedDocuments.documents[i].id field.
I want to have a query which returns all Data.dataValues as an array but containing a field links that contains the DocumentMetadata.name field like below :
[
{
"questionId": "Someguid123",
"questionValue": "Question1",
"links": ["MyForm_001.png", "MyForm_002.png"]
},
{
"questionId": "Someguid",
"questionValue": "Question2"
},
{
"questionId": "AnotherGuid",
"questionValue": "Question3"
}
]
I tried unnest clause but couldn't output datavalues items without documents. How should I write the query to include those also?
Thank you

Assuming you have a 1:1 relationship between Data & AttachedDocuments, you can try:
CREATE SCOPE default.f;
CREATE COLLECTION default.f.Data;
CREATE COLLECTION default.f.AttachedDocuments;
CREATE COLLECTION default.f.DocumentMetaData;
CREATE INDEX ix1 ON default.f.DocumentMetaData(id);
SELECT dataValues.questionId, dataValues.questionValue, links
FROM default.f.Data join default.f.AttachedDocuments ON "MyFormData::"||Data.formId = AttachedDocuments.parentDocId
UNNEST Data.dataValues AS dataValues
LET links = (SELECT RAW DocumentMetaData.name
FROM default.f.DocumentMetaData
WHERE DocumentMetaData.parentDocumentId = AttachedDocuments.id
AND id IN ARRAY a.id FOR a IN AttachedDocuments.documents WHEN a.parentId = dataValues.questionId END
)
;
If you have a 1:n relationship between Data & AttachedDocuments but the attachments for a single question are wholly in a single attached document:
CREATE INDEX ix2 ON default.f.AttachedDocuments(parentDocId);
CREATE INDEX ix3 ON default.f.AttachedDocuments(id);
SELECT dataValues.questionId, dataValues.questionValue, links
FROM default.f.Data join default.f.AttachedDocuments ON "MyFormData::"||Data.formId = AttachedDocuments.parentDocId
UNNEST Data.dataValues as dataValues
LET links = (SELECT RAW md.name
FROM default.f.AttachedDocuments ad JOIN default.f.DocumentMetaData md ON ad.id = md.parentDocumentId
UNNEST ad.documents d
WHERE ad.parentDocId = "MyFormData::"||Data.formId
AND d.id = md.id
AND d.parentId = dataValues.questionId
)
WHERE ANY dv IN AttachedDocuments.documents SATISFIES dv.parentId = dataValues.questionId END
;
If attachments for a single question can be spread over multiple attached documents, add a DISTINCT to the above statement.
HTH.
(You can use the same logic without collections adding appropriate aliasing and type field filtering.)

Related

Extract value of Tags from cloudTrail logs using Athena

I am trying to query cloudtrail logs using Athena. My goal is to find specific instances and extract them with their Tags.
The query I am using is:
SELECT eventTime, awsRegion , json_extract(responseelements, '$.instancesSet.items[0].instanceId') AS instanceId, json_extract(responseelements, '$.instancesSet.items[0].tagSet.items') AS TAGS FROM cloudtrail_logs_PP WHERE (eventName = 'RunInstances' OR eventName = 'StartInstances' ) AND requestparameters LIKE '%mytest1%' AND "timestamp" BETWEEN '2021/09/01' AND '2021/10/01' ORDER BY eventTime;
Using this query - I am able to get all Tags under one column.
Output of query
I want to extract only specific Tags and need help in the same. How cam I extract the only specific Tag?
I tried enhancing my query as json_extract(responseelements, '$.instancesSet.items[0].tagSet.items[0]' but the order of Tags is diff in diff logs - so cant pass the index location.
My json file in S3 is something like below:
{
"eventVersion": "1",
"eventTime": "2022-05-27T18:44:29Z",
"eventName": "RunInstances",
"awsRegion": "us-east-1",
"requestParameters": {
"instancesSet": {
"items": [{
"imageId": "ami-1234545",
"keyName": "DDKJKD"
}]
},
"instanceType": "m5.2xlarge",
"monitoring": {
"enabled": false
},
"hibernationOptions": {
"configured": false
}
},
"responseElements": {
"instancesSet": {
"items": [{
"tagSet": {
"items": [ {
"key": "11",
"value": "DS"
}, {
"key": "1",
"value": "A"
}]
}]
}
}
}

Null objects while using Coalesce and duplicate values while joining

[
{
"permissions": [
{
"name": "CREATE",
"id": 1
},
{
"name": "DELETE",
"id": 4
}
],
"roles": [
{
"name": "ADMIN",
"permission": [
{
"name": "CREATE",
"id": 1
},
{
"name": "UPDATE",
"id": 2
},
{
"name": "GET",
"id": 3
},
{
"name": "DELETE",
"id": 4
}
],
"id": 1
},
{
"name": "ADMIN",
"permission": [
{
"name": "CREATE",
"id": 1
},
{
"name": "UPDATE",
"id": 2
},
{
"name": "GET",
"id": 3
},
{
"name": "DELETE",
"id": 4
}
],
"id": 1
}
],
"id": 1,
"username": "raj#100"
},
{
"permissions": [
{
"name": null,
"id": null
}
],
"roles": [
{
"name": "USER",
"permission": [
{
"name": "GET",
"id": 3
}
],
"id": 3
}
],
"id": 2,
"username": "ram145"
}
]
As you can see from the above output the in roles the ADMIN is repeated twice and in the second users has no permissions so he should have an empty array but the output is with the permission object with all its values empty
This is the jooq statement which is executed :
public Object findAllUsers(String role, String permission) {
SelectOnConditionStep<Record1<JSON>> query = dslContext.select(
jsonObject(
key("id").value(USER.ID),
key("fullName").value(USER.FULL_NAME),
key("username").value(USER.USERNAME),
key("email").value(USER.EMAIL),
key("mobile").value(USER.MOBILE),
key("isActive").value(USER.IS_ACTIVE),
key("lastLoggedIn").value(USER.LAST_LOGGED_IN),
key("profileImage").value(USER.PROFILE_IMAGE),
key("roles").value(
coalesce(
jsonArrayAgg(
jsonObject(
key("id").value(ROLE.ID),
key("name").value(ROLE.NAME),
key("permission").value(
coalesce(
select(
jsonArrayAgg(
jsonObject(
key("id").value(PERMISSION.ID),
key("name").value(PERMISSION.NAME)
)
)
).from(ROLE_PERMISSION)
.leftJoin(PERMISSION)
.on(PERMISSION.ID.eq(ROLE_PERMISSION.PERMISSION_ID))
.where(ROLE_PERMISSION.ROLE_ID.eq(ROLE.ID))
.orderBy(PERMISSION.NAME.asc()),
jsonArray()
)
)
)
),
jsonArray()
)
),
key("permissions").value(
coalesce(
jsonArrayAgg(
jsonObject(
key("id").value(PERMISSION.ID),
key("name").value(PERMISSION.NAME)
)
),
jsonArray()
)
)
)
).from(USER)
.leftJoin(USER_ROLE).on(USER.ID.eq(USER_ROLE.USER_ID))
.leftJoin(ROLE).on(USER_ROLE.ROLE_ID.eq(ROLE.ID))
.leftJoin(USER_PERMISSION).on(USER.ID.eq(USER_PERMISSION.USER_ID))
.leftJoin(PERMISSION).on(USER_PERMISSION.PERMISSION_ID.eq(PERMISSION.ID));
if (role != null) {
query.where(ROLE.NAME.eq(role));
}
if (permission != null) {
query.where(PERMISSION.NAME.eq(role));
}
return query.groupBy(USER.ID)
.orderBy(USER.ID.asc())
.fetch().into(JSONObject.class);
}
Is there any way to fix this problem?
Why the duplicates?
Your join graph creates a cartesian product between the two "nested collections" ROLE and PERMISSION. You can't remove that cartesian product with GROUP BY alone, that works only if you join a single to-many relationship.
Instead, you can write subqueries like this (you already did this correctly for the ROLE_PERMISSION relationship):
dslContext.select(jsonObject(
key("id").value(USER.ID),
key("username").value(USER.USERNAME),
key("roles").value(coalesce(field(
select(jsonArrayAgg(jsonObject(
key("id").value(ROLE.ID),
key("name").value(ROLE.NAME),
key("permission").value(coalesce(field(
select(coalesce(jsonArrayAgg(jsonObject(
key("id").value(PERMISSION.ID),
key("name").value(PERMISSION.NAME)
)), jsonArray()))
.from(ROLE_PERMISSION)
.join(PERMISSION)
.on(PERMISSION.ID.eq(ROLE_PERMISSION.PERMISSION_ID))
.where(ROLE_PERMISSION.ROLE_ID.eq(ROLE.ID))
.orderBy(PERMISSION.NAME.asc())
), jsonArray()))
)))
.from(USER_ROLE)
.join(ROLE)
.on(USER_ROLE.ROLE_ID.eq(ROLE.ID))
.where(USER_ROLE.USER_ID.eq(USER.ID))
), jsonArray())),
key("permissions").value(coalesce(field(
select(coalesce(jsonArrayAgg(jsonObject(
key("id").value(PERMISSION.ID),
key("name").value(PERMISSION.NAME)
)))
.from(USER_PERMISSION)
.join(PERMISSION)
.on(USER_PERMISSION.PERMISSION_ID.eq(PERMISSION.ID))
.where(USER_PERMISSION.USER_ID.eq(USER.ID))
), jsonArray()))
))
.from(USER)
.orderBy(USER.ID.asc())
.fetch().into(JSONObject.class);
Join vs semi join
After you edited your question to become a slightly different question, the point you were trying to make is that you want to filter the USER table by some ROLE or PERMISSION that they must have. You can't achieve this with JOIN alone (unless you're happy with the duplicates). The answer I gave doesn't change. If you're joining multiple to-many relationships, you'll get cartesian products.
So, instead, why not semi join them? Either with jOOQ's synthetic SEMI JOIN syntax, or manually using EXISTS or IN, e.g.
.where(role != null
? exists(selectOne()
.from(USER_ROLE)
.where(USER_ROLE.role().NAME.eq(role))
)
: noCondition()
)
.and(permission != null
? exists(selectOne()
.from(USER_PERMISSION)
.where(USER_PERMISSION.permission().NAME.eq(permission))
)
: noCondition()
)
This is using the implicit join syntax, which is optional, but I think it does simplify your query.

Couchbase N1QL - Nest array using keys

i'm new to Couchbase and N1QL syntax and i'm facing an issue.
Let's say we have 3 type of documents:
Doc1 of TypeA with key = typeA:Doc1
{
"type": "typeA"
"id": "Doc1",
"sequences": [
"typeB:Doc2"
]
}
Doc2 of TypeB with key = typeB:Doc2
{
"id": "Doc2",
"processors": [
{
"order": 1,
"id": "typeC:Doc3"
}
]
}
Doc3 of TypeC with key = typeC:Doc3
{
"id": "Doc3",
"prop": "value"
}
What i want to achieve is to nest these 3 objects by their document keys in ordere to have a unique document with this structure:
{
"id": "Doc1",
"sequences": [
{
"id": "Doc2",
"processors": [
{
"order": 1,
"id": "Doc3",
"prop": "value"
}
]
}
]
What i've done is to nest the first two documents to obtain a partial result. But i'm tryng to integrate also the third document.
Here's my attempt:
SELECT dev.*,
ARRAY sq_i FOR sq_i IN prseq END AS sequences
FROM data dev
NEST data prseq ON KEYS dev.sequences
WHERE dev.type = 'TypeA'
Can anyone help me with the third level of nesting?
Thank you.
Use subqueries
SELECT dt.*,
(SELECT ds.*,
(ARRAY OBJECT_ADD((SELECT RAW dp FROM data AS dp USE KEYS v.id)[0], "order", v.`order`)
FOR v IN ds.processors
END) AS processors
FROM data AS ds USE KEYS dt.sequences) AS sequences
FROM data AS dt
WHERE dt.type = 'TypeA';

get tag with json ot jsonb query

I am using PostgreSQL 11.
I'm trying get "wmnote" tag from this json(this is a fragment, it is necessary to close the labels):
{
"order": [
{
"notes": {
"note": []
},
"onHold": "false",
"wmnotes": {
"wmnote": []
},
"invoices": {
"invoiceDetail": []
},
"confirmed": "true",
"enteredBy": "",
"entryType": "",
"orderType": "DTC",
"orderEvent": "Update",
"orderLines": {
"orderLine": [
{
"notes": {
"note": []
},
"isGift": "false",
"itemID": "4027956",
"onHold": "false",
"wmnotes": {
"wmnote": [
{
"noteSeq": "1",
"noteCode": "",
"noteType": "DDate",
"visibility": "0",
"commentText": "02/07/2019"
}
This is my query:
select o.info->>'order'-> 'orderLines'->'wmnotes'->'wmnote'
from customer_orders o
where o.order_id = 1;
But result is null.
The column name info is a data type jsonb.
They could help me with the construction of the query!!
Three points:
->> gives out a text not a type JSON. So you would not be able to work with the result as a JSON object. Use -> instead as you did with all furth steps
order contains an array. So you have to specify which array element you are searching for. If you want to search the first element you need to call "order" -> 0 -> "orderlines" (note JSON arrays are zero based!)
orderline contains an array as well. See point 2.
So your query should look like:
SELECT o.info->'order'->0 -> 'orderLines' -> 'orderLine' -> 0 -> 'wmnotes'->'wmnote'
FROM customer_orders o
demo: db<>fiddle

AWS Athena - Querying JSON - Searching for Values

I have nested JSON files on S3 and am trying to query them with Athena.
However, I am having problems to query the nested JSON values.
My JSON file looks like this:
{
"id": "17842007980192959",
"acount_id": "17841401243773780",
"stats": [
{
"name": "engagement",
"period": "lifetime",
"values": [
{
"value": 374
}
],
"title": "Engagement",
"description": "Total number of likes and comments on the media object",
"id": "17842007980192959/insights/engagement/lifetime"
},
{
"name": "impressions",
"period": "lifetime",
"values": [
{
"value": 11125
}
],
"title": "Impressions",
"description": "Total number of times the media object has been seen",
"id": "17842007980192959/insights/impressions/lifetime"
},
{
"name": "reach",
"period": "lifetime",
"values": [
{
"value": 8223
}
],
"title": "Reach",
"description": "Total number of unique accounts that have seen the media object",
"id": "17842007980192959/insights/reach/lifetime"
},
{
"name": "saved",
"period": "lifetime",
"values": [
{
"value": 0
}
],
"title": "Saved",
"description": "Total number of unique accounts that have saved the media object",
"id": "17842007980192959/insights/saved/lifetime"
}
],
"import_date": "2017-12-04"
}
What I'm trying to do is to query the "stats" field value where name=impressions.
So ideally something like:
SELECT id, account_id, stats.values.value WHERE stats.name='engagement'
AWS example: https://docs.aws.amazon.com/athena/latest/ug/searching-for-values.html
Any help would be appreciated.
You can query the JSON with the following table definition:
CREATE EXTERNAL TABLE test(
id string,
acount_id string,
stats array<
struct<
name:string,
period:string,
values:array<
struct<value:string>>,
title:string
>
>
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
LOCATION 's3://bucket/';
Now, the value column is available through the following unnesting:
select id, acount_id, stat.name,x.value
from test
cross join UNNEST(test.stats) as st(stat)
cross join UNNEST(stat."values") as valx(x)
WHERE stat.name='engagement';