Json Iteration in spark - json

Input Json file
{
"CarBrands": [{
"model": "audi",
"make": " (YEAR == \"2009\" AND CONDITION in (\"Y\") AND RESALE in (\"2015\")) ",
"service": {
"first": null,
"second": [],
"third": []
},
"dealerspot": [{
"dealername": [
"\"first\"",
"\"abc\""
]
},
{
"dealerlat": [
"\"45.00\"",
"\"38.00\""
]
}
],
"type": "ok",
"plate": true
},
{
"model": "bmw",
"make": " (YEAR == \"2010\" AND CONDITION OR (\"N\") AND RESALE in (\"2016\")) ",
"service": {
"first": null,
"second": [],
"third": []
},
"dealerspot": [{
"dealerlat": [
"\"99.00\"",
"\"38.00\""
]
},
{
"dealername": [
"\"sports\"",
"\"abc\""
]
}
],
"type": "ok",
"plate": true
},
{
"model": "toy",
"make": " (YEAR == \"2013\" AND CONDITION in (\"Y\") AND RESALE in (\"2018\")) ",
"service": {
"first": null,
"second": [],
"third": []
},
"dealerspot": [{
"dealerlat": [
"\"35.00\"",
"\"38.00\""
]
},
{
"dealername": [
"\"nelson\"",
"\"abc\""
]
}
],
"type": "ok",
"plate": true
}
]
}
expected output
+-------+-------------+-----------+
model | dealername | dealerlat |
--------+-------------+-----------+
audi | first | 45 |
bmw | sports | 99 |
toy | nelson | 35 |
--------+-------------+-----------+
import sparkSession.implicits._
val tagsDF = sparkSession.read.option("multiLine", true).option("inferSchema", true).json("src/main/resources/carbrands.json");
val df = tagsDF.select(explode($"CarBrands") as "car_brands")
val dfd = df.withColumn("_tmp", split($"car_brands.make", "\"")).select($"car_brands.model".as("model"),$"car_brands.dealerspot.dealername"(0)(0).as("dealername"),$"car_brands.dealerspot.dealerlat"(0)(0).as("dealerlat"))
note : since dealername and dealerlat position is not fixed, the index (0)(0) doesnt produce the desired output. please help

You can convert dealerspot into JSON string and then use JSONPath with get_json_object():
import org.apache.spark.sql.functions.{get_json_object,to_json,trim,explode}
val df1 = (tagsDF.withColumn("car_brands", explode($"CarBrands"))
.select("car_brands.*")
.withColumn("dealerspot", to_json($"dealerspot")))
//+--------------------+--------------------+-----+-----+----------+----+
//| dealerspot| make|model|plate| service|type|
//+--------------------+--------------------+-----+-----+----------+----+
//|[{"dealername":["...| (YEAR == "2009" ...| audi| true|[, [], []]| ok|
//|[{"dealerlat":["\...| (YEAR == "2010" ...| bmw| true|[, [], []]| ok|
//|[{"dealerlat":["\...| (YEAR == "2013" ...| toy| true|[, [], []]| ok|
//+--------------------+--------------------+-----+-----+----------+----+
df1.select(
$"model"
, trim(get_json_object($"dealerspot", "$[*].dealername[0]"), "\"\\") as "dealername"
, trim(get_json_object($"dealerspot", "$[*].dealerlat[0]"), "\"\\") as "dealerlat"
).show
//+-----+----------+---------+
//|model|dealername|dealerlat|
//+-----+----------+---------+
//| audi| first| 45.00|
//| bmw| sports| 99.00|
//| toy| nelson| 35.00|
//+-----+----------+---------+

Related

how to extract and modify inner array objects with parent object data in jq

We are tying to format a json similar to this:
[
{"id": 1,
"type": "A",
"changes": [
{"id": 12},
{"id": 13}
],
"wanted_key": "good",
"unwanted_key": "aaa"
},
{"id": 2,
"type": "A",
"unwanted_key": "aaa"
},
{"id": 3,
"type": "B",
"changes": [
{"id": 31},
{"id": 32}
],
"unwanted_key": "aaa",
"unwanted_key2": "aaa"
},
{"id": 4,
"type": "B",
"unwanted_key3": "aaa"
},
null,
null,
{"id": 7}
]
into something like this:
[
{
"id": 1,
"type": "A",
"wanted_key": true # every record must have this key/value
},
{
"id": 12, # note: this was in the "changes" property of record id 1
"type": "A", # type should be the same type than record id 1
"wanted_key": true
},
{
"id": 13, # note: this was in the "changes" property of record id 1
"type": "A", # type should be the same type than record id 1
"wanted_key": true
},
{
"id": 2,
"type": "A",
"wanted_key": true
},
{
"id": 3,
"type": "B",
"wanted_key": true
},
{
"id": 31, # note: this was in the "changes" property of record id 3
"type": "B", # type should be the same type than record id 3
"wanted_key": true
},
{
"id": 32, # note: this was in the "changes" property of record id 3
"type": "B", # type should be the same type than record id 3
"wanted_key": true
},
{
"id": 4,
"type": "B",
"wanted_key": true
},
{
"id": 7,
"type": "UNKN", # records without a type should have this type
"wanted_key": true
}
]
So far, I've been able to:
remove null records
obtain the keys we need with their default
give records without a type a default type
What we are missing:
from records having a changes key, create new records with the type of their parent record
join all records in a single array
Unfortunately we are not entirely sure how to proceed... Any help would be appreciated.
So far our jq goes like this:
del(..|nulls) | map({id, type: (.type // "UNKN"), wanted_key: (true)}) | del(..|nulls)
Here's our test code:
https://jqplay.org/s/eLAWwP1ha8P
The following should work:
map(select(values))
| map(., .type as $type | (.changes[]? + {$type}))
| map({id, type: (.type // "UNKN"), wanted_key: true})
Only select non-null values
Return the original items followed by their inner changes array (+ outer type)
Extract 3 properties for output
Multiple map calls can usually be combined, so this becomes:
map(
select(values)
| ., (.type as $type | (.changes[]? + {$type}))
| {id, type: (.type // "UNKN"), wanted_key: true}
)
Another option without variables:
map(
select(values)
| ., .changes[]? + {type}
| {id, type: (.type // "UNKN"), wanted_key: true}
)
# or:
map(select(values))
| map(., .changes[]? + {type})
| map({id, type: (.type // "UNKN"), wanted_key: true})
or even with a separate normalization step for the unknown type:
map(select(values))
| map(.type //= "UNKN")
| map(., .changes[]? + {type})
| map({id, type, wanted_key: true})
# condensed to a single line:
map(select(values) | .type //= "UNKN" | ., .changes[]? + {type} | {id, type, wanted_key: true})
Explanation:
Select only non-null values from the array
If type is not set, create the property with value "UNKN"
Produce the original array items, followed by their nested changes elements extended with the parent type
Reshape objects to only contain properties id, type, and wanted_key.
Here's one way:
map(
select(values)
| (.type // "UNKN") as $type
| ., .changes[]?
| {id, $type, wanted_key: true}
)
[
{
"id": 1,
"type": "A",
"wanted_key": true
},
{
"id": 12,
"type": "A",
"wanted_key": true
},
{
"id": 13,
"type": "A",
"wanted_key": true
},
{
"id": 2,
"type": "A",
"wanted_key": true
},
{
"id": 3,
"type": "B",
"wanted_key": true
},
{
"id": 31,
"type": "B",
"wanted_key": true
},
{
"id": 32,
"type": "B",
"wanted_key": true
},
{
"id": 4,
"type": "B",
"wanted_key": true
},
{
"id": 7,
"type": "UNKN",
"wanted_key": true
}
]
Demo
Something like below should work
map(
select(type == "object") |
( {id}, {id : ( .changes[]? .id )} ) +
{ type: (.type // "UNKN"), wanted_key: true }
)
jq play - demo

jq - select objects and print null for missing

I'm trying to generate a CSV of sort from json file, the files are as below
cat role1.json
{
"Tags": [
{
"Key": "Name",
"Value": "Role1Name"
},
{
"Key": "ID",
"Value": "Role1ID"
},
{
"Key": "Manager",
"Value": "Role1Manager"
},
{
"Key": "User",
"Value": "Role1User"
},
{
"Key": "Country",
"Value": "USA"
}
]
}
cat role2.json
{
"Tags": [
{
"Key": "Name",
"Value": "Role2Name"
},
{
"Key": "ID",
"Value": "Role2ID"
},
{
"Key": "City",
"Value": "NewYork"
},
{
"Key": "Creator",
"Value": "Role2Creator"
},
{
"Key": "User",
"Value": "Role2User"
}
]
}
cat role3.json
{
"Tags": [
{
"Key": "Name",
"Value": "Role3Name"
},
{
"Key": "ID",
"Value": "Role3ID"
},
{
"Key": "Creator",
"Value": "Role3Creator"
},
{
"Key": "ZIP",
"Value": 82378
},
{
"Key": "Manager",
"Value": "Role3Manager"
},
{
"Key": "User",
"Value": "Role3User"
}
]
}
I want to generate lines from each of these to be later used as CSV, something like:
Role1Name, Role1ID, null, Role1Manager, Role1User
Role2Name, Role2ID, Role2Creator, null, Role2User
Role3Name, Role3ID, Role3Creator, Role3Manager, Role3User
For the header line
Name, ID, Creator, Manager, User
I'm able to get all the "Value" but not able to print null for missing "Key"
$cat role1.json | jq -rc '[.Tags[] | select(.Key == ("Name","ID","Creator","Manager","User")) | .Value]'
["Role1Name","Role1ID","Role1Manager","Role1User"]
$cat role2.json | jq -rc '[.Tags[] | select(.Key == ("Name","ID","Creator","Manager","User")) | .Value]'
["Role2Name","Role2ID","Role2Creator","Role2User"]
$cat role3.json | jq -rc '[.Tags[] | select(.Key == ("Name","ID","Creator","Manager","User")) | .Value]'
["Role3Name","Role3ID","Role3Creator","Role3Manager","Role3User"]
Can someone share with me how this can be done using jq.
Also, how can we enforce the order.
Thanks!
The key (ha!) is
[ .[ $keys[] ] ]
Had you looked at other answers to questions relating to CSV, you might have noticed the first step taken is to get the list of keys. This is often done by collecting the keys of the input objects. (Example) In your case, you have a hard-coded list, so it's even simpler.
If you wanted actual CSV, you could use
jq -sr '
[ "Name", "ID", "Creator", "Manager", "User" ] as $keys |
(
$keys,
( .[].Tags | from_entries | [ .[ $keys[] ] ] )
) |
#csv
' role*.json
This produces
"Name","ID","Creator","Manager","User"
"Role1Name","Role1ID",,"Role1Manager","Role1User"
"Role2Name","Role2ID","Role2Creator",,"Role2User"
"Role3Name","Role3ID","Role3Creator","Role3Manager","Role3User"
jqplay
Without a header:
jq -r '.Tags | from_entries | [ .["Name","ID","Creator","Manager","User"] ] | #csv' role*.json
jqplay
To get the specific output you posted (which isn't CSV), you could use
jq -sr '
[ "Name", "ID", "Creator", "Manager", "User" ] as $keys |
(
$keys,
( .[].Tags | from_entries | [ .[ $keys[] ] | . // "null" ] )
) |
join(", ")
' role*.json
This produces
Name, ID, Creator, Manager, User
Role1Name, Role1ID, null, Role1Manager, Role1User
Role2Name, Role2ID, Role2Creator, null, Role2User
Role3Name, Role3ID, Role3Creator, Role3Manager, Role3User
jqplay
Without a header:
jq -r '.Tags | from_entries | [ .["Name","ID","Creator","Manager","User"] | . // "null" ] | join(", ")' role*.json
jqplay
Got an answer from another forum, might be useful for others
$jq -rc '.Tags | from_entries | [.Name, .ID, .Creator, .Manager, .User]' role*.json
["Role1Name","Role1ID",null,"Role1Manager","Role1User"]
["Role2Name","Role2ID","Role2Creator",null,"Role2User"]
["Role3Name","Role3ID","Role3Creator","Role3Manager","Role3User"]

jq select filter chain

I am having the following array of objects that I would like to filter down. Like this:
1. LOGGEDIN == 0
2. Timestamp older than 5 minutes
3. IDLETIME > 60 && CPULOAD < 200
So for the second filter I’d like not to consider the objects filtered out on the first filter. And for the third filter I’d like not to consider the objects filtered out on the second filter. I tried to get the selection with jq:
1. jq '.[] | select(.LOGGEDIN=="0")'
2. jq '.[] | select(.TIMESTAMP | fromdateiso8601 < '$FIVEMINAGO')'
3. jq '.[] | select(.IDLETIME |tonumber > 60) | select(.CPULOAD |tonumber < 200)'
I’d like to wrap these up so that I end up with one array of objects, matching the filters and another array of objects, that do not. I’m on a Mac, zsh.
[
{
"SERIAL": "XXXSERIAL1XXX",
"TIMESTAMP": "2020-12-17 18:45:14",
"EMAIL": "email1#mydomain.com",
"LOGGEDIN": "0",
"IDLETIME": "122",
"CPULOAD": "2",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL2XXX",
"TIMESTAMP": "2020-12-17 18:43:29",
"EMAIL": "email2#mydomain.com",
"LOGGEDIN": "1",
"IDLETIME": "0",
"CPULOAD": "0",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL3XXX",
"TIMESTAMP": "2020-12-17 18:46:37",
"EMAIL": "email1#mydomain.com",
"LOGGEDIN": "1",
"IDLETIME": "0",
"CPULOAD": "0",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL4XXX",
"TIMESTAMP": "2020-12-17 18:45:23",
"EMAIL": "email3#mydomain.com",
"LOGGEDIN": "0",
"IDLETIME": "0",
"CPULOAD": "13",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL5XXX",
"TIMESTAMP": "2020-12-17 18:47:02",
"EMAIL": "email2#mydomain.com",
"LOGGEDIN": "1",
"IDLETIME": "0",
"CPULOAD": "0",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL6XXX",
"TIMESTAMP": "2020-12-17 18:43:42",
"EMAIL": "email3#mydomain.com",
"LOGGEDIN": "1",
"IDLETIME": "10",
"CPULOAD": "20",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL7XXX",
"TIMESTAMP": "2020-12-17 18:43:29",
"EMAIL": "email4#mydomain.com",
"LOGGEDIN": "1",
"IDLETIME": "0",
"CPULOAD": "0",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL8XXX",
"TIMESTAMP": "2020-12-17 18:46:02",
"EMAIL": "email4#mydomain.com",
"LOGGEDIN": "0",
"IDLETIME": "0",
"CPULOAD": "0",
"BLOCKED": "0"
},
{
"SERIAL": "XXXSERIAL9XXX",
"TIMESTAMP": "2020-12-17 18:45:23",
"EMAIL": "email1#mydomain.com",
"LOGGEDIN": "0",
"IDLETIME": "443",
"CPULOAD": "666",
"BLOCKED": "0"
}
]
Problems with the snippets you posted:
Don't try to generate code in the shell! Use --arg (or some other mechanism) to pass values to your program instead.
Your timestamps are not valid ISO8601 timestamps, much less what fromdateiso8601 expects.
| has the lowest precedence other than ;, so
.IDLETIME | tonumber > 60 means
.IDLETIME | ( tonumber > 60 ) but you want
( .IDLETIME | tonumber ) > 60.
We can start with this:
jq --arg TSCUT "$( date --date='5 minutes ago' +%s )" '
group_by(
.LOGGEDIN == "0" and
( .TIMESTAMP | sub(" "; "T") + "Z" | fromdateiso8601 ) < $TSCUT and
( .IDLETIME | tonumber ) > 60 and
( .CPULOAD | tonumber ) < 200
)
'
jqplay
The above segregates the matching records from those that don't, but we could end up with any of the following:
[ ]
[ [...matches...] ]
[ [...non-matches...] ]
[ [...non-matches...], [...matches...] ]
This isn't very useful. As such, I propose the following:
jq --arg TSCUT "$( date --date='5 minutes ago' +%s )" '
map(
._f = (
.LOGGEDIN == "0" and
( .TIMESTAMP | sub(" "; "T") + "Z" | fromdateiso8601 ) < $TSCUT and
( .IDLETIME | tonumber ) > 60 and
( .CPULOAD | tonumber ) < 200
)
) |
. as $a |
{
"matches": [ $a[] | select( ._f ) | del(._f) ],
"non-matches": [ $a[] | select( ._f | not ) | del(._f) ]
}
'
jqplay
I assumed that "$( ... )" means the same thing in zsh as it does in the POSIX shell. Adjust as needed.
Thanks to #oguz ismail for pointing out group_by, even though I retain my original solution.

Flatten nested json in kusto column

I have a kusto table with one of the columns as dynamic type with nested json,
How do I flatten in kusto?
mv-expand is only doing one level.
column1 : timetsamp
column2 : id
column3 : json object
timestamp id value
2020-10-13 22:42:05.0000000 d0 "{
""value"": ""0"",
""max"": ""0"",
""min"": ""0"",
""avg"": ""0""
}"
2020-10-13 22:42:05.0000000 d0 "{
""sid"": ""a0"",
""data"": {
""x"": {
""a"": {
""t1"": ""2020-10-13T22:46:50.1310000Z"",
""m1"": 446164,
""m4"": {
""m41"": ""abcd"",
""m42"": 1234
}
}
}
}
}"
#update2 : I was able to faltten keys, but not the values
let testJson = datatable(timestamp : datetime, id : string, value : dynamic )
[datetime(2020-10-13T22:42:05Z), 'd0', dynamic({"value":"0","max":"0","min":"0","avg":"0"}),
datetime(2020-10-13T22:42:05Z), 'd1', dynamic({"sid":"a0","data":{"x":{"a":{"t1":"2020-10-13T22:46:50.131Z","m1":446164,"m4":{"m41":"abcd","m42":1234}}}}})];
testJson
| extend key=treepath(value)
| mv-expand key
| extend value1 = value[tostring(key)]
You can invoke mv-expand several times:
let _data = datatable (column1:datetime , column2:string , column3:dynamic )
[
datetime(2020-10-13 22:42:05.0000000), 'd0', dynamic({
"value": "0",
"max": "0",
"min": "0",
"avg": "0"
}),
datetime(2020-10-13 22:42:05.0000000), 'd0', dynamic({
"sid": "a0",
"data": {
"x": {
"a": {
"t1": "2020-10-13T22:46:50.1310000Z",
"m1": 446164,
"m4": {
"m41": "abcd",
"m42": 1234
}
}
}
}
})
];
_data
| mv-expand column3
| mv-expand more_data=column3.data.x.a
| mv-expand more_data_m4=more_data.m4
You can also promote dynamic fields into columns using evaluate bag_unpack():
https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/bag-unpackplugin
let _data = datatable (column1:datetime , column2:string , column3:dynamic )
[
datetime(2020-10-13 22:42:05.0000000), 'd0', dynamic({
"value": "0",
"max": "0",
"min": "0",
"avg": "0"
}),
datetime(2020-10-13 22:42:05.0000000), 'd0', dynamic({
"sid": "a0",
"data": {
"x": {
"a": {
"t1": "2020-10-13T22:46:50.1310000Z",
"m1": 446164,
"m4": {
"m41": "abcd",
"m42": 1234
}
}
}
}
})
];
_data
| mv-expand column3
| extend expanded_data = column3.data.x.a
| evaluate bag_unpack(expanded_data)

How to index nested array

How to index (N1QL query in Couchbase) above document to speed up searching by SerialNumber field in nested array (doc => groups => items => item.SerialNumber)?
Sample:
{
"Id": "0012ed6e-41af-4e45-b53f-bac3b2eb0b82",
"Machine": "Machine2",
"Groups": [
{
"Id": "0fed9b14-fa38-e511-893a-001125665867",
"Name": "Name",
"Items": [
{
"Id": "64e69b14-fa38-e511-893a-001125665867",
"SerialNumber": "1504H365",
"Position": 73
},
{
"Id": "7be69b14-fa38-e511-893a-001125665867",
"SerialNumber": "1504H364",
"Position": 72
}
]
},
{
"Id": "0fed9b14-fa38-e511-893a-001125665867",
"Name": "Name",
"Items": [
{
"Id": "64e69b14-fa38-e511-893a-001125665867",
"SerialNumber": "1504H365",
"Position": 73
},
{
"Id": "7be69b14-fa38-e511-893a-001125665867",
"SerialNumber": "1504H364",
"Position": 72
}
]
}
]
}
my query:
CREATE INDEX idx_serial ON `aplikomp-bucket`
(ALL ARRAY(ALL ARRAY i.SerialNumber FOR i IN g.Items END ) FOR g In Groups END);
CREATE INDEX idx_serial ON `aplikomp-bucket` (DISTINCT ARRAY(DISTINCT ARRAY i.SerialNumber FOR i IN g.Items END ) FOR g In Groups END);
SELECT META().id FROM `aplikomp-bucket` AS a
WHERE ANY g IN a.Groups SATISFIES (ANY i IN g.Items SATISFIES i.SerialNumber > 123 END) END;