Multilevel Complex Nested Json Using Spark SQL

Multilevel Complex Nested Json Using Spark SQL - json

I have a specific requirement to convert some related tables data in nested json like below by using Spark SQL. I have achieved it with Scala but not getting it resolved in Spark SQL.
{
"REPORTING_CARRIER":"9E",
"DISTANCE":"3132",
"ORIGIN_STATE_NM":"Pennsylvania",
"QUARTER":"2",
"YEAR":"2017",
"ITIN_GEO_TYPE":"2",
"BULK_FARE":"0",
"ORIGIN":"ABE",
"ORIGIN_AIRPORT_ID":"10135",
"ITIN_FARE":"787",
"ORIGIN_CITY_MARKET_ID":"30135",
"ROUNDTRIP":"1",
"Market":[
{
"MKT_DISTANCE":"1566",
"MKT_BULK_FARE":"0",
"MKT_NO_OF_CPNS":"2",
"MKT_DEST_STATE_NM":"Texas",
"MKT_OP_CARR_GRP":"9E:DL",
"MKT_TK_CARR_GRP":"DL:DL",
"MKT_MILES_FLOWN":"1566",
"MKT_AIRPORT_GROUP":"ABE:ATL:SAT",
"MKT_FARE_AMT":"393.5",
"MKT_ORIG_STATE_NM":"Pennsylvania",
"MKT_DEST_ARPT_CITY_NM":"33214",
"MKT_RPTG_CARR_NM":"9E",
"MKT_DEST":"SAT",
"MKT_DEST_CNTRY":"US",
"MKT_ORIG_CNTRY":"US",
"Coupon":[
{
"CPN_STATE_NM":"Georgia",
"CPN_DEST":"ATL",
"CPN_TKT_CARR_NM":"DL",
"TRIP_BREAK":"",
"CPN_MKT_ORIG_ARPT_NM":"10135",
"CLASS_OF_SVC":"X",
"CPN_TKT_NBR":"2017245",
"CPN_DEST_CITY_MKT_NM":"30397",
"CPN_DISTANCE":"692",
"SEQ_NUM":"1",
"ITIN_GEO_TYPE":"2",
"CPN_RPTG_CARR_NM":"9E",
"COUPON_GEO_TYPE":"2",
"CPN_ORIG_STATE_NM":"Pennsylvania",
"CPN_OPERG_CARR_NM":"9E",
"CPN_ORIG":"ABE",
"CPN_PASSENGERS":"1",
"COUPON_TYPE":"A",
"CPN_DEST_ARPT_NM":"10397",
"CPN_MKT_ORIG_CITY_NM":"30135",
"CPN_DEST_CNTRY":"US",
"CPN_MKT_ID":"201724501",
"CPN_ORIG_CNTRY":"US"
},
{
"CPN_STATE_NM":"Texas",
"CPN_DEST":"SAT",
"CPN_TKT_CARR_NM":"DL",
"TRIP_BREAK":"X",
"CPN_MKT_ORIG_ARPT_NM":"10397",
"CLASS_OF_SVC":"X",
"CPN_TKT_NBR":"2017245",
"CPN_DEST_CITY_MKT_NM":"33214",
"CPN_DISTANCE":"874",
"SEQ_NUM":"2",
"ITIN_GEO_TYPE":"2",
"CPN_RPTG_CARR_NM":"9E",
"COUPON_GEO_TYPE":"2",
"CPN_ORIG_STATE_NM":"Georgia",
"CPN_OPERG_CARR_NM":"DL",
"CPN_ORIG":"ATL",
"CPN_PASSENGERS":"1",
"COUPON_TYPE":"A",
"CPN_DEST_ARPT_NM":"14683",
"CPN_MKT_ORIG_CITY_NM":"30397",
"CPN_DEST_CNTRY":"US",
"CPN_MKT_ID":"201724501",
"CPN_ORIG_CNTRY":"US"
}
],
"MKT_ITIN_ID":"2017245",
"MKT_OPERG_CARR_NM":"99",
"MKT_DEST_ARPT_NM":"14683",
"MKT_ORIG_ARPT_NM":"ABE",
"MKT_ITIN_GEO_TYPE":"2",
"MKT_PASSENGERS":"1",
"MKT_ID":"201724501",
"MKT_TKT_CARR_NM":"DL"
},
{
"MKT_DISTANCE":"1566",
"MKT_BULK_FARE":"0",
"MKT_NO_OF_CPNS":"2",
"MKT_DEST_STATE_NM":"Pennsylvania",
"MKT_OP_CARR_GRP":"DL:DL",
"MKT_TK_CARR_GRP":"DL:DL",
"MKT_MILES_FLOWN":"1566",
"MKT_AIRPORT_GROUP":"SAT:ATL:ABE",
"MKT_FARE_AMT":"393.5",
"MKT_ORIG_STATE_NM":"Texas",
"MKT_DEST_ARPT_CITY_NM":"30135",
"MKT_RPTG_CARR_NM":"9E",
"MKT_DEST":"ABE",
"MKT_DEST_CNTRY":"US",
"MKT_ORIG_CNTRY":"US",
"Coupon":[
{
"CPN_STATE_NM":"Georgia",
"CPN_DEST":"ATL",
"CPN_TKT_CARR_NM":"DL",
"TRIP_BREAK":"",
"CPN_MKT_ORIG_ARPT_NM":"14683",
"CLASS_OF_SVC":"X",
"CPN_TKT_NBR":"2017245",
"CPN_DEST_CITY_MKT_NM":"30397",
"CPN_DISTANCE":"874",
"SEQ_NUM":"3",
"ITIN_GEO_TYPE":"2",
"CPN_RPTG_CARR_NM":"9E",
"COUPON_GEO_TYPE":"2",
"CPN_ORIG_STATE_NM":"Texas",
"CPN_OPERG_CARR_NM":"DL",
"CPN_ORIG":"SAT",
"CPN_PASSENGERS":"1",
"COUPON_TYPE":"A",
"CPN_DEST_ARPT_NM":"10397",
"CPN_MKT_ORIG_CITY_NM":"33214",
"CPN_DEST_CNTRY":"US",
"CPN_MKT_ID":"201724503",
"CPN_ORIG_CNTRY":"US"
},
{
"CPN_STATE_NM":"Pennsylvania",
"CPN_DEST":"ABE",
"CPN_TKT_CARR_NM":"DL",
"TRIP_BREAK":"X",
"CPN_MKT_ORIG_ARPT_NM":"10397",
"CLASS_OF_SVC":"X",
"CPN_TKT_NBR":"2017245",
"CPN_DEST_CITY_MKT_NM":"30135",
"CPN_DISTANCE":"692",
"SEQ_NUM":"4",
"ITIN_GEO_TYPE":"2",
"CPN_RPTG_CARR_NM":"9E",
"COUPON_GEO_TYPE":"2",
"CPN_ORIG_STATE_NM":"Georgia",
"CPN_OPERG_CARR_NM":"DL",
"CPN_ORIG":"ATL",
"CPN_PASSENGERS":"1",
"COUPON_TYPE":"A",
"CPN_DEST_ARPT_NM":"10135",
"CPN_MKT_ORIG_CITY_NM":"30397",
"CPN_DEST_CNTRY":"US",
"CPN_MKT_ID":"201724503",
"CPN_ORIG_CNTRY":"US"
}
],
"MKT_ITIN_ID":"2017245",
"MKT_OPERG_CARR_NM":"DL",
"MKT_DEST_ARPT_NM":"10135",
"MKT_ORIG_ARPT_NM":"SAT",
"MKT_ITIN_GEO_TYPE":"2",
"MKT_PASSENGERS":"1",
"MKT_ID":"201724503",
"MKT_TKT_CARR_NM":"DL"
}
],
"NO_OF_CPNS":"4",
"ORIGIN_COUNTRY":"US",
"ITIN_ID":"2017245",
"PASSENGERS":"1",
"MILES_FLOWN":"3132"
}

You can use the from_json() helper function within the select() Dataset API call, to extract or decode data's attributes and values from a JSON string into a DataFrame as columns, dictated by a schema.
example, given the following json { "reporting_carrier": "A", "market": { "value": 10 } }, stored in the rawJsonDf
case class MarketData (reporting_carrier: String, market_json: String)
val jsonSchema = new StructType()
.add("value", LongType)
rawJsonDf
.toDf("reporting_carrier","market")
.as[MarketData]
rawJsonDf
.select(from_json($"market_json", jsonSchema) as "market")
.filter($"market.value" > 5)
see this great tutorial by databricks for more info.

Related

Get array from json in Bigquery

I'm trying to get the data from a JSON in BigQuery. This JSON is Stored in a one-column table.
So far, I've been able to get only the "variables" array, with the following:
Select JSON_QUERY_ARRAY(Column1, '$.sessions[0].variables') FROM Table
How can I get the other values/arrays (sessionMessage and events)? I can't make it work..
I've tried with:
JSON_VALUE(Column1, '$.sessions[0].conversation')
JSON_QUERY_ARRAY(Column1, '$.sessions[0].sessionMessages')
But I get only empty values (The original json has values inside this arrays..)
{
"fromDate":"2020-04-10T23:47:17.161Z",
"pageRows":151,
"sessions":[
{
"variables":[],
"sessionDate":"2020-04-10T23:47:17.161Z",
"botMessages":2,
"userHasTalked":"true",
"topics":[
"TOPIC1"
],
"sessionId":"WXXXSXSXSXXXQ_2020-01-00T23:47:17.161Z",
"platformContactId":"XXXXXXX-XXXXXXX-XXXXXXXXXXXXXX",
"sessionMessages":[.....],
"queues":[
"QUEUE1",
"QUEUE2"
],
"customerId":"SSDSDS",
"userMessages":2,
"operatorMessages":1,
"sessionMessagesQty":2,
"sessionStartingCause":"Organic",
"channelId":"IDCHANEL",
"conversation":"https://url.com",
"events":[.....]
}
],
"toDate":"2020-04-10T23:47:17.161Z",
"hasMore":true,
"pageToken":"XXXXXXXXXXXXXX"
}

There is nothing wrong with the function and JSONPath that you used, but your sample JSON file has some unexpected thing, like [.....], removing/replacing those and query below works fine:
WITH a as (select
"""
{
"fromDate":"2020-04-10T23:47:17.161Z",
"pageRows":151,
"sessions":[
{
"variables":[],
"sessionDate":"2020-04-10T23:47:17.161Z",
"botMessages":2,
"userHasTalked":"true",
"topics":[
"TOPIC1"
],
"sessionId":"WXXXSXSXSXXXQ_2020-01-00T23:47:17.161Z",
"platformContactId":"XXXXXXX-XXXXXXX-XXXXXXXXXXXXXX",
"sessionMessages":[1,2,3],
"queues":[
"QUEUE1",
"QUEUE2"
],
"customerId":"SSDSDS",
"userMessages":2,
"operatorMessages":1,
"sessionMessagesQty":2,
"sessionStartingCause":"Organic",
"channelId":"IDCHANEL",
"conversation":"https://url.com",
"events":[],
}
],
"toDate":"2020-04-10T23:47:17.161Z",
"hasMore":true,
"pageToken":"XXXXXXXXXXXXXX"
}
""" data)
SELECT JSON_VALUE(data, '$.sessions[0].conversation'),
JSON_QUERY_ARRAY(data, '$.sessions[0].sessionMessages')
FROM a;

Passion in CouchBase Programming

I would like to get the single element in the Couchbase document that is in the array of objects, but i am able to fetch the array of objects
i tried to fetch the array using the following query, 'select countryDetails from test';
{
"type":"countries",
"docName":"CountryData",
"countryDetails":[
{
"name":"US",
"code":"+1",
"stateInfo":[
{
"name":"Florida",
"id":"1212"
},
{
"name":"NewYork",
"id":"1214"
}
]
},
{
"name":"France",
"code":"+33",
"stateInfo":[
{
"name":"Grand Est",
"id":"5212"
},
{
"name":"Brittany",
"id":"5214"
}
]
}
]
}
i tried fetching array using, select countryDetails from test;
i like to fetch the result as [ {"name" : "US", "code" : "+1" }, {"name" : "France", "code" : "+33"}]

If you project countryDetails it projects whole sub object.
If you need to part of sub object you need to explicitly project that.
The following ARRAY construction will provide the data representation you are expecting.
SELECT ARRAY {v.name,v.code} FOR v IN t.countryDetails END AS contryDetails
FROM test AS t
WHERE t.type = "countries";

What you are trying to do does not seem to be possible. You can get closer to what you want with a query like this:
select raw countryDetails from test
But the results of this query still have the result wrapped in an extra level of array.

PostgreSQL JSON Querying

I have a JSON type column called "person" and the data stored in it is in the format
{
"clients":{
"nbr":"2",
"info":[
{
"nom":"Baptiste",
"genre":"male",
"age":"48"
},
{
"nom":"Lisa",
"genre":"female",
"age":"29"
}
]
}
}
I want to retrieve the names of clients.

You may use json_array_elements
select json_array_elements(person->'clients'->'info')->>'nom' as name
from t;

Json not getting data from list

I am having a problem with reading json data. I have tried a few methods but came up short. Any help is welcome. Here is the code:
Its corrected with the whole file now.
"gallery": {
"106x100": [
"106x100-0.jpeg",
"106x100-1.jpeg",
"106x100-2.jpeg",
"106x100-3.jpeg",
"106x100-4.jpeg",
"106x100-5.jpeg",
"106x100-6.jpeg",
"106x100-7.jpeg",
"106x100-8.jpeg",
"106x100-9.jpeg",
"106x100-10.jpeg",
"106x100-11.jpeg",
"106x100-12.jpeg",
"106x100-13.jpeg",
"106x100-14.jpeg",
"106x100-15.jpeg",
"106x100-16.jpeg"
],
"190x100": [
"190x100-0.jpeg",
"190x100-1.jpeg",
"190x100-2.jpeg",
"190x100-3.jpeg",
"190x100-4.jpeg",
"190x100-5.jpeg",
"190x100-6.jpeg",
"190x100-7.jpeg",
"190x100-8.jpeg",
"190x100-9.jpeg",
"190x100-10.jpeg",
"190x100-11.jpeg",
"190x100-12.jpeg",
"190x100-13.jpeg",
"190x100-14.jpeg",
"190x100-15.jpeg",
"190x100-16.jpeg"
]
},
},

Your json's format is wrong.
Here is the correct format:
{"190x100": [
"190x100-0.jpeg",
"190x100-1.jpeg",
"190x100-2.jpeg",
"190x100-3.jpeg",
"190x100-4.jpeg",
"190x100-5.jpeg",
"190x100-6.jpeg",
"190x100-7.jpeg",
"190x100-8.jpeg",
"190x100-9.jpeg",
"190x100-10.jpeg",
"190x100-11.jpeg",
"190x100-12.jpeg",
"190x100-13.jpeg",
"190x100-14.jpeg",
"190x100-15.jpeg",
"190x100-16.jpeg"
]}
PS: Read that you are using java. You can then try this:
String jsonString = "{"+json+"}"; //pass json here if you are getting it in that format.
JSONObject jsonObject = new JSONObject(jsonString);
JSONObject newJSON = jsonObject.getJSONObject("gallery");
System.out.println(newJSON.toString());
Again the format is wrong. Here is the format:
{"gallery": {
"106x100": [
"106x100-0.jpeg",
"106x100-1.jpeg",
"106x100-2.jpeg",
"106x100-3.jpeg",
"106x100-4.jpeg",
"106x100-5.jpeg",
"106x100-6.jpeg",
"106x100-7.jpeg",
"106x100-8.jpeg",
"106x100-9.jpeg",
"106x100-10.jpeg",
"106x100-11.jpeg",
"106x100-12.jpeg",
"106x100-13.jpeg",
"106x100-14.jpeg",
"106x100-15.jpeg",
"106x100-16.jpeg"
],
"190x100": [
"190x100-0.jpeg",
"190x100-1.jpeg",
"190x100-2.jpeg",
"190x100-3.jpeg",
"190x100-4.jpeg",
"190x100-5.jpeg",
"190x100-6.jpeg",
"190x100-7.jpeg",
"190x100-8.jpeg",
"190x100-9.jpeg",
"190x100-10.jpeg",
"190x100-11.jpeg",
"190x100-12.jpeg",
"190x100-13.jpeg",
"190x100-14.jpeg",
"190x100-15.jpeg",
"190x100-16.jpeg"
]
}
}

How to get data in specific format using scala?

I have a raw json in following format-
"luns": [
{
"numReadBlocks": 15444876,
"numWriteBlocks": 13530714,
"blockSizeInBytes": 512,
"writeIops": 495344,
"readIops": 312702,
"serialNumber": "aaaaaaa",
"uuid": "id",
"shareState": "none",
"usedBytes": 6721716224,
"totalSizeBytes": 16106127360,
"path": "/vol/lun_23052014_025830_vol/lun_23052014_025830"
},
{
"numReadBlocks": 15444876,
"numWriteBlocks": 13530714,
"blockSizeInBytes": 512,
"writeIops": 495344,
"readIops": 312702,
"serialNumber": "aaaaaaa",
"uuid": "id",
"shareState": "none",
"usedBytes": 6721716224,
"totalSizeBytes": 16106127360,
"path": "/vol/lun_23052014_025830_vol/lun_23052014_025830"
}]
The luns may contains list.
I want to process above json and form output as following-
"topStorageLuns": [
{
"name": "Free (in GB)",
"data": [7.79,7.79]
},
{
"name": "Used (in GB)",
"data": [7.21,7.21]
}]
I tried following in order to get output-
val storageLuns = myRawJson
val topStorageLuns = storageLuns.map { storageLun =>
val totalLunsSizeOnStorageDevice = storageLun.luns.foldLeft(0.0) {
case (totalBytesOnDevice, lun) =>
totalBytesOnDevice + lun.usedBytes.getOrElse(0.0).toString.toLong
}
val totalAvailableLunsOnStorageDevice = storageLun.luns.foldLeft(0.0) {
case (totalBytesOnDevice, lun) =>
totalBytesOnDevice + lun.usedBytes.getOrElse(0.0).toString.toLong
}
Json.obj("name" -> storageLun.hostId, "data" -> "%.2f".format(totalLunsSizeOnStorageDevice / (1024 * 1024 * 1024)).toDouble)
}
Can anybody help me to get desired output please???

The key lesson I want to impart is that your algorithm should reflect the shape of the output you want. Work backward from the result you want to build the algorithm.
It looks to me like you want to create an array of length 2, where each entry has a corresponding algorithm (spaced used, space free). Within each of these elements, you want a nested array with an element for each item in your input array, calculated using the algorithm from the outer array. Here's how I would approach the problem:
1) Define your algorithms
val dfAlgorithm: (Seq[(String, JsValue)] => Double) = _.foldLeft(0.0) { (acc, item) =>
/* whatever logic you need to do */
}
val duAlgorithm: (Seq[(String, JsValue)] => Double) = _.foldLeft(0.0) { (acc, item) =>
/* whatever logic you need to do */
}
2) Create a data structure to map over to build your final output
val stats = Seq("Free (in GB)" -> dfAlgorithm, "Used (in GB)" -> duAlgorithm)
3) Map over your input data within your mapping over your algorithms (the logic here reflects the shape of the result you want)
stats.map { case (name, algorithm) =>
Json.obj("name" -> name, "data" -> storageLuns.map { storageLun => algorithm(storageLun) }
}
This isn't going to be a turnkey solution, since I don't know how your free/used algorithms are supposed to work, but this overall scheme should get you there.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Multilevel Complex Nested Json Using Spark SQL - json

Related

Get array from json in Bigquery

Passion in CouchBase Programming

PostgreSQL JSON Querying

Json not getting data from list

How to get data in specific format using scala?

Categories

Resources