I have a specific requirement to convert some related tables data in nested json like below by using Spark SQL. I have achieved it with Scala but not getting it resolved in Spark SQL.
{
"REPORTING_CARRIER":"9E",
"DISTANCE":"3132",
"ORIGIN_STATE_NM":"Pennsylvania",
"QUARTER":"2",
"YEAR":"2017",
"ITIN_GEO_TYPE":"2",
"BULK_FARE":"0",
"ORIGIN":"ABE",
"ORIGIN_AIRPORT_ID":"10135",
"ITIN_FARE":"787",
"ORIGIN_CITY_MARKET_ID":"30135",
"ROUNDTRIP":"1",
"Market":[
{
"MKT_DISTANCE":"1566",
"MKT_BULK_FARE":"0",
"MKT_NO_OF_CPNS":"2",
"MKT_DEST_STATE_NM":"Texas",
"MKT_OP_CARR_GRP":"9E:DL",
"MKT_TK_CARR_GRP":"DL:DL",
"MKT_MILES_FLOWN":"1566",
"MKT_AIRPORT_GROUP":"ABE:ATL:SAT",
"MKT_FARE_AMT":"393.5",
"MKT_ORIG_STATE_NM":"Pennsylvania",
"MKT_DEST_ARPT_CITY_NM":"33214",
"MKT_RPTG_CARR_NM":"9E",
"MKT_DEST":"SAT",
"MKT_DEST_CNTRY":"US",
"MKT_ORIG_CNTRY":"US",
"Coupon":[
{
"CPN_STATE_NM":"Georgia",
"CPN_DEST":"ATL",
"CPN_TKT_CARR_NM":"DL",
"TRIP_BREAK":"",
"CPN_MKT_ORIG_ARPT_NM":"10135",
"CLASS_OF_SVC":"X",
"CPN_TKT_NBR":"2017245",
"CPN_DEST_CITY_MKT_NM":"30397",
"CPN_DISTANCE":"692",
"SEQ_NUM":"1",
"ITIN_GEO_TYPE":"2",
"CPN_RPTG_CARR_NM":"9E",
"COUPON_GEO_TYPE":"2",
"CPN_ORIG_STATE_NM":"Pennsylvania",
"CPN_OPERG_CARR_NM":"9E",
"CPN_ORIG":"ABE",
"CPN_PASSENGERS":"1",
"COUPON_TYPE":"A",
"CPN_DEST_ARPT_NM":"10397",
"CPN_MKT_ORIG_CITY_NM":"30135",
"CPN_DEST_CNTRY":"US",
"CPN_MKT_ID":"201724501",
"CPN_ORIG_CNTRY":"US"
},
{
"CPN_STATE_NM":"Texas",
"CPN_DEST":"SAT",
"CPN_TKT_CARR_NM":"DL",
"TRIP_BREAK":"X",
"CPN_MKT_ORIG_ARPT_NM":"10397",
"CLASS_OF_SVC":"X",
"CPN_TKT_NBR":"2017245",
"CPN_DEST_CITY_MKT_NM":"33214",
"CPN_DISTANCE":"874",
"SEQ_NUM":"2",
"ITIN_GEO_TYPE":"2",
"CPN_RPTG_CARR_NM":"9E",
"COUPON_GEO_TYPE":"2",
"CPN_ORIG_STATE_NM":"Georgia",
"CPN_OPERG_CARR_NM":"DL",
"CPN_ORIG":"ATL",
"CPN_PASSENGERS":"1",
"COUPON_TYPE":"A",
"CPN_DEST_ARPT_NM":"14683",
"CPN_MKT_ORIG_CITY_NM":"30397",
"CPN_DEST_CNTRY":"US",
"CPN_MKT_ID":"201724501",
"CPN_ORIG_CNTRY":"US"
}
],
"MKT_ITIN_ID":"2017245",
"MKT_OPERG_CARR_NM":"99",
"MKT_DEST_ARPT_NM":"14683",
"MKT_ORIG_ARPT_NM":"ABE",
"MKT_ITIN_GEO_TYPE":"2",
"MKT_PASSENGERS":"1",
"MKT_ID":"201724501",
"MKT_TKT_CARR_NM":"DL"
},
{
"MKT_DISTANCE":"1566",
"MKT_BULK_FARE":"0",
"MKT_NO_OF_CPNS":"2",
"MKT_DEST_STATE_NM":"Pennsylvania",
"MKT_OP_CARR_GRP":"DL:DL",
"MKT_TK_CARR_GRP":"DL:DL",
"MKT_MILES_FLOWN":"1566",
"MKT_AIRPORT_GROUP":"SAT:ATL:ABE",
"MKT_FARE_AMT":"393.5",
"MKT_ORIG_STATE_NM":"Texas",
"MKT_DEST_ARPT_CITY_NM":"30135",
"MKT_RPTG_CARR_NM":"9E",
"MKT_DEST":"ABE",
"MKT_DEST_CNTRY":"US",
"MKT_ORIG_CNTRY":"US",
"Coupon":[
{
"CPN_STATE_NM":"Georgia",
"CPN_DEST":"ATL",
"CPN_TKT_CARR_NM":"DL",
"TRIP_BREAK":"",
"CPN_MKT_ORIG_ARPT_NM":"14683",
"CLASS_OF_SVC":"X",
"CPN_TKT_NBR":"2017245",
"CPN_DEST_CITY_MKT_NM":"30397",
"CPN_DISTANCE":"874",
"SEQ_NUM":"3",
"ITIN_GEO_TYPE":"2",
"CPN_RPTG_CARR_NM":"9E",
"COUPON_GEO_TYPE":"2",
"CPN_ORIG_STATE_NM":"Texas",
"CPN_OPERG_CARR_NM":"DL",
"CPN_ORIG":"SAT",
"CPN_PASSENGERS":"1",
"COUPON_TYPE":"A",
"CPN_DEST_ARPT_NM":"10397",
"CPN_MKT_ORIG_CITY_NM":"33214",
"CPN_DEST_CNTRY":"US",
"CPN_MKT_ID":"201724503",
"CPN_ORIG_CNTRY":"US"
},
{
"CPN_STATE_NM":"Pennsylvania",
"CPN_DEST":"ABE",
"CPN_TKT_CARR_NM":"DL",
"TRIP_BREAK":"X",
"CPN_MKT_ORIG_ARPT_NM":"10397",
"CLASS_OF_SVC":"X",
"CPN_TKT_NBR":"2017245",
"CPN_DEST_CITY_MKT_NM":"30135",
"CPN_DISTANCE":"692",
"SEQ_NUM":"4",
"ITIN_GEO_TYPE":"2",
"CPN_RPTG_CARR_NM":"9E",
"COUPON_GEO_TYPE":"2",
"CPN_ORIG_STATE_NM":"Georgia",
"CPN_OPERG_CARR_NM":"DL",
"CPN_ORIG":"ATL",
"CPN_PASSENGERS":"1",
"COUPON_TYPE":"A",
"CPN_DEST_ARPT_NM":"10135",
"CPN_MKT_ORIG_CITY_NM":"30397",
"CPN_DEST_CNTRY":"US",
"CPN_MKT_ID":"201724503",
"CPN_ORIG_CNTRY":"US"
}
],
"MKT_ITIN_ID":"2017245",
"MKT_OPERG_CARR_NM":"DL",
"MKT_DEST_ARPT_NM":"10135",
"MKT_ORIG_ARPT_NM":"SAT",
"MKT_ITIN_GEO_TYPE":"2",
"MKT_PASSENGERS":"1",
"MKT_ID":"201724503",
"MKT_TKT_CARR_NM":"DL"
}
],
"NO_OF_CPNS":"4",
"ORIGIN_COUNTRY":"US",
"ITIN_ID":"2017245",
"PASSENGERS":"1",
"MILES_FLOWN":"3132"
}
You can use the from_json() helper function within the select() Dataset API call, to extract or decode data's attributes and values from a JSON string into a DataFrame as columns, dictated by a schema.
example, given the following json { "reporting_carrier": "A", "market": { "value": 10 } }, stored in the rawJsonDf
case class MarketData (reporting_carrier: String, market_json: String)
val jsonSchema = new StructType()
.add("value", LongType)
rawJsonDf
.toDf("reporting_carrier","market")
.as[MarketData]
rawJsonDf
.select(from_json($"market_json", jsonSchema) as "market")
.filter($"market.value" > 5)
see this great tutorial by databricks for more info.
Related
I'm trying to get the data from a JSON in BigQuery. This JSON is Stored in a one-column table.
So far, I've been able to get only the "variables" array, with the following:
Select JSON_QUERY_ARRAY(Column1, '$.sessions[0].variables') FROM Table
How can I get the other values/arrays (sessionMessage and events)? I can't make it work..
I've tried with:
JSON_VALUE(Column1, '$.sessions[0].conversation')
JSON_QUERY_ARRAY(Column1, '$.sessions[0].sessionMessages')
But I get only empty values (The original json has values inside this arrays..)
{
"fromDate":"2020-04-10T23:47:17.161Z",
"pageRows":151,
"sessions":[
{
"variables":[],
"sessionDate":"2020-04-10T23:47:17.161Z",
"botMessages":2,
"userHasTalked":"true",
"topics":[
"TOPIC1"
],
"sessionId":"WXXXSXSXSXXXQ_2020-01-00T23:47:17.161Z",
"platformContactId":"XXXXXXX-XXXXXXX-XXXXXXXXXXXXXX",
"sessionMessages":[.....],
"queues":[
"QUEUE1",
"QUEUE2"
],
"customerId":"SSDSDS",
"userMessages":2,
"operatorMessages":1,
"sessionMessagesQty":2,
"sessionStartingCause":"Organic",
"channelId":"IDCHANEL",
"conversation":"https://url.com",
"events":[.....]
}
],
"toDate":"2020-04-10T23:47:17.161Z",
"hasMore":true,
"pageToken":"XXXXXXXXXXXXXX"
}
There is nothing wrong with the function and JSONPath that you used, but your sample JSON file has some unexpected thing, like [.....], removing/replacing those and query below works fine:
WITH a as (select
"""
{
"fromDate":"2020-04-10T23:47:17.161Z",
"pageRows":151,
"sessions":[
{
"variables":[],
"sessionDate":"2020-04-10T23:47:17.161Z",
"botMessages":2,
"userHasTalked":"true",
"topics":[
"TOPIC1"
],
"sessionId":"WXXXSXSXSXXXQ_2020-01-00T23:47:17.161Z",
"platformContactId":"XXXXXXX-XXXXXXX-XXXXXXXXXXXXXX",
"sessionMessages":[1,2,3],
"queues":[
"QUEUE1",
"QUEUE2"
],
"customerId":"SSDSDS",
"userMessages":2,
"operatorMessages":1,
"sessionMessagesQty":2,
"sessionStartingCause":"Organic",
"channelId":"IDCHANEL",
"conversation":"https://url.com",
"events":[],
}
],
"toDate":"2020-04-10T23:47:17.161Z",
"hasMore":true,
"pageToken":"XXXXXXXXXXXXXX"
}
""" data)
SELECT JSON_VALUE(data, '$.sessions[0].conversation'),
JSON_QUERY_ARRAY(data, '$.sessions[0].sessionMessages')
FROM a;
I would like to get the single element in the Couchbase document that is in the array of objects, but i am able to fetch the array of objects
i tried to fetch the array using the following query, 'select countryDetails from test';
{
"type":"countries",
"docName":"CountryData",
"countryDetails":[
{
"name":"US",
"code":"+1",
"stateInfo":[
{
"name":"Florida",
"id":"1212"
},
{
"name":"NewYork",
"id":"1214"
}
]
},
{
"name":"France",
"code":"+33",
"stateInfo":[
{
"name":"Grand Est",
"id":"5212"
},
{
"name":"Brittany",
"id":"5214"
}
]
}
]
}
i tried fetching array using, select countryDetails from test;
i like to fetch the result as [ {"name" : "US", "code" : "+1" }, {"name" : "France", "code" : "+33"}]
If you project countryDetails it projects whole sub object.
If you need to part of sub object you need to explicitly project that.
The following ARRAY construction will provide the data representation you are expecting.
SELECT ARRAY {v.name,v.code} FOR v IN t.countryDetails END AS contryDetails
FROM test AS t
WHERE t.type = "countries";
What you are trying to do does not seem to be possible. You can get closer to what you want with a query like this:
select raw countryDetails from test
But the results of this query still have the result wrapped in an extra level of array.
I have a JSON type column called "person" and the data stored in it is in the format
{
"clients":{
"nbr":"2",
"info":[
{
"nom":"Baptiste",
"genre":"male",
"age":"48"
},
{
"nom":"Lisa",
"genre":"female",
"age":"29"
}
]
}
}
I want to retrieve the names of clients.
You may use json_array_elements
select json_array_elements(person->'clients'->'info')->>'nom' as name
from t;
I am having a problem with reading json data. I have tried a few methods but came up short. Any help is welcome. Here is the code:
Its corrected with the whole file now.
"gallery": {
"106x100": [
"106x100-0.jpeg",
"106x100-1.jpeg",
"106x100-2.jpeg",
"106x100-3.jpeg",
"106x100-4.jpeg",
"106x100-5.jpeg",
"106x100-6.jpeg",
"106x100-7.jpeg",
"106x100-8.jpeg",
"106x100-9.jpeg",
"106x100-10.jpeg",
"106x100-11.jpeg",
"106x100-12.jpeg",
"106x100-13.jpeg",
"106x100-14.jpeg",
"106x100-15.jpeg",
"106x100-16.jpeg"
],
"190x100": [
"190x100-0.jpeg",
"190x100-1.jpeg",
"190x100-2.jpeg",
"190x100-3.jpeg",
"190x100-4.jpeg",
"190x100-5.jpeg",
"190x100-6.jpeg",
"190x100-7.jpeg",
"190x100-8.jpeg",
"190x100-9.jpeg",
"190x100-10.jpeg",
"190x100-11.jpeg",
"190x100-12.jpeg",
"190x100-13.jpeg",
"190x100-14.jpeg",
"190x100-15.jpeg",
"190x100-16.jpeg"
]
},
},
Your json's format is wrong.
Here is the correct format:
{"190x100": [
"190x100-0.jpeg",
"190x100-1.jpeg",
"190x100-2.jpeg",
"190x100-3.jpeg",
"190x100-4.jpeg",
"190x100-5.jpeg",
"190x100-6.jpeg",
"190x100-7.jpeg",
"190x100-8.jpeg",
"190x100-9.jpeg",
"190x100-10.jpeg",
"190x100-11.jpeg",
"190x100-12.jpeg",
"190x100-13.jpeg",
"190x100-14.jpeg",
"190x100-15.jpeg",
"190x100-16.jpeg"
]}
PS: Read that you are using java. You can then try this:
String jsonString = "{"+json+"}"; //pass json here if you are getting it in that format.
JSONObject jsonObject = new JSONObject(jsonString);
JSONObject newJSON = jsonObject.getJSONObject("gallery");
System.out.println(newJSON.toString());
Again the format is wrong. Here is the format:
{"gallery": {
"106x100": [
"106x100-0.jpeg",
"106x100-1.jpeg",
"106x100-2.jpeg",
"106x100-3.jpeg",
"106x100-4.jpeg",
"106x100-5.jpeg",
"106x100-6.jpeg",
"106x100-7.jpeg",
"106x100-8.jpeg",
"106x100-9.jpeg",
"106x100-10.jpeg",
"106x100-11.jpeg",
"106x100-12.jpeg",
"106x100-13.jpeg",
"106x100-14.jpeg",
"106x100-15.jpeg",
"106x100-16.jpeg"
],
"190x100": [
"190x100-0.jpeg",
"190x100-1.jpeg",
"190x100-2.jpeg",
"190x100-3.jpeg",
"190x100-4.jpeg",
"190x100-5.jpeg",
"190x100-6.jpeg",
"190x100-7.jpeg",
"190x100-8.jpeg",
"190x100-9.jpeg",
"190x100-10.jpeg",
"190x100-11.jpeg",
"190x100-12.jpeg",
"190x100-13.jpeg",
"190x100-14.jpeg",
"190x100-15.jpeg",
"190x100-16.jpeg"
]
}
}
I have a raw json in following format-
"luns": [
{
"numReadBlocks": 15444876,
"numWriteBlocks": 13530714,
"blockSizeInBytes": 512,
"writeIops": 495344,
"readIops": 312702,
"serialNumber": "aaaaaaa",
"uuid": "id",
"shareState": "none",
"usedBytes": 6721716224,
"totalSizeBytes": 16106127360,
"path": "/vol/lun_23052014_025830_vol/lun_23052014_025830"
},
{
"numReadBlocks": 15444876,
"numWriteBlocks": 13530714,
"blockSizeInBytes": 512,
"writeIops": 495344,
"readIops": 312702,
"serialNumber": "aaaaaaa",
"uuid": "id",
"shareState": "none",
"usedBytes": 6721716224,
"totalSizeBytes": 16106127360,
"path": "/vol/lun_23052014_025830_vol/lun_23052014_025830"
}]
The luns may contains list.
I want to process above json and form output as following-
"topStorageLuns": [
{
"name": "Free (in GB)",
"data": [7.79,7.79]
},
{
"name": "Used (in GB)",
"data": [7.21,7.21]
}]
I tried following in order to get output-
val storageLuns = myRawJson
val topStorageLuns = storageLuns.map { storageLun =>
val totalLunsSizeOnStorageDevice = storageLun.luns.foldLeft(0.0) {
case (totalBytesOnDevice, lun) =>
totalBytesOnDevice + lun.usedBytes.getOrElse(0.0).toString.toLong
}
val totalAvailableLunsOnStorageDevice = storageLun.luns.foldLeft(0.0) {
case (totalBytesOnDevice, lun) =>
totalBytesOnDevice + lun.usedBytes.getOrElse(0.0).toString.toLong
}
Json.obj("name" -> storageLun.hostId, "data" -> "%.2f".format(totalLunsSizeOnStorageDevice / (1024 * 1024 * 1024)).toDouble)
}
Can anybody help me to get desired output please???
The key lesson I want to impart is that your algorithm should reflect the shape of the output you want. Work backward from the result you want to build the algorithm.
It looks to me like you want to create an array of length 2, where each entry has a corresponding algorithm (spaced used, space free). Within each of these elements, you want a nested array with an element for each item in your input array, calculated using the algorithm from the outer array. Here's how I would approach the problem:
1) Define your algorithms
val dfAlgorithm: (Seq[(String, JsValue)] => Double) = _.foldLeft(0.0) { (acc, item) =>
/* whatever logic you need to do */
}
val duAlgorithm: (Seq[(String, JsValue)] => Double) = _.foldLeft(0.0) { (acc, item) =>
/* whatever logic you need to do */
}
2) Create a data structure to map over to build your final output
val stats = Seq("Free (in GB)" -> dfAlgorithm, "Used (in GB)" -> duAlgorithm)
3) Map over your input data within your mapping over your algorithms (the logic here reflects the shape of the result you want)
stats.map { case (name, algorithm) =>
Json.obj("name" -> name, "data" -> storageLuns.map { storageLun => algorithm(storageLun) }
}
This isn't going to be a turnkey solution, since I don't know how your free/used algorithms are supposed to work, but this overall scheme should get you there.