How to get specific values from 2 CSV in groovy - csv

Please help with parse CSV to JSON from 2 files in groovy.
I have 1st CSV like this (line numbers may be different each time):
testKey,status
Name001,PASS
Name002,PASS
Name003,FAIL
CSV2 (list of all testkeys but with different names of keys:
Kt,Pd
PT-01,Name007
PT-02,Name001
PT-03,Name003
PT-05,Name002
PT-06,Name004
PT-07,Name006
I need to match in result exactly the same values for testKey (testKey.CSV1=Kt=CSV2)
Something like this:
{
"testExecutionKey": "DEMO-303",
"info": {
"user": "admin"
},
"tests": [
{
"testKey": "PT-02",
"status": "PASS"
},
{
"testKey": "PT-05",
"status": "PASS"
},
{
"testKey": "PT-03",
"status": "FAIL"
}
]
}
This code is parsing only the same value but with no matching exactly testKey:
File csv1 = new File( 'one.csv')
File csv2 = new File( 'two.csv')
def lines1 = csv1.readLines()
def lines2 = csv2.readLines()
assert lines1.size() <= lines2.size()
fieldSep = /,[ ]*/
def fieldNames1 = lines1[0].split( fieldSep )
def fieldNames2 = lines1[0].split( fieldSep )
def testList = []
lines1[1..-1].eachWithIndex { csv1Line, lineNo ->
def mappedLine = [:]
def fieldsCsv1 = csv1Line.split( fieldSep )
fieldsCsv1[1..-1].eachWithIndex { value, fldNo ->
String name = fieldNames1[ fldNo + 1 ]
mappedLine[ name ] = value
}
def fieldsCsv2 = lines2[lineNo + 1].split( fieldSep )
fieldsCsv2[0..-2].eachWithIndex { value, fldNo ->
String name = fieldNames2[ fldNo ]
mappedLine[ name ] = value
}
testList << mappedLine
}
def builder = new JsonBuilder()
def root = builder {
testExecutionKey 'DEMO-303'
info user: 'admin'
tests testList
}
println builder.toPrettyString()

You need to bind CSV2 to a Map, and then use it to replace values from CSV1, like so:
import groovy.json.*
def csv1 = '''
testKey,status
Name001,PASS
Name002,PASS
Name003,FAIL
Name999,FAIL
'''.trim()
def csv2 = '''
Kt,Pd
PT-01,Name007
PT-02,Name001
PT-03,Name003
PT-05,Name002
PT-06,Name004
PT-07,Name006
'''.trim()
boolean skip1st = false
def testMap2 = [:]
//parse and bind 1st CSV to Map
csv2.splitEachLine( /\s*,\s*/ ){
skip1st ? ( testMap2[ it[ 1 ] ] = it[ 0 ] ) : ( skip1st = true )
}
def keys
def testList = []
csv1.splitEachLine( /\s*,\s*/ ){ parts ->
if( !keys )
keys = parts*.trim()
else{
def test = [:]
parts.eachWithIndex{ val, ix -> test[ keys[ ix ] ] = val }
//check if testKey present in csv2
if( testMap2[ test.testKey ] ){
test.testKey = testMap2[ test.testKey ] // replace values from CSV2
testList << test
}
}
}
def builder = new JsonBuilder()
def root = builder {
testExecutionKey 'DEMO-303'
info user: 'admin'
tests testList
}
builder.toPrettyString()
gives:
{
"testExecutionKey": "DEMO-303",
"info": {
"user": "admin"
},
"tests": [
{
"testKey": "PT-02",
"status": "PASS"
},
{
"testKey": "PT-05",
"status": "PASS"
},
{
"testKey": "PT-03",
"status": "FAIL"
}
]
}

Related

Convert string to dict in python, but string dict has no quotes and = instead of :

I have dictionaries as strings. In the string the keys and values have no quotes, and instead of having key-value pairs in the usual format (key:value), I have them like this key=value. An example of such string is here below
{created={type=FREEMIUM, title={value=Drool is love, drool is live..., _iscolumngrouppresent=true}, content={value=null, _iscolumngrouppresent=false}, status=PROCESSING, tags=[], attachments=[{payload_0={video_id=null, image_id=2efbff31-a0a6-4f4c-a163-667c4aabd111}}], visible_at={value=null, _iscolumngrouppresent=false}, author_user_id=8cfdf75d-5816-42f8-906f-8b203bb2c99f, _iscolumngrouppresent=true}, _iscolumngrouppresent=true}
Which I would like to convert to
"created": {
"type": "FREEMIUM",
"title": {
"value": "Drool is love, drool is live...",
"_iscolumngrouppresent": True
},
"content": {
"value": None,
"_iscolumngrouppresent": False
},
"status": "PROCESSING",
"tags": [],
"attachments": [
{
"payload_0": {
"video_id": None,
"image_id": "2efbff31-a0a6-4f4c-a163-667c4aabd111"
}
}
],
"visible_at": {
"value": None,
"_iscolumngrouppresent": False
},
"author_user_id": "8cfdf75d-5816-42f8-906f-8b203bb2c99f",
"_iscolumngrouppresent": True
},
"_iscolumngrouppresent": True
}
I tried to parse the string by myself, but there are cases where it fails. Mainly when a value of a key:value pair is a string with commas in it (",").
Is there a tooling already out there that I can use. Any ideas are more than welcome.
Thanks in advance
This parser works for your example:
class ParseError(Exception):
pass
class Parser:
def __init__(self, data):
self.data = data
self.position = 0
def parse(self, string):
self.parse_dict()
def parse_dict(self):
self.consume('{')
if self.current() == '}':
self.next()
return dict()
key_values = self.parse_key_value_list()
self.consume('}')
return dict(key_values)
def parse_key_value_list(self):
res = []
while True:
res.append(self.parse_key_value())
if not self.consume_comma_separator():
break
return res
def consume_comma_separator(self):
if self.current() != ',':
return False
self.consume(',')
self.consume_whitespace()
return True
def parse_key_value(self):
name = self.parse_name()
self.consume('=')
value = self.parse_value()
return (name, value)
def parse_name(self):
if not self.current().isalpha() and self.current() != '_':
self.error(f"Expected [a-zA-Z] or _ but found {self.current()}")
res = self.current()
self.next()
res += self.accept_while(lambda c: c.isalpha() or c.isdigit() or c=='_')
return res
def parse_value(self):
match self.current():
case '[':
return self.parse_array()
case '{':
return self.parse_dict()
case 'n':
return self.parse_null()
case 't' | 'f':
return self.parse_boolean()
# case c if c.isdigit():
# return self.parse_integer()
case _:
return self.parse_string()
def parse_array(self):
self.consume('[')
if self.current() == ']':
self.next()
return []
res = []
while True:
res.append(self.parse_value())
if not self.consume_comma_separator():
break
self.consume(']')
return res
def parse_null(self):
self.consume("null")
return None
def parse_boolean(self):
match self.current():
case 't':
self.consume('true')
return True
case 'f':
self.consume('false')
return False
case _:
self.error(f"Unexpected character {self.current()}")
def parse_string(self):
pos = self.position
while self.data[pos] not in '=]}':
pos += 1
if self.data[pos] == '=':
while self.data[pos] != ',':
pos -= 1
res = self.data[self.position : pos]
self.position = pos
return res
# return self.accept_while(lambda c: c not in '=]}')
def consume(self, string):
for c in string:
if self.current() != c:
self.error(f"Expected '{c}'; found '{self.current()}")
self.next()
def consume_whitespace(self):
while self.current().isspace():
self.next()
def next(self) -> str:
self.position += 1
def current(self) -> str:
return self.data[self.position]
def accept_while(self, predicate) -> str:
res = ""
while self.position < len(self.data):
# print(f"{self.current()=}")
if not predicate(self.current()):
break
res += self.current()
self.next()
return res
def error(self, msg):
raise ParseError(f"{msg} [position: {self.position}]")
def parse(data):
return Parser(data).parse_dict()
test_input = r"{created={type=FREEMIUM, title={value=Drool is love, drool is live..., _iscolumngrouppresent=true}, content={value=null, _iscolumngrouppresent=false}, status=PROCESSING, tags=[], attachments=[{payload_0={video_id=null, image_id=2efbff31-a0a6-4f4c-a163-667c4aabd111}}], visible_at={value=null, _iscolumngrouppresent=false}, author_user_id=8cfdf75d-5816-42f8-906f-8b203bb2c99f, _iscolumngrouppresent=true}, _iscolumngrouppresent=true}"
test_output = {
"created": {
"type": "FREEMIUM",
"title": {
"value": "Drool is love, drool is live...",
"_iscolumngrouppresent": True
},
"content": {
"value": None,
"_iscolumngrouppresent": False
},
"status": "PROCESSING",
"tags": [],
"attachments": [
{
"payload_0": {
"video_id": None,
"image_id": "2efbff31-a0a6-4f4c-a163-667c4aabd111"
}
}
],
"visible_at": {
"value": None,
"_iscolumngrouppresent": False
},
"author_user_id": "8cfdf75d-5816-42f8-906f-8b203bb2c99f",
"_iscolumngrouppresent": True
},
"_iscolumngrouppresent": True
}
def test():
assert parse(test_input) == test_output
if __name__ == '__main__':
test()

Explode Deeply Nested JSON returning duplicates in Spark Scala

I have a utility which is working fine for parsing simple JSONs, but cross joining in case multiple array[structs] is present in the JSON
I have tried distinct() or dropDuplicates() as well to remove duplicates which is happening due to the cross join that I have included in the code, but thats returning empty DF..
def flattenDataFrame(df: DataFrame): DataFrame = {
var flattenedDf: DataFrame = df
if (isNested(df)) {
val flattenedSchema: Array[(Column, Boolean)] = flattenSchema(df.schema)
var simpleColumns: List[Column] = List.empty[Column]
var complexColumns: List[Column] = List.empty[Column]
flattenedSchema.foreach {
case (col, isComplex) => {
if (isComplex) {
complexColumns = complexColumns :+ col
} else {
simpleColumns = simpleColumns :+ col
}
}
}
var crossJoinedDataFrame = df.select(simpleColumns: _*)
complexColumns.foreach(col => {
crossJoinedDataFrame = crossJoinedDataFrame.crossJoin(df.select(col))
crossJoinedDataFrame = flattenDataFrame(crossJoinedDataFrame)
})
crossJoinedDataFrame
} else {
flattenedDf
}
}
private def flattenSchema(schema: StructType, prefix: String = null): Array[(Column, Boolean)] = {
schema.fields.flatMap(field => {
val columnName = if (prefix == null) field.name else prefix + "." + field.name
field.dataType match {
case arrayType: ArrayType => {
val cols: Array[(Column, Boolean)] = Array[(Column, Boolean)](((explode_outer(col(columnName)).as(columnName.replace(".", "_"))), true))
cols
}
case structType: StructType => {
flattenSchema(structType, columnName)
}
case _ => {
val columnNameWithUnderscores = columnName.replace(".", "_")
val metadata = new MetadataBuilder().putString("encoding", "ZSTD").build()
Array(((col(columnName).as(columnNameWithUnderscores, metadata)), false))
}
}
}).filter(field => field != None)
}
def isNested(df: DataFrame): Boolean = {
df.schema.fields.flatMap(field => {
field.dataType match {
case arrayType: ArrayType => {
Array(true)
}
case mapType: MapType => {
Array(true)
}
case structType: StructType => {
Array(true)
}
case _ => {
Array(false)
}
}
}).exists(b => b)
}
A sample JSON in which I am facing the issue:
[
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
},
{
"id": "0002",
"type": "donut",
"name": "Raised",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
]
Solution without join and more than that, no cross-join which is your problem:
Sorry for the formatting, can't really get it to format well for stack-overflow
def flattenDataFrame(df: DataFrame): DataFrame = {
val flattenedDf: DataFrame = df
if (isNested(df)) {
val flattenedSchema: Array[(Column, Boolean)] = flattenSchema(flattenedDf.schema)
var simpleColumns: List[Column] = List.empty[Column]
var complexColumns: List[Column] = List.empty[Column]
flattenedSchema.foreach {
case (col, isComplex) =>
if (isComplex) {
complexColumns = complexColumns :+ col
} else {
simpleColumns = simpleColumns :+ col
}
}
val complexUnderlyingCols = complexColumns.map { column =>
val name = column.expr.asInstanceOf[UnresolvedAttribute].name
val unquotedColName = s"${name.replaceAll("`","")}"
val explodeSelectColName = s"`${name.replaceAll("`","")}`"
(unquotedColName, col(name).as(unquotedColName), explode_outer(col(explodeSelectColName)).as(unquotedColName))
}
var joinDataFrame = flattenedDf.select(simpleColumns ++ complexUnderlyingCols.map(_._2): _*)
complexUnderlyingCols.foreach { case (name, tempCol, column) =>
val nonTransformedColumns = joinDataFrame.schema.fieldNames.diff(List(name)).map(fieldName => s"`${fieldName.replaceAll("`", "")}`").map(col)
joinDataFrame = joinDataFrame.select(nonTransformedColumns :+ column :_*)
}
flattenDataFrame(joinDataFrame)
} else {
flattenedDf
}
}
private def flattenSchema(schema: StructType, prefix: String = null, level: Int = 0): Array[(Column, Boolean)] = {
val unquotedPrefix = if (prefix != null) prefix.replace("", "") else null
println(level)
schema.fields.flatMap(field => {
val fieldName = field.name
val columnName = if (level == 0) {
s"$fieldName"
} else {
val fullName = s"$unquotedPrefix.$fieldName"
val x = fullName.split('.').reverse.zipWithIndex.reverse.foldLeft(new StringBuilder("")){ case (builder, (fieldPart, index)) =>
if(index > level) {
builder.append(s".$fieldPart")
} else if (index == level) {
builder.append(s".$fieldPart")
} else {
builder.append(s".$fieldPart")
}
}
x.replace(1,2,"").toString()
}
val unquotedColumnName = columnName.replace("", "")
field.dataType match {
case _: ArrayType =>
val cols: Array[(Column, Boolean)] = Array[(Column, Boolean)]((col(columnName), true)) // We pass only the column as we'll generate explode function while expanding the DF
cols
case structType: StructType =>
flattenSchema(structType, columnName, level + 1)
case _ =>
val metadata = new MetadataBuilder().putString("encoding", "ZSTD").build()
Array((col(columnName).as(unquotedColumnName, metadata), false))
}
})
}
def isNested(df: DataFrame): Boolean = {
df.schema.fields.flatMap(field => {
field.dataType match {
case _: ArrayType =>
Array(x = true)
case _: MapType =>
Array(x = true)
case _: StructType =>
Array(x = true)
case _ =>
Array(x = false)
}
}).exists(b => b)
}

How to find the difference/mismatch between two JSON file?

I have two json files, one is expected json and the another one is the result of GET API call. I need to compare and find out the mismatch in the file.
Expected Json:
{
"array": [
1,
2,
3
],
"boolean": true,
"null": null,
"number": 123,
"object": {
"a": "b",
"c": "d",
"e": "f"
},
"string": "Hello World"
}
Actual Json response:
{
"array": [
1,
2,
3
],
"boolean": true,
"null": null,
"number": 456,
"object": {
"a": "b",
"c": "d",
"e": "f"
},
"string": "India"
}
Actually there are two mismatch: number received is 456 and string is India.
Is there a way to compare and get these two mismatch as results.
This need to be implemented in gatling/scala.
You can use, for example, play-json library and recursively traverse both JSONs. For next input (a bit more sophisticated than yours input):
LEFT:
{
"array" : [ 1, 2, 4 ],
"boolean" : true,
"null" : null,
"number" : 123,
"object" : {
"a" : "b",
"c" : "d",
"e" : "f"
},
"string" : "Hello World",
"absent-in-right" : true,
"different-types" : 123
}
RIGHT:
{
"array" : [ 1, 2, 3 ],
"boolean" : true,
"null" : null,
"number" : 456,
"object" : {
"a" : "b",
"c" : "d",
"e" : "ff"
},
"string" : "India",
"absent-in-left" : true,
"different-types" : "YES"
}
It produces this output:
Next fields are absent in LEFT:
*\absent-in-left
Next fields are absent in RIGHT:
*\absent-in-right
'*\array\(2)' => 4 != 3
'*\number' => 123 != 456
'*\object\e' => f != ff
'*\string' => Hello World != India
Cannot compare JsNumber and JsString in '*\different-types'
Code:
val left = Json.parse("""{"array":[1,2,4],"boolean":true,"null":null,"number":123,"object":{"a":"b","c":"d","e":"f"},"string":"Hello World","absent-in-right":true,"different-types":123}""").asInstanceOf[JsObject]
val right = Json.parse("""{"array":[1,2,3],"boolean":true,"null":null,"number":456,"object":{"a":"b","c":"d","e":"ff"},"string":"India","absent-in-left":true,"different-types":"YES"}""").asInstanceOf[JsObject]
// '*' - for the root node
showJsDiff(left, right, "*", Seq.empty[String])
def showJsDiff(left: JsValue, right: JsValue, parent: String, path: Seq[String]): Unit = {
val newPath = path :+ parent
if (left.getClass != right.getClass) {
println(s"Cannot compare ${left.getClass.getSimpleName} and ${right.getClass.getSimpleName} " +
s"in '${getPath(newPath)}'")
}
else {
left match {
// Primitive types are pretty easy to handle
case JsNull => logIfNotEqual(JsNull, right.asInstanceOf[JsNull.type], newPath)
case JsBoolean(value) => logIfNotEqual(value, right.asInstanceOf[JsBoolean].value, newPath)
case JsNumber(value) => logIfNotEqual(value, right.asInstanceOf[JsNumber].value, newPath)
case JsString(value) => logIfNotEqual(value, right.asInstanceOf[JsString].value, newPath)
case JsArray(value) =>
// For array we have to call showJsDiff on each element of array
val arr1 = value
val arr2 = right.asInstanceOf[JsArray].value
if (arr1.length != arr2.length) {
println(s"Arrays in '${getPath(newPath)}' have different length. ${arr1.length} != ${arr2.length}")
}
else {
arr1.indices.foreach { idx =>
showJsDiff(arr1(idx), arr2(idx), s"($idx)", newPath)
}
}
case JsObject(value) =>
val leftFields = value.keys.toSeq
val rightJsObject = right.asInstanceOf[JsObject]
val rightFields = rightJsObject.fields.map { case (name, value) => name }
val absentInLeft = rightFields.diff(leftFields)
if (absentInLeft.nonEmpty) {
println("Next fields are absent in LEFT: ")
absentInLeft.foreach { fieldName =>
println(s"\t ${getPath(newPath :+ fieldName)}")
}
}
val absentInRight = leftFields.diff(rightFields)
if (absentInRight.nonEmpty) {
println("Next fields are absent in RIGHT: ")
absentInRight.foreach { fieldName =>
println(s"\t ${getPath(newPath :+ fieldName)}")
}
}
// For common fields we have to call showJsDiff on them
val commonFields = leftFields.intersect(rightFields)
commonFields.foreach { field =>
showJsDiff(value(field), rightJsObject(field), field, newPath)
}
}
}
}
def logIfNotEqual[T](left: T, right: T, path: Seq[String]): Unit = {
if (left != right) {
println(s"'${getPath(path)}' => $left != $right")
}
}
def getPath(path: Seq[String]): String = path.mkString("\\")
Use diffson - a Scala implementation of RFC-6901 and RFC-6902: https://github.com/gnieh/diffson
json4s has a handy diff function described here: https://github.com/json4s/json4s (search for Merging & Diffing) and API doc here: https://static.javadoc.io/org.json4s/json4s-core_2.9.1/3.0.0/org/json4s/Diff.html
This is a slightly modified version of Artavazd's answer (which is amazing btw thank you so much!). This version outputs the differences into a convenient object instead of only logging them.
import play.api.Logger
import play.api.libs.json.{JsArray, JsBoolean, JsError, JsNull, JsNumber, JsObject, JsString, JsSuccess, JsValue, Json, OFormat, Reads}
case class JsDifferences(
differences: List[JsDifference] = List()
)
object JsDifferences {
implicit val format: OFormat[JsDifferences] = Json.format[JsDifferences]
}
case class JsDifference(
key: String,
path: Seq[String],
oldValue: Option[String],
newValue: Option[String]
)
object JsDifference {
implicit val format: OFormat[JsDifference] = Json.format[JsDifference]
}
object JsonUtils {
val logger: Logger = Logger(this.getClass)
def findDiff(left: JsValue, right: JsValue, parent: String = "*", path: List[String] = List()): JsDifferences = {
val newPath = path :+ parent
if (left.getClass != right.getClass) {
logger.debug(s"Cannot compare ${left.getClass.getSimpleName} and ${right.getClass.getSimpleName} in '${getPath(newPath)}'")
JsDifferences()
} else left match {
case JsNull => logIfNotEqual(JsNull, right.asInstanceOf[JsNull.type], newPath)
case JsBoolean(value) => logIfNotEqual(value, right.asInstanceOf[JsBoolean].value, newPath)
case JsNumber(value) => logIfNotEqual(value, right.asInstanceOf[JsNumber].value, newPath)
case JsString(value) => logIfNotEqual(value, right.asInstanceOf[JsString].value, newPath)
case JsArray(value) =>
val arr1 = value
val arr2 = right.asInstanceOf[JsArray].value
if (arr1.length != arr2.length) {
logger.debug(s"Arrays in '${getPath(newPath)}' have different length. ${arr1.length} != ${arr2.length}")
JsDifferences()
} else JsDifferences(arr1.indices.flatMap(idx => findDiff(arr1(idx), arr2(idx), s"($idx)", newPath).differences).toList)
case leftJsObject: JsObject => {
val leftFields = leftJsObject.keys.toSeq
val rightJsObject = right.asInstanceOf[JsObject]
val rightFields = rightJsObject.fields.map { case (name, value) => name }
val keysAbsentInLeft = rightFields.diff(leftFields)
val leftDifferences = keysAbsentInLeft.map(fieldName => JsDifference(
key = fieldName, path = newPath :+ fieldName, oldValue = None, newValue = Some(rightJsObject(fieldName).toString)
))
val keysAbsentInRight = leftFields.diff(rightFields)
val rightDifferences = keysAbsentInRight.map(fieldName => JsDifference(
key = fieldName, path = newPath :+ fieldName, oldValue = Some(leftJsObject(fieldName).toString), newValue = None
))
val commonKeys = leftFields.intersect(rightFields)
val commonDifferences = commonKeys.flatMap(field => findDiff(leftJsObject(field), rightJsObject(field), field, newPath).differences).toList
JsDifferences((leftDifferences ++ rightDifferences ++ commonDifferences).toList)
}
}
}
def logIfNotEqual[T](left: T, right: T, path: Seq[String]): JsDifferences = {
if (left != right) {
JsDifferences(List(JsDifference(
key = path.last, path = path, oldValue = Some(left.toString), newValue = Some(right.toString)
)))
} else JsDifferences()
}
def getPath(path: Seq[String]): String = path.mkString("\\")
}

SPARK : How to create aggregate from RDD[Row] in Scala

How do I create a List/Map inside a RDD/DF so that I can get the aggregate ?
I have a file where each row is a JSON object :
{
itemId :1122334,
language: [
{
name: [
"US", "FR"
],
value: [
"english", "french"
]
},
{
name: [
"IND"
],
value: [
"hindi"
]
}
],
country: [
{
US: [
{
startTime: 2016-06-06T17: 39: 35.000Z,
endTime: 2016-07-28T07: 00: 00.000Z
}
],
CANADA: [
{
startTime: 2016-06-06T17: 39: 35.000Z,
endTime: 2016-07-28T07: 00: 00.000Z
}
],
DENMARK: [
{
startTime: 2016-06-06T17: 39: 35.000Z,
endTime: 2016-07-28T07: 00: 00.000Z
}
],
FRANCE: [
{
startTime: 2016-08-06T17: 39: 35.000Z,
endTime: 2016-07-28T07: 00: 00.000Z
}
]
}
]
},
{
itemId :1122334,
language: [
{
name: [
"US", "FR"
],
value: [
"english", "french"
]
},
{
name: [
"IND"
],
value: [
"hindi"
]
}
],
country: [
{
US: [
{
startTime: 2016-06-06T17: 39: 35.000Z,
endTime: 2016-07-28T07: 00: 00.000Z
}
],
CANADA: [
{
startTime: 2016-07-06T17: 39: 35.000Z,
endTime: 2016-07-28T07: 00: 00.000Z
}
],
DENMARK: [
{
startTime: 2016-06-06T17: 39: 35.000Z,
endTime: 2016-07-28T07: 00: 00.000Z
}
],
FRANCE: [
{
startTime: 2016-08-06T17: 39: 35.000Z,
endTime: 2016-07-28T07: 00: 00.000Z
}
]
}
]
}
I have matching POJO which gets me the values from the JSON.
import com.mapping.data.model.MappingUtils
import com.mapping.data.model.CountryInfo
val mappingPath = "s3://.../"
val timeStamp = "2016-06-06T17: 39: 35.000Z"
val endTimeStamp = "2016-06-07T17: 39: 35.000Z"
val COUNTRY_US = "US"
val COUNTRY_CANADA = "CANADA"
val COUNTRY_DENMARK = "DENMARK"
val COUNTRY_FRANCE = "FRANCE"
val input = sc.textFile(mappingPath)
The input is list of jsons where each line is json which I am mapping to the POJO class CountryInfo using MappingUtils which takes care of JSON parsing and conversion:
val MappingsList = input.map(x=> {
val countryInfo = MappingUtils.getCountryInfoString(x);
(countryInfo.getItemId(), countryInfo)
}).collectAsMap
MappingsList: scala.collection.Map[String,com.mapping.data.model.CountryInfo]
def showCountryInfo(x: Option[CountryInfo]) = x match {
case Some(s) => s
}
But I need to create a DF/RDD so that I can get the aggregates of country and language for based on itemId.
In the given example, if the country's start time is not lesser than "2016-06-07T17: 39: 35.000Z" then the value will be zero.
Which format will be good to create the final aggregate json :
1. List ?
|-----itemId-------|----country-------------------|-----language---------------------|
| 1122334 | [US, CANADA,DENMARK] | [english,hindi,french] |
| 1122334 | [US,DENMARK] | [english] |
|------------------|------------------------------|----------------------------------|
2. Map ?
|-----itemId-------|----country---------------------------------|-----language---------------------|
| 1122334 | (US,2) (CANADA,1) (DENMARK,2) (FRANCE, 0) |(english,2) (hindi,1) (french,1) |
|.... |
|.... |
|.... |
|------------------|--------------------------------------------|----------------------------------|
I would like to create a final json which has the aggregate value like :
{
itemId: "1122334",
country: {
"US" : 2,
"CANADA" : 1,
"DENMARK" : 2,
"FRANCE" : 0
},
language: {
"english" : 2,
"french" : 1,
"hindi" : 1
}
}
I tried List :
val events = sqlContext.sql( "select itemId EventList")
val itemList = events.map(row => {
val itemId = row.getAs[String](1);
val countryInfo = showTitleInfo(MappingsList.get(itemId));
val country = new ListBuffer[String]()
country += if (countryInfo.getCountry().getUS().get(0).getStartTime() < endTimeStamp) COUNTRY_US;
country += if (countryInfo.getCountry().getCANADA().get(0).getStartTime() < endTimeStamp) COUNTRY_CANADA;
country += if (countryInfo.getCountry().getDENMARK().get(0).getStartTime() < endTimeStamp) COUNTRY_DENMARK;
country += if (countryInfo.getCountry().getFRANCE().get(0).getStartTime() < endTimeStamp) COUNTRY_FRANCE;
val languageList = new ListBuffer[String]()
val language = countryInfo.getLanguages().collect.foreach(x => languageList += x.getValue());
Row(itemId, country.toList, languageList.toList)
})
and Map :
val itemList = events.map(row => {
val itemId = row.getAs[String](1);
val countryInfo = showTitleInfo(MappingsList.get(itemId));
val country: Map[String, Int] = Map()
country += if (countryInfo.getCountry().getUS().get(0).getStartTime() < endTimeStamp) ('COUNTRY_US' -> 1) else ('COUNTRY_US' -> 0)
country += if (countryInfo.getCountry().getUS().get(0).getStartTime() < endTimeStamp) ('COUNTRY_CANADA' -> 1) else ('COUNTRY_CANADA' -> 0)
country += if (countryInfo.getCountry().getUS().get(0).getStartTime() < endTimeStamp) ('COUNTRY_DENMARK' -> 1) else ('COUNTRY_DENMARK' -> 0)
country += if (countryInfo.getCountry().getUS().get(0).getStartTime() < endTimeStamp) ('COUNTRY_FRANCE' -> 1) else ('COUNTRY_FRANCE' -> 0)
val language: Map[String, Int] = Map()
countryInfo.getLanguages().collect.foreach(x => language += (x.getValue -> 1)) ;
Row(itemId, country, language)
})
But both are getting frozen in Zeppelin. Is there any better way to get aggregates as json ? Which is better List/Map construct the final aggreagate ?
It would be helpful if you restated your question in terms of Spark DataFrame/Dataset and Row; I understand that you ultimately want to use JSON but the details of the JSON input/output are a separate concern.
The function you are looking for is a Spark SQL aggregate function (see the group of them on that page). The functions collect_list and collect_set are related, but the function you need is not already implemented.
You can implement what I'll call count_by_value by deriving from org.spark.spark.sql.expressions.UserDefinedAggregateFunction. This will require some in-depth knowledge of how Spark SQL works.
Once count_by_value is implemented, you can use it like this:
df.groupBy("itemId").agg(count_by_value(df("country")), count_by_value(df("language")))

Scala: Parsing JSON using the org.fastxml.jackson library

I have written the following program to parse a JSON structure in a streaming fashion.
However this looks very imperative. This is my latest attempt to wrote more idiomatic Scala code but I am not there yet.
I am parsing the following JSON, using the Scala code that follows the JSON snippet. My goal is to shorten the code through the use of more idiomatic scala structures.
Thanks in advance.
{
"type": "ImportantIncidentInfo",
"incidentTimestamp": "2014-05-15T10:09:27.989-05:00",
"numOfMatches": 4,
"myReport": {
"docReports": {
"part1/.": {
"path": [
"unknown"
],
"myAnalysis": {
"matches": [
{
"id": {
"major": 1,
"minor": 0
},
"name": "US SSN",
"position": 13,
"string": " 636-12-4567 "
},
{
"id": {
"major": 3,
"minor": 0
},
"name": "MasterCard Credit Card Number",
"position": 35,
"string": " 5424-1813-6924-3685 "
}
]
},
"cleanedUpData": [
{
"startPosition": 0,
"endPosition": 65,
"frameContent": ""
}
],
"minedMetadata": {
"Content-Encoding": "ISO-8859-1",
"Content-Type": "text/html; charset=iso-8859-1"
},
"deducedMetadata": {
"Content-Type": "text/html; iso-8859-1"
}
},
"part2/.": {
"path": [
"unknown"
],
"myAnalysis": {
"matches": [
{
"id": {
"major": 1,
"minor": 0
},
"name": "SSN",
"position": 3,
"string": " 636-12-4567\r"
},
{
"id": {
"major": 3,
"minor": 0
},
"name": "MasterCard Credit Card Number",
"position": 18,
"string": "\n5424-1813-6924-3685\r"
}
]
},
"cleanedUpData": [
{
"startPosition": 0,
"endPosition": 44,
"frameContent": ""
}
],
"minedMetadata": {
"Content-Encoding": "windows-1252",
"Content-Type": "text/plain; charset=windows-1252"
},
"deducedMetadata": {
"Content-Type": "text/plain; iso-8859-1"
}
}
}
},
"whatSetItOffEntry": {
"action": "Log",
"component": {
"type": "aComponent",
"components": [
{
"type": "PatternComponent",
"patterns": [
1
],
"not": false
}
],
"not": false
},
"ticketInfo": {
"createIncident": true,
"tags": [],
"seeRestrictedIds": [
{
"type": "userGroup",
"name": "SiteMasters",
"description": "Group for SiteMasters",
"masters": [
"04fb02a2bc0fba"
],
"members": [],
"id": "04fade"
}
]
},
"letmeknowInfo": {
"createNotification": true,
"contactNames": [
"someguy#gmail.com"
]
}
},
"seeRestrictedIds": [
"04fade66c0"
],
"status": "New",
"timeStamps": [
"2015-05-15T10:09:27.989-05:00"
],
"count": 1
}
package mypackage
import java.io.BufferedReader
import java.io.FileReader
import java.io.IOException
import java.io.InputStream
import java.util._
import com.fasterxml.jackson.core._
import com.fasterxml.jackson.databind._
import java.util.Properties
import JacksonStreaming._
object JacksonStreaming {
def main(args: Array[String]) {
println("Entered Main")
try {
new JacksonStreaming().getNames
} catch {
case e: Exception => e.printStackTrace()
}
}
}
class JacksonStreaming {
var jsonMapper: ObjectMapper = new ObjectMapper()
var jsonFactory: JsonFactory = new JsonFactory()
var prop: Properties = new Properties()
var filePath: String = ""
val path = Array("myReport", "docReports", "part1/.", "myAnalysis", "matches", "name")
def getNames() {
println("Entered getNames")
var rootNode: JsonNode = null
try {
val fileReader = new BufferedReader(new FileReader("C:/jsonFormattedModified.json"))
println("fileReader is: " + fileReader)
rootNode = jsonMapper.readTree(fileReader)
println("Return value of jsonMapper.readTree is: " + rootNode)
findByPath(rootNode)
val jsonParser = jsonFactory.createParser(new FileReader("C:/jsonFormattedModified.json"))
println("JsonParser is: " + jsonParser)
var pathIndex = 0
val names = new ArrayList[String]()
var breakOnClose = false
while (jsonParser.nextToken() != null) {
val fieldName = jsonParser.getCurrentName
if (fieldName == null) {
//continue
}
if (breakOnClose && fieldName == path(path.length - 2)) {
println("Stopping search at end of node " + fieldName)
//break
}
if (jsonParser.getCurrentToken != JsonToken.FIELD_NAME) {
//continue
}
if (pathIndex >= path.length - 1) {
if (fieldName == path(path.length - 1)) {
try {
jsonParser.nextToken()
} catch {
case e: IOException => e.printStackTrace()
}
var name: String = null
name = jsonParser.getValueAsString
if (name == null) {
throw new RuntimeException("No value exists for field " + fieldName)
}
names.add(name)
println("Found " + fieldName + " value: " + name)
}
} else if (fieldName == path(pathIndex)) {
println("Found node " + path(pathIndex))
pathIndex += 1
if (pathIndex >= path.length - 1) {
println("Looking for names ...")
breakOnClose = true
try {
jsonParser.nextFieldName()
} catch {
case e: IOException => e.printStackTrace()
}
}
}
}
} catch {
case e: IOException => e.printStackTrace()
}
}
def findByPath(jn: JsonNode) {
println("Entered findByPath")
var matchesNamesNode = jn
for (i <- 0 until path.length - 1) {
matchesNamesNode = matchesNamesNode.path(path(i))
}
if (matchesNamesNode.isMissingNode) {
throw new RuntimeException("No node with names found.")
}
println("Tree names: " + matchesNamesNode.findValuesAsText("name"))
}
}
I think that Scala is Expression Oriented, Object oriented and Functional programming language, of course you can write it imperative but for working with JSON I Recommend you to go througt Object Oriented, you can find examples it it's github repository
https://github.com/FasterXML/jackson-module-scala/
For example I recommend you to write A Scala, classes for All the Json and then for the sub objects like MyReport or whatSetItOffEntry, in the github repo is an example for this type of solution in the repo:
package com.fasterxml.jackson.module.scala
import com.fasterxml.jackson.annotation.{JsonUnwrapped, JsonProperty, JsonIgnore}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.matchers.ShouldMatchers
import org.scalatest.FlatSpec
import com.fasterxml.jackson.databind.ObjectMapper
case class Address(address1: Option[String], city: Option[String], state: Option[String])
class NonCreatorPerson
{
var name: String = _
#JsonUnwrapped var location: Address = _
var alias: Option[String] = _
}
case class Person(name: String, #JsonIgnore location: Address, alias: Option[String])
{
private def this() = this("", Address(None, None, None), None)
def address1 = location.address1
private def address1_=(value: Option[String]) {
setAddressField("address1", value)
}
def city = location.city
private def city_=(value: Option[String]) {
setAddressField("city", value)
}
def state = location.state
private def state_= (value: Option[String]) {
setAddressField("state", value)
}
private def setAddressField(name: String, value: Option[String])
{
val f = location.getClass.getDeclaredField(name)
f.setAccessible(true)
f.set(location, value)
}
}
#RunWith(classOf[JUnitRunner])
class UnwrappedTest extends BaseSpec {
"mapper" should "handle ignored fields correctly" in {
val mapper = new ObjectMapper()
mapper.registerModule(DefaultScalaModule)
val p = Person("Snoopy", Address(Some("123 Main St"), Some("Anytown"), Some("WA")), Some("Joe Cool"))
val json = mapper.writeValueAsString(p)
// There's some instability in the ordering of keys. Not sure what that's about, but rather than
// have buggy tests, I'm accepting it for now.
// json should (
// be === """{"name":"Snoopy","alias":"Joe Cool","city":"Anytown","address1":"123 Main St","state":"WA"}""" or
// be === """{"name":"Snoopy","alias":"Joe Cool","state":"WA","address1":"123 Main St","city":"Anytown"}"""
// )
val p2 = mapper.readValue(json, classOf[Person])
p2 shouldEqual p
}
it should "handle JsonUnwrapped for non-creators" in {
val mapper = new ObjectMapper()
mapper.registerModule(DefaultScalaModule)
val p = new NonCreatorPerson
p.name = "Snoopy"
p.location = Address(Some("123 Main St"), Some("Anytown"), Some("WA"))
p.alias = Some("Joe Cool")
val json = mapper.writeValueAsString(p)
val p2 = mapper.readValue(json, classOf[NonCreatorPerson])
p2.name shouldBe p.name
p2.location shouldBe p.location
p2.alias shouldBe p.alias
}
}