How to use MapReduce to parse a Json file? - json
Im new to json format. And Im trying to learn how to parse a Json file and pick up the data in it using a MapReduce programming model. Is there any Json Parser that can read multiple lines in records.
Here is my possible & maximum number of elements that be present in my Json file:
{
"type": "",
"format": "",
"version": "",
"id": "",
"start": "",
"cp": "",
message:{ "proto": "","protoVer": "","cliIP": "","reqPort": "","reqHost": "","reqMethod": "","reqPath": "","reqQuery": "","reqCT": "","reqLen": "","sslVer": "","status": "","redirURL": "","respCT": "","respLen": "","bytes": "","UA": "","fwdHost":},
reqHdr:{"accEnc": "","accLang": "","auth": "","cacheCtl": "","conn": "","contMD5": "","cookie": "","DNT": "","expect": "","ifMatch": "","ifMod": "","ifNone": "","ifRange": "","ifUnmod": "","range": "","referer": "","te": "","upgrade": "","via": "","xFrwdFor": "","xReqWith": ""},
"respHdr": {"accRange": "","allowOrigin": "","age": "","allow": "","cacheCtl": "","conn": "","contEnc": "","contLang": "","contMD5": "","contDisp": "","contRange": "","date": "","eTag": "","expires": "","lastMod": "","link": "","p3p": "","retry": "","server": "","trailer": "","transEnc": "","vary": "","via": "","warning": "","wwwAuth": "","xPwrdBy": "","setCookie": ""},
"netPerf": {"downloadTime": "","originName": "","originIP": "","originInitIP": "","originRetry": "","lastMileRTT": "","midMileLatency": "","netOriginLatency": "","lastMileBW": "","cacheStatus": "","firstByte": "","lastByte": "","asnum": "","network": "","netType": "","edgeIP": ""},
"geo": {"country": "","region": "","city": ""},
"waf" : {"logVer" : "1.0","ipRules" : "","appRules" : "","warn" : "","deny" : ""},
"content": {"custom_name": "custom_value"},
}
These are my sample values in the json file.
`{"type":"cloud_monitor","format":"default","version":"1.0","id":"71101cb85441995d11a43bb","start":"1413585245.921","cp":"254623","message":{"proto":"http","protoVer":"1.1","status":"403","cliIP":"23.79.231.14","reqPort":"80","reqHost":"ksd.metareactor.com","reqMethod":"GET","reqPath":"%2findex.php","reqQuery":"path%3d57%26product_id%3d49%26route%3d%255Cwinnt%255Cwin.ini%2500.","respCT":"text/html","respLen":"286","bytes":"286","UA":"mozilla-saturn","fwdHost":"origin-demo2-akamaized.scoe-sil.net"}`,
"reqHdr":{"accEnc":"gzip,%20deflate","cookie":"PHPSESSID%3dkkqoodvfe0rt9l7lbvqghk6e15%3bcurrency%3dUSD%3blanguage%3den"},"netPerf":{"downloadTime":"8","lastMileRTT":"20","cacheStatus":"0","firstByte":"1","lastByte":"1","asnum":"12222","edgeIP":"184.28.16.109"},"geo":{"country":"US","region":"CA","city":"SANFRANCISCO","lat":"37.7795","long":"-122.4195"},"network":{"edgeIP":"184.28.16.109","asnum":"12222","network":"","networkType":""},"waf":{"ver":"2.0","policy":"qik1_12418","ruleSet":"KRS%201.0","mode":"scr","rsr":"1","dor":"0","oft":"0","riskGroups":":INBOUND-ANOMALY","riskTuples":":-3000002","riskScores":":-1000","pAction":"","pRate":"","warnRules":"3000002","warnSlrs":"ARGS%3aroute","warnData":"d2lubnQvd2luLmluaQ%3d%3d","warnTags":"AKAMAI%2fWEB_ATTACK%2fFILE_INJECTION","denyRules":"INBOUND-ANOMALY","denyData":"U2NvcmU6IDEwMDAsIERFTlkgdGhyZXNob2xkOiAyNSwgQWxlcnQgUnVsZXM6IDMwMDAwMDIsIERlbnkgUnVsZTogLCBMYXN0IE1hdGNoZWQgTWVzc2FnZTogTG9jYWwgU3lzdGVtIEZpbGUgQWNjZXNzIEF0dGVtcHQ%3d"}}
I have a Java Json Parser but I can use it to read a single line. How can I identify a multiple line record in a Json file and use it in the MapReduce code to extract the data.
My Json parser class:
String[] tuple = value.toString().split("\n");
try {
for(int i=0; i<tuple.length; i++) {
JSONObject jsonobj = new JSONObject(tuple[i]);
type = (String) jsonobj.get("type");
format = (String) jsonobj.get("format");
version = (String) jsonobj.get("version");
id = (String) jsonobj.get("id");
start = (String) jsonobj.get("start");
cp = (String) jsonobj.get("cp");
message = (String) jsonobj.get("message");
}
} catch (JSONException e) {
e.printStackTrace();
}
Can anyone help me on writing code to read Json records which are in multiple lines in MapReduce ?
Related
Pulling specific Parent/Child JSON data with Python
I'm having a difficult time figuring out how to pull specific information from a json file. So far I have this: # Import json library import json # Open json database file with open('jsondatabase.json', 'r') as f: data = json.load(f) # assign variables from json data and convert to usable information identifier = data['ID'] identifier = str(identifier) name = data['name'] name = str(name) # Collect data from user to compare with data in json file print("Please enter your numerical identifier and name: ") user_id = input("Numerical identifier: ") user_name = input("Name: ") if user_id == identifier and user_name == name: print("Your inputs matched. Congrats.") else: print("Your inputs did not match our data. Please try again.") And that works great for a simple JSON file like this: { "ID": "123", "name": "Bobby" } But ideally I need to create a more complex JSON file and can't find deeper information on how to pull specific information from something like this: { "Parent": [ { "Parent_1": [ { "Name": "Bobby", "ID": "123" } ], "Parent_2": [ { "Name": "Linda", "ID": "321" } ] } ] }
Here is an example that you might be able to pick apart. You could either: Make a custom de-jsonify object_hook as shown below and do something with it. There is a good tutorial here. Just gobble up the whole dictionary that you get without a custom de-jsonify and drill down into it and make a list or set of the results. (not shown) Example: import json from collections import namedtuple data = ''' { "Parents": [ { "Name": "Bobby", "ID": "123" }, { "Name": "Linda", "ID": "321" } ] } ''' Parent = namedtuple('Parent', ['name', 'id']) def dejsonify(json_str: dict): if json_str.get("Name"): parent = Parent(json_str.get('Name'), int(json_str.get('ID'))) return parent return json_str res = json.loads(data, object_hook=dejsonify) print(res) # then we can do whatever... if you need lookups by name/id, # we could put the result into a dictionary all_parents = {(p.name, p.id) : p for p in res['Parents']} lookup_from_input = ('Bobby', 123) print(f'found match: {all_parents.get(lookup_from_input)}') Result: {'Parents': [Parent(name='Bobby', id=123), Parent(name='Linda', id=321)]} found match: Parent(name='Bobby', id=123)
Cleaning of JSON Objects using Spark
I have been trying to clean my json file.I used RDD to read the Json file and then tried to clean it using replace function but still I am not getting the correct json file because of the escape sequences present in the JSON value. Here is my code with which I am trying to clean the JSON file of various disturbances. The cleaned JSON shows errors.Please review and tell the issue** val readjson = sparkSession .sparkContext.textFile("dev.json") val json=readjson.map(element=>element .replace("\"\":\"\"","\":\"") .replace("\"\",\"\"","\",\"") .replace("\"\":","\":") .replace(",\"\"",",\"") .replace("\"{\"\"","{\"") .replace("\"\"}\"","\"}")) .saveAsTextFile("JSON") HERE IS MY JSON FILE "{""SEQ_NO"":596514,""PROV_DEMOG_SK"":596514,""PROV_ID"":""QMP000003370581"",""FRST_NM"":"""",""LAST_NM"":""RICHARD WHITTINGTON BUTCHER"",""FUL_NM"":"""",""GENDR_CD"":"""",""PROV_NPI"":"""",""PROV_STAT"":""Incomplete"",""PROV_TY"":""03"",""DT_OF_BRTH"":"""",""PROFPROFL_DESGTN"":"""",""ETL_LAST_UPDT_DT_TM"":""2020-04-28 11:43:31.000000"",""PROV_CLSFTN_CD"":""A"",""SRC_DATA_KEY"":50,""OPRN_CD"":""I"",""REC_SET"":""F""}" I tried cleaning the above json and got the following result:- { "SEQ_NO": 596514, "PROV_DEMOG_SK": 596514, "PROV_ID": "QMP000003370581", "FRST_NM": "", "LAST_NM": "RICHARD WHITTINGTON BUTCHER", "FUL_NM": "", "GENDR_CD": "", "PROV_NPI": "", "PROV_STAT": "Incomplete", "PROV_TY": "03", "DT_OF_BRTH": "", "PROFPROFL_DESGTN": "", "ETL_LAST_UPDT_DT_TM": "2020-04-28 11:43:31.000000", "PROV_CLSFTN_CD": "A", "SRC_DATA_KEY": 50, "OPRN_CD": "I", "REC_SET": "F" } The JSON validators present online show that it is incorrect
Looks like your JSON has one or few control character \u0009 try replacing them with .replaceAll("\\u0009"," ") You can do it in below sequence val replacedVal = """{""SEQ_NO"":596514,""PROV_DEMOG_SK"":596514,""PROV_ID"":""QMP000003370581"",""FRST_NM"":\"\"\"",""LAST_NM"":""RICHARD WHITTINGTON BUTCHER"",""FUL_NM"":\"\"\"",""GENDR_CD"":\"\"\"",""PROV_NPI"":\"\"\"",""PROV_STAT"":""Incomplete"",""PROV_TY"":""03"",""DT_OF_BRTH"":\"\"\"",""PROFPROFL_DESGTN"":\"\"\"",""ETL_LAST_UPDT_DT_TM"":""2020-04-28 11:43:31.000000"",""PROV_CLSFTN_CD"":""A"",""SRC_DATA_KEY"":50,""OPRN_CD"":""I"",""REC_SET"":""F""}""" .replace("""\"""",""""""") .replace("""""""",""""""") .replaceAll("\\u0009"," ")
Python: JSON to Dictionary
Two examples for a JSON request. Both examples should have the correct JSON syntax, yet only the second version seems to be translatable to a dictionary. #doesn't work string_js3 = """{"employees": [ { "FNAME":"FTestA", "LNAME":"LTestA", "SSN":6668844441 }, { "FNAME":"FTestB", "LNAME":"LTestB", "SSN":6668844442 } ]} """ #works string_js4 = """[ { "FNAME":"FTestA", "LNAME":"LTestA", "SSN":6668844441 }, { "FNAME":"FTestB", "LNAME":"LTestB", "SSN":6668844442 }] """ This gives an error, while the same with string_js4 works L1 = json.loads(string_js3) print(L1[0]['FNAME']) So I have 2 questions: 1) Why doesn't the first version work 2) Is there a simple way to make the first version also work?
Both of these strings are valid JSON. Where you are getting stuck is in how you are accessing the resulting data structures. L1 (from string_js3) is a (nested) dict; L2 (from string_js4) is a list of dicts. Walkthrough: import json string_js3 = """{ "employees": [{ "FNAME": "FTestA", "LNAME": "LTestA", "SSN": 6668844441 }, { "FNAME": "FTestB", "LNAME": "LTestB", "SSN": 6668844442 } ] }""" string_js4 = """[{ "FNAME": "FTestA", "LNAME": "LTestA", "SSN": 6668844441 }, { "FNAME": "FTestB", "LNAME": "LTestB", "SSN": 6668844442 } ]""" L1 = json.loads(string_js3) L2 = json.loads(string_js4) The resulting objects: L1 {'employees': [{'FNAME': 'FTestA', 'LNAME': 'LTestA', 'SSN': 6668844441}, {'FNAME': 'FTestB', 'LNAME': 'LTestB', 'SSN': 6668844442}]} L2 [{'FNAME': 'FTestA', 'LNAME': 'LTestA', 'SSN': 6668844441}, {'FNAME': 'FTestB', 'LNAME': 'LTestB', 'SSN': 6668844442}] type(L1), type(L2) (dict, list) 1) Why doesn't the first version work? Because calling L1[0] is trying to return the value from the key 0, and that key doesn't exist. From the docs, "It is an error to extract a value using a non-existent key." L1 is a dictionary with just one key: L1.keys() dict_keys(['employees']) 2) Is there a simple way to make the first version also work? There are several ways, but it ultimately depends on what your larger problem looks like. I'm going to assume you want to modify the Python code rather than the JSON files/strings themselves. You could do: L3 = L1['employees'].copy() You now have a list of dictionaries that resembles L2: L3 [{'FNAME': 'FTestA', 'LNAME': 'LTestA', 'SSN': 6668844441}, {'FNAME': 'FTestB', 'LNAME': 'LTestB', 'SSN': 6668844442}]
Can JSON String format be converted to Actual format using groovy?
I have the following JSON String format getting from external source:- What kind of format is this actually? { id=102, brand=Disha, book=[{ slr=EFTR, description=Grammer, data=TYR, rate=true, numberOfPages=345, maxAllowed=12, currentPage=345 }, { slr=EFRE, description=English, data=TYR, rate=true, numberOfPages=345, maxAllowed=12, currentPage=345 }] } I want to convert this into actual JSON format like this: - { "id": "102", "brand": "Disha", "book": [{ "slr": "EFTR", "description": "Grammer", "data": "TYR", "rate": true, "numberOfPages": 345, "maxAllowed": "12", "currentPage": 345 }, { "slr": "EFRE", "description": "English", "data": "TYR", "rate": true, "numberOfPages": 345, "maxAllowed": "12", "currentPage": 345 }] } Is this achievable using groovy command or code?
Couple of things: You do not need Groovy Script test step which is currently there as step3 For step2, Add a 'Script Assertion` with given below script Provide step name for nextStepName in the script below for which you want to add the request. //Provide the test step name where you want to add the request def nextStepName = 'step4' def setRequestToStep = { stepName, requestContent -> context.testCase.testSteps[stepName]?.httpRequest.requestContent = requestContent } //Check the response assert context.response, 'Response is empty or null' setRequestToStep(nextStepName, context.response) EDIT: Based on the discussion with OP on the chat, OP want to update existing request of step4 for a key and its value as step2's response. Using samples to demonstrate the change input and desired outputs. Let us say, step2's response is: { "world": "test1" } And step4's existing request is : { "key" : "value", "key2" : "value2" } Now, OP wants to update value of key with first response in ste4's request, and desired is : { "key": { "world": "test1" }, "key2": "value2" } Here is the updated script, use it in Script Assertion for step 2: //Change the key name if required; the step2 response is updated for this key of step4 def keyName = 'key' //Change the name of test step to expected to be updated with new request def nextStepName = 'step4' //Check response assert context.response, 'Response is null or empty' def getJson = { str -> new groovy.json.JsonSlurper().parseText(str) } def getStringRequest = { json -> new groovy.json.JsonBuilder(json).toPrettyString() } def setRequestToStep = { stepName, requestContent, key -> def currentRequest = context.testCase.testSteps[stepName]?.httpRequest.requestContent log.info "Existing request of step ${stepName} is ${currentRequest}" def currentReqJson = getJson(currentRequest) currentReqJson."$key" = getJson(requestContent) context.testCase.testSteps[stepName]?.httpRequest.requestContent = getStringRequest(currentReqJson) log.info "Updated request of step ${stepName} is ${getStringRequest(currentReqJson)}" } setRequestToStep(nextStepName, context.request, keyName)
We can convert the invalid JSON format to valid JSON format using this line of code:- def validJSONString = JsonOutput.toJson(invalidJSONString).toString()
i got empty value when print json key in lua code
this is the json response plus "\x00" in the end from server : { "STATUS": [{ "STATUS":"S", "When":1470180059, "Code":11, "Msg":"Summary", "Description":"nsgminer 0.9.2" }],"SUMMARY": [{ "Elapsed":2061, "MHS av":0.00, "Found Blocks":0, "Getworks":76, "Accepted":0, "Rejected":0, "Hardware Errors":0, "Utility":0.00, "Discarded":209, "Stale":0, "Get Failures":3, "Local Work":293, "Remote Failures":0, "Network Blocks":14, "Total MH":0.0000, "Work Utility":0.00, "Difficulty Accepted":0.00000000, "Difficulty Rejected":0.00000000, "Difficulty Stale":0.00000000, "Best Share":0 }], "id":1 }\x00 i want to use the json in lua code : local output = stdnse.output_table() local json_string = tostring(result:sub(1, -2)) local pos, value = json.parse(json_string) output["Description"] = value["STATUS"][0]["Description"] return output when i print it out, i got null value
i solve that with covert json to string and convert string into json table local pos, value = json.parse(tostring(json_string)) output["Description"] = value["STATUS"][1]["Description"]