How to convert simple JSON to DynamoDB JSON? - json

I have a simple JSON and want to convert it to DynamoDB JSON. Is there any easy way to do that?

If you mean JsonString to Dynamodb Map, you can use boto3.
Here is the example.
import boto3
import json
json_string = '{"key1": 1, "key2": "value"}'
json_obj = json.loads(json_string)
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('test-table')
table.put_item(Item={'pk': 'pk-value', 'map': json_obj})
If you just want to update the while Map attribute, you can use just JSON format the same as put_item.
json_string = '{"key1": 2, "key2": "value2"}'
json_obj = json.loads(json_string2)
rsp = table.update_item(
Key={'pk': 'pk-value'},
AttributeUpdates={'map': {'Value': json_obj2, 'Action': 'PUT'}}
)
However, If you want to update only specific nested attribute, you need to use UpdateExpression. For example, the below is code to update only key1 attribute to 'value3'.
nested_json_string = '{"nested": "key3"}'
nested_json_obj = json.loads(nested_json_string)
rsp = table.update_item(
Key={'pk': 'pk-value'},
UpdateExpression='SET #map.#key1 = :val3',
ExpressionAttributeNames={'#map': 'map', '#key1': 'key1'},
ExpressionAttributeValues={':val3': nested_json_obj}
)

I know this is an old question, but I came across it and the accepted answer didn't help me (it seems to suggest that you can feed boto3 with plain JSON, but it didn't work for me) and the library mentioned in the comments didn't help me either.
What did work for me was using the serializer/deserializer from boto3.dynamodb.types, basically as suggested by this answer on a very similar topic.
from boto3.dynamodb.types import TypeSerializer, TypeDeserializer
import json
serializer = TypeSerializer()
deserializer = TypeDeserializer()
# for building a DynamoDB JSON from a Python object
def serialize(item):
serialized_item = serializer.serialize(vars(item) if hasattr(item, '__dict__') else item)
return item if 'M' not in serialized_item else serialized_item['M']
# for building a plain JSON from a DynamoDB JSON
def deserialize(dynamodb_json_string):
return deserializer.deserialize({'M': dynamodb_json_string})
class MyItem:
def __init__(self, some_string_value=None, some_numeric_value=None):
self.my_key = some_string_value
self.my_other_key = some_numeric_value
def __str__(self):
return json.dumps(self, default=lambda x: x.__dict__)
if __name__ == '__main__':
my_classy_item = MyItem("my_string_value", 5)
my_string_item = json.loads('{"my_key": "my_string_value", "my_other_key" : 5}')
print(serialize(my_classy_item))
print(serialize(my_string_item))

Related

json to csv using python, json.loads and json_normalize function

I am trying to convert a JSON file to CSV format using Python. I am using JSON.loads() function and then using json_normalize() to flatten the objects. I was wondering if there is better way of doing this.
this is the input file, one row form it:
{"ID": "02","Date": "2019-08-01","Total": 400,"QTY": 12,"Item": [{"NM": "0000000001","CD": "item_CD1","SRL": "25","Disc": [{"CD": "discount_CD1","Amount": 2}],"TxLns": {"TX": [{"TXNM": "000001-001","TXCD": "TX_CD1"}]}},{"NM": "0000000002","CD": "item_CD2","SRL": "26","Disc": [{"CD": "discount_CD2","Amount": 4}],"TxLns": {"TX": [{"TXNM": "000002-001","TXCD": "TX_CD2"}]}},{"NM": "0000000003","CD": "item_CD3","SRL": "27"}],"Cust": {"CustID": 10,"Email": "01#abc.com"},"Address": [{"FirstName": "firstname","LastName": "lastname","Address": "address"}]}
Code
import json
import pandas as pd
from pandas.io.json import json_normalize
data_final=pd.DataFrame()
with open("sample.json") as f:
for line in f:
json_obj = json.loads(line)
ID = json_obj['ID']
Item = json_obj['Item']
dataMain = json_normalize(json_obj)
dataMain=dataMain.drop(['Item','Address'], axis=1)
#dataMain.to_csv("main.csv",index=False)
dataItem = json_normalize(json_obj,'Item',['ID'])
dataItem=dataItem.drop(['Disc','TxLns.TX'],axis=1)
#dataItem.to_csv("Item.csv",index=False)
dataDisc = pd.DataFrame()
dataTx = pd.DataFrame()
for rt in Item:
NM=rt['NM']
rt['ID'] = ID
if 'Disc' in rt:
data = json_normalize(rt, 'Disc', ['NM','ID'])
dataDisc = dataDisc.append(data, sort=False)
if 'TxLns' in rt:
tx=rt['TxLns']
tx['NM'] = NM
tx['ID'] = ID
if 'TX' in tx:
data = json_normalize(tx, 'TX', ['NM','ID'])
dataTx = dataTx.append(data, sort=False)
dataDIS = pd.merge(dataItem, dataDisc, on=['NM','ID'],how='left')
dataTX = pd.merge(dataDIS, dataTx, on=['NM','ID'],how='left')
dataAddress = json_normalize(json_obj,'Address',['ID'])
data_IT = pd.merge(dataMain, dataTX, on=['ID'])
data_merge=pd.merge(data_IT,dataAddress, on=['ID'])
data_final=data_final.append(data_merge,sort=False)
data_final=data_final.drop_duplicates(keep = 'first')
data_final.to_csv("data_merged.csv",index=False)
this is the output:
ID,Date,Total,QTY,Cust.CustID,Cust.Email,NM,CD_x,SRL,CD_y,Amount,TXNM,TXCD,FirstName,LastName,Address
02,2019-08-01,400,12,10,01#abc.com,0000000001,item_CD1,25,discount_CD1,2.0,000001-001,TX_CD1,firstname,lastname,address
02,2019-08-01,400,12,10,01#abc.com,0000000002,item_CD2,26,discount_CD2,4.0,000002-001,TX_CD2,firstname,lastname,address
02,2019-08-01,400,12,10,01#abc.com,0000000003,item_CD3,27,,,,,firstname,lastname,address
The code is working fine for now. By Better I mean:
Is it efficient in terms of time and space complexity? If this code has to process around 10K records in a file, is this the optimized solution?

JSON serialization using Marshmallow - skip None attributes

I am using Marshmallow to send instance of my Decision class to JSON. However, this will also dump the attributes which are None, e.g. my attribute score will translate to null in JSON. After that I am unable to read the JSON again using the same approach.
https://repl.it/repls/VoluminousMulticoloredFacts
The last line is where it currently fails. I need to either NOT dump None to JSON or skip null during loading:
import json
from marshmallow import Schema, fields, post_load
json_data = """{
"appid": "2309wfjwef",
"strategy": "First Strategy"
}"""
# Output class definition
class Decision(object):
def __init__(self, appid = None, strategy = None, score = None):
self.appid = appid
self.strategy = strategy
self.score = score
class DecisionSchema(Schema):
appid = fields.Str()
strategy = fields.Str()
score = fields.Int()
#post_load
def make_decision(self, data):
return Decision(**data)
# Deserialization into object
dec_json = json.loads(json_data)
schema = DecisionSchema()
dec = schema.load(dec_json).data
print(dec.strategy)
# Dump results back to JSON
schema = DecisionSchema()
out = schema.dumps(dec)
print(out.data)
# Load back from dump
schema = DecisionSchema()
dec = schema.load(out).data
#print(dec.strategy) # returns error currently
An "official" answer from marshmallow development team can be found in this comment in the bugtracker:
Use a post_dump method.
from marshmallow import Schema, fields, post_dump
class BaseSchema(Schema):
SKIP_VALUES = set([None])
#post_dump
def remove_skip_values(self, data, **kwargs):
return {
key: value for key, value in data.items()
if value not in self.SKIP_VALUES
}
class MySchema(BaseSchema):
foo = fields.Field()
bar = fields.Field()
sch = MySchema()
sch.dump({'foo': 42, 'bar': None}).data # {'foo': 42}
As I point out in a further comment, there's a shortcoming: it will also remove None when the field's allow_none is True.
As I pointed out in my comment above this messes with the order if you use the
class Meta:
fields = (
'field1', 'field2'
)
ordered = True
To fix this I used this:
# Remove None fields
#pre_dump
def remove_skip_values(self, data):
return {
key: value for key, value in data.items()
if value is not None
}
This works for my dictonary of objects

Reading the data written to s3 by Amazon Kinesis Firehose stream

I am writing record to Kinesis Firehose stream that is eventually written to a S3 file by Amazon Kinesis Firehose.
My record object looks like
ItemPurchase {
String personId,
String itemId
}
The data is written to S3 looks like:
{"personId":"p-111","itemId":"i-111"}{"personId":"p-222","itemId":"i-222"}{"personId":"p-333","itemId":"i-333"}
NO COMMA SEPERATION.
NO STARTING BRACKET as in a Json Array
[
NO ENDING BRACKET as in a Json Array
]
I want to read this data get a list of ItemPurchase objects.
List<ItemPurchase> purchases = getPurchasesFromS3(IOUtils.toString(s3ObjectContent))
What is the correct way to read this data?
It boggles my mind that Amazon Firehose dumps JSON messages to S3 in this manner, and doesn't allow you to set a delimiter or anything.
Ultimately, the trick I found to deal with the problem was to process the text file using the JSON raw_decode method
This will allow you to read a bunch of concatenated JSON records without any delimiters between them.
Python code:
import json
decoder = json.JSONDecoder()
with open('giant_kinesis_s3_text_file_with_concatenated_json_blobs.txt', 'r') as content_file:
content = content_file.read()
content_length = len(content)
decode_index = 0
while decode_index < content_length:
try:
obj, decode_index = decoder.raw_decode(content, decode_index)
print("File index:", decode_index)
print(obj)
except JSONDecodeError as e:
print("JSONDecodeError:", e)
# Scan forward and keep trying to decode
decode_index += 1
I also had the same problem, here is how I solved.
replace "}{" with "}\n{"
line split by "\n".
input_json_rdd.map(lambda x : re.sub("}{", "}\n{", x, flags=re.UNICODE))
.flatMap(lambda line: line.split("\n"))
A nested json object has several "}"s, so split line by "}" doesn't solve the problem.
I've had the same issue.
It would have been better if AWS allowed us to set a delimiter but we can do it on our own.
In my use case, I've been listening on a stream of tweets, and once receiving a new tweet I immediately put it to Firehose.
This, of course, resulted in a 1-line file which could not be parsed.
So, to solve this, I have concatenated the tweet's JSON with a \n.
This, in turn, let me use some packages that can output lines when reading stream contents, and parse the file easily.
Hope this helps you.
I think the best ways to tackle this is to first create a properly formatted json file containing well separated json objects within them. In my case I added ',' to the events which was pushed into the firehose. Then After a file is saved in s3, all the files will contain json object separated by some delimitter(comma- in our case). Another thing that must be added are '[' and ']' at the beginning and end of the file. Then you have a proper json file containing multiple json objects. Parsing them will be possible now.
If the input source for the firehose is an Analytics application, this concatenated JSON without a delimiter is a known issue as cited here. You should have a lambda function as here that outputs JSON objects in multiple lines.
I used a transformation Lambda to add a line break at the end of every record
def lambda_handler(event, context):
output = []
for record in event['records']:
# Decode from base64 (Firehose records are base64 encoded)
payload = base64.b64decode(record['data'])
# Read json as utf-8
json_string = payload.decode("utf-8")
# Add a line break
output_json_with_line_break = json_string + "\n"
# Encode the data
encoded_bytes = base64.b64encode(bytearray(output_json_with_line_break, 'utf-8'))
encoded_string = str(encoded_bytes, 'utf-8')
# Create a deep copy of the record and append to output with transformed data
output_record = copy.deepcopy(record)
output_record['data'] = encoded_string
output_record['result'] = 'Ok'
output.append(output_record)
print('Successfully processed {} records.'.format(len(event['records'])))
return {'records': output}
Use this simple Python code.
input_str = '''{"personId":"p-111","itemId":"i-111"}{"personId":"p-222","itemId":"i-222"}{"personId":"p-333","itemId":"i-333"}'''
data_str = "[{}]".format(input_str.replace("}{","},{"))
data_json = json.loads(data_str)
And then (if you want) convert to Pandas.
import pandas as pd
df = pd.DataFrame().from_records(data_json)
print(df)
And this is result
itemId personId
0 i-111 p-111
1 i-222 p-222
2 i-333 p-333
If there's a way to change the way data is written, please separate all the records by a line. That way you can read the data simply, line by line. If not, then simply build a scanner object which takes "}" as a delimiter and use the scanner to read. That would do the job.
You can find the each valid JSON by counting the brackets. Assuming the file starts with a { this python snippet should work:
import json
def read_block(stream):
open_brackets = 0
block = ''
while True:
c = stream.read(1)
if not c:
break
if c == '{':
open_brackets += 1
elif c == '}':
open_brackets -= 1
block += c
if open_brackets == 0:
yield block
block = ''
if __name__ == "__main__":
c = 0
with open('firehose_json_blob', 'r') as f:
for block in read_block(f):
record = json.loads(block)
print(record)
This problem can be solved with a JSON parser that consumes objects one at a time from a stream. The raw_decode method of the JSONDecoder exposes just such a parser, but I've written a library that makes it straightforward to do this with a one-liner.
from firehose_sipper import sip
for entry in sip(bucket=..., key=...):
do_something_with(entry)
I've added some more details in this blog post
In Spark, we had the same problem. We're using the following:
from pyspark.sql.functions import *
#udf
def concatenated_json_to_array(text):
final = "["
separator = ""
for part in text.split("}{"):
final += separator + part
separator = "}{" if re.search(r':\s*"([^"]|(\\"))*$', final) else "},{"
return final + "]"
def read_concatenated_json(path, schema):
return (spark.read
.option("lineSep", None)
.text(path)
.withColumn("value", concatenated_json_to_array("value"))
.withColumn("value", from_json("value", schema))
.withColumn("value", explode("value"))
.select("value.*"))
It works as follows:
Read the data as one string per file (no delimiters!)
Use a UDF to introduce the JSON array and split the JSON objects by introducing a comma. Note: be careful not to break any strings with }{ in them!
Parse the JSON with a schema into DataFrame fields.
Explode the array into separate rows
Expand the value object into column.
Use it like this:
from pyspark.sql.types import *
schema = ArrayType(
StructType([
StructField("type", StringType(), True),
StructField("value", StructType([
StructField("id", IntegerType(), True),
StructField("joke", StringType(), True),
StructField("categories", ArrayType(StringType()), True)
]), True)
])
)
path = '/mnt/my_bucket_name/messages/*/*/*/*/'
df = read_concatenated_json(path, schema)
I've written more details and considerations here: Parsing JSON data from S3 (Kinesis) with Spark. Do not just split by }{, as it can mess up your string data! For example: { "line": "a\"r}{t" }.
You can use below script.
If streamed data size is not over buffer size that you set, each file of s3 have one pair of brackets([]) and comma.
import base64
print('Loading function')
def lambda_handler(event, context):
output = []
for record in event['records']:
print(record['recordId'])
payload = base64.b64decode(record['data']).decode('utf-8')+',\n'
# Do custom processing on the payload here
output_record = {
'recordId': record['recordId'],
'result': 'Ok',
'data': base64.b64encode(payload.encode('utf-8'))
}
output.append(output_record)
last = len(event['records'])-1
print('Successfully processed {} records.'.format(len(event['records'])))
start = '['+base64.b64decode(output[0]['data']).decode('utf-8')
end = base64.b64decode(output[last]['data']).decode('utf-8')+']'
output[0]['data'] = base64.b64encode(start.encode('utf-8'))
output[last]['data'] = base64.b64encode(end.encode('utf-8'))
return {'records': output}
Using JavaScript Regex.
JSON.parse(`[${item.replace(/}\s*{/g, '},{')}]`);

Working with JSON and Django

I am new to Python and Django. I am an IT professional that deploys software that monitors computers. The api outputs to JSON. I want to create a Django app that reads the api and outputs the data to an html page. Where do I get started? I think the idea is to write the JSON feed to a Django model. Any help/advice is greatly appreciated.
Here's a simple single file to extract the JSON data:
import urllib2
import json
def printResults(data):
theJSON = json.loads(data)
for i in theJSON[""]
def main():
urlData = ""
webUrl = urllib2.urlopen(urlData)
if (webUrl.getcode() == 200):
data = webUrl.read()
printResults(data)
else:
print "Received error"
if __name__ == '__main__':
main()
If you have an URL returning a json as response, you could try this:
import requests
import json
url = 'http://....' # Your api url
response = requests.get(url)
json_response = response.json()
Now json_response is a list containing dicts. Let's suppose you have this structure:
[
{
'code': ABC,
'avg': 14.5,
'max': 30
},
{
'code': XYZ,
'avg': 11.6,
'max': 21
},
...
]
You can iterate over the list and take every dict into a model.
from yourmodels import CurrentModel
...
for obj in json_response:
cm = CurrentModel()
cm.avg = obj['avg']
cm.max = obj['max']
cm.code = obj['code']
cm.save()
Or you could use a bulk method, but keep in mind that bulk_create does not trigger save method.

Processing JSON Response using scrapy

I have the following code in my scrapy spider:
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
htmldata = jsonresponse["html"]
for sel in htmldata.xpath('//li/li'):
-- more xpath codes --
yield item
But i am having this error:
raise ValueError("No JSON object could be decoded")
exceptions.ValueError: No JSON object could be decoded
After checking the json reply, i found out about **<!--WPJM-->** and **<!--WPJM_END-->** which is causing this error.
<!--WPJM-->{"found_jobs":true,"html":"<html code>","max_num_pages":3}<!--WPJM_END-->
How do i parse my scrapy without looking at the !--WPJM-- and !--WPJM_END-- code?
EDIT: This is the error that i have:
File "/home/muhammad/Projects/project/project/spiders/crawler.py", line 150, in parse
for sel in htmldata.xpath('//li'):
exceptions.AttributeError: 'unicode' object has no attribute 'xpath'
def parse(self, response):
rawdata = response.body_as_unicode()
jsondata = rawdata.replace('<!--WPJM-->', '').replace('<!--WPJM_END-->', '')
# print jsondata # For debugging
# pass
data = json.loads(jsondata)
htmldata = data["html"]
# print htmldata # For debugging
# pass
for sel in htmldata.xpath('//li'):
item = ProjectjomkerjaItem()
item['title'] = sel.xpath('a/div[#class="position"]/div[#id="job-title-job-listing"]/strong/text()').extract()
item['company'] = sel.xpath('a/div[#class="position"]/div[#class="company"]/strong/text()').extract()
item['link'] = sel.xpath('a/#href').extract()
The easiest approach would be to get rid of the comments tags manually using replace():
data = response.body_as_unicode()
data = data.replace('<!--WPJM-->', '').replace('<!--WPJM_END-->', '')
jsonresponse = json.loads(data)
Though it is not quite pythonic and reliable.
Or, a better option would to be to get the text() by xpath:
$ scrapy shell index.html
>>> response.xpath('//text()').extract()[0]
u'{"found_jobs":true,"html":"<html code"}'