Separate of nouns and groups of noun tag using nltk from json file - nltk

I want to find or separate noun and groups of nouns using NLTK from JSON file, this is the JSON file content:
[
{
"id": 18009,
"ingredients": [
"baking powder",
"eggs",
"all-purpose flour",
"raisins",
"milk",
"white sugar"
]
},
{
"id": 28583,
"ingredients": [
"sugar",
"egg yolks",
"corn starch",
"cream of tartar",
"bananas",
"vanilla wafers",
"milk",
"vanilla extract",
"toasted pecans",
"egg whites",
"light rum"
]
},
I want to find the NN, NNS, NNP, NNPS.

import nltk
from nltk import word_tokenize
for a in data:
for b in a["ingredients"]:
text = word_tokenize(b)
res = nltk.pos_tag(text)
res = [t for t in res if t[1] in ["NN", "NNS", "NNP", "NNPS"]]
print(res)
#output:
#[('powder', 'NN')]
#[('eggs', 'NNS')]
#[('flour', 'NN')]
#[('raisins', 'NNS')]
#[('milk', 'NN')]
#[('sugar', 'NN')]
#[('sugar', 'NN')]
#[('egg', 'NN'), ('yolks', 'NNS')]
#[('corn', 'NN'), ('starch', 'NN')]
# ...

Related

Parsing nested JSON and collecting data in a list

I am trying to parse a nested JSON and trying to collect data into a list under some condition.
Input JSON as below:
[
{
"name": "Thomas",
"place": "USA",
"items": [
{"item_name":"This is a book shelf", "level":1},
{"item_name":"Introduction", "level":1},
{"item_name":"parts", "level":2},
{"item_name":"market place", "level":3},
{"item_name":"books", "level":1},
{"item_name":"pens", "level":1},
{"item_name":"pencils", "level":1}
],
"descriptions": [
{"item_name": "Books"}
]
},
{
"name": "Samy",
"place": "UK",
"items": [
{"item_name":"This is a cat house", "level":1},
{"item_name":"Introduction", "level":1},
{"item_name":"dog house", "level":3},
{"item_name":"cat house", "level":1},
{"item_name":"cat food", "level":2},
{"item_name":"cat name", "level":1},
{"item_name":"Samy", "level":2}
],
"descriptions": [
{"item_name": "cat"}
]
}
]
I am reading json as below:
with open('test.json', 'r', encoding='utf8') as fp:
data = json.load(fp)
for i in data:
if i['name'] == "Thomas":
#collect "item_name", values in a list (my_list) if "level":1
#my_list = []
Expected output:
my_list = ["This is a book shelf", "Introduction", "books", "pens", "pencils"]
Since it's a nested complex JSON, I am not able to collect the data into a list as I mentioned above. Please let me know no how to collect the data from the nested JSON.
Try:
import json
with open("test.json", "r", encoding="utf8") as fp:
data = json.load(fp)
my_list = [
i["item_name"]
for d in data
for i in d["items"]
if d["name"] == "Thomas" and i["level"] == 1
]
print(my_list)
This prints:
['This is a book shelf', 'Introduction', 'books', 'pens', 'pencils']
Or without list comprehension:
my_list = []
for d in data:
if d["name"] != "Thomas":
continue
for i in d["items"]:
if i["level"] == 1:
my_list.append(i["item_name"])
print(my_list)
Once we have the data we iterate over the outermost list of objects.
We check if the object has the name equals to "Thomas" if true then we apply filter method with a lambda function on items list with a condition of level == 1
This gives us a list of item objects who have level = 1
In order to extract the item_name we use a comprehension so the final value in the final_list will be as you have expected.
["This is a book shelf", "Introduction", "books", "pens", "pencils"]
import json
def get_final_list():
with open('test.json', 'r', encoding='utf8') as fp:
data = json.load(fp)
final_list = []
for obj in data:
if obj.get("name") == "Thomas":
x = list(filter(lambda item: item['level'] == 1, obj.get("items")))
final_list = final_list + x
final_list = [i.get("item_name") for i in final_list]
return final_list

Writing Nested JSON Dictionary List To CSV

Issue
I'm trying to write the following nested list of dictionary which has another list of dictionary to csv. I tried multiple ways but I can not get it to properly write it:
Json Data
[
{
"Basic_Information_Source": [
{
"Image": "image1.png",
"Image_Format": "PNG",
"Image_Mode": "RGB",
"Image_Width": 574,
"Image_Height": 262,
"Image_Size": 277274
}
],
"Basic_Information_Destination": [
{
"Image": "image1_dst.png",
"Image_Format": "PNG",
"Image_Mode": "RGB",
"Image_Width": 574,
"Image_Height": 262,
"Image_Size": 277539
}
],
"Values": [
{
"Value1": 75.05045463635267,
"Value2": 0.006097560975609756,
"Value3": 0.045083481733371615,
"Value4": 0.008639858263904898
}
]
},
{
"Basic_Information_Source": [
{
"Image": "image2.png",
"Image_Format": "PNG",
"Image_Mode": "RGB",
"Image_Width": 1600,
"Image_Height": 1066,
"Image_Size": 1786254
}
],
"Basic_Information_Destination": [
{
"Image": "image2_dst.png",
"Image_Format": "PNG",
"Image_Mode": "RGB",
"Image_Width": 1600,
"Image_Height": 1066,
"Image_Size": 1782197
}
],
"Values": [
{
"Value1": 85.52662890580055,
"Value2": 0.0005464352720450282,
"Value3": 0.013496113910369758,
"Value4": 0.003800236380811839
}
]
}
]
Working Code
I tried to use the following code and it works, but it only saved the headers and then dumps all the underlying list as text in the csv file:
import json
import csv
def Convert_CSV():
ar_enc_file = open('analysis_results_enc.json','r')
json_data = json.load(ar_enc_file)
keys = json_data[0].keys()
with open('test.csv', 'w', encoding='utf8', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(json_data)
ar_enc_file.close()
Convert_CSV()
Working Output / Issue with it
The output writes the following header:
Basic_Information_Source
Basic_Information_Destination
Values
And then it dumps all other data inside each header as a list like this:
[{'Image': 'image1.png', 'Image_Format': 'PNG', 'Image_Mode': 'RGB', 'Image_Width': 574, 'Image_Height': 262, 'Image_Size': 277274}]
Expected Output / Sample
Trying to generate the above type of output for each dictionary in the array of dictionaries.
How do it properly write it?
I'm sure someone will come by with a much more elegant solution. That being said:
You have a few problems.
You have inconsistent entries with the fields you want to align.
Even if you pad your data you have intermediate lists that need flattened out.
Then you still have separated data that needs to be merged together.
DictWriter AFAIK expects it's data in the format of [{'column': 'entry'},{'column': 'entry'} so even if you do all the previous steps you're still not in the right format.
So let's get started.
For the first two parts we can combine.
def pad_list(lst, size, padding=None):
# we wouldn't have to make a copy but I prefer to
# avoid the possibility of getting bitten by mutability
_lst = lst[:]
for _ in range(len(lst), size):
_lst.append(padding)
return _lst
# this expects already parsed json data
def flatten(json_data):
lst = []
for dct in json_data:
# here we're just setting a max size of all dict entries
# this is in case the shorter entry is in the first iteration
max_size = 0
# we initialize a dict for each of the list entries
# this is in case you have inconsistent lengths between lists
flattened = dict()
for k, v in dct.items():
entries = list(next(iter(v), dict()).values())
flattened[k] = entries
max_size = max(len(entries), max_size)
# here we append the padded version of the keys for the dict
lst.append({k: pad_list(v, max_size) for k, v in flattened.items()})
return lst
So now we have a flattened, list of dicts whos values are lists of consistent length. Essentially:
[
{
"Basic_Information_Source": [
"image1.png",
"PNG",
"RGB",
574,
262,
277274
],
"Basic_Information_Destination": [
"image1_dst.png",
"PNG",
"RGB",
574,
262,
277539
],
"Values": [
75.05045463635267,
0.006097560975609756,
0.045083481733371615,
0.008639858263904898,
None,
None
]
}
]
But this list has multiple dicts that need to be merged, not just one.
So we need to merge.
# this should be self explanatory
def merge(flattened):
merged = dict()
for dct in flattened:
for k, v in dct.items():
if k not in merged:
merged[k] = []
merged[k].extend(v)
return merged
This gives us something close to this:
{
"Basic_Information_Source": [
"image1.png",
"PNG",
"RGB",
574,
262,
277274,
"image2.png",
"PNG",
"RGB",
1600,
1066,
1786254
],
"Basic_Information_Destination": [
"image1_dst.png",
"PNG",
"RGB",
574,
262,
277539,
"image2_dst.png",
"PNG",
"RGB",
1600,
1066,
1782197
],
"Values": [
75.05045463635267,
0.006097560975609756,
0.045083481733371615,
0.008639858263904898,
None,
None,
85.52662890580055,
0.0005464352720450282,
0.013496113910369758,
0.003800236380811839,
None,
None
]
}
But wait, we still need to format it for the writer.
Our data needs to be in the format of [{'column_1': 'entry', column_2: 'entry'},{'column_1': 'entry', column_2: 'entry'}
So we format:
def format_for_writer(merged):
formatted = []
for k, v in merged.items():
for i, item in enumerate(v):
# on the first pass this will append an empty dict
# on subsequent passes it will be ignored
# and add keys into the existing dict
if i >= len(formatted):
formatted.append(dict())
formatted[i][k] = item
return formatted
So finally, we have a nice clean formatted data structure we can just hand to our writer function.
def convert_csv(formatted):
keys = formatted[0].keys()
with open('test.csv', 'w', encoding='utf8', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(formatted)
Full code with json string:
import json
import csv
json_raw = """\
[
{
"Basic_Information_Source": [
{
"Image": "image1.png",
"Image_Format": "PNG",
"Image_Mode": "RGB",
"Image_Width": 574,
"Image_Height": 262,
"Image_Size": 277274
}
],
"Basic_Information_Destination": [
{
"Image": "image1_dst.png",
"Image_Format": "PNG",
"Image_Mode": "RGB",
"Image_Width": 574,
"Image_Height": 262,
"Image_Size": 277539
}
],
"Values": [
{
"Value1": 75.05045463635267,
"Value2": 0.006097560975609756,
"Value3": 0.045083481733371615,
"Value4": 0.008639858263904898
}
]
},
{
"Basic_Information_Source": [
{
"Image": "image2.png",
"Image_Format": "PNG",
"Image_Mode": "RGB",
"Image_Width": 1600,
"Image_Height": 1066,
"Image_Size": 1786254
}
],
"Basic_Information_Destination": [
{
"Image": "image2_dst.png",
"Image_Format": "PNG",
"Image_Mode": "RGB",
"Image_Width": 1600,
"Image_Height": 1066,
"Image_Size": 1782197
}
],
"Values": [
{
"Value1": 85.52662890580055,
"Value2": 0.0005464352720450282,
"Value3": 0.013496113910369758,
"Value4": 0.003800236380811839
}
]
}
]
"""
def pad_list(lst, size, padding=None):
_lst = lst[:]
for _ in range(len(lst), size):
_lst.append(padding)
return _lst
def flatten(json_data):
lst = []
for dct in json_data:
max_size = 0
flattened = dict()
for k, v in dct.items():
entries = list(next(iter(v), dict()).values())
flattened[k] = entries
max_size = max(len(entries), max_size)
lst.append({k: pad_list(v, max_size) for k, v in flattened.items()})
return lst
def merge(flattened):
merged = dict()
for dct in flattened:
for k, v in dct.items():
if k not in merged:
merged[k] = []
merged[k].extend(v)
return merged
def format_for_writer(merged):
formatted = []
for k, v in merged.items():
for i, item in enumerate(v):
if i >= len(formatted):
formatted.append(dict())
formatted[i][k] = item
return formatted
def convert_csv(formatted):
keys = formatted[0].keys()
with open('test.csv', 'w', encoding='utf8', newline='') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(formatted)
def main():
json_data = json.loads(json_raw)
flattened = flatten(json_data)
merged = merge(flattened)
formatted = format_for_writer(merged)
convert_csv(formatted)
if __name__ == '__main__':
main()

How to check JSON data in Python

Data sample:
import pandas as pd
patients_df = pd.read_json('C:/MyWorks/Python/Anal/data_sample.json', orient="records", lines=True)
patients_df.head()
//in python
//my json data sample
"data1": {
"id": "myid",
"seatbid": [
{
"bid": [
{
"id": "myid",
"impid": "1",
"price": 0.46328014,
"adm": "adminfo",
"adomain": [
"domain.com"
],
"iurl": "url.com",
"cid": "111",
"crid": "1111",
"cat": [
"CAT-0101"
],
"w": 00,
"h": 00
}
],
"seat": "27"
}
],
"cur": "USD"
},
What I want to do is to check if there is a "cat" value in my very large JSON data.
The "cat" value may/may not exist, but I'm trying to use Python Pandas to check it.
for seatbid in patients_df["win_res"]:
for bid in seatbid["seatbid"]:
I tried to access JSON data while writing a loop like that, but it's not being accessed properly.
I simply want to check if "cat" exist or not.
You can use python's json library as follows:
import json
patient_data = json.loads(patientJson)
if "cat" in student:
print("Key exist in JSON data")
else
print("Key doesn't exist in JSON data")

JSON to CSV - Go 4 level deep

I would like to extract only a small fraction of my JSON response in a .csv file. However, I need to go to 4 levels deep and I am currently only able to go to 3 level deep. My goal is to have a .csv with 3 columns campaign_id, campaign_name, cost_per_click and 3 lines for each of my campaigns.
Original JSON
{
"318429215527453": {
"conversion_events": {
"data": [
{
"id": "djdfhdf",
"name": "Total",
"cost": 328.14,
"metrics_breakdown": {
"data": [
{
"campaign_id": 2364,
"campaign_name": "uk",
"cost_per_click": 1345
},
{
"campaign_id": 7483,
"campaign_name": "fr",
"cost_per_click": 756
},
{
"campaign_id": 8374,
"campaign_name": "spain",
"cost_per_click": 545
},
{
"campaign_id": 2431,
"campaign_name": "ge",
"cost_per_click": 321
}
],
"paging": {
"cursors": {
"after": "MjUZD"
},
"next": "https://graph.facebook.com/v9.0/xxxx"
}
}
}
],
"summary": {
"count": 1,
"metric_date_range": {
"date_range": {
"begin_date": "2021-01-09T00:00:00+0100",
"end_date": "2021-02-08T00:00:00+0100",
"time_zone": "Europe/Paris"
},
"prior_period_date_range": {
"begin_date": "2020-12-10T00:00:00+0100",
"end_date": "2021-01-09T00:00:00+0100"
}
}
}
},
"id": "xxx"
}
}
reformated.py
import json
with open('campaigns.json') as json_file:
data = json.load(json_file)
reformated_json = data['318429215527453']['conversion_events']['data']
with open('data.json', 'w') as outfile:
json.dump(reformated_json, outfile)
I tried to add ['metrics_breakdown'] or another ['data'] at the end of reformated_json but I am getting TypeError: list indices must be integers or slices, not str.
{
"id": "djdfhdf",
"name": "Total",
"cost": 328.14,
"metrics_breakdown": {
"data": [
{
"campaign_id": 2364,
"campaign_name": "uk",
"cost_per_click": 1345,
},
{
"campaign_id": 7483,
"campaign_name": "fr",
"cost_per_click": 756,
},
{
"campaign_id": 8374,
"campaign_name": "spain",
"cost_per_click": 545,
},
{
"campaign_id": 2431,
"campaign_name": "ge",
"cost_per_click": 321,
},
],
"paging": {
"cursors": {
"after": "MjUZD"
},
"next": "https://graph.facebook.com/v9.0/xxxx"
}
}
}
]
import csv
import json
from typing import Dict, List, Union # typing for easy development
# read json function
def read_json(json_path: str) -> Union[Dict, List]:
with open(json_path, 'r') as file_io:
return json.load(file_io)
# write csv function
def write_csv(data: List[Dict], csv_path: str) -> None:
with open(csv_path, 'w') as file:
fieldnames = set().union(*data)
writer = csv.DictWriter(file, fieldnames=fieldnames,
lineterminator='\n')
writer.writeheader()
writer.writerows(data)
# parse campaigns using a comprehension
def parse_campaigns(data: Dict) -> List[Dict]:
return [row
for value in data.values() # first level (conversion events)
for root_data in value['conversion_events']['data'] # conversion events/data
for row in root_data['metrics_breakdown']['data']] # data/metrics_breakdown/data
json_data = read_json('./campaigns.json')
campaign_data = parse_campaigns(json_data)
write_csv(campaign_data, 'campaigns.csv')
campaigns.csv (I copied the data to multiple root dictionary objects):
cost_per_click,campaign_id,campaign_name
1345,2364,uk
756,7483,fr
545,8374,spain
321,2431,ge
1345,2364,uk
756,7483,fr
545,8374,spain
321,2431,ge
The first data subkey contains a single-element list. Dereference with [0] to get the element, then fetch the next layers of keys. Then a DictWriter can be used to write the CSV lines:
import json
import csv
with open('campaigns.json') as json_file:
data = json.load(json_file)
items = data['318429215527453']['conversion_events']['data'][0]['metrics_breakdown']['data']
with open('data.csv', 'w', newline='') as outfile:
w = csv.DictWriter(outfile,fieldnames=items[0].keys())
w.writeheader()
w.writerows(items)
Output:
campaign_id,campaign_name,cost_per_click
2364,uk,1345
7483,fr,756
8374,spain,545
2431,ge,321

Combine multiple JSON files, and parse into CSV

I have about 100 JSON files, all titled with different dates and I need to merge them into one CSV file that has headers "date", "real_name", "text".
There are no dates listed in the JSON itself, and the real_name is nested. I haven't worked with JSON in a while and am a little lost.
The basic structure of the JSON looks more or less like this:
Filename: 2021-01-18.json
[
{
"client_msg_id": "xxxx",
"type": "message",
"text": "THIS IS THE TEXT I WANT TO PULL",
"user": "XXX",
"user_profile": {
"first_name": "XXX",
"real_name": "THIS IS THE NAME I WANT TO PULL",
"display_name": "XXX",
"is_restricted": false,
"is_ultra_restricted": false
},
"blocks": [
{
"type": "rich_text",
"block_id": "yf=A9",
}
]
}
]
So far I have
import glob
read_files = glob.glob("*.json")
output_list = []
all_items = []
for f in read_files:
with open(f, "rb") as infile:
output_list.append(json.load(infile))
data = {}
for obj in output_list[]
data['date'] = f
data['text'] = 'text'
data['real_name'] = 'real_name'
all_items.append(data)
Once you've read the JSON object, just index into the dictionaries for the data. You might need obj[0]['text'], etc., if your JSON data is really in a list in each file, but that seems odd and I'm assuming your data was pasted from output_list after you'd collected the data. So assuming your file content is exactly like below:
{
"client_msg_id": "xxxx",
"type": "message",
"text": "THIS IS THE TEXT I WANT TO PULL",
"user": "XXX",
"user_profile": {
"first_name": "XXX",
"real_name": "THIS IS THE NAME I WANT TO PULL",
"display_name": "XXX",
"is_restricted": false,
"is_ultra_restricted": false
},
"blocks": [
{
"type": "rich_text",
"block_id": "yf=A9",
}
]
}
test.py:
import json
import glob
from pathlib import Path
read_files = glob.glob("*.json")
output_list = []
all_items = []
for f in read_files:
with open(f, "rb") as infile:
output_list.append(json.load(infile))
data = {}
for obj in output_list:
data['date'] = Path(f).stem
data['text'] = obj['text']
data['real_name'] = obj['user_profile']['real_name']
all_items.append(data)
print(all_items)
Output:
[{'date': '2021-01-18', 'text': 'THIS IS THE TEXT I WANT TO PULL', 'real_name': 'THIS IS THE NAME I WANT TO PULL'}]