How to convert json to csv python with proper headers - json

Trying to get Json data to csv i am getting the values but one block is showing as one line in result, new to python so any help appriciated. Have tried the below code to do the same.
import pandas as pd
with open(r'C:\Users\anath\hard.json', encoding='utf-8') as inputfile:
df = pd.read_json(inputfile)
df.to_csv(r'C:\Users\anath\csvfile.csv', encoding='utf-8', index=True)
Sample Json in the source file, short snippet
{
"issues": [
{
"issueId": 110052,
"revision": 84,
"definitionId": "DNS1012",
"subject": "urn:h:domain:fitestdea.com",
"subjectDomain": "fitestdea.com",
"title": "Nameserver name doesn\u0027t resolve to an IPv6 address",
"category": "DNS",
"severity": "low",
"cause": "urn:h:domain:ns1.gname.net",
"causeDomain": "ns1.gname.net",
"open": true,
"status": "active",
"auto": true,
"autoOpen": true,
"createdOn": "2022-09-01T02:29:09.681451Z",
"lastUpdated": "2022-11-23T02:26:28.785601Z",
"lastChecked": "2022-11-23T02:26:28.785601Z",
"lastConfirmed": "2022-11-23T02:26:28.785601Z",
"details": "{}"
},
{
"issueId": 77881,
"revision": 106,
"definitionId": "DNS2001",
"subject": "urn:h:domain:origin-mx.stagetest.test.com.test.com",
"subjectDomain": "origin-mx.stagetest.test.com.test.com",
"title": "Dangling domain alias (CNAME)",
"category": "DNS",
"severity": "high",
"cause": "urn:h:domain:origin-www.stagetest.test.com.test.com",
"causeDomain": "origin-www.stagetest.test.com.test.com",
"open": true,
"status": "active",
"auto": true,
"autoOpen": true,
"createdOn": "2022-08-10T09:34:36.929071Z",
"lastUpdated": "2022-11-23T09:33:32.553663Z",
"lastChecked": "2022-11-23T09:33:32.553663Z",
"lastConfirmed": "2022-11-23T09:33:32.553663Z",
"details": "{\"#type\": \"hardenize/com.hardenize.schemas.dns.DanglingProblem\", \"rrType\": \"CNAME\", \"rrDomain\": \"origin-mx.stagetest.test.com.test.com\", \"causeDomain\": \"origin-www.stagetest.test.com.test.com\", \"danglingType\": \"nxdomain\", \"rrEffectiveDomain\": \"origin-mx.stagetest.test.com.test.com\"}"
}
}
]
}
Output i am getting is as below was looking a way where could field name in header and values in a column or cell so far getting the entire record in 1 cell. Any way we can just get specific field only like title, severity or issueid not everything but only the feilds i need.

Try:
import json
import pandas as pd
with open("your_file.json", "r") as f_in:
data = json.load(f_in)
df = pd.DataFrame(data["issues"])
print(df[["title", "severity", "issueId"]])
Prints:
title severity issueId
0 Nameserver name doesn't resolve to an IPv6 address low 110052
1 Dangling domain alias (CNAME) high 77881
To save as CSV you can do:
df[["title", "severity", "issueId"]].to_csv('data.csv', index=False)

try this...
df = pd.json_normalize(inputfile)
in place of the line you have.

Finally this worked for me #Andrej Kesely thanks for the inputs. sharing as might help others.
import pandas as pd
import json
with open(r'C:\Users\anath\hard.json', encoding='utf-8') as inputfile:
data = json.load(inputfile)
df = pd.DataFrame(data["issues"])
print(df[["title", "severity", "issueId"]])
df[["title", "severity", "issueId"]].to_csv('data.csv', index=False)

Related

multiple object of an array creates different columns in the CSV file

Here is my JSON example. When I convert JSON to CSV file, it creates different columns for each object of reviews array. columns names be like - serial name.0 rating.0 _id.0 name.1 rating.1 _id.1. How can i convert to CSV file where only serial,name,rating,_id will be the column name and every object of the reviews will be put in a different row?
`
[{
"serial": "63708940a8d291c502be815f",
"reviews": [
{
"name": "shadman",
"rating": 4,
"_id":"6373d4eb50cff661989f3d83"
},
{
"name": "niloy1",
"rating": 3,
"_id": "6373d59450cff661989f3db8"
},
],
}]
`
`
I am trying to use the CSV file to pandas. If not possible, is there any way to solve the problem using pandas package in python?
I suggest you use pandas for the CSV export only and process the json data by flattening the data structure first so that the result can then be easily loaded in a Pandas DataFrame.
Try:
data_python = [{
"serial": "63708940a8d291c502be815f",
"reviews": [
{
"name": "shadman",
"rating": 4,
"_id":"6373d4eb50cff661989f3d83"
},
{
"name": "niloy1",
"rating": 3,
"_id": "6373d59450cff661989f3db8"
},
],
}]
from collections import defaultdict
from pprint import pprint
import pandas as pd
dct_flat = defaultdict(list)
for dct in data_python:
for dct_reviews in dct["reviews"]:
dct_flat['serial'].append(dct['serial'])
for key, value in dct_reviews.items():
dct_flat[key].append(value)
#pprint(data_python)
#pprint(dct_flat)
df = pd.DataFrame(dct_flat)
print(df)
df.to_csv("data.csv")
which gives:
serial name rating _id
0 63708940a8d291c502be815f shadman 4 6373d4eb50cff661989f3d83
1 63708940a8d291c502be815f niloy1 3 6373d59450cff661989f3db8
and
,serial,name,rating,_id
0,63708940a8d291c502be815f,shadman,4,6373d4eb50cff661989f3d83
1,63708940a8d291c502be815f,niloy1,3,6373d59450cff661989f3db8
as CSV file content.
Notice that the json you provided in your question can't be loaded from file or string in Python neither using Python json module nor using Pandas because it is not valid json code. See below for corrected valid json data:
valid_json_data='''\
[{
"serial": "63708940a8d291c502be815f",
"reviews": [
{
"name": "shadman",
"rating": 4,
"_id":"6373d4eb50cff661989f3d83"
},
{
"name": "niloy1",
"rating": 3,
"_id": "6373d59450cff661989f3db8"
}
]
}]
'''
and code for loading this data from json file:
import json
json_file = "data.json"
with open(json_file) as f:
data_json = f.read()
data_python = json.loads(data_json)

Extract data from a JSON file using python

Say if I have JSON entry as follows(The JSON file generated by fetching data from a Firebase DB):
[{"goal_savings": 0.0, "social_id": "", "score": 0, "country": "BR", "photo": "http://graph.facebook", "id": "", "plates": 3, "rcu": null, "name": "", "email": ".", "provider": "facebook", "phone": "", "savings": [], "privacyPolicyAccepted": true, "currentRole": "RoleType.PERSONAL", "empty_lives_date": null, "userId": "", "authentication_token": "-------", "onboard_status": "ONBOARDING_WIZARD", "fcmToken": ----------", "level": 1, "dni": "", "social_token": "", "lives": 10, "bills": [{"date": "2020-12-10", "role": "RoleType.PERSONAL", "name": "Supermercado", "category": "feeding", "periodicity": "PeriodicityType.NONE", "value": 100.0"}], "payments": [], "goals": [], "goalTransactions": [], "incomes": [], "achievements": [{"created_at":", "name": ""}]}]
How do I extract the content corresponding to 'value' which is present inside column 'bills' . Any way to do this ?
My python code is as follows. With this I was only able to get data within bills column. But I need only the entry corresponding to 'value' which is present inside bills.
import json
filedata = open('firebase-dataset.json','r')
data = json.load(filedata)
listoffields = [] # To produce it into a list with fields
for dic in data:
try:
listoffields.append(dic['bills']) # only non-essential bill categories.
except KeyError:
pass
print(listoffields)
The JSON you posted contains misplaced quotes.
I think you are trying to extract the value of 'value' column within bills.
try this
print(listoffields[0][0]['value'])
which will print you 100.0 as str. use float() to use it in calculations.
---edit---
Say the JSON you having contains many JSON objects separated by commas as..
[{ first-entry },{ second-entry },{ third.. }, ....and so on]
..and you want to find the value of each bill in the each JSON obj..
may be the code below will work.-
bill_value_list = [] # to store 'value' of each bill
for bill_list in listoffields:
bill_value_list.append(float(bill_list[0]['value'])) # blill_list[0] will contain complete bill dictionary.
print(bill_value_list)
print(sum(bill_value_list)) # do something usefull
Paste it after the code you posted.(no changes to your code .. since it always works :-) )

Python error: Extra data: line 1 in loading a big Json file

I am trying to read a JSON file which is 370 MB
import json
data = open( "data.json" ,"r")
json.loads(data.read())
and it's not possible to easily find the root cause of the following error,
json.decoder.JSONDecodeError: Extra data: line 1 column 1024109 (char 1024108)
I looked at similar questions and tried the following StackOverflow answer
import json
data = [json.loads(line) for line in open('data.json', 'r')]
But it didn't resolve the issue. I am wondering if there is any solution to find where the error happens in the file. I am getting some other files from the same source and they run without any problem.
A small piece of the Json file is a list of dicts like,
{
"uri": "p",
"source": {
"uri": "dail",
"dataType": "pr",
"title": "Daily"
},
"authors": [
{
"type": "author",
"isAgency": false
}
],
"concepts": [
{
"amb": false,
"imp": true,
"date": "2019-05-23",
"textStart": 2459,
"textEnd": 2467
},
{
"amb": false,
"imp": true,
"date": "2019-05-09",
"textStart": 2684,
"textEnd": 2691
}
],
"shares": {},
"wgt": 100,
"relevance": 100
}
The problem with json library is loaded everything to memory and parsed in full and then handled in-memory, which for such a large amount of data is clearly problematic.
Instead I would suggest to take a look at https://github.com/henu/bigjson
import bigjson
with open('data.json', 'rb') as f:
json_data = bigjson.load(f)

I am getting an error when trying to import a JSON file to MongoDB via Compass

I am on Windows 10. I recently obtained a large JSON file (200 MB) via webscraping, and I am now trying to import the file to MongoDB using Compass Community via the import data button. However, whenever I try to import the file, I get the following error:
Unexpected token l in JSON at position 0 while parsing near 'l
Here are the first few lines of the JSON file I am trying to import:
{
"bands": [{
"activity": "Split-up",
"bandMembers": ["https://www.metal-archives.com/artists/Jon_Powlowski/760544", "https://www.metal-archives.com/artists/Ruben_Martinez/760545", "https://www.metal-archives.com/artists/Greg_Eickmier/416646", "https://www.metal-archives.com/artists/Nedwob/471955"],
"bandName": "A // Solution",
"country": "United States",
"dateAdded": "2018-08-04",
"genre": "Crust Punk/Thrash Metal",
"label": {
"labelName": "Voltic Records",
"labelUrl": "https://www.metal-archives.com/labels/Voltic_Records/47794"
},
"location": "California",
"lyricalThemes": "N/A",
"releases": [{
"numReviews": 0,
"releaseName": "Butterfly",
"reviewAverage": null,
"type": "EP",
"url": "https://www.metal-archives.com/albums/A_--_Solution/Butterfly/723154",
"year": "1989"
}, {
"numReviews": 0,
"releaseName": "Things to Come",
"reviewAverage": null,
"type": "EP",
"url": "https://www.metal-archives.com/albums/A_--_Solution/Things_to_Come/723155",
"year": "1995"
}
],
"similarArtists": null,
"url": "https://www.metal-archives.com/bands/A_--_Solution/3540442600",
"yearFormed": "N/A",
"yearsActive": "N/A"
}, {
"activity": "Active",
Does anyone have an idea on how I can fix this error?
EDIT: I ran the import again after restarting Compass and got this:
Unexpected token : in JSON at position 0 while parsing near ': null,
Is this error related at all to the other one?
The import data button needs the object to be inlined according to https://docs.mongodb.com/compass/master/import-export/#import-data-into-a-collection.
Apart from that, I had issues with the "Unexpected token : in JSON at position 0", and even tho I could not figure out the cause yet, I tried creating a new .json and copying the content into it, and surprisingly, it worked.
Also, remember to leave a line break at the end of the file.
To convert the json into a 1 line format, you could use the following python script:
import json
import sys
import codecs
import os
def read_file(name):
with open(name, encoding='utf8') as f:
return f.read()
def write_file(name, text):
os.makedirs(os.path.dirname(name), exist_ok=True)
with codecs.open(name, "w", "utf-8-sig") as temp:
temp.writelines(text)
text = read_file(sys.argv[1])
data = json.loads(text)
result = json.dumps(text, ensure_ascii=False) + "\n"
write_file(sys.argv[2], result)

Unable to loop through JSON output from webservice Python

I have a web-service call (HTTP Get) that my Python script makes in which returns a JSON response. The response looks to be a list of Dictionaries. The script's purpose is to iterate through the each dictionary, extract each piece of metadata (i.e. "ClosePrice": "57.74",) and write each dictionary to its own row in Mssql.
The issue is, I don't think Python is recognizing the JSON output from the API call as a list of dictionaries, and when I try a for loop, I'm getting the error must be int not str. I have tried converting the output to a list, dictionary, tuple. I've also tried to make it work with List Comprehension, with no luck. Further, if I copy/paste the data from the API call and assign it to a variable, it recognizes that its a list of dictionaries without issue. Any help would be appreciated. I'm using Python 2.7.
Here is the actual http call being made: http://test.kingegi.com/Api/QuerySystem/GetvalidatedForecasts?user=kingegi&market=us&startdate=08/19/13&enddate=09/12/13
Here is an abbreviated JSON output from the API call:
[
{
"Id": "521d992cb031e30afcb45c6c",
"User": "kingegi",
"Symbol": "psx",
"Company": "phillips 66",
"MarketCap": "34.89B",
"MCapCategory": "large",
"Sector": "basic materials",
"Movement": "up",
"TimeOfDay": "close",
"PredictionDate": "2013-08-29T00:00:00Z",
"Percentage": ".2-.9%",
"Latency": 37.48089483333333,
"PickPosition": 2,
"CurrentPrice": "57.10",
"ClosePrice": "57.74",
"HighPrice": null,
"LowPrice": null,
"Correct": "FALSE",
"GainedPercentage": 0,
"TimeStamp": "2013-08-28T02:31:08 778",
"ResponseMsg": "",
"Exchange": "NYSE "
},
{
"Id": "521d992db031e30afcb45c71",
"User": "kingegi",
"Symbol": "psx",
"Company": "phillips 66",
"MarketCap": "34.89B",
"MCapCategory": "large",
"Sector": "basic materials",
"Movement": "down",
"TimeOfDay": "close",
"PredictionDate": "2013-08-29T00:00:00Z",
"Percentage": "16-30%",
"Latency": 37.4807215,
"PickPosition": 1,
"CurrentPrice": "57.10",
"ClosePrice": "57.74",
"HighPrice": null,
"LowPrice": null,
"Correct": "FALSE",
"GainedPercentage": 0,
"TimeStamp": "2013-08-28T02:31:09 402",
"ResponseMsg": "",
"Exchange": "NYSE "
}
]
Small Part of code being used:
import os,sys
import subprocess
import glob
from os import path
import urllib2
import json
import time
try:
data = urllib2.urlopen('http://api.kingegi.com/Api/QuerySystem/GetvalidatedForecasts?user=kingegi&market=us&startdate=08/10/13&enddate=09/12/13').read()
except urllib2.HTTPError, e:
print "HTTP error: %d" % e.code
except urllib2.URLError, e:
print "Network error: %s" % e.reason.args[1]
list_id=[x['Id'] for x in data] #test to see if it extracts the ID from each Dict
print(data) #Json output
print(len(data)) #should retrieve the number of dict in list
UPDATE
Answered my own question, here is the method below:
`url = 'some url that is a list of dictionaries' #GetCall
u = urllib.urlopen(url) # u is a file-like object
data = u.read()
newdata = json.loads(data)
print(type(newdata)) # printed data type will show as a list
print(len(newdata)) #the length of the list
newdict = newdata[1] # each element in the list is a dict
print(type(newdict)) # this element is a dict
length = len(newdata) # how many elements in the list
for a in range(1,length): #a is a variable that increments itself from 1 until a number
var = (newdata[a])
print(var['Correct'], var['User'])`