"trailing data" error when reading json to Pandas dataframe - json

I have a Python 3.8.5 script that gets a JSON from an API, saves to disk, reads JSON to DF. It works.
df = pd.io.json.read_json('json_file', orient='records')
I want to try IO buffer instead so I don't have to read/write to disk, but I am getting an error. The code is like this:
from io import StringIO
io = StringIO()
json_out = []
# some code to append API results to json_out
json.dump(json_out, io)
df = pd.io.json.read_json(io.getvalue())
On that last line I get the error
File "C:\Users\chap\Anaconda3\lib\site-packages\pandas\util\_decorators.py", line 199, in wrapper
return func(*args, **kwargs)
File "C:\Users\chap\Anaconda3\lib\site-packages\pandas\util\_decorators.py", line 296, in wrapper
return func(*args, **kwargs)
File "C:\Users\chap\Anaconda3\lib\site-packages\pandas\io\json\_json.py", line 618, in read_json
result = json_reader.read()
File "C:\Users\chap\Anaconda3\lib\site-packages\pandas\io\json\_json.py", line 755, in read
obj = self._get_object_parser(self.data)
File "C:\Users\chap\Anaconda3\lib\site-packages\pandas\io\json\_json.py", line 777, in _get_object_parser
obj = FrameParser(json, **kwargs).parse()
File "C:\Users\chap\Anaconda3\lib\site-packages\pandas\io\json\_json.py", line 886, in parse
self._parse_no_numpy()
File "C:\Users\chap\Anaconda3\lib\site-packages\pandas\io\json\_json.py", line 1119, in _parse_no_numpy
loads(json, precise_float=self.precise_float), dtype=None
ValueError: Trailing data
The JSON is in a list format. So this is not the actual json but it looks like this when I write to disk:
json = [
{"state": "North Dakota",
"address": "123 30th st E #206",
"account": "123"
},
{"state": "North Dakota",
"address": "456 30th st E #206",
"account": "456"
}
]
Given that it worked in the first case (write/read from disk), I don't know how to troubleshoot. How do I troubleshoot something in the buffer? The actual data is mostly text but has some number fields.

Don't know what's going wrong for you, this works for me:
import json
import pandas as pd
from io import StringIO
json_out = [
{"state": "North Dakota",
"address": "123 30th st E #206",
"account": "123"
},
{"state": "North Dakota",
"address": "456 30th st E #206",
"account": "456"
}
]
io = StringIO()
json.dump(json_out, io)
df = pd.io.json.read_json(io.getvalue())
print(df)
leads me to believe there's something wrong with the code that appends the API data...
However, if you have a list of dictionaries, you don't need the IO step. You can just do:
pd.DataFrame(json_out)
EDIT: I think I remember this error when there was a comma at the end of my json like so:
[
{
"hello":"world",
},
]

Related

How to convert json to csv python with proper headers

Trying to get Json data to csv i am getting the values but one block is showing as one line in result, new to python so any help appriciated. Have tried the below code to do the same.
import pandas as pd
with open(r'C:\Users\anath\hard.json', encoding='utf-8') as inputfile:
df = pd.read_json(inputfile)
df.to_csv(r'C:\Users\anath\csvfile.csv', encoding='utf-8', index=True)
Sample Json in the source file, short snippet
{
"issues": [
{
"issueId": 110052,
"revision": 84,
"definitionId": "DNS1012",
"subject": "urn:h:domain:fitestdea.com",
"subjectDomain": "fitestdea.com",
"title": "Nameserver name doesn\u0027t resolve to an IPv6 address",
"category": "DNS",
"severity": "low",
"cause": "urn:h:domain:ns1.gname.net",
"causeDomain": "ns1.gname.net",
"open": true,
"status": "active",
"auto": true,
"autoOpen": true,
"createdOn": "2022-09-01T02:29:09.681451Z",
"lastUpdated": "2022-11-23T02:26:28.785601Z",
"lastChecked": "2022-11-23T02:26:28.785601Z",
"lastConfirmed": "2022-11-23T02:26:28.785601Z",
"details": "{}"
},
{
"issueId": 77881,
"revision": 106,
"definitionId": "DNS2001",
"subject": "urn:h:domain:origin-mx.stagetest.test.com.test.com",
"subjectDomain": "origin-mx.stagetest.test.com.test.com",
"title": "Dangling domain alias (CNAME)",
"category": "DNS",
"severity": "high",
"cause": "urn:h:domain:origin-www.stagetest.test.com.test.com",
"causeDomain": "origin-www.stagetest.test.com.test.com",
"open": true,
"status": "active",
"auto": true,
"autoOpen": true,
"createdOn": "2022-08-10T09:34:36.929071Z",
"lastUpdated": "2022-11-23T09:33:32.553663Z",
"lastChecked": "2022-11-23T09:33:32.553663Z",
"lastConfirmed": "2022-11-23T09:33:32.553663Z",
"details": "{\"#type\": \"hardenize/com.hardenize.schemas.dns.DanglingProblem\", \"rrType\": \"CNAME\", \"rrDomain\": \"origin-mx.stagetest.test.com.test.com\", \"causeDomain\": \"origin-www.stagetest.test.com.test.com\", \"danglingType\": \"nxdomain\", \"rrEffectiveDomain\": \"origin-mx.stagetest.test.com.test.com\"}"
}
}
]
}
Output i am getting is as below was looking a way where could field name in header and values in a column or cell so far getting the entire record in 1 cell. Any way we can just get specific field only like title, severity or issueid not everything but only the feilds i need.
Try:
import json
import pandas as pd
with open("your_file.json", "r") as f_in:
data = json.load(f_in)
df = pd.DataFrame(data["issues"])
print(df[["title", "severity", "issueId"]])
Prints:
title severity issueId
0 Nameserver name doesn't resolve to an IPv6 address low 110052
1 Dangling domain alias (CNAME) high 77881
To save as CSV you can do:
df[["title", "severity", "issueId"]].to_csv('data.csv', index=False)
try this...
df = pd.json_normalize(inputfile)
in place of the line you have.
Finally this worked for me #Andrej Kesely thanks for the inputs. sharing as might help others.
import pandas as pd
import json
with open(r'C:\Users\anath\hard.json', encoding='utf-8') as inputfile:
data = json.load(inputfile)
df = pd.DataFrame(data["issues"])
print(df[["title", "severity", "issueId"]])
df[["title", "severity", "issueId"]].to_csv('data.csv', index=False)

I am getting an error when trying to import a JSON file to MongoDB via Compass

I am on Windows 10. I recently obtained a large JSON file (200 MB) via webscraping, and I am now trying to import the file to MongoDB using Compass Community via the import data button. However, whenever I try to import the file, I get the following error:
Unexpected token l in JSON at position 0 while parsing near 'l
Here are the first few lines of the JSON file I am trying to import:
{
"bands": [{
"activity": "Split-up",
"bandMembers": ["https://www.metal-archives.com/artists/Jon_Powlowski/760544", "https://www.metal-archives.com/artists/Ruben_Martinez/760545", "https://www.metal-archives.com/artists/Greg_Eickmier/416646", "https://www.metal-archives.com/artists/Nedwob/471955"],
"bandName": "A // Solution",
"country": "United States",
"dateAdded": "2018-08-04",
"genre": "Crust Punk/Thrash Metal",
"label": {
"labelName": "Voltic Records",
"labelUrl": "https://www.metal-archives.com/labels/Voltic_Records/47794"
},
"location": "California",
"lyricalThemes": "N/A",
"releases": [{
"numReviews": 0,
"releaseName": "Butterfly",
"reviewAverage": null,
"type": "EP",
"url": "https://www.metal-archives.com/albums/A_--_Solution/Butterfly/723154",
"year": "1989"
}, {
"numReviews": 0,
"releaseName": "Things to Come",
"reviewAverage": null,
"type": "EP",
"url": "https://www.metal-archives.com/albums/A_--_Solution/Things_to_Come/723155",
"year": "1995"
}
],
"similarArtists": null,
"url": "https://www.metal-archives.com/bands/A_--_Solution/3540442600",
"yearFormed": "N/A",
"yearsActive": "N/A"
}, {
"activity": "Active",
Does anyone have an idea on how I can fix this error?
EDIT: I ran the import again after restarting Compass and got this:
Unexpected token : in JSON at position 0 while parsing near ': null,
Is this error related at all to the other one?
The import data button needs the object to be inlined according to https://docs.mongodb.com/compass/master/import-export/#import-data-into-a-collection.
Apart from that, I had issues with the "Unexpected token : in JSON at position 0", and even tho I could not figure out the cause yet, I tried creating a new .json and copying the content into it, and surprisingly, it worked.
Also, remember to leave a line break at the end of the file.
To convert the json into a 1 line format, you could use the following python script:
import json
import sys
import codecs
import os
def read_file(name):
with open(name, encoding='utf8') as f:
return f.read()
def write_file(name, text):
os.makedirs(os.path.dirname(name), exist_ok=True)
with codecs.open(name, "w", "utf-8-sig") as temp:
temp.writelines(text)
text = read_file(sys.argv[1])
data = json.loads(text)
result = json.dumps(text, ensure_ascii=False) + "\n"
write_file(sys.argv[2], result)

How to save json file into mongodb

I have twitter account timeline data per tweet saved in .json format, I am unable to save the data into mongodb
Example: fetched data of one tweet.
{
"created_at": "Fri Apr 12 05:13:35 +0000 2019",
"id": 1116570031511359489,
"id_str": "1116570031511359489",
"full_text": "#jurafsky How can i get your video lectures related to Sentiment Analysis",
"truncated": false,
"display_text_range": [0, 73],
"entities": {
"hashtags": [],
"symbols": [],
"user_mentions": [
{
"screen_name": "jurafsky",
"name": "Dan Jurafsky",
"id": 14968475,
"id_str": "14968475",
"indices": [0, 9]
}
],
"urls": []
}
it also contains urls and other lost of information
I have tried the following code.
from pymongo import MongoClient
import json
client=MongoClient('localhost',27107)
db=client.test
coll=db.dataset
with open('tweets.json') as f:
file_data=json.loads(f.read())
coll.insert(file_data)
client.close()
Try this:
from pymongo import MongoClient
import json
client=MongoClient('localhost',27107)
db=client.test
coll=db.dataset
with open('tweets.json') as f:
file_data=json.load(f)
coll.insert(file_data)
client.close()
My json dataset was not valid, I have to merge it to one array object
Thanks to: Can't parse json file: json.decoder.JSONDecodeError: Extra data.

loading a json file using python [duplicate]

I am getting some data from a JSON file "new.json", and I want to filter some data and store it into a new JSON file. Here is my code:
import json
with open('new.json') as infile:
data = json.load(infile)
for item in data:
iden = item.get["id"]
a = item.get["a"]
b = item.get["b"]
c = item.get["c"]
if c == 'XYZ' or "XYZ" in data["text"]:
filename = 'abc.json'
try:
outfile = open(filename,'ab')
except:
outfile = open(filename,'wb')
obj_json={}
obj_json["ID"] = iden
obj_json["VAL_A"] = a
obj_json["VAL_B"] = b
And I am getting an error, the traceback is:
File "rtfav.py", line 3, in <module>
data = json.load(infile)
File "/usr/lib64/python2.7/json/__init__.py", line 278, in load
**kw)
File "/usr/lib64/python2.7/json/__init__.py", line 326, in loads
return _default_decoder.decode(s)
File "/usr/lib64/python2.7/json/decoder.py", line 369, in decode
raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 88 column 2 - line 50607 column 2 (char 3077 - 1868399)
Here is a sample of the data in new.json, there are about 1500 more such dictionaries in the file
{
"contributors": null,
"truncated": false,
"text": "#HomeShop18 #DreamJob to professional rafter",
"in_reply_to_status_id": null,
"id": 421584490452893696,
"favorite_count": 0,
"source": "Mobile Web (M2)",
"retweeted": false,
"coordinates": null,
"entities": {
"symbols": [],
"user_mentions": [
{
"id": 183093247,
"indices": [
0,
11
],
"id_str": "183093247",
"screen_name": "HomeShop18",
"name": "HomeShop18"
}
],
"hashtags": [
{
"indices": [
12,
21
],
"text": "DreamJob"
}
],
"urls": []
},
"in_reply_to_screen_name": "HomeShop18",
"id_str": "421584490452893696",
"retweet_count": 0,
"in_reply_to_user_id": 183093247,
"favorited": false,
"user": {
"follow_request_sent": null,
"profile_use_background_image": true,
"default_profile_image": false,
"id": 2254546045,
"verified": false,
"profile_image_url_https": "https://pbs.twimg.com/profile_images/413952088880594944/rcdr59OY_normal.jpeg",
"profile_sidebar_fill_color": "171106",
"profile_text_color": "8A7302",
"followers_count": 87,
"profile_sidebar_border_color": "BCB302",
"id_str": "2254546045",
"profile_background_color": "0F0A02",
"listed_count": 1,
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
"utc_offset": null,
"statuses_count": 9793,
"description": "Rafter. Rafting is what I do. Me aur mera Tablet. Technocrat of Future",
"friends_count": 231,
"location": "",
"profile_link_color": "473623",
"profile_image_url": "http://pbs.twimg.com/profile_images/413952088880594944/rcdr59OY_normal.jpeg",
"following": null,
"geo_enabled": false,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/2254546045/1388065343",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
"name": "Jayy",
"lang": "en",
"profile_background_tile": false,
"favourites_count": 41,
"screen_name": "JzayyPsingh",
"notifications": null,
"url": null,
"created_at": "Fri Dec 20 05:46:00 +0000 2013",
"contributors_enabled": false,
"time_zone": null,
"protected": false,
"default_profile": false,
"is_translator": false
},
"geo": null,
"in_reply_to_user_id_str": "183093247",
"lang": "en",
"created_at": "Fri Jan 10 10:09:09 +0000 2014",
"filter_level": "medium",
"in_reply_to_status_id_str": null,
"place": null
}
As you can see in the following example, json.loads (and json.load) does not decode multiple json object.
>>> json.loads('{}')
{}
>>> json.loads('{}{}') # == json.loads(json.dumps({}) + json.dumps({}))
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Python27\lib\json\__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "C:\Python27\lib\json\decoder.py", line 368, in decode
raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 1 column 3 - line 1 column 5 (char 2 - 4)
If you want to dump multiple dictionaries, wrap them in a list, dump the list (instead of dumping dictionaries multiple times)
>>> dict1 = {}
>>> dict2 = {}
>>> json.dumps([dict1, dict2])
'[{}, {}]'
>>> json.loads(json.dumps([dict1, dict2]))
[{}, {}]
Iterate over the file, loading each line as JSON in the loop:
tweets = []
for line in open('tweets.json', 'r'):
tweets.append(json.loads(line))
This avoids storing intermediate python objects. As long as you write one full tweet per append() call, this should work.
I came across this because I was trying to load a JSON file dumped from MongoDB. It was giving me an error
JSONDecodeError: Extra data: line 2 column 1
The MongoDB JSON dump has one object per line, so what worked for me is:
import json
data = [json.loads(line) for line in open('data.json', 'r')]
This may also happen if your JSON file is not just 1 JSON record.
A JSON record looks like this:
[{"some data": value, "next key": "another value"}]
It opens and closes with a bracket [ ], within the brackets are the braces { }. There can be many pairs of braces, but it all ends with a close bracket ].
If your json file contains more than one of those:
[{"some data": value, "next key": "another value"}]
[{"2nd record data": value, "2nd record key": "another value"}]
then loads() will fail.
I verified this with my own file that was failing.
import json
guestFile = open("1_guests.json",'r')
guestData = guestFile.read()
guestFile.close()
gdfJson = json.loads(guestData)
This works because 1_guests.json has one record []. The original file I was using all_guests.json had 6 records separated by newline. I deleted 5 records, (which I already checked to be bookended by brackets) and saved the file under a new name. Then the loads statement worked.
Error was
raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 2 column 1 - line 10 column 1 (char 261900 - 6964758)
PS. I use the word record, but that's not the official name. Also, if your file has newline characters like mine, you can loop through it to loads() one record at a time into a json variable.
I just got the same error while my json file is like this
{"id":"1101010","city_id":"1101","name":"TEUPAH SELATAN"}
{"id":"1101020","city_id":"1101","name":"SIMEULUE TIMUR"}
And I found it malformed, so I changed it to:
{
"datas":[
{"id":"1101010","city_id":"1101","name":"TEUPAH SELATAN"},
{"id":"1101020","city_id":"1101","name":"SIMEULUE TIMUR"}
]
}
One-liner for your problem:
data = [json.loads(line) for line in open('tweets.json', 'r')]
If you want to solve it in a two-liner you can do it like this:
with open('data.json') as f:
data = [json.loads(line) for line in f]
The error is due to the \nsymbol if you use the read()method of the file descriptor... so don't bypass the problem by using readlines()& co but just remove such character!
import json
path = # contains for example {"c": 4} also on multy-lines
new_d = {'new': 5}
with open(path, 'r') as fd:
d_old_str = fd.read().replace('\n', '') # remove all \n
old_d = json.loads(d_old_str)
# update new_d (python3.9 otherwise new_d.update(old_d))
new_d |= old_d
with open(path2, 'w') as fd:
fd.write(json.dumps(new_d)) # save the dictionary to file (in case needed)
... and if you really really want to use readlines() here an alternative solution
new_d = {'new': 5}
with open('some_path', 'r') as fd:
d_old_str = ''.join(fd.readlines()) # concatenate the lines
d_old = json.loads(d_old_str)
# then as above
I think saving dicts in a list is not an ideal solution here proposed by #falsetru.
Better way is, iterating through dicts and saving them to .json by adding a new line.
Our 2 dictionaries are
d1 = {'a':1}
d2 = {'b':2}
you can write them to .json
import json
with open('sample.json','a') as sample:
for dict in [d1,d2]:
sample.write('{}\n'.format(json.dumps(dict)))
And you can read json file without any issues
with open('sample.json','r') as sample:
for line in sample:
line = json.loads(line.strip())
Simple and efficient
My json file was formatted exactly as the one in the question but none of the solutions here worked out. Finally I found a workaround on another Stackoverflow thread. Since this post is the first link in Google search, I put the that answer here so that other people come to this post in the future will find it more easily.
As it's been said there the valid json file needs "[" in the beginning and "]" in the end of file. Moreover, after each json item instead of "}" there must be a "},". All brackets without quotations! This piece of code just modifies the malformed json file into its correct format.
https://stackoverflow.com/a/51919788/2772087
If your data is from a source outside your control, use this
def load_multi_json(line: str) -> [dict]:
"""
Fix some files with multiple objects on one line
"""
try:
return [json.loads(line)]
except JSONDecodeError as err:
if err.msg == 'Extra data':
head = [json.loads(line[0:err.pos])]
tail = FrontFile.load_multi_json(line[err.pos:])
return head + tail
else:
raise err

Unable to loop through JSON output from webservice Python

I have a web-service call (HTTP Get) that my Python script makes in which returns a JSON response. The response looks to be a list of Dictionaries. The script's purpose is to iterate through the each dictionary, extract each piece of metadata (i.e. "ClosePrice": "57.74",) and write each dictionary to its own row in Mssql.
The issue is, I don't think Python is recognizing the JSON output from the API call as a list of dictionaries, and when I try a for loop, I'm getting the error must be int not str. I have tried converting the output to a list, dictionary, tuple. I've also tried to make it work with List Comprehension, with no luck. Further, if I copy/paste the data from the API call and assign it to a variable, it recognizes that its a list of dictionaries without issue. Any help would be appreciated. I'm using Python 2.7.
Here is the actual http call being made: http://test.kingegi.com/Api/QuerySystem/GetvalidatedForecasts?user=kingegi&market=us&startdate=08/19/13&enddate=09/12/13
Here is an abbreviated JSON output from the API call:
[
{
"Id": "521d992cb031e30afcb45c6c",
"User": "kingegi",
"Symbol": "psx",
"Company": "phillips 66",
"MarketCap": "34.89B",
"MCapCategory": "large",
"Sector": "basic materials",
"Movement": "up",
"TimeOfDay": "close",
"PredictionDate": "2013-08-29T00:00:00Z",
"Percentage": ".2-.9%",
"Latency": 37.48089483333333,
"PickPosition": 2,
"CurrentPrice": "57.10",
"ClosePrice": "57.74",
"HighPrice": null,
"LowPrice": null,
"Correct": "FALSE",
"GainedPercentage": 0,
"TimeStamp": "2013-08-28T02:31:08 778",
"ResponseMsg": "",
"Exchange": "NYSE "
},
{
"Id": "521d992db031e30afcb45c71",
"User": "kingegi",
"Symbol": "psx",
"Company": "phillips 66",
"MarketCap": "34.89B",
"MCapCategory": "large",
"Sector": "basic materials",
"Movement": "down",
"TimeOfDay": "close",
"PredictionDate": "2013-08-29T00:00:00Z",
"Percentage": "16-30%",
"Latency": 37.4807215,
"PickPosition": 1,
"CurrentPrice": "57.10",
"ClosePrice": "57.74",
"HighPrice": null,
"LowPrice": null,
"Correct": "FALSE",
"GainedPercentage": 0,
"TimeStamp": "2013-08-28T02:31:09 402",
"ResponseMsg": "",
"Exchange": "NYSE "
}
]
Small Part of code being used:
import os,sys
import subprocess
import glob
from os import path
import urllib2
import json
import time
try:
data = urllib2.urlopen('http://api.kingegi.com/Api/QuerySystem/GetvalidatedForecasts?user=kingegi&market=us&startdate=08/10/13&enddate=09/12/13').read()
except urllib2.HTTPError, e:
print "HTTP error: %d" % e.code
except urllib2.URLError, e:
print "Network error: %s" % e.reason.args[1]
list_id=[x['Id'] for x in data] #test to see if it extracts the ID from each Dict
print(data) #Json output
print(len(data)) #should retrieve the number of dict in list
UPDATE
Answered my own question, here is the method below:
`url = 'some url that is a list of dictionaries' #GetCall
u = urllib.urlopen(url) # u is a file-like object
data = u.read()
newdata = json.loads(data)
print(type(newdata)) # printed data type will show as a list
print(len(newdata)) #the length of the list
newdict = newdata[1] # each element in the list is a dict
print(type(newdict)) # this element is a dict
length = len(newdata) # how many elements in the list
for a in range(1,length): #a is a variable that increments itself from 1 until a number
var = (newdata[a])
print(var['Correct'], var['User'])`