How to save json file into mongodb - json

I have twitter account timeline data per tweet saved in .json format, I am unable to save the data into mongodb
Example: fetched data of one tweet.
{
"created_at": "Fri Apr 12 05:13:35 +0000 2019",
"id": 1116570031511359489,
"id_str": "1116570031511359489",
"full_text": "#jurafsky How can i get your video lectures related to Sentiment Analysis",
"truncated": false,
"display_text_range": [0, 73],
"entities": {
"hashtags": [],
"symbols": [],
"user_mentions": [
{
"screen_name": "jurafsky",
"name": "Dan Jurafsky",
"id": 14968475,
"id_str": "14968475",
"indices": [0, 9]
}
],
"urls": []
}
it also contains urls and other lost of information
I have tried the following code.
from pymongo import MongoClient
import json
client=MongoClient('localhost',27107)
db=client.test
coll=db.dataset
with open('tweets.json') as f:
file_data=json.loads(f.read())
coll.insert(file_data)
client.close()

Try this:
from pymongo import MongoClient
import json
client=MongoClient('localhost',27107)
db=client.test
coll=db.dataset
with open('tweets.json') as f:
file_data=json.load(f)
coll.insert(file_data)
client.close()

My json dataset was not valid, I have to merge it to one array object
Thanks to: Can't parse json file: json.decoder.JSONDecodeError: Extra data.

Related

multiple object of an array creates different columns in the CSV file

Here is my JSON example. When I convert JSON to CSV file, it creates different columns for each object of reviews array. columns names be like - serial name.0 rating.0 _id.0 name.1 rating.1 _id.1. How can i convert to CSV file where only serial,name,rating,_id will be the column name and every object of the reviews will be put in a different row?
`
[{
"serial": "63708940a8d291c502be815f",
"reviews": [
{
"name": "shadman",
"rating": 4,
"_id":"6373d4eb50cff661989f3d83"
},
{
"name": "niloy1",
"rating": 3,
"_id": "6373d59450cff661989f3db8"
},
],
}]
`
`
I am trying to use the CSV file to pandas. If not possible, is there any way to solve the problem using pandas package in python?
I suggest you use pandas for the CSV export only and process the json data by flattening the data structure first so that the result can then be easily loaded in a Pandas DataFrame.
Try:
data_python = [{
"serial": "63708940a8d291c502be815f",
"reviews": [
{
"name": "shadman",
"rating": 4,
"_id":"6373d4eb50cff661989f3d83"
},
{
"name": "niloy1",
"rating": 3,
"_id": "6373d59450cff661989f3db8"
},
],
}]
from collections import defaultdict
from pprint import pprint
import pandas as pd
dct_flat = defaultdict(list)
for dct in data_python:
for dct_reviews in dct["reviews"]:
dct_flat['serial'].append(dct['serial'])
for key, value in dct_reviews.items():
dct_flat[key].append(value)
#pprint(data_python)
#pprint(dct_flat)
df = pd.DataFrame(dct_flat)
print(df)
df.to_csv("data.csv")
which gives:
serial name rating _id
0 63708940a8d291c502be815f shadman 4 6373d4eb50cff661989f3d83
1 63708940a8d291c502be815f niloy1 3 6373d59450cff661989f3db8
and
,serial,name,rating,_id
0,63708940a8d291c502be815f,shadman,4,6373d4eb50cff661989f3d83
1,63708940a8d291c502be815f,niloy1,3,6373d59450cff661989f3db8
as CSV file content.
Notice that the json you provided in your question can't be loaded from file or string in Python neither using Python json module nor using Pandas because it is not valid json code. See below for corrected valid json data:
valid_json_data='''\
[{
"serial": "63708940a8d291c502be815f",
"reviews": [
{
"name": "shadman",
"rating": 4,
"_id":"6373d4eb50cff661989f3d83"
},
{
"name": "niloy1",
"rating": 3,
"_id": "6373d59450cff661989f3db8"
}
]
}]
'''
and code for loading this data from json file:
import json
json_file = "data.json"
with open(json_file) as f:
data_json = f.read()
data_python = json.loads(data_json)

How to convert json to csv python with proper headers

Trying to get Json data to csv i am getting the values but one block is showing as one line in result, new to python so any help appriciated. Have tried the below code to do the same.
import pandas as pd
with open(r'C:\Users\anath\hard.json', encoding='utf-8') as inputfile:
df = pd.read_json(inputfile)
df.to_csv(r'C:\Users\anath\csvfile.csv', encoding='utf-8', index=True)
Sample Json in the source file, short snippet
{
"issues": [
{
"issueId": 110052,
"revision": 84,
"definitionId": "DNS1012",
"subject": "urn:h:domain:fitestdea.com",
"subjectDomain": "fitestdea.com",
"title": "Nameserver name doesn\u0027t resolve to an IPv6 address",
"category": "DNS",
"severity": "low",
"cause": "urn:h:domain:ns1.gname.net",
"causeDomain": "ns1.gname.net",
"open": true,
"status": "active",
"auto": true,
"autoOpen": true,
"createdOn": "2022-09-01T02:29:09.681451Z",
"lastUpdated": "2022-11-23T02:26:28.785601Z",
"lastChecked": "2022-11-23T02:26:28.785601Z",
"lastConfirmed": "2022-11-23T02:26:28.785601Z",
"details": "{}"
},
{
"issueId": 77881,
"revision": 106,
"definitionId": "DNS2001",
"subject": "urn:h:domain:origin-mx.stagetest.test.com.test.com",
"subjectDomain": "origin-mx.stagetest.test.com.test.com",
"title": "Dangling domain alias (CNAME)",
"category": "DNS",
"severity": "high",
"cause": "urn:h:domain:origin-www.stagetest.test.com.test.com",
"causeDomain": "origin-www.stagetest.test.com.test.com",
"open": true,
"status": "active",
"auto": true,
"autoOpen": true,
"createdOn": "2022-08-10T09:34:36.929071Z",
"lastUpdated": "2022-11-23T09:33:32.553663Z",
"lastChecked": "2022-11-23T09:33:32.553663Z",
"lastConfirmed": "2022-11-23T09:33:32.553663Z",
"details": "{\"#type\": \"hardenize/com.hardenize.schemas.dns.DanglingProblem\", \"rrType\": \"CNAME\", \"rrDomain\": \"origin-mx.stagetest.test.com.test.com\", \"causeDomain\": \"origin-www.stagetest.test.com.test.com\", \"danglingType\": \"nxdomain\", \"rrEffectiveDomain\": \"origin-mx.stagetest.test.com.test.com\"}"
}
}
]
}
Output i am getting is as below was looking a way where could field name in header and values in a column or cell so far getting the entire record in 1 cell. Any way we can just get specific field only like title, severity or issueid not everything but only the feilds i need.
Try:
import json
import pandas as pd
with open("your_file.json", "r") as f_in:
data = json.load(f_in)
df = pd.DataFrame(data["issues"])
print(df[["title", "severity", "issueId"]])
Prints:
title severity issueId
0 Nameserver name doesn't resolve to an IPv6 address low 110052
1 Dangling domain alias (CNAME) high 77881
To save as CSV you can do:
df[["title", "severity", "issueId"]].to_csv('data.csv', index=False)
try this...
df = pd.json_normalize(inputfile)
in place of the line you have.
Finally this worked for me #Andrej Kesely thanks for the inputs. sharing as might help others.
import pandas as pd
import json
with open(r'C:\Users\anath\hard.json', encoding='utf-8') as inputfile:
data = json.load(inputfile)
df = pd.DataFrame(data["issues"])
print(df[["title", "severity", "issueId"]])
df[["title", "severity", "issueId"]].to_csv('data.csv', index=False)

Python error: Extra data: line 1 in loading a big Json file

I am trying to read a JSON file which is 370 MB
import json
data = open( "data.json" ,"r")
json.loads(data.read())
and it's not possible to easily find the root cause of the following error,
json.decoder.JSONDecodeError: Extra data: line 1 column 1024109 (char 1024108)
I looked at similar questions and tried the following StackOverflow answer
import json
data = [json.loads(line) for line in open('data.json', 'r')]
But it didn't resolve the issue. I am wondering if there is any solution to find where the error happens in the file. I am getting some other files from the same source and they run without any problem.
A small piece of the Json file is a list of dicts like,
{
"uri": "p",
"source": {
"uri": "dail",
"dataType": "pr",
"title": "Daily"
},
"authors": [
{
"type": "author",
"isAgency": false
}
],
"concepts": [
{
"amb": false,
"imp": true,
"date": "2019-05-23",
"textStart": 2459,
"textEnd": 2467
},
{
"amb": false,
"imp": true,
"date": "2019-05-09",
"textStart": 2684,
"textEnd": 2691
}
],
"shares": {},
"wgt": 100,
"relevance": 100
}
The problem with json library is loaded everything to memory and parsed in full and then handled in-memory, which for such a large amount of data is clearly problematic.
Instead I would suggest to take a look at https://github.com/henu/bigjson
import bigjson
with open('data.json', 'rb') as f:
json_data = bigjson.load(f)

I am getting an error when trying to import a JSON file to MongoDB via Compass

I am on Windows 10. I recently obtained a large JSON file (200 MB) via webscraping, and I am now trying to import the file to MongoDB using Compass Community via the import data button. However, whenever I try to import the file, I get the following error:
Unexpected token l in JSON at position 0 while parsing near 'l
Here are the first few lines of the JSON file I am trying to import:
{
"bands": [{
"activity": "Split-up",
"bandMembers": ["https://www.metal-archives.com/artists/Jon_Powlowski/760544", "https://www.metal-archives.com/artists/Ruben_Martinez/760545", "https://www.metal-archives.com/artists/Greg_Eickmier/416646", "https://www.metal-archives.com/artists/Nedwob/471955"],
"bandName": "A // Solution",
"country": "United States",
"dateAdded": "2018-08-04",
"genre": "Crust Punk/Thrash Metal",
"label": {
"labelName": "Voltic Records",
"labelUrl": "https://www.metal-archives.com/labels/Voltic_Records/47794"
},
"location": "California",
"lyricalThemes": "N/A",
"releases": [{
"numReviews": 0,
"releaseName": "Butterfly",
"reviewAverage": null,
"type": "EP",
"url": "https://www.metal-archives.com/albums/A_--_Solution/Butterfly/723154",
"year": "1989"
}, {
"numReviews": 0,
"releaseName": "Things to Come",
"reviewAverage": null,
"type": "EP",
"url": "https://www.metal-archives.com/albums/A_--_Solution/Things_to_Come/723155",
"year": "1995"
}
],
"similarArtists": null,
"url": "https://www.metal-archives.com/bands/A_--_Solution/3540442600",
"yearFormed": "N/A",
"yearsActive": "N/A"
}, {
"activity": "Active",
Does anyone have an idea on how I can fix this error?
EDIT: I ran the import again after restarting Compass and got this:
Unexpected token : in JSON at position 0 while parsing near ': null,
Is this error related at all to the other one?
The import data button needs the object to be inlined according to https://docs.mongodb.com/compass/master/import-export/#import-data-into-a-collection.
Apart from that, I had issues with the "Unexpected token : in JSON at position 0", and even tho I could not figure out the cause yet, I tried creating a new .json and copying the content into it, and surprisingly, it worked.
Also, remember to leave a line break at the end of the file.
To convert the json into a 1 line format, you could use the following python script:
import json
import sys
import codecs
import os
def read_file(name):
with open(name, encoding='utf8') as f:
return f.read()
def write_file(name, text):
os.makedirs(os.path.dirname(name), exist_ok=True)
with codecs.open(name, "w", "utf-8-sig") as temp:
temp.writelines(text)
text = read_file(sys.argv[1])
data = json.loads(text)
result = json.dumps(text, ensure_ascii=False) + "\n"
write_file(sys.argv[2], result)

Parsing and cleaning text file in Python?

I have a text file which contains raw data. I want to parse that data and clean it so that it can be used further.The following is the rawdata.
"{\x0A \x22identifier\x22: {\x0A \x22company_code\x22: \x22TSC\x22,\x0A \x22product_type\x22: \x22airtime-ctg\x22,\x0A \x22host_type\x22: \x22android\x22\x0A },\x0A \x22id\x22: {\x0A \x22type\x22: \x22guest\x22,\x0A \x22group\x22: \x22guest\x22,\x0A \x22uuid\x22: \x221a0d4d6e-0c00-11e7-a16f-0242ac110002\x22,\x0A \x22device_id\x22: \x22423e49efa4b8b013\x22\x0A },\x0A \x22stats\x22: [\x0A {\x0A \x22timestamp\x22: \x222017-03-22T03:21:11+0000\x22,\x0A \x22software_id\x22: \x22A-ACTG\x22,\x0A \x22action_id\x22: \x22open_app\x22,\x0A \x22values\x22: {\x0A \x22device_id\x22: \x22423e49efa4b8b013\x22,\x0A \x22language\x22: \x22en\x22\x0A }\x0A }\x0A ]\x0A}"
I want to remove all the hexadecimal characters,I tried parsing the data and storing in an array and cleaning it using re.sub() but it gives the same data.
for line in f:
new_data = re.sub(r'[^\x00-\x7f],\x22',r'', line)
data.append(new_data)
\x0A is the hex code for newline. After s = <your json string>, print(s) gives
>>> print(s)
{
"identifier": {
"company_code": "TSC",
"product_type": "airtime-ctg",
"host_type": "android"
},
"id": {
"type": "guest",
"group": "guest",
"uuid": "1a0d4d6e-0c00-11e7-a16f-0242ac110002",
"device_id": "423e49efa4b8b013"
},
"stats": [
{
"timestamp": "2017-03-22T03:21:11+0000",
"software_id": "A-ACTG",
"action_id": "open_app",
"values": {
"device_id": "423e49efa4b8b013",
"language": "en"
}
}
]
}
You should parse this with the json module load (from file) or loads (from string) functions. You will get a dict with 2 dicts and a list with a dict.