I have below document in mongodb.
I am using below python code to save it in .json file.
file = 'employee'
json_cur = find_document(file)
count_document = emp_collection.count_documents({})
with open(file_path, 'w') as f:
f.write('[')
for i, document in enumerate(json_cur, 1):
print("document : ", document)
f.write(dumps(document))
if i != count_document:
f.write(',')
f.write(']')
the output is -
{
"_id":{
"$oid":"611288c262c5c14df84f649b"
},
"Lname":"Borg",
"Fname":"James",
"Dname":"Headquarters",
"Projects":"[{"HOURS": 5.0, "PNAME": "Reorganization", "PNUMBER": 20}]"
}
But i need it like this (Projects value without quotes) -
{
"_id":{
"$oid":"611288c262c5c14df84f649b"
},
"Lname":"Borg",
"Fname":"James",
"Dname":"Headquarters",
"Projects":[{"HOURS": 5.0, "PNAME": "Reorganization", "PNUMBER": 20}]
}
Could anyone please help me to resolve this?
Thanks,
Jay
You should parse the JSON from the Projects field
Like this:
from json import loads
document['Projects'] = loads(document['Projects'])
So,
file = 'employee'
json_cur = find_document(file)
count_document = emp_collection.count_documents({})
with open(file_path, 'w') as f:
f.write('[')
for i, document in enumerate(json_cur, 1):
document['Projects'] = loads(document['Projects'])
print("document : ", document)
f.write(dumps(document))
if i != count_document:
f.write(',')
f.write(']')
Related
I'm having a difficult time figuring out how to pull specific information from a json file.
So far I have this:
# Import json library
import json
# Open json database file
with open('jsondatabase.json', 'r') as f:
data = json.load(f)
# assign variables from json data and convert to usable information
identifier = data['ID']
identifier = str(identifier)
name = data['name']
name = str(name)
# Collect data from user to compare with data in json file
print("Please enter your numerical identifier and name: ")
user_id = input("Numerical identifier: ")
user_name = input("Name: ")
if user_id == identifier and user_name == name:
print("Your inputs matched. Congrats.")
else:
print("Your inputs did not match our data. Please try again.")
And that works great for a simple JSON file like this:
{
"ID": "123",
"name": "Bobby"
}
But ideally I need to create a more complex JSON file and can't find deeper information on how to pull specific information from something like this:
{
"Parent": [
{
"Parent_1": [
{
"Name": "Bobby",
"ID": "123"
}
],
"Parent_2": [
{
"Name": "Linda",
"ID": "321"
}
]
}
]
}
Here is an example that you might be able to pick apart.
You could either:
Make a custom de-jsonify object_hook as shown below and do something with it. There is a good tutorial here.
Just gobble up the whole dictionary that you get without a custom de-jsonify and drill down into it and make a list or set of the results. (not shown)
Example:
import json
from collections import namedtuple
data = '''
{
"Parents":
[
{
"Name": "Bobby",
"ID": "123"
},
{
"Name": "Linda",
"ID": "321"
}
]
}
'''
Parent = namedtuple('Parent', ['name', 'id'])
def dejsonify(json_str: dict):
if json_str.get("Name"):
parent = Parent(json_str.get('Name'), int(json_str.get('ID')))
return parent
return json_str
res = json.loads(data, object_hook=dejsonify)
print(res)
# then we can do whatever... if you need lookups by name/id,
# we could put the result into a dictionary
all_parents = {(p.name, p.id) : p for p in res['Parents']}
lookup_from_input = ('Bobby', 123)
print(f'found match: {all_parents.get(lookup_from_input)}')
Result:
{'Parents': [Parent(name='Bobby', id=123), Parent(name='Linda', id=321)]}
found match: Parent(name='Bobby', id=123)
I'm at a lost and my searches have gotten me nowhere.
In my seeds.rb file I have the following code
require 'json'
jsonfile = File.open 'db/search_result2.json'
jsondata = JSON.load jsonfile
#jsondata = JSON.parse(jsonfile)
jsondata[].each do |data|
Jobpost.create!(post: data['title'],
link: data['link'],
image: data['pagemap']['cse_image']['src'] )
end
Snippet of the json file looks like this:
{
"kind": "customsearch#result",
"title": "Careers Open Positions - Databricks",
"link": "https://databricks.com/company/careers/open-positions",
"pagemap": {
"cse_image": [
{
"src": "https://databricks.com/wp-content/uploads/2020/08/careeers-new-og-image-sept20.jpg"
}
]
}
},
Fixed jsondata[].each to jasondata.each. Now I'm getting the following error:
TypeError: no implicit conversion of String into Integer
jsondata[] says to call the [] method with no arguments on the object in the jsondata variable. Normally [] would take an index like jsondata[0] to get the first element or a start and length like jsondata[0, 5] to get the first five elements.
You want to call the each method on jsondata, so jsondata.each.
So this is very specific to what you have posted:
require 'json'
file = File.open('path_to_file.json').read
json_data = JSON.parse file
p json_data['kind'] #=> "customsearch#result"
# etc for all the other keys
now maybe the json you posted is just the first element in an array:
[
{}, // where each {} is the json you posted
{},
{},
// etc
]
in which case you will indeed have to iterate:
require 'json'
file = File.open('path_to_file.json').read
json_data = JSON.parse file
json_data.each do |data|
p data['kind'] #=> "customsearch#result"
end
I think I've read every Groovy parsing question on here but I can't seem to find my exact scenario so I'm reaching out for help - please be kind, I'm new to Groovy and I've really bitten off more than I can chew in this latest endeavor.
So I have this XML Response:
<?xml version="1.0" encoding="UTF-8"?>
<worklogs date_from="2020-04-19 00:00:00" date_to="2020-04-25 23:59:59" number_of_worklogs="60" format="xml" diffOnly="false" errorsOnly="false" validOnly="false" addDeletedWorklogs="true" addBillingInfo="false" addIssueSummary="true" addIssueDescription="false" duration_ms="145" headerOnly="false" userName="smm288" addIssueDetails="false" addParentIssue="false" addUserDetails="true" addWorklogDetails="false" billingKey="" issueKey="" projectKey="" addApprovalStatus="true" >
<worklog>
<worklog_id></worklog_id>
<jira_worklog_id></jira_worklog_id>
<issue_id></issue_id>
<issue_key></issue_key>
<hours></hours>
<work_date></work_date>
<username></username>
<staff_id />
<billing_key></billing_key>
<billing_attributes></billing_attributes>
<activity_id></activity_id>
<activity_name></activity_name>
<work_description></work_description>
<parent_key></parent_key>
<reporter></reporter>
<external_id />
<external_tstamp />
<external_hours></external_hours>
<external_result />
<customField_11218></customField_11218>
<customField_12703></customField_12703>
<customField_12707></customField_12707>
<hash_value></hash_value>
<issue_summary></issue_summary>
<user_details>
<full_name></full_name>
<email></email>
<user-prop key="auto_approve_timesheet"></user-prop>
<user-prop key="cris_id"></user-prop>
<user-prop key="iqn_gl_string"></user-prop>
<user-prop key="is_contractor"></user-prop>
<user-prop key="is_employee"></user-prop>
<user-prop key="it_leadership"></user-prop>
<user-prop key="primary_role"></user-prop>
<user-prop key="resource_manager"></user-prop>
<user-prop key="team"></user-prop>
</user_details>
<approval_status></approval_status>
<timesheet_approval>
<status></status>
<status_date></status_date>
<reviewer></reviewer>
<actor></actor>
<comment></comment>
</timesheet_approval>
</worklog>
....
....
</worklogs>
And I'm retrieving this XML Response from an API call so the response is held within an object. NOTE: The sample XML above is from Postman.
What I'm trying to do is the following:
1. Only retrieve certain values from this response from all the nodes.
2. Write the values collected to a .json file.
I've created a map but now I'm kind of stuck on how to parse through it and create a .json file out of the fields I want.
This is what I have thus far
#Grab('org.codehaus.groovy.modules.http-builder:http-builder:0.7.1')
#Grab('oauth.signpost:signpost-core:1.2.1.2')
#Grab('oauth.signpost:signpost-commonshttp4:1.2.1.2')
import groovyx.net.http.RESTClient
import groovyx.net.http.Method
import static groovyx.net.http.ContentType.*
import groovyx.net.http.HttpResponseException
import groovy.json.JsonBuilder
import groovy.json.JsonOutput
import groovy.json.*
// User Credentials
def jiraAuth = ""
// JIRA Endpoints
def jiraUrl = "" //Dev
def jiraUrl = "" //Production
// Tempo API Tokens
//def tempoApiToken = "" //Dev
//def tempoApiToken = "" //Production
// Define Weekly Date Range
def today = new Date()
def lastPeriodStart = today - 8
def lastPeriodEnd = today - 2
def dateFrom = lastPeriodStart.format("yyyy-MM-dd")
def dateTo = lastPeriodEnd.format("yyyy-MM-dd")
def jiraClient = new RESTClient(jiraUrl)
jiraClient.ignoreSSLIssues()
def headers = [
"Authorization" : "Basic " + jiraAuth,
"X-Atlassian-token": "no-check",
"Content-Type" : "application/json"
]
def response = jiraClient.get(
path: "",
query: [
tempoApiToken: "${tempoApiToken}",
format: "xml",
dateFrom: "${dateFrom}",
dateTo: "${dateTo}",
addUserDetails: "true",
addApprovalStatus: "true",
addIssueSummary: "true"
],
headers: headers
) { response, worklogs ->
println "Processing..."
// Start building the Output - Creates a Worklog Map
worklogs.worklog.each { worklognodes ->
def workLog = convertToMap(worklognodes)
// Print out the Map
println (workLog)
}
}
// Helper Method
def convertToMap(nodes) {
nodes.children().collectEntries {
if (it.name() == 'user-prop') {
[it['#key'], it.childNodes() ? convertToMap(it) : it.text()]
} else {
[it.name(), it.childNodes() ? convertToMap(it) : it.text()]
}
}
}
I'm only interested in parsing out the following fields from each node:
<worklogs>
<worklog>
<hours>
<work_date>
<billing_key>
<customField_11218>
<issue_summary>
<user_details>
<full_name>
<user-prop key="auto_approve_timesheet">
<user-prop key="it_leadership">
<user-prop key="resource_manager">
<user-prop key="team">
<user-prop key="cris_id">
<user-prop key="iqn_id">
<approval_status>
</worklog>
...
</worklogs>
I've tried the following:
1. Converting the workLog to a json string (JsonOutput.toJson) and then converting the json string to prettyPrint (JsonOutput.prettyPrint) - but this just returns a collection of .json responses which I can't do anything with (thought process was, this is as good as I can get and I'll just use a .json to .csv converter and get rid of what I don't want) - which is not the solution I ultimately want.
2. Printing the map workLog just returns little collections which I can't do anything with either
3. Create a new file using File and creating a .json file of workLog but again, it doesn't translate well.
The results of the println for workLog is here (just so everyone can see that the response is being held and the map matches the XML response).
[worklog_id: , jira_worklog_id: , issue_id: , issue_key: , hours: , work_date: , username: , staff_id: , billing_key: , billing_attributes: , activity_id: , activity_name: , work_description: , parent_key: , reporter: , external_id:, external_tstamp:, external_hours: , external_result:, customField_11218: , hash_value: , issue_summary: , user_details:[full_name: , email: , auto_approve_timesheet: , cris_id: , iqn_gl_approver: , iqn_gl_string: , iqn_id: , is_contractor: , is_employee: , it_leadership: , primary_role: , resource_manager: , team: ], approval_status: , timesheet_approval:[status: ]]
I would so appreciate it if anyone could offer some insights on how to move forward or even documentation that has good examples of what I'm trying to achieve (Apache's documentation is sorely lacking in examples, in my opinion).
It's not all of the way there. But, I was able to get a JSON file created with the XML and Map. From there I can just use the .json file to create a .csv and then get rid of the columns I don't want.
// Define Weekly Date Range
def today = new Date()
def lastPeriodStart = today - 8
def lastPeriodEnd = today - 2
def dateFrom = lastPeriodStart.format("yyyy-MM-dd")
def dateTo = lastPeriodEnd.format("yyyy-MM-dd")
def jiraClient = new RESTClient(jiraUrl)
jiraClient.ignoreSSLIssues()
// Creates and Begins the File
File file = new File("${dateFrom}_RPT05.json")
file.write("")
file.append("[\n")
// Defines the File
def arrplace = 0
def arrsize = 0
def headers = [
"Authorization" : "Basic " + jiraAuth,
"X-Atlassian-token": "no-check",
"Content-Type" : "application/json"
]
def response = jiraClient.get(
path: "/plugins/servlet/tempo-getWorklog/",
query: [
tempoApiToken: "${tempoApiToken}",
format: "xml",
dateFrom: "${dateFrom}",
dateTo: "${dateTo}",
addUserDetails: "true",
addApprovalStatus: "true",
addIssueSummary: "true"
],
headers: headers
) { response, worklogs ->
println "Processing..."
// Gets Size of Array
worklogs.worklog.each { worklognodes ->
arrsize = arrsize+1 }
// Start Building the Output - Creates a Worklog Map
worklogs.worklog.each { worklognodes ->
worklognodes = convertToMap(worklognodes)
// Convert Map to a JSON String
def json_str = JsonOutput.toJson(worklognodes)
// Adds Row to File
file.append(json_str)
arrplace = arrplace+1
if(arrplace<arrsize)
{file.append(",")}
file.append("\n")
print "."
}
}
file.append("]")
// Helper Method
def convertToMap(nodes) {
nodes.children().collectEntries {
if (it.name() == 'user-prop') {
[it['#key'], it.childNodes() ? convertToMap(it) : it.text()]
} else {
[it.name(), it.childNodes() ? convertToMap(it) : it.text()]
}
}
}
The output is a collection/array of worklogs.
I am executing a statement in Livy Server using HTTP POST call to localhost:8998/sessions/0/statements, with the following body
{
"code": "spark.sql(\"select * from test_table limit 10\")"
}
I would like an answer in the following format
(...)
"data": {
"application/json": "[
{"id": "123", "init_date": 1481649345, ...},
{"id": "133", "init_date": 1481649333, ...},
{"id": "155", "init_date": 1481642153, ...},
]"
}
(...)
but what I'm getting is
(...)
"data": {
"text/plain": "res0: org.apache.spark.sql.DataFrame = [id: string, init_date: timestamp ... 64 more fields]"
}
(...)
Which is the toString() version of the dataframe.
Is there some way to return a dataframe as JSON using the Livy Server?
EDIT
Found a JIRA issue that addresses the problem: https://issues.cloudera.org/browse/LIVY-72
By the comments one can say that Livy does not and will not support such feature?
I recommend using the built-in (albeit hard to find documentation for) magics %json and %table:
%json
session_url = host + "/sessions/1"
statements_url = session_url + '/statements'
data = {
'code': textwrap.dedent("""\
val d = spark.sql("SELECT COUNT(DISTINCT food_item) FROM food_item_tbl")
val e = d.collect
%json e
""")}
r = requests.post(statements_url, data=json.dumps(data), headers=headers)
print r.json()
%table
session_url = host + "/sessions/21"
statements_url = session_url + '/statements'
data = {
'code': textwrap.dedent("""\
val x = List((1, "a", 0.12), (3, "b", 0.63))
%table x
""")}
r = requests.post(statements_url, data=json.dumps(data), headers=headers)
print r.json()
Related: Apache Livy: query Spark SQL via REST: possible?
I don't have a lot of experience with Livy, but as far as I know this endpoint is used as an interactive shell and the output will be a string with the actual result that would be shown by a shell. So, with that in mind, I can think of a way to emulate the result you want, but It may not be the best way to do it:
{
"code": "println(spark.sql(\"select * from test_table limit 10\").toJSON.collect.mkString(\"[\", \",\", \"]\"))"
}
Then, you will have a JSON wrapped in a string, so your client could parse it.
I think in general your best bet is to write your output to a database of some kind. If you write to a randomly named table, you could have your code read it after the script is done.
I use this code to get the tweet from the feed which i write inside a file.
When i read the file and try to json the lines i always get an ERROR.
def SearchTwt(api):
os.chdir('/Users/me/Desktop')
SearchResult = api.search( q='market',lang='en',rpp=20)
text_file = open("TweetOut.txt", "w")
for tw in SearchResult:
text_file.write(str(tw))
print(str(tw))
text_file.close()
I read the file with:
def readfile():
tweets_data = []
os.chdir('/Users/me/Desktop')
file = open("TweetOut.txt", "r")
for line in file:
parts = line.split("Status(")
print (len(parts))
for part in parts:
tweet = 'Status('+part
if len(tweet) > 10:
tweetj = json.loads(tweet)
#tweets_data.append(tweet)
print(tweet)
file.close()
May be this is wrong to fill the file with str(tw)? Yes I rebuild the string during the reading because i thought the tweet started like that. So may be another mistake.
I tried a lot of other options.
the error:
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
the file starts like this (edited the url as asked by stack):
Status(source='SocialFlow', id=757991135465857024, in_reply_to_status_id=None, is_quote_status=False, entities={'hashtags': [], 'user_mentions': [], 'symbols': [], 'urls': [{'url': '', 'expanded_url': '', 'display_url':
The file is not valid JSON. It should be something like
{
"source": "SocialFlow",
"id":"757991135465857024",
...
"entities": {
"hashtags": [],
"user_mentions": [],
...
}
}
Because it is not valid json you either have to parse it a different way, or be sure to write it as json when you save the file.