I am trying to return a list of SQLAlchemy rows and output it from a FastAPI endpoint. Each row in the list consists of the actor's name and the actor's total number of lines from a show. The query itself I believe is correct, but when viewing the output from the endpoint, one of the columns is missing for some reason.
The query function:
def get_actors(db: Session, detailed: bool = False) -> list[str]:
"""Return a list of actors and their total lines from the show"""
if not detailed:
query = (
db.query(models.Script.actor, func.count(models.Script.detail)) # (<actor name>, <total lines>)
.filter(models.Script.actor.isnot(None)) # Skip null actors.
.group_by(models.Script.actor) # Group unique actors.
.order_by(func.count(models.Script.detail).desc()) # Sort by most to least lines.
)
actors_list = [actor for actor in query]
print("ACTORS LIST:", actors_list) # Debug print, looks fine.
return actors_list
...
FastAPI endpoint
#holy_api.get("/actors", response_class=PrettyJSONResponse)
def get_actors(detailed: bool = False, db: Session = Depends(get_db)):
"""Get a list of all the actors from the show with their total lines, optionally detailed view"""
return crud.get_actors(db, detailed=detailed)
Now, when I open up /actors, the terminal prints out the debug looking like:
ACTORS LIST: [('Michael Palin', 2454), ('Eric Idle', 2107), ('John Cleese', 2044), ('Graham Chapman', 1848), ('Terry Jones', 1801), ('Carol Cleveland', 277), ('Terry Gilliam', 85), ('Terry\nJones', 35), ('Neil Innes', 12), ('Ian Davidson', 8), ('Connie Booth', 5), ('Katya Wyeth', 4), ('Rita Davies', 3), ('Marjorie Wilde', 3), ('Donna Reading', 2), ('Nicki Howorth', 1), ('Julia Breck', 1), ('Caron Gardener', 1)]
INFO: 127.0.0.1:54057 - "GET /actors HTTP/1.1" 200 OK
This is exactly what I need. But the actual JSON response looks like:
[
{
"actor": "Michael Palin"
},
{
"actor": "Eric Idle"
},
{
"actor": "John Cleese"
},
{
"actor": "Graham Chapman"
},
{
"actor": "Terry Jones"
},
{
"actor": "Carol Cleveland"
},
...
]
The total lines aren't shown next to the actor. Why? I am not sure if this is a problem with FastAPI not parsing the JSON properly or if this is a SQLAlchemy thing or I'm doing something plainly wrong.
Wow! Shortly after posting this question, I fixed it. All I had to do was attach a label to the aggregate count function: func.count(models.Script.detail).label("total_lines") I've been on this problem for hours and I feel really, really dumb right now.
I have a json as:
mytestdata = {
"success": True,
"message": "",
"data": {
"totalCount": 95,
"goal": [
{
"user_id": 123455,
"user_email": "john.smith#test.com",
"user_first_name": "John",
"user_last_name": "Smith",
"people_goals": [
{
"goal_id": 545555,
"goal_name": "test goal name",
"goal_owner": "123455",
"goal_narrative": "",
"goal_type": {
"id": 1,
"name": "Team"
},
"goal_create_at": "1595874095",
"goal_modified_at": "1595874095",
"goal_created_by": "123455",
"goal_updated_by": "123455",
"goal_start_date": "1593561600",
"goal_target_date": "1601424000",
"goal_progress": "34",
"goal_progress_color": "#ff9933",
"goal_status": "1",
"goal_permission": "internal,team",
"goal_category": [],
"goal_owner_full_name": "John Smith",
"goal_team_id": "766754",
"goal_team_name": "",
"goal_workstreams": []
}
]
}
]
}
}
I am trying to display all details in "people_goals" along with "user_last_name", "user_first_name","user_email", "user_id" with json_normalize.
So far I am able to display "people_goals", "user_first_name","user_email" with the code
df2 = pd.json_normalize(data=mytestdata['data'], record_path=['goal', 'people_goals'],
meta=[['goal','user_first_name'], ['goal','user_last_name'], ['goal','user_email']], errors='ignore')
However I am having issue when trying to include ['goal', 'user_id'] in the meta=[]
The error is:
TypeError Traceback (most recent call last)
<ipython-input-192-b7a124a075a0> in <module>
7 df2 = pd.json_normalize(data=mytestdata['data'], record_path=['goal', 'people_goals'],
8 meta=[['goal','user_first_name'], ['goal','user_last_name'], ['goal','user_email'], ['goal','user_id']],
----> 9 errors='ignore')
10
11 # df2 = pd.json_normalize(data=mytestdata['data'], record_path=['goal', 'people_goals'])
The only difference I see for 'user_id' is that it is not a string
Am I missing something here?
Your code works on my platform. I've migrated away from using record_path and meta parameters for two reasons. a) they are difficult to work out b) there are compatibility issues between versions of pandas
Therefore I now use approach of use json_normalize() multiple times to progressively expand JSON. Or use pd.Series. Have included both as examples.
df = pd.json_normalize(data=mytestdata['data']).explode("goal")
df = pd.concat([df, df["goal"].apply(pd.Series)], axis=1).drop(columns="goal").explode("people_goals")
df = pd.concat([df, df["people_goals"].apply(pd.Series)], axis=1).drop(columns="people_goals")
df = pd.concat([df, df["goal_type"].apply(pd.Series)], axis=1).drop(columns="goal_type")
df.T
df2 = pd.json_normalize(pd.json_normalize(
pd.json_normalize(data=mytestdata['data']).explode("goal").to_dict(orient="records")
).explode("goal.people_goals").to_dict(orient="records"))
df2.T
print(df.T.to_string())
output
0
totalCount 95
user_id 123455
user_email john.smith#test.com
user_first_name John
user_last_name Smith
goal_id 545555
goal_name test goal name
goal_owner 123455
goal_narrative
goal_create_at 1595874095
goal_modified_at 1595874095
goal_created_by 123455
goal_updated_by 123455
goal_start_date 1593561600
goal_target_date 1601424000
goal_progress 34
goal_progress_color #ff9933
goal_status 1
goal_permission internal,team
goal_category []
goal_owner_full_name John Smith
goal_team_id 766754
goal_team_name
goal_workstreams []
id 1
name Team
I have a heirarchy of tables in a MySQL 5.6 database that I need to query to a JSON format for use by a javascript tree structure.
Just as a test in my flask I did the following for just the top level
def get_all_customers():
response_object = {'status': 'success'}
cnx = mysql.connector.connect(user="", password="", database="", host="localhost", port=3306)
cursor = cnx.cursor()
cursor.execute('SELECT idx, name FROM listcustomers ORDER BY name')
data = []
for idx, name in cursor:
data.append({'id': idx, 'label':name, 'otherProp': "Customer"})
response_object['customers'] = data
return jsonify(response_object)
which returns
[
{ id: 1,
label: "customer 1",
otherProp: "Customer"
},
...
]
But each customer has locations, and each location has areas, and each area has assets, and each asset has projects, and I need to also query them into children of this json object. So, for example, just going one level deeper to locations, I would need something like this -
[
{ id: 1,
label: "customer 1",
otherProp: "Customer",
children: [
{
id: 5,
label: "location 5",
otherProp: "Location"
},
...
]
},
...
]
where in my database listlocatiosn who links to listcustomers via the it's parentCustomerId column. How can I manage this? Eventually this tree will have about 13,000 objects so I know just querying the data and then parsing it with python would be far more inefficient than if I am able to query properly to begin with.
1) There is a CSV file containing the following information (the first row is the header):
first,second,third,total
1,4,9,14
7,5,2,14
3,8,7,18
2) I would like to find the sum of individual rows and generate a final file with a modified header. The final file should look like this:
[
{
"first": 1,
"second": 4,
"third": 9,
"total": 14
},
{
"first": 7,
"second": 5,
"third": 2,
"total": 14
},
{
"first": 3,
"second": 8,
"third": 7,
"total": 18
}
]
But it does not work and I am not sure how to fix this. Can anyone provide me an understanding on how to approach this problem?
NiFi flow:
Although i'm not into Python, by just googling around i think this might do it:
import csv
with open("YOURFILE.csv") as f:
reader = csv.DictReader(f)
data = [r for r in reader]
import json
with open('result.json', 'w') as outfile:
json.dump(data, outfile)
You can use Query Record processor and add new property as
total
select first,second,third,first+second+third total from FLOWFILE
Configure the CsvReader controller service with matching avro schema with int as datatype for all the fields and Json Setwriter controller service,Include total field name so that the output from Query Record processor will be all the columns and the sum of the columns as total.
Connect total relationship from Query Record processor for further processing
Refer to these links regarding Query Record and Configure Record Reader/Writer
Trying to write a python script that will allow me to read a .csv file and mix up the values to a specific format/data structure in .json that I can then import into mongoDB. I'm using pedestrian data as my dataset and there are over a million entries with redundant data. I'm stuck on writing the actual script and translating that into my desired .json format.
data.csv - in table format for easier reading and raw
Id,Date_Time,Year,Month,Mdate,Day,Time,Sensor_ID,Sensor_Name,Hourly_Counts
1, 01-JUN-2009 00:00,2009,June,1,Monday,0,4,Town Hall (West),194
2, 01-JUN-2009 00:00,2009,June,1,Monday,0,17,Collins Place (South),21
3, 01-JUN-2009 00:00,2009,June,1,Monday,0,18,Collins Place (North),9
4, 01-JUN-2009 00:00,2009,June,1,Monday,0,16,Australia on Collins,39
5, 01-JUN-2009 00:00,2009,June,1,Monday,0,2,Bourke Street Mall (South),28
6, 01-JUN-2009 00:00,2009,June,1,Monday,0,1,Bourke Street Mall (North),37
7, 01-JUN-2009 00:00,2009,June,1,Monday,0,13,Flagstaff Station,1
8, 01-JUN-2009 00:00,2009,June,1,Monday,0,3,Melbourne Central,155
9, 01-JUN-2009 00:00,2009,June,1,Monday,0,15,State Library,98
10, 01-JUN-2009 00:00,2009,June,1,Monday,0,9,Southern Cross Station,7
11, 01-JUN-2009 00:00,2009,June,1,Monday,0,10,Victoria Point,8
12, 01-JUN-2009 00:00,2009,June,1,Monday,0,12,New Quay,30
Because I'll be uploading to mongoDB, the Id in my context is redundant to me so I need my script to skip that. Sensor_ID is not unique but I'm planning to make it the PK and create a list of objects differentiating the Hourly_Count.
I'm aiming to generate a JSON object like this from the data:
**data.json**
{
{
"Sensor_ID": 4,
"Sensor_Name": "Town Hall(West)",
"countList":
[
{
"Date_Time": "01-JUN-2009 00:00",
"Year":2009,
"Month": "June",
"Mdate": 1,
"Day": "Monday",
"Time": 0,
"Hourly_Counts": 194
},
{
"Date_Time": "01-JUN-2009 00:00",
"Year":2009,
"Month": "June",
"Mdate": 1,
"Day": "Monday",
"Time": 1,
"Hourly_Counts": 82
}
]
},
{
"Sensor_ID": 17,
"Sensor_Name": "Collins Place(North)",
"countList":
[
{
"Date_Time": "01-JUN-2009 00:00",
"Year":2009,
"Month": "June",
"Mdate": 1,
"Day": "Monday",
"Time": 0,
"Hourly_Counts": 21
}
]
}
}
So on so forth. I'm trying to make it so when it reads a Sensor_ID it creates an json object from the fields listed and adds it to the countList. Added in another from station_ID = 4 to the countList.
I am using python 2.7.x and I have looked at every question concerning this on stackoverflow and every other website. Very few rarely seem to want to restructure the .csv data when converting to .json so it's been a bit difficult.
What I have so far, still relatively new to python so thought this would be good to try out.
csvtojson.py
import csv
import json
def csvtojson():
filename = 'data.csv'
fieldnames = ('Id','Date_Time','Year','Month','Mdate','Day',
'Time','Sensor_ID','Sensor_Name', 'Hourly_Counts')
dataTime = ('Date_Time','Year','Month','Mdate','Day',
'Time', 'Hourly_Counts')
all_data = {}
with open(filename, 'rb') as csvfile:
reader = csv.DictReader(csvfile, fieldnames)
#skip header
next(reader)
current_sensorID = None
for row in reader:
sensor_ID = row['Sensor_ID']
sensorName = row['Sensor_Name']
data = all_data[sensor_ID] = {}
data['dataTime'] = dict((k, row[k]) for k in dataTime)
print json.dumps(all_data, indent=4, sort_keys=True)
if __name__ == "__main__":
csvtojson()
As far as I got is that countList is in is own object but it's not creating a list of objects and may mess up the import to mongoDB. It is filtering through Sensor_ID but overwriting if there are duplicates instead of adding to countList. And I can't seem to get it in the format/data structure I want - I'm not even sure if that's the right structure, ultimate goal is to import the millions of tuples into mongoDB like the way I listed. I'm trying a small set now to test it out.
Please check the following.
https://github.com/gurdyals/test-repo/tree/master/MongoDB
Use " MongoDB_py.zip " files .
I did the same to convert csv data to MongoDB dict .
Please let me know if you have any questions.
Thanks
Here is sample code for doing something similar to the above using python pandas. You could also do some aggregation in the dataframe if you wish to summarise the data to get rid of the redundant data.
import pandas as pd
import pprint as pp
import json
from collections import defaultdict
results = defaultdict(lambda: defaultdict(dict))
df = pd.read_csv('data.csv')
df.set_index(['Sensor_ID', 'Sensor_Name'],inplace=True)
df.reset_index(inplace=True)
grouped = df.groupby(['Sensor_ID', 'Sensor_Name']).apply(lambda x: x.drop(['Sensor_ID', 'Sensor_Name'], axis=1).to_json(orient='records'))
grouped.name = 'countList'
js = json.loads(pd.DataFrame(grouped).reset_index().to_json(orient='records'))
print json.dumps(js, indent = 4)
The output:
[
{
"Sensor_ID": 1,
"countList": "[{\"Id\":6,\"Date_Time\":\" 01-JUN-2009 00:00\",\"Year\":2009,\"Month\":\"June\",\"Mdate\":1,\"Day\":\"Monday\",\"Time\":0,\"Hourly_Counts\":37}]",
"Sensor_Name": "Bourke Street Mall (North)"
},
{
"Sensor_ID": 2,
"countList": "[{\"Id\":5,\"Date_Time\":\" 01-JUN-2009 00:00\",\"Year\":2009,\"Month\":\"June\",\"Mdate\":1,\"Day\":\"Monday\",\"Time\":0,\"Hourly_Counts\":28}]",
"Sensor_Name": "Bourke Street Mall (South)"
},
{
"Sensor_ID": 3,
"countList": "[{\"Id\":8,\"Date_Time\":\" 01-JUN-2009 00:00\",\"Year\":2009,\"Month\":\"June\",\"Mdate\":1,\"Day\":\"Monday\",\"Time\":0,\"Hourly_Counts\":155}]",
"Sensor_Name": "Melbourne Central"
},
{
"Sensor_ID": 4,
"countList": "[{\"Id\":1,\"Date_Time\":\" 01-JUN-2009 00:00\",\"Year\":2009,\"Month\":\"June\",\"Mdate\":1,\"Day\":\"Monday\",\"Time\":0,\"Hourly_Counts\":194}]",
"Sensor_Name": "Town Hall (West)"
},