I need read 2 collections data from MongoDB in Python, is there any way to join data in python?
Let's say that we have two collections(tables):
buy_orders
sell_orders
Those tables have the same field 'id_transaction' , and we want to join those tables on this field:
import pymongo
my_client = pymongo.MongoClient('mongodb://localhost:27017/')
my_db = my_client['Orders']
my_collection = my_db['buy_orders']
result = my_collection.aggregate([{
'$lookup' : {'from': 'sell_orders','localField': 'id_transaction','foreignField': 'id_transaction','as': 'results' }
}])
To print results:
for item in result:
print(item)
For more references: MongoDB Docs and PyMongo Docs
Have a look here
from bson.objectid import ObjectId
#the custom_id for reference
custom_id = ObjectId()
#creating user with the role admin
db.users.insert_one({"name": "Boston", "role_id": custom_id})
#Creating role with the custom id
db.roles.insert_one({"_id": custom_id, "name": "Admin")}
#lookup usage
db.users.aggregate([
{
"$lookup":
{
"from": "roles",
"localField": "role_id",
"foreignField": "_id",
"as": "roles"
}
}
])
Related
I have a json file which looks like this
{
"tags":[
"Real_send",
"stopped"
],
"messages":{
"7c2e9284-993d-4eb4-ad6b-6a2bfcc51060":{
"channel":"channel 1",
"name":"Version 1",
"alert":"\ud83d\ude84 alert 1"
},
"c2cbd05c-5452-476c-bdc7-ac31ed3417f9":{
"channel":"channel 1",
"name":"name 1",
"type":"type 1"
},
"b869886f-0f9c-487f-8a43-abe3d6456678":{
"channel":"channel 2",
"name":"Version 2",
"alert":"\ud83d\ude84 alert 2"
}
}
}
I want the output to look like below
When I print the schema I get the below schema from spark
StructType(List(
StructField(messages,
StructType(List(
StructField(7c2e9284-993d-4eb4-ad6b-6a2bfcc51060,
StructType(List(
StructField(alert,StringType,true),
StructField(channel,StringType,true),
StructField(name,StringType,true))),true),
StructField(b869886f-0f9c-487f-8a43-abe3d6456678,StructType(List(
StructField(alert,StringType,true),
StructField(channel,StringType,true),
StructField(name,StringType,true))),true),
StructField(c2cbd05c-5452-476c-bdc7-ac31ed3417f9,StructType(List(
StructField(channel,StringType,true),
StructField(name,StringType,true),
StructField(type,StringType,true))),true))),true),
StructField(tags,ArrayType(StringType,true),true)))
Basically 7c2e9284-993d-4eb4-ad6b-6a2bfcc51060 should be considered as my ID column
My code looks like:
cols_list_to_select_from_flattened = ['alert', 'channel', 'type', 'name']
df = df \
.select(
F.json_tuple(
F.col('messages'), *cols_list_to_select_from_flattened
)
.alias(*cols_list_to_select_from_flattened))
df.show(1, False)
Error message:
E pyspark.sql.utils.AnalysisException: cannot resolve 'json_tuple(`messages`, 'alert', 'channel', 'type', 'name')' due to data type mismatch: json_tuple requires that all arguments are strings;
E 'Project [json_tuple(messages#0, alert, channel, type, name) AS ArrayBuffer(alert, channel, type, name)]
E +- Relation[messages#0,tags#1] json
I also tried to list all keys like below
df.withColumn("map_json_column", F.posexplode_outer(F.col("messages"))).show()
But got error
E pyspark.sql.utils.AnalysisException: cannot resolve 'posexplode(`messages`)' due to data type mismatch: input to function explode should be array or map type, not struct<7c2e9284-993d-4eb4-ad6b-6a2bfcc51060:struct<alert:string,channel:string,name:string>,b869886f-0f9c-487f-8a43-abe3d6456678:struct<alert:string,channel:string,name:string>,c2cbd05c-5452-476c-bdc7-ac31ed3417f9:struct<channel:string,name:string,type:string>>;
E 'Project [messages#0, tags#1, generatorouter(posexplode(messages#0)) AS map_json_column#5]
E +- Relation[messages#0,tags#1] json
How can I get the desired output?
When reading json you can specify your own schema, instead of message column being a struct type make it a map type and then you can simply explode that column
Here is a self contained example with your data
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
spark = SparkSession.builder.getOrCreate()
json_sample = """
{
"tags":[
"Real_send",
"stopped"
],
"messages":{
"7c2e9284-993d-4eb4-ad6b-6a2bfcc51060":{
"channel":"channel 1",
"name":"Version 1",
"alert":"lert 1"
},
"c2cbd05c-5452-476c-bdc7-ac31ed3417f9":{
"channel":"channel 1",
"name":"name 1",
"type":"type 1"
},
"b869886f-0f9c-487f-8a43-abe3d6456678":{
"channel":"channel 2",
"name":"Version 2",
"alert":" alert 2"
}
}
}
"""
data = spark.sparkContext.parallelize([json_sample])
cols_to_select = ['alert', 'channel', 'type', 'name']
# The schema of message entry, only columns
# that are needed to select will be parsed,
# must be nullable based on your data sample
message_schema = StructType([
StructField(col_name, StringType(), True) for col_name in cols_to_select
])
# the complete document schema
json_schema = StructType([
StructField("tags", StringType(), False),
StructField("messages", MapType(StringType(), message_schema, False) ,False),
])
# Read json and parse to specific schema
# Here instead of sample data you can use file path
df = spark.read.schema(json_schema).json(data)
# explode the map column and select the requires columns
df = (
df
.select(F.explode(F.col("messages")))
.select(
F.col("key").alias("id"),
*[F.col(f"value.{col_name}").alias(col_name) for col_name in cols_to_select]
)
)
df.show(truncate=False)
I'm having a difficult time figuring out how to pull specific information from a json file.
So far I have this:
# Import json library
import json
# Open json database file
with open('jsondatabase.json', 'r') as f:
data = json.load(f)
# assign variables from json data and convert to usable information
identifier = data['ID']
identifier = str(identifier)
name = data['name']
name = str(name)
# Collect data from user to compare with data in json file
print("Please enter your numerical identifier and name: ")
user_id = input("Numerical identifier: ")
user_name = input("Name: ")
if user_id == identifier and user_name == name:
print("Your inputs matched. Congrats.")
else:
print("Your inputs did not match our data. Please try again.")
And that works great for a simple JSON file like this:
{
"ID": "123",
"name": "Bobby"
}
But ideally I need to create a more complex JSON file and can't find deeper information on how to pull specific information from something like this:
{
"Parent": [
{
"Parent_1": [
{
"Name": "Bobby",
"ID": "123"
}
],
"Parent_2": [
{
"Name": "Linda",
"ID": "321"
}
]
}
]
}
Here is an example that you might be able to pick apart.
You could either:
Make a custom de-jsonify object_hook as shown below and do something with it. There is a good tutorial here.
Just gobble up the whole dictionary that you get without a custom de-jsonify and drill down into it and make a list or set of the results. (not shown)
Example:
import json
from collections import namedtuple
data = '''
{
"Parents":
[
{
"Name": "Bobby",
"ID": "123"
},
{
"Name": "Linda",
"ID": "321"
}
]
}
'''
Parent = namedtuple('Parent', ['name', 'id'])
def dejsonify(json_str: dict):
if json_str.get("Name"):
parent = Parent(json_str.get('Name'), int(json_str.get('ID')))
return parent
return json_str
res = json.loads(data, object_hook=dejsonify)
print(res)
# then we can do whatever... if you need lookups by name/id,
# we could put the result into a dictionary
all_parents = {(p.name, p.id) : p for p in res['Parents']}
lookup_from_input = ('Bobby', 123)
print(f'found match: {all_parents.get(lookup_from_input)}')
Result:
{'Parents': [Parent(name='Bobby', id=123), Parent(name='Linda', id=321)]}
found match: Parent(name='Bobby', id=123)
I am running a sql and output i am reading as pandas df. Now i need to convert the data in to json and need to normalize the data. I tried to_json but this give partial solution.
Dataframe output:
| SalesPerson | ContactID |
|12345 |Tom|
|12345 |Robin|
|12345 |Julie|
Expected JSON:
{"SalesPerson": "12345", "ContactID":"Tom","Robin","Julie"}
Please see below code which i tried.
q = Select COL1, SalesPerson , ContactIDfrom table;
df = pd.read_sql(q, sqlconn)
df1=df.iloc[:, 1:2]
df2 = df1.to_json(orient='records')
also to_json result bracket which i also dont need.
Try this:
df.groupby('SalesPerson').apply(lambda x: pd.Series({
'ContactID': x['ContactID'].values
})).reset_index().to_json(orient='records')
Output (pretty printed):
[
{
"SalesPerson": 1,
"ContactID": ["Tom", "Robin", "Julie"]
},
{
"SalesPerson": 2,
"ContactID": ["Jack", "Mike", "Mary"]
}
]
I have a heirarchy of tables in a MySQL 5.6 database that I need to query to a JSON format for use by a javascript tree structure.
Just as a test in my flask I did the following for just the top level
def get_all_customers():
response_object = {'status': 'success'}
cnx = mysql.connector.connect(user="", password="", database="", host="localhost", port=3306)
cursor = cnx.cursor()
cursor.execute('SELECT idx, name FROM listcustomers ORDER BY name')
data = []
for idx, name in cursor:
data.append({'id': idx, 'label':name, 'otherProp': "Customer"})
response_object['customers'] = data
return jsonify(response_object)
which returns
[
{ id: 1,
label: "customer 1",
otherProp: "Customer"
},
...
]
But each customer has locations, and each location has areas, and each area has assets, and each asset has projects, and I need to also query them into children of this json object. So, for example, just going one level deeper to locations, I would need something like this -
[
{ id: 1,
label: "customer 1",
otherProp: "Customer",
children: [
{
id: 5,
label: "location 5",
otherProp: "Location"
},
...
]
},
...
]
where in my database listlocatiosn who links to listcustomers via the it's parentCustomerId column. How can I manage this? Eventually this tree will have about 13,000 objects so I know just querying the data and then parsing it with python would be far more inefficient than if I am able to query properly to begin with.
Let's say I have the following document in a MongoDB database:
{
"assist_leaders" : {
"Steve Nash" : {
"team" : "Phoenix Suns",
"position" : "PG",
"draft_data" : {
"class" : 1996,
"pick" : 15,
"selected_by" : "Phoenix Suns",
"college" : "Santa Clara"
}
},
"LeBron James" : {
"team" : "Cleveland Cavaliers",
"position" : "SF",
"draft_data" : {
"class" : 2003,
"pick" : 1,
"selected_by" : "Cleveland Cavaliers",
"college" : "None"
}
},
}
}
I'm trying to collect a few values under "draft_data" for each player in an ORDERED list. The list needs to look like the following for this particular document:
[ [1996, 15, "Phoenix Suns"], [2003, 1, "Cleveland Cavaliers"] ]
That is, each nested list must contain the values corresponding to the "pick", "selected_by", and "class" keys, in that order. I also need the "Steve Nash" data to come before the "LeBron James" data.
How can I achieve this using pymongo? Note that the structure of the data is not set in stone so I can change this if that makes the code simpler.
I'd extract the data and turn it into a list in Python, once you've retrieved the document from MongoDB:
for doc in db.collection.find():
for name, info in doc['assist_leaders'].items():
draft_data = info['draft_data']
lst = [draft_data['class'], draft_data['pick'], draft_data['selected_by']]
print name, lst
List comprehension is the way to go here (Note: don't forget .iteritems() in Python2 or .items() in Python3 or you'll get a ValueError: too many values to unpack).
import pymongo
import numpy as np
client = pymongo.MongoClient()
db = client[database_name]
dataList = [v for i in ["Steve Nash", "LeBron James"]
for key in ["class", "pick", "selected_by"]
for document in db.collection_name.find({"assist_leaders": {"$exists": 1}})
for k, v in document["assist_leaders"][i]["draft_data"].iteritems()
if k == key]
print dataList
# [1996, 15, "Phoenix Suns", 2003, 1, "Cleveland Cavaliers"]
matrix = np.reshape(dataList, [2,3])
print matrix
# [ [1996, 15, "Phoenix Suns"],
# [2003, 1, "Cleveland Cavaliers"] ]