Converting JSON into a DataFrame within FastAPI app - json

I am trying to create an API for customer churn at a bank. I have completed the model and now want to create the API using FastAPI. My problem is converting the JSON passed data to a dataframe to be able to run it through the model. Here is the code.
from fastapi import FastAPI
from starlette.middleware.cors import CORSMiddleware
from pycaret.classification import *
import pandas as pd
import uvicorn # ASGI
import pickle
import pydantic
from pydantic import BaseModel
class customer_input(BaseModel):
CLIENTNUM:int
Customer_Age:int
Gender:str
Dependent_count:int
Education_Level:str
Marital_Status:str
Income_Category:str
Card_Category:str
Months_on_book:int
Total_Relationship_Count:int
Months_Inactive_12_mon:int
Contacts_Count_12_mon:int
Credit_Limit:float
Total_Revolving_Bal:int
Avg_Open_To_Buy:float
Total_Amt_Chng_Q4_Q1:float
Total_Trans_Amt:int
Total_Trans_Ct:int
Total_Ct_Chng_Q4_Q1:float
Avg_Utilization_Ratio:float
app = FastAPI()
#Loading the saved model from pycaret
model = load_model('BankChurnersCatboostModel25thDec2020')
origins = [
'*'
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=['GET','POST'],
allow_headers=['Content-Type','application/xml','application/json'],
)
#app.get("/")
def index():
return {"Nothing to see here"}
#app.post("/predict")
def predict(data: customer_input):
# Convert input data into a dictionary
data = data.dict()
# Convert the dictionary into a dataframe
my_data = pd.DataFrame([data])
# Predicting using pycaret
prediction = predict_model(model, my_data)
return prediction
# Only use below 2 lines when testing on localhost -- remove when deploying
if __name__ == '__main__':
uvicorn.run(app, host='127.0.0.1', port=8000)
When I test this out I get the Internal Server Error from the OpenAPI interface so I check my cmd and the error says
ValueError: [TypeError("'numpy.int64' object is not iterable"), TypeError('vars() argument must have __dict__ attribute')]
How can I have the data that is passed into the predict function successfully convert into a dataframe. Thank you.

Ok so I fixed this by changing the customer_input class. Any int types I changed to a float and that fixed it. I don't understand why though. Can anyone explain?
Fundamentally those int values are only meant to be an integer because they are all discrete values (i.e choosing number of dependents in a bank) but I guess I could put a constrain on the front-end.

Related

How can I save some json files generated in a for loop as csv?

Sorry, I am new in coding in Python, I would need to save a json file generated in a for loop as csv for each iteration of the loop.
I wrote a code that works fine to generate the first csv file but then it is overwritten and I did not find a solution yet. Can anyone help me? many thanks
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file.csv')
You need to add a sequence number or some other unique identifier to the filename. The clearest example would be to keep track of a counter, or use a GUID. Below I've used a counter that is initialized before your loop, and is incremented in each iteration. This will produce a list of files like output_file_1.csv, output_file_2.csv, output_file_3.csv and so on.
counter = 0
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file_' + str(counter) + '.csv')
counter += 1
We convert the integer to a string, and paste it inbetween the name of your file and its extension.
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for idx, user in enumerate(user_objects):
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv(f'output_file{str(idx)}.csv')

pyflink kafka connector deserializes received json data to null

I am creating a stream processor using PyFlink. When I connect Kafka to Flink, everything works fine. But when I send json data to kafka, PyFlink receives it but the deserialiser converts it to null. PyFlink code is
from pyflink.common.serialization import Encoder
from pyflink.datastream.connectors import StreamingFileSink
from pyflink.common.serialization import JsonRowDeserializationSchema
from pyflink.common.typeinfo import Types
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer
from pyflink.common import Row
# Starting Flink app
def start_flink_app():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
env.add_jars(
"file:///Users/samprabin/Documents/xealei_fall_detector/dataProcessorAndClassifier/jar/flink-sql-connector-kafka_2.11-1.12.3.jar")
deserialization_schema = JsonRowDeserializationSchema.builder() \
.type_info(type_info=Types.ROW([Types.INT(), Types.STRING()])).build()
kafka_consumer = FlinkKafkaConsumer(
topics='quickstart-events',
deserialization_schema=deserialization_schema,
properties={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'})
ds = env.add_source(kafka_consumer)
ds.print()
env.execute("tutorial_job1")
if __name__ == "__main__":
print('Main program started...')
start_flink_app()
And the kafka producer code is
from kafka import KafkaProducer
from json import dumps
producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
value_serializer=lambda x: dumps(x).encode('utf-8'))
data = {"name":"tom"}
producer.send('quickstart-events', value=data)
please let me know how I can receive the json data in PyFlink
i faced same problem. What I did was use to the same serializer/deserializer given in example of flink kafka producer and generate output in a topic. I found that we have to use the following format :
{"f0": 123, "f1": "ddd"}
and then it works as expected without giving the null null it was giving earlier.
Issue resolved by using Types.ROW_NAMED instead Types.ROW. Then provide your field names.
deserialization_schema = JsonRowDeserializationSchema.builder().type_info(
type_info=Types.ROW_NAMED(
["abc","xyz"], [Types.STRING(), Types.STRING()])).build()

How can i extract information quickly from 130,000+ Json files located in S3?

i have an S3 was over 130k Json Files which i need to calculate numbers based on data in the json files (for example calculate the number of gender of Speakers). i am currently using s3 Paginator and JSON.load to read each file and extract information form. but it take a very long time to process such a large number of file (2-3 files per second). how can i speed up the process? please provide working code examples if possible. Thank you
here is some of my code:
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
result = paginator.paginate(Bucket='bucket-name',StartAfter='')
for page in result:
if "Contents" in page:
for key in page[ "Contents" ]:
keyString = key[ "Key" ]
s3 = boto3.resource('s3')
content_object = s3.Bucket('bucket-name').Object(str(keyString))
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)
x = (json_content['dict-name'])
In order to use the code below, I'm assuming you understand pandas (if not, you may want to get to know it). Also, it's not clear if your 2-3 seconds is on the read or includes part of the number crunching, nonetheless multiprocessing will speed this up dramatically. The gist is to read all the files in (as dataframes), concatenate them, then do your analysis.
To be useful for me, I run this on spot instances that have lots of vCPUs and memory. I've found the instances that are network optimized (like c5n - look for the n) and the inf1 (for machine learning) are much faster at reading/writing than T or M instance types, as examples.
My use case is reading 2000 'directories' with roughly 1200 files in each and analyzing them. The multithreading is orders of magnitude faster than single threading.
File 1: your main script
# create script.py file
import os
from multiprocessing import Pool
from itertools import repeat
import pandas as pd
import json
from utils_file_handling import *
ufh = file_utilities() #instantiate the class functions - see below (second file)
bucket = 'your-bucket'
prefix = 'your-prefix/here/' # if you don't have a prefix pass '' (empty string or function will fail)
#define multiprocessing function - get to know this to use multiple processors to read files simultaneously
def get_dflist_multiprocess(keys_list, num_proc=4):
with Pool(num_proc) as pool:
df_list = pool.starmap(ufh.reader_json, zip(repeat(bucket), keys_list), 15)
pool.close()
pool.join()
return df_list
#create your master keys list upfront; you can loop through all or slice the list to test
keys_list = ufh.get_keys_from_prefix(bucket, prefix)
# keys_list = keys_list[0:2000] # as an exampmle
num_proc = os.cpu_count() #tells you how many processors your machine has; function above defaults to 4 unelss given
df_list = get_dflist_multiprocess(keys_list, num_proc=num_proc) #collect dataframes for each file
df_new = pd.concat(df_list, sort=False)
df_new = df_new.reset_index(drop=True)
# do your analysis on the dataframe
File 2: class functions
#utils_file_handling.py
# create this in a separate file; name as you wish but change the import in the script.py file
import boto3
import json
import pandas as pd
#define client and resource
s3sr = boto3.resource('s3')
s3sc = boto3.client('s3')
class file_utilities:
"""file handling function"""
def get_keys_from_prefix(self, bucket, prefix):
'''gets list of keys and dates for given bucket and prefix'''
keys_list = []
paginator = s3sr.meta.client.get_paginator('list_objects_v2')
# use Delimiter to limit search to that level of hierarchy
for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/'):
keys = [content['Key'] for content in page.get('Contents')]
print('keys in page: ', len(keys))
keys_list.extend(keys)
return keys_list
def read_json_file_from_s3(self, bucket, key):
"""read json file"""
bucket_obj = boto3.resource('s3').Bucket(bucket)
obj = boto3.client('s3').get_object(Bucket=bucket, Key=key)
data = obj['Body'].read().decode('utf-8')
return data
# you may need to tweak this for your ['dict-name'] example; I think I have it correct
def reader_json(self, bucket, key):
'''returns dataframe'''
return pd.DataFrame(json.loads(self.read_json_file_from_s3(bucket, key))['dict-name'])

How to convert json file into table structure in redshift using python

How can I convert JSON file into a table structure in Redshift? I tried the below python code.
import boto3
import json
import os
import sys
import psycopg2
import csv
from collections import defaultdict
def jsonfile(path):
session = boto3.Session(
aws_access_key_id='dfjfkgj',
aws_secret_access_key='sdfg',
region_name='us-west-2')
s3 = session.resource('s3')
bucket= s3.Bucket('ag-redshift-poc')
with open(path, 'rb') as data:
res=json.load(data)
f = open('data.csv','wb')
output = csv.writer(f)
output.writerow(res[0].keys())
for row in res:
output.writerow(row.values())
bucket.put_object(Key=('C:\Python27\data.csv'),Body=res)
print 'success'
def redshift():
co=psycopg2.connect(dbname= 'redshiftpoc', host='shdjf',
port= '5439', user= 'admin', password= 'snd')
curr = co.cursor()
curr.execute("""copy sample from 's3://ag-redshift-poc/testfile/json.txt'
CREDENTIALS 'aws_access_key_id=fdfd;aws_secret_access_key=sxhd'
""")
co.commit()
print 'success'
curr.close()
co.close()
jsonfile('C:\Python27\json.txt')
redshift()
Redshift can directly absorb JSON to COPY into your table. (Though not very efficient).
In your case, modify the COPY query to,
COPY sample FROM 's3://<bucket_name>/<path_to_json>'
CREDENTIALS 'aws_access_key_id=xxxx;aws_secret_access_key=xxxx'
JSON 'auto' ACCEPTINVCHARS;
Please note JSON 'auto' in query. This maps every column in table with keys in JSON file.
More details here in the COPY examples

Scrapy / Pipeline not inserting data to MySQL database

I'm making a pipeline in scrapy to store scraped data in a mysql database. When the spider is run in terminal it works perfectly. Even the pipeline is opened. However the data is not being sent to the database. Any help appreciated! :)
here's the pipeline code:
import sys
import MySQLdb
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
from tutorial.items import TutorialItem
class MySQLTest(object):
def __init__(self):
db = MySQLdb.connect(user='root', passwd='', host='localhost', db='python')
cursor = db.cursor()
def process_item(self, spider, item):
try:
cursor.execute("INSERT INTO info (venue, datez) VALUES (%s, %s)", (item['artist'], item['date']))
self.conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
return item
and heres the spider code
import scrapy # Import required libraries.
from scrapy.selector import HtmlXPathSelector # Allows for path detection in a websites code.
from scrapy.spider import BaseSpider # Used to create a simple spider to extract data.
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor # Needed for the extraction of href links in HTML to crawl further pages.
from scrapy.contrib.spiders import CrawlSpider # Needed to make the crawl spider.
from scrapy.contrib.spiders import Rule # Allows specified rules to affect what the link
import spotipy
import soundcloud
import mysql.connector
from tutorial.items import TutorialItem
class AllGigsSpider(CrawlSpider):
name = "allGigs" # Name of the Spider. In command promt, when in the correct folder, enter "scrapy crawl Allgigs".
allowed_domains = ["www.allgigs.co.uk"] # Allowed domains is a String NOT a URL.
start_urls = [
"http://www.allgigs.co.uk/whats_on/London/clubbing-1.html",
"http://www.allgigs.co.uk/whats_on/London/festivals-1.html",
"http://www.allgigs.co.uk/whats_on/London/comedy-1.html",
"http://www.allgigs.co.uk/whats_on/London/theatre_and_opera-1.html",
"http://www.allgigs.co.uk/whats_on/London/dance_and_ballet-1.html"
] # Specify the starting points for the web crawler.
rules = [
Rule(SgmlLinkExtractor(restrict_xpaths='//div[#class="more"]'), # Search the start URL's for
callback="parse_me",
follow=True),
]
def parse_me(self, response):
for info in response.xpath('//div[#class="entry vevent"]|//div[#class="resultbox"]'):
item = TutorialItem() # Extract items from the items folder.
item ['artist'] = info.xpath('.//span[#class="summary"]//text()').extract() # Extract artist information.
item ['date'] = info.xpath('.//span[#class="dates"]//text()').extract() # Extract date information.
#item ['endDate'] = info.xpath('.//abbr[#class="dtend"]//text()').extract() # Extract end date information.
#item ['startDate'] = info.xpath('.//abbr[#class="dtstart"]//text()').extract() # Extract start date information.
item ['genre'] = info.xpath('.//div[#class="header"]//text()').extract()
yield item # Retreive items in item.
client = soundcloud.Client(client_id='401c04a7271e93baee8633483510e263')
tracks = client.get('/tracks', limit=1, license='cc-by-sa', q= item['artist'])
for track in tracks:
print(tracks)
I believe the problem was in my settings.py file where i had missed a comma... yawn.
ITEM_PIPELINES = {
'tutorial.pipelines.MySQLTest': 300,
}