How to fill in missing column value? - json

# Import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import ast
start_time = time.time()
s = requests.Session()
#Get URL and extract content
page=1
traits = []
accessories, backgrounds, shoes = [], [], []
while page != 100:
params = {
('arg', f"Qmer3VzaeFhb7c5uiwuHJbRuVCaUu72DcnSoUKb1EvnB2x/{page}"),
}
content = s.get('https://ipfs.infura.io:5001/api/v0/cat', params=params, auth=('', ''))
soup = BeautifulSoup(content.text, 'html.parser')
page = page + 1
traits = ast.literal_eval(soup.text)['attributes']
df = pd.DataFrame(traits)
df1 = df[df['trait_type']=='ACCESSORIES']
accessories.append(df1['value'].values[0])
When I run the above code I get the following error:
IndexError: index 0 is out of bounds for axis 0 with size 0
This happens because not every item has an "ACCESSORIES" trait data point. So how would I go about adding/filling in an ACCESSORIES trait for those items that don't have one with an empty, nan, or 0 value?

Following code solves this issue:
# Import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import ast
start_time = time.time()
s = requests.Session()
#Get URL and extract content
page=1
traits = []
accessories, backgrounds, shoes = [], [], []
while page != 100:
params = {
('arg', f"Qmer3VzaeFhb7c5uiwuHJbRuVCaUu72DcnSoUKb1EvnB2x/{page}"),
}
content = s.get('https://ipfs.infura.io:5001/api/v0/cat', params=params, auth=('', ''))
soup = BeautifulSoup(content.text, 'html.parser')
page = page + 1
traits = ast.literal_eval(soup.text)['attributes']
df = pd.DataFrame(traits)
df1 = df[df['trait_type']=='ACCESSORIES']
try:
accessories.append(df1['value'].values[0])
except:
'NONE'

Related

Plotly dashboard hangs in loading - likely bug

There is likely a bug in the following code that causes the dashboard to not load, but don't see where it's at:
from dash import Dash, html, dcc, Input, Output, State
import plotly.express as px
import plotly.graph_objects as go
import dash_bootstrap_components as dbc
import pandas as pd
from pandas_datareader import data
import yfinance as yf
yf.pdr_override()
from datetime import date
start = pd.to_datetime('2022-01-01')
end = pd.to_datetime(date.today())
def update_data():
# !! reset_index because otherwise plotly doesn't recognize the index as a x input in go.Figure
df = data.DataReader('USDJPY%3DX', data_source='yahoo', start=start, end=end).reset_index()
return df
app = Dash(__name__, external_stylesheets=[dbc.themes.LITERA])
app.layout = dbc.Container(
[
dbc.Row(
[dbc.Col([html.H1(
"Daily Price",
style={"textAlign": "center"},
),
dcc.Graph(id="price-chart", figure={})],
width=12,lg=6),
dbc.Col([html.H1(
"10 Day SMA of Daily Range",
style={"textAlign": "center"},
),
dcc.Graph(id="volatility-chart", figure={})],
width=12,lg=6)
]
),
dbc.Row(
dbc.Col(dcc.Dropdown(
id="dropdown",
options=["AAPL", "TSLA", "MSFT"],
value=["TSLA"],
style={"color": "green"}
),
className="three columns"),
),
dcc.Store(id="storage", storage_type="memory", data={}),
dcc.Interval(id="timer", interval=1000 * 60, n_intervals=0),
]
)
#app.callback(Output(component_id = "storage", component_property = "data"),
Input(component_id = "timer", component_property = "n_intervals"))
def store_data(n_time):
df = update_data()
return df.to_dict("records")
#app.callback(Output(component_id = "price-chart", component_property = "figure"),
Input(component_id = "storage", component_property = "data"))
def display_data(stored_dataframe):
df = pd.DataFrame.from_records(stored_dataframe)
fig = go.Figure(data=[go.Candlestick(x=df['Date'],
open=df['Open'],
high=df['High'],
low=df['Low'],
close=df['Close'])])
return fig
#app.callback(Output(component_id = "volatility-chart", component_property = "figure"),
Input(component_id = "storage", component_property = "data"))
def modify_data(stored_dataframe):
df = pd.DataFrame.from_records(stored_dataframe)
df['range'] = df.High - df.Low
df['range_sma'] = df.range.rolling(10).mean()
fig = px.line(df.range_sma)
return fig
if __name__ == "__main__":
app.run_server(debug=True)
I would prefer: app.layout = dash.Dash and would also choose some port:
if __name__ == "__main__":
app.run_server(debug=True, port = 8050
)

Unable to use method of a class in different class-missing 2 required positional arguments

I have two python classes:- One class(CloudLink) is responsible for sending JSON events to the app and another(ReadData) is responsible for building the JSON data.
The ReadData class will be using the CloudLink methods to send the JSON data to the App. But I'm getting error _buildJSONdata() missing 1 required positional argument: 'Data'.
ReadData class
from pyspark.sql import SparkSession
import functools
from pyspark.sql import DataFrame
from pyspark.sql.functions import explode
from cosmosconnect import azurecosmos
class ReadData:
#exception(logger)
def __init__(self):
self.spark_session = (
SparkSession.builder
.appName("readData")
.getOrCreate()
)
mssparkutils.fs.unmount('/mnt/test')
logger.info("Drive unmounted")
mssparkutils.fs.mount(
'abfss://abc#transl.dfs.core.windows.net/',
'/mnt/test',
{'linkedService': "linkCosmos"}
)
logger.info("Mounted Successfully")
self.input_directory = (f"synfs:/{mssparkutils.env.getJobId()}/mnt/test/input_path"
)
self.output_directory = (f"synfs:/{mssparkutils.env.getJobId()}/mnt/test/output_path"
)
'''
Reading the schema from csv file
'''
#exception(logger)
def readConfig(self):
try:
logger.info(f"Reading the Config present in {self.input_directory} ")
dfConfig = self.spark_session.read.option("multiline","true") \
.json(self.input_directory)
#for f in dfConfig.select("Entity","Query","Business_Rule").collect():
dfConfig=dfConfig.select(explode('Input').alias('Input_Data'))\
.select('Input_Data.Validation_Type','Input_Data.Entity','Input_Data.Query','Input_Data.Business_Rule')
for f in dfConfig.rdd.toLocalIterator():
#for index, f in dfConfig.toPandas().iterrows():
self.Validation_Type=f[0]
self.container=f[1]
self.query=f[2]
self.rule=f[3]
self.readCosmos(self)
except:
raise ValueError("")
#exception(logger)
def readCosmos(self,*params):
#from cosmosconnect import azurecosmos
#a=[]
linkedService='fg'
df=azurecosmos.cosmosConnect(linkedService,self.query,self.container)
df.cache()
if len(df.head(1)) >0:
outputpath=self.output_directory+'/'+self.container
df.coalesce(1).write.mode('overwrite').parquet(outputpath)
Status="Validation Failure"
Data= {"Validation_Type":[],"Status":[],"Container":[],"Business_Rule":[]}
Data["Validation_Type"].append(self.Validation_Type)
Data["Status"].append(Status)
Data["Container"].append(self.container)
Data["Business_Rule"].append(self.rule)
CloudLink._buildJSONdata(Data)
if __name__ == "__main__":
p = ReadData()
p.readConfig()
CloudLink class
import json
import datetime
import hashlib
import json
import sys
import traceback
import adal
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import logging
from functools import wraps
import sys
def create_logger():
#create a logger object
#logger = logging.getLogger()
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logfile = logging.FileHandler('exc_logger.log')
#logfile = logging.StreamHandler(sys.stdout)
fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
formatter = logging.Formatter(fmt)
logfile.setFormatter(formatter)
logger.addHandler(logfile)
return logger
logger = create_logger()
def exception(logger):
def decorator(func):
#wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except:
issue = "exception in "+func.__name__+"\n"
issue = issue+"-------------------------\
------------------------------------------------\n"
logger.exception(issue)
raise
return wrapper
return decorator
class CloudLink(object):
_token = None
_instance = None
http = None
cloudclient = TokenLibrary.getSecret("xxxx", "rtrt")
clientid = TokenLibrary.getSecret("xxxx", "tyty")
clientcredentials = TokenLibrary.getSecret("xxxx", "abcabc")
authority_url = TokenLibrary.getSecret("xxxx", "abab")
cloudtest = TokenLibrary.getSecret("xxxx", "yyyy")
#staticmethod
def getInstance():
if not CloudLink._instance:
CloudLink._instance = CloudLink()
return CloudLink._instance
def __init__(self):
retry_strategy = Retry(
total=3,
backoff_factor=0,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.http = requests.Session()
self.http.mount("https://", adapter)
self.http.mount("http://", adapter)
print("Inside init")
def parseJSON(self, t):
try:
eventData = json.loads(t)
logger.info(f"Sending {eventData} to cloud")
self.sendToCloud(eventData)
except ValueError as e:
print("Error: %s Please validate JSON in https://www.jsonschemavalidator.net/"% e)
return None # or: raise
def sendToCloud(self, eventData):
cloudData = {"eventData": eventData, "metadata": self._buildMetadata()}
logger.info(f"Raising alert with data=({cloudData}")
response = self.http.post(
self.cloudtest, headers=self._buildHeaders(), json=cloudData
)
logger.info(f"cloud alert response={response}")
if response.status_code == 202 or response.status_code == 200:
logger.info("Mail sent to Cloud")
else:
raise Exception(f"Cloud reporting failed with Error {response}")
def _buildJSONdata(self,Data):
if len(Data) == 0:
raise Exception("JSON is empty")
else:
t = json.dumps(self.Data)
self.parseJSON(t)
def _buildMetadata(self):
return {
"messageType": "Send Email",
"messageVersion": "0.0.1",
"sender": "Send Email",
}
def _buildHeaders(self):
self._refreshADToken()
headers = {
"Authorization": "Bearer {}".format(self._token["accessToken"]),
"Content-type": "application/json",
"Accept": "text/plain",
}
return headers
def _refreshADToken(self):
def shouldRenew(token):
"""Returns True if the token should be renewed"""
expiresOn = datetime.datetime.strptime(
token["expiresOn"], "%Y-%m-%d %H:%M:%S.%f"
)
now = datetime.datetime.now()
return (expiresOn - now) < datetime.timedelta(minutes=5)
if not self._token or shouldRenew(self._token):
logger.info("Renewing credentials for Alerting")
result = None
try:
context = adal.AuthenticationContext(CloudLink.authority_url)
result = context.acquire_token_with_client_credentials(CloudLink.cloudclient, CloudLink.clientid,CloudLink.clientcredentials)
except Exception as e:
error = "Failed to renew client credentials."
logger.info(error)
raise
if result and "accessToken" in result:
self._token = result
else:
logger.error(
"Failed to acquire bearer token. accessToken not found in result object on renewing credentials."
)
raise Exception("Could not acquire a bearer token")

why does JSON dump doesn't work in my code?

I'm trying to put python objects into a JSON file by getting the API from one of the sites but somehow when I run the code nothing has been put in the JSON file. API is working well, as well when I print out the code by json.load I get the output but I have no idea why does dump doesn't work.
here is my code:
from django.shortcuts import render
import requests
import json
import datetime
import re
def index(request):
now = datetime.datetime.now()
format = "{}-{}-{}".format(now.year, now.month, now.day)
source = []
author = []
title = []
date = []
url = "http://newsapi.org/v2/everything"
params = {
'q': 'bitcoin',
'from': format,
'sortBy': 'publishedAt',
'apiKey': '1186d3b0ccf24e6a91ab9816de603b90'
}
response = requests.request("GET", url, params=params)
for news in response.json()['articles']:
matching = re.match("\d+-\d+-\d+", news['publishedAt'])
if format == matching.group():
source.append(news['source'])
author.append(news['author'])
title.append(news['title'])
date.append(news['publishedAt'])
data = \
{
'source': source,
'author': author,
'title': title,
'date': date
}
with open('data.json', "a+") as fp:
x = json.dump(data, fp, indent=4)
return render(request, 'news/news.html', {'response': response})

AWS Sagemaker batch transform with JSON input filter

I have a custom Sagemaker instance on a NLP task and trying to run a batch transform on the following json file
{"id":123, "features":"This is a test message"}'
and im looking to output the following:
{"id":123,"SageMakerOutput":spam}
Here's my batch transform code:
transformer = sklearn.transformer(instance_count=1,
instance_type='local',
accept='application/json',
output_path="s3://spam-detection-messages-output/json_examples")
transformer.transform("s3://spam-detection-messages/json_examples", content_type='application/json', input_filter="$.features", join_source="Input", output_filter="$['features', SageMakerOutput']")
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()
According to this document,
https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#batch-transform-data-processing-examples
i should be able to grab the "features" object using input_filter,
however, it grabs the entire json payload. and only outputs the prediction
I'm also including my training code
import argparse
import pandas as pd
import os
import glob
import io
import json
from sklearn import tree
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer()
def remove_stop_words(words):
result = [i for i in words if i not in ENGLISH_STOP_WORDS]
return result
def word_stemmer(words):
return [stemmer.stem(o) for o in words]
def word_lemmatizer(words):
return [lemmatizer.lemmatize(o) for o in words]
def remove_characters(words):
return [word for word in words if len(word)> 1]
def clean_token_pipeline(words):
cleaning_utils = [remove_stop_words, word_lemmatizer]
for o in cleaning_utils:
words = o(words)
return words
def process_text(X_train, X_test, y_train, y_test):
X_train = [word_tokenize(o) for o in X_train]
X_test = [word_tokenize(o) for o in X_test]
X_train = [clean_token_pipeline(o) for o in X_train]
X_test = [clean_token_pipeline(o) for o in X_test]
X_train = [" ".join(o) for o in X_train]
X_test = [" ".join(o) for o in X_test]
return X_train, X_test, y_train, y_test
def convert_to_feature(raw_tokenize_data):
raw_sentences = [' '.join(o) for o in raw_tokenize_data]
return vectorizer.transform(raw_sentences)
def _npy_loads(data):
"""
Deserializes npy-formatted bytes into a numpy array
"""
stream = io.BytesIO(data)
return np.load(stream)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Sagemaker specific arguments. Defaults are set in the environment variables.
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
args = parser.parse_args()
train_data = pd.read_csv(args.train+"/spamAssassin_min.csv", index_col=0)
train_data.dropna(inplace=True)
print(train_data.head())
X_train, X_test, y_train, y_test = train_test_split(train_data['message'], train_data['label'], test_size = 0.2, random_state = 1)
X_train, X_test, y_train, y_test = process_text(X_train, X_test, y_train, y_test)
X_train = [o.split(" ") for o in X_train]
X_test = [o.split(" ") for o in X_test]
vectorizer = TfidfVectorizer()
raw_sentences = [' '.join(o) for o in X_train]
vectorizer.fit(raw_sentences)
# print("saving transformer to {}".format(args.model_dir))
joblib.dump(vectorizer, os.path.join(args.model_dir, "vectorizer.joblib"))
x_train_features = convert_to_feature(X_train)
x_test_features = convert_to_feature(X_test)
clf = GaussianNB()
clf.fit(x_train_features.toarray(),y_train)
y_true, y_pred = y_test, clf.predict(x_test_features.toarray())
print(classification_report(y_true, y_pred))
joblib.dump(clf, os.path.join(args.model_dir, "model.joblib"))
def model_fn(model_dir):
"""Deserialized and return fitted model
Note that this should have the same name as the serialized model in the main method
"""
clf = joblib.load(os.path.join(model_dir, "model.joblib"))
# print("model loaded {}".format(clf))
return clf
def input_fn(request_body, request_content_type):
print("** input_fn**")
print("request_body:{} request_content_type:{}".format(request_body, request_content_type))
if request_content_type == "text/plain":
#convert to string
message = str(request_body)
return message
elif request_content_type == "application/json":
request_body_json = json.loads(request_body)
# print("json {}".format(request_body_json))
return request_body_json['features']
elif request_content_type == "application/x-npy":
return " ".join(_npy_loads(request_body))
else:
# Handle other content-types here or raise an Exception
# if the content type is not supported.
return request_body
def predict_fn(input_data, model):
print("** predict_fn**")
print("input_data: {} model:{}".format(input_data, model))
print("\n")
prefix = '/opt/ml/'
model_path = os.path.join(prefix, 'model')
my_vect = joblib.load(os.path.join(model_path, "vectorizer.joblib"))
message = "".join(clean_token_pipeline(input_data))
print("processed message: {}".format(message))
message = my_vect.transform([message])
message = message.toarray()
prediction = model.predict(message)
return prediction

Why Field names are sliced and raise error in python csv writing

During practicing selenium i failed to write a dictionary into csv.I have searched the problem solution like it but it did not help me. My problem is when i want to write a python dictionary into csv file using dictwriter i reach at exception i.e.
ValueError: dict contains fields not in fieldnames: u'S', u'k', u'u'
but the field name is
Sku
Why it is sliced and gives me queer exception but i supplied proper filednames in the dictwriter.
My experimenting code. is-
import os,sys,bs4,random,codecs,requests
import unicodecsv as csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from contextlib import contextmanager
from selenium.webdriver.support.expected_conditions import staleness_of
from selenium.webdriver.support import expected_conditions as EC
current_file = sys.argv[0]
link_dir = os.path.dirname(current_file)
link_path = os.path.join(link_dir,'lnks.txt')
Image_folder = os.path.join(link_dir,"images")+"\\"
urls = [line.strip() for line in open(link_path, 'r')]
urls = list(set(urls))
url = urls[0]
driver = webdriver.Firefox()#Chrome()##chromedriver)##
base_url = 'http://www.hotleathers.com'
Header = [u'Url',u'Name',u'Sku',u'Price',u'Color',u'Size']
#def get_data(url):
#try:
print "Scraping : %s"%url
driver.get(url)
driver.implicitly_wait(3)
detpage_lnks = driver.find_elements_by_xpath("//div[#style='margin-top:0px;margin-bottom:5px']/a")
detpage_lnks = map(lambda x: x.get_attribute('href'),detpage_lnks)
for i in detpage_lnks:
Data = []
#try:
driver.get(i)
driver.implicitly_wait(3)
Name_v=driver.find_element_by_xpath("//table [#class='showproductpage']/tbody/tr/td/h1").text
Sku_v=driver.find_element_by_xpath("(//table[#cellspacing = '0'])[3]//td[#style='padding-left:5px; font-size:16px; font-weight:bold;']").text
image_name = Sku_v+".jpg"
image_url = "http://www.hotleathers.com/Assets/ProductImages/large/"+image_name
res = requests.get(image_url)
if res.status_code == requests.codes.ok:
out = open(Image_folder+image_name,'wb')
out.write(res.content)
Price_v=driver.find_element_by_xpath("((//table[#cellspacing = '0'])[3]//tr)[2]//span").text
Color=driver.find_elements_by_xpath("(//table[#class='buyProductForm'])//tr[2]/td/select/option")
Color_v = '"'+':'.join([i.text for i in Color[1:]])+'"'
Size=driver.find_elements_by_xpath("(//table[#class='buyProductForm'])//tr[3]/td/select/option")
Size_v = '"'+':'.join([i.text for i in Size[1:]])+'"'
temp = [driver.current_url,Name_v,Sku_v,Price_v,Color_v,Size_v]
Data.append(zip(Header,temp))
Data = [item for sublst in Data for item in sublst]
my_dict = dict(Data)
with codecs.open(os.path.join(link_dir,"Image_info.csv"),'wb',encoding="utf-8") as f:
# Using dictionary keys as fieldnames for the CSV file header
writer = csv.DictWriter(f,delimiter=",", fieldnames=Header,lineterminator='\n')
writer.writeheader()
for d in my_dict:
writer.writerow(d)
driver.close()
I tried both unicodecsv and csv but with no success.
After many tries i found solution as below-
I did not understand that writerow expects a dictionary!
import os,sys,bs4,random,codecs,requests
import unicodecsv as csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from contextlib import contextmanager
from selenium.webdriver.support.expected_conditions import staleness_of
from selenium.webdriver.support import expected_conditions as EC
current_file = sys.argv[0]
link_dir = os.path.dirname(current_file)
link_path = os.path.join(link_dir,'lnks.txt')
Image_folder = os.path.join(link_dir,"images")+"\\"
urls = [line.strip() for line in open(link_path, 'r')]
urls = list(set(urls))
url = urls[0]
driver = webdriver.Firefox()#Chrome()##chromedriver)##
base_url = 'http://www.hotleathers.com'
Header = [u'Url',u'Name',u'Sku',u'Price',u'Color',u'Size']
#def get_data(url):
#try:
print "Scraping : %s"%url
driver.get(url)
driver.implicitly_wait(3)
detpage_lnks = driver.find_elements_by_xpath("//div[#style='margin-top:0px;margin-bottom:5px']/a")
detpage_lnks = map(lambda x: x.get_attribute('href'),detpage_lnks)
for i in detpage_lnks:
Data = []
#try:
driver.get(i)
driver.implicitly_wait(3)
Name_v=driver.find_element_by_xpath("//table [#class='showproductpage']/tbody/tr/td/h1").text
Sku_v=driver.find_element_by_xpath("(//table[#cellspacing = '0'])[3]//td[#style='padding-left:5px; font-size:16px; font-weight:bold;']").text
image_name = Sku_v+".jpg"
image_url = "http://www.hotleathers.com/Assets/ProductImages/large/"+image_name
res = requests.get(image_url)
if res.status_code == requests.codes.ok:
out = open(Image_folder+image_name,'wb')
out.write(res.content)
Price_v=driver.find_element_by_xpath("((//table[#cellspacing = '0'])[3]//tr)[2]//span").text
Color=driver.find_elements_by_xpath("(//table[#class='buyProductForm'])//tr[2]/td/select/option")
Color_v = '"'+':'.join([i.text for i in Color[1:]])+'"'
Size=driver.find_elements_by_xpath("(//table[#class='buyProductForm'])//tr[3]/td/select/option")
Size_v = '"'+':'.join([i.text for i in Size[1:]])+'"'
temp = [driver.current_url,Name_v,Sku_v,Price_v,Color_v,Size_v]
Data.append(zip(Header,temp))
Data = [item for sublst in Data for item in sublst]
my_dict = dict(Data)
with codecs.open(os.path.join(link_dir,"Image_info.csv"),'ab',encoding="utf-8") as f:
# Using dictionary keys as fieldnames for the CSV file header
writer = csv.DictWriter(f,fieldnames=my_dict.keys())
writer.writerow(my_dict)
driver.close()