widget does not get proper size when added to a layout - widget

I need to get the real width() of a widget when it is dynamically added to a layout, because I need to do some painting on the widget based on its width(). But the code below does not work as I expected: w.width() is always 640, which is obviously not the real width.
Any idea?
# -*- coding: utf-8 -*-
import os, sys
from PyQt4 import QtGui, QtCore
from PyQt4.QtGui import *
from PyQt4.QtCore import *
class MainWidget(QWidget):
def __init__(self, parent=None):
super(MainWidget, self).__init__(parent)
self.setupUI()
def setupUI(self):
self.mainLayout = QVBoxLayout(self)
class MyWidget(QWidget):
def __init__(self, parent=None):
super(MyWidget, self).__init__(parent)
layout = QVBoxLayout(self)
layout.addWidget(QLabel('label'))
def minimunSizeHint(self):
return QSize(30, 30)
def sizeHint(self):
return QSize(100, 100)
if __name__ == '__main__':
app = QtGui.QApplication(sys.argv)
main = MainWidget()
main.show()
l = QGridLayout()
l.addWidget(MyWidget(), 0, 0)
l.addWidget(MyWidget(), 0, 1)
l.addWidget(MyWidget(), 1, 0)
l.addWidget(MyWidget(), 1, 1)
main.mainLayout.addLayout(l)
w = l.itemAtPosition(0, 0).widget()
print w.width(), w.height()
sys.exit(app.exec_())

just re-implement resizeEvent() and re-painting whenever the size changed.
# -*- coding: utf-8 -*-
import os, sys
from PyQt4 import QtGui, QtCore
from PyQt4.QtGui import *
from PyQt4.QtCore import *
class MainWidget(QWidget):
def __init__(self, parent=None):
super(MainWidget, self).__init__(parent)
self.setupUI()
def setupUI(self):
self.mainLayout = QVBoxLayout(self)
class MyWidget(QWidget):
def __init__(self, parent=None):
super(MyWidget, self).__init__(parent)
layout = QVBoxLayout(self)
layout.addWidget(QLabel('label'))
def minimunSizeHint(self):
return QSize(30, 30)
def sizeHint(self):
return QSize(100, 100)
def resizeEvent(self, event):
print self.width(), self.height()
# call painting here
if __name__ == '__main__':
app = QtGui.QApplication(sys.argv)
main = MainWidget()
main.show()
l = QGridLayout()
l.addWidget(MyWidget(), 0, 0)
l.addWidget(MyWidget(), 0, 1)
l.addWidget(MyWidget(), 1, 0)
l.addWidget(MyWidget(), 1, 1)
main.mainLayout.addLayout(l)
w = l.itemAtPosition(0, 0).widget()
sys.exit(app.exec_())

Related

Plotly dashboard hangs in loading - likely bug

There is likely a bug in the following code that causes the dashboard to not load, but don't see where it's at:
from dash import Dash, html, dcc, Input, Output, State
import plotly.express as px
import plotly.graph_objects as go
import dash_bootstrap_components as dbc
import pandas as pd
from pandas_datareader import data
import yfinance as yf
yf.pdr_override()
from datetime import date
start = pd.to_datetime('2022-01-01')
end = pd.to_datetime(date.today())
def update_data():
# !! reset_index because otherwise plotly doesn't recognize the index as a x input in go.Figure
df = data.DataReader('USDJPY%3DX', data_source='yahoo', start=start, end=end).reset_index()
return df
app = Dash(__name__, external_stylesheets=[dbc.themes.LITERA])
app.layout = dbc.Container(
[
dbc.Row(
[dbc.Col([html.H1(
"Daily Price",
style={"textAlign": "center"},
),
dcc.Graph(id="price-chart", figure={})],
width=12,lg=6),
dbc.Col([html.H1(
"10 Day SMA of Daily Range",
style={"textAlign": "center"},
),
dcc.Graph(id="volatility-chart", figure={})],
width=12,lg=6)
]
),
dbc.Row(
dbc.Col(dcc.Dropdown(
id="dropdown",
options=["AAPL", "TSLA", "MSFT"],
value=["TSLA"],
style={"color": "green"}
),
className="three columns"),
),
dcc.Store(id="storage", storage_type="memory", data={}),
dcc.Interval(id="timer", interval=1000 * 60, n_intervals=0),
]
)
#app.callback(Output(component_id = "storage", component_property = "data"),
Input(component_id = "timer", component_property = "n_intervals"))
def store_data(n_time):
df = update_data()
return df.to_dict("records")
#app.callback(Output(component_id = "price-chart", component_property = "figure"),
Input(component_id = "storage", component_property = "data"))
def display_data(stored_dataframe):
df = pd.DataFrame.from_records(stored_dataframe)
fig = go.Figure(data=[go.Candlestick(x=df['Date'],
open=df['Open'],
high=df['High'],
low=df['Low'],
close=df['Close'])])
return fig
#app.callback(Output(component_id = "volatility-chart", component_property = "figure"),
Input(component_id = "storage", component_property = "data"))
def modify_data(stored_dataframe):
df = pd.DataFrame.from_records(stored_dataframe)
df['range'] = df.High - df.Low
df['range_sma'] = df.range.rolling(10).mean()
fig = px.line(df.range_sma)
return fig
if __name__ == "__main__":
app.run_server(debug=True)
I would prefer: app.layout = dash.Dash and would also choose some port:
if __name__ == "__main__":
app.run_server(debug=True, port = 8050
)

Unable to use method of a class in different class-missing 2 required positional arguments

I have two python classes:- One class(CloudLink) is responsible for sending JSON events to the app and another(ReadData) is responsible for building the JSON data.
The ReadData class will be using the CloudLink methods to send the JSON data to the App. But I'm getting error _buildJSONdata() missing 1 required positional argument: 'Data'.
ReadData class
from pyspark.sql import SparkSession
import functools
from pyspark.sql import DataFrame
from pyspark.sql.functions import explode
from cosmosconnect import azurecosmos
class ReadData:
#exception(logger)
def __init__(self):
self.spark_session = (
SparkSession.builder
.appName("readData")
.getOrCreate()
)
mssparkutils.fs.unmount('/mnt/test')
logger.info("Drive unmounted")
mssparkutils.fs.mount(
'abfss://abc#transl.dfs.core.windows.net/',
'/mnt/test',
{'linkedService': "linkCosmos"}
)
logger.info("Mounted Successfully")
self.input_directory = (f"synfs:/{mssparkutils.env.getJobId()}/mnt/test/input_path"
)
self.output_directory = (f"synfs:/{mssparkutils.env.getJobId()}/mnt/test/output_path"
)
'''
Reading the schema from csv file
'''
#exception(logger)
def readConfig(self):
try:
logger.info(f"Reading the Config present in {self.input_directory} ")
dfConfig = self.spark_session.read.option("multiline","true") \
.json(self.input_directory)
#for f in dfConfig.select("Entity","Query","Business_Rule").collect():
dfConfig=dfConfig.select(explode('Input').alias('Input_Data'))\
.select('Input_Data.Validation_Type','Input_Data.Entity','Input_Data.Query','Input_Data.Business_Rule')
for f in dfConfig.rdd.toLocalIterator():
#for index, f in dfConfig.toPandas().iterrows():
self.Validation_Type=f[0]
self.container=f[1]
self.query=f[2]
self.rule=f[3]
self.readCosmos(self)
except:
raise ValueError("")
#exception(logger)
def readCosmos(self,*params):
#from cosmosconnect import azurecosmos
#a=[]
linkedService='fg'
df=azurecosmos.cosmosConnect(linkedService,self.query,self.container)
df.cache()
if len(df.head(1)) >0:
outputpath=self.output_directory+'/'+self.container
df.coalesce(1).write.mode('overwrite').parquet(outputpath)
Status="Validation Failure"
Data= {"Validation_Type":[],"Status":[],"Container":[],"Business_Rule":[]}
Data["Validation_Type"].append(self.Validation_Type)
Data["Status"].append(Status)
Data["Container"].append(self.container)
Data["Business_Rule"].append(self.rule)
CloudLink._buildJSONdata(Data)
if __name__ == "__main__":
p = ReadData()
p.readConfig()
CloudLink class
import json
import datetime
import hashlib
import json
import sys
import traceback
import adal
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import logging
from functools import wraps
import sys
def create_logger():
#create a logger object
#logger = logging.getLogger()
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logfile = logging.FileHandler('exc_logger.log')
#logfile = logging.StreamHandler(sys.stdout)
fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
formatter = logging.Formatter(fmt)
logfile.setFormatter(formatter)
logger.addHandler(logfile)
return logger
logger = create_logger()
def exception(logger):
def decorator(func):
#wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except:
issue = "exception in "+func.__name__+"\n"
issue = issue+"-------------------------\
------------------------------------------------\n"
logger.exception(issue)
raise
return wrapper
return decorator
class CloudLink(object):
_token = None
_instance = None
http = None
cloudclient = TokenLibrary.getSecret("xxxx", "rtrt")
clientid = TokenLibrary.getSecret("xxxx", "tyty")
clientcredentials = TokenLibrary.getSecret("xxxx", "abcabc")
authority_url = TokenLibrary.getSecret("xxxx", "abab")
cloudtest = TokenLibrary.getSecret("xxxx", "yyyy")
#staticmethod
def getInstance():
if not CloudLink._instance:
CloudLink._instance = CloudLink()
return CloudLink._instance
def __init__(self):
retry_strategy = Retry(
total=3,
backoff_factor=0,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["HEAD", "GET", "OPTIONS"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.http = requests.Session()
self.http.mount("https://", adapter)
self.http.mount("http://", adapter)
print("Inside init")
def parseJSON(self, t):
try:
eventData = json.loads(t)
logger.info(f"Sending {eventData} to cloud")
self.sendToCloud(eventData)
except ValueError as e:
print("Error: %s Please validate JSON in https://www.jsonschemavalidator.net/"% e)
return None # or: raise
def sendToCloud(self, eventData):
cloudData = {"eventData": eventData, "metadata": self._buildMetadata()}
logger.info(f"Raising alert with data=({cloudData}")
response = self.http.post(
self.cloudtest, headers=self._buildHeaders(), json=cloudData
)
logger.info(f"cloud alert response={response}")
if response.status_code == 202 or response.status_code == 200:
logger.info("Mail sent to Cloud")
else:
raise Exception(f"Cloud reporting failed with Error {response}")
def _buildJSONdata(self,Data):
if len(Data) == 0:
raise Exception("JSON is empty")
else:
t = json.dumps(self.Data)
self.parseJSON(t)
def _buildMetadata(self):
return {
"messageType": "Send Email",
"messageVersion": "0.0.1",
"sender": "Send Email",
}
def _buildHeaders(self):
self._refreshADToken()
headers = {
"Authorization": "Bearer {}".format(self._token["accessToken"]),
"Content-type": "application/json",
"Accept": "text/plain",
}
return headers
def _refreshADToken(self):
def shouldRenew(token):
"""Returns True if the token should be renewed"""
expiresOn = datetime.datetime.strptime(
token["expiresOn"], "%Y-%m-%d %H:%M:%S.%f"
)
now = datetime.datetime.now()
return (expiresOn - now) < datetime.timedelta(minutes=5)
if not self._token or shouldRenew(self._token):
logger.info("Renewing credentials for Alerting")
result = None
try:
context = adal.AuthenticationContext(CloudLink.authority_url)
result = context.acquire_token_with_client_credentials(CloudLink.cloudclient, CloudLink.clientid,CloudLink.clientcredentials)
except Exception as e:
error = "Failed to renew client credentials."
logger.info(error)
raise
if result and "accessToken" in result:
self._token = result
else:
logger.error(
"Failed to acquire bearer token. accessToken not found in result object on renewing credentials."
)
raise Exception("Could not acquire a bearer token")

load_model() doesn't load a model

I have Python version 3.6 and tensorflow version 2.0.1 installed. I created a model and tried to load it, but ran into a problem:
Traceback (most recent call last):
File "C:/Users/Irina/Documents/PYTHON4/main.py", line 86, in <module>
main()
File "C:/Users/Irina/Documents/PYTHON4/main.py", line 73, in main
predictions = get_predictions(data, model)
File "C:/Users/Irina/Documents/PYTHON4/main.py", line 59, in get_predictions
predictions = [predict(np.array([image]), model) for image in data]
File "C:/Users/Irina/Documents/PYTHON4/main.py", line 59, in <listcomp>
predictions = [predict(np.array([image]), model) for image in data]
File "C:\Users\Irina\Documents\PYTHON4\eval.py", line 53, in predict
pred = model.predict(data)
AttributeError: 'list' object has no attribute 'predict'
Process finished with exit code 1
I suspect that the error is in the non-working load_model () function, although it may be due to the incompatibility of Python and tensorflow.
It's main.py:
# -*- coding: utf-8 -*-
import cv2 # computer vision library
import os
from sklearn.metrics import f1_score
import numpy as np
from imutils import paths
from eval import standardize_input, predict, load_final_model
def unison_shuffled_copies(a, b):
assert len(a) == len(b)
p = np.random.permutation(len(a))
return a[p], b[p]
# Image data directories
def one_hot_encode(label):
dictAnimal = {'cats': 0, 'dogs': 1}
return dictAnimal[label]
def load_data():
IMAGE_DIR_VALIDATION = "animalsval"
imagePaths = sorted(list(paths.list_images(IMAGE_DIR_VALIDATION)))
data = []
labels = []
for imagePath in imagePaths:
data.append(standardize_input(imagePath))
label = imagePath.split(os.path.sep)[-2]
labels.append(one_hot_encode(label))
data = np.array(data, dtype="float")
labels = np.array(labels)
data, labels = unison_shuffled_copies(data, labels)
return data, labels
def get_predictions(data, model):
predictions = [predict(np.array([image]), model) for image in data]
return predictions
def main():
data, labels = load_data()
try:
model = load_final_model()
except:
print('The model is not loaded, we use a constant classifier')
model = None
predictions = get_predictions(data, model)
try:
f1 = f1_score(labels, predictions)
print('F1-Classifier measure:', f1)
except Exception as e:
print('Error: ', e)
file = open("score.txt", "w")
file.write(str(f1))
file.close()
if __name__ == '__main__':
main()
It's eval.py:
# -*- coding: utf-8 -*-
import numpy as np
import cv2
from tensorflow.keras.models import load_model
def standardize_input(image):
standard_im = cv2.imread(image)
standard_im = cv2.resize(standard_im, (32, 32))
standard_im = np.reshape(standard_im, (1, 32, 32, 3))
return standard_im
MODEL_FILE_NAME = 'EasyNet.h5'
def load_final_model():
try:
model = load_model(MODEL_FILE_NAME)
print(type(model))
except Exception:
print("Error")
model = []
return model
def predict(image, model):
data = image
label = 0
pred = model.predict(data)
if pred[0][0] >= pred[0][1]:
label = 0
else:
label = 1
return label
It's model:
import matplotlib
matplotlib.use("Agg")
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layer import Dense
from tensorflow.keras.optimizers import SGD
from imutils import paths
import matplotlib.pyplot as plt
import numpy as np
import random
import pickle
import cv2
import os
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
ImagePaths = list(paths.list_images("/content/drive/My Drive/GoogleColab/dogs/"))
random.seed(42)
ImagePaths = ImagePaths[:1800] + list(paths.list_images("/content/drive/My Drive/GoogleColab/cats/"))
print(len(ImagePaths))
random.shuffle(ImagePaths)
data = []
labels = []
i=0
for imagepath in ImagePaths:
if i % 100 == 0:
print(str(i)+"/3555")
i+=1
image = cv2.imread(imagepath)
image = cv2.resize(image, (32, 32))
data.append(image)
label = imagepath.split(os.path.sep)[-2]
if label == "cats":
label = [1,0]
else:
label = [0,1]
labels.append(label)
print(labels)
data = np.array(data, dtype="float") / 255.0
labels = np.array(labels)
with open("/content/drive/My Drive/GoogleColab/Lesson_4/data.pickle", 'wb') as f:
pickle.dump(data, f)
print("Data saved")
with open("/content/drive/My Drive/GoogleColab/Lesson_4/labels.pickle", 'wb') as f:
pickle.dump(labels, f)
print("Labels saved")
with open("/content/drive/My Drive/GoogleColab/Lesson_4/data.pickle", 'rb') as f:
data = pickle.load(f)
print("Data loaded")
with open("/content/drive/My Drive/GoogleColab/Lesson_4/labels.pickle", 'rb') as f:
labels = pickle.load(f)
print("Labels loaded")
(trainX, testX, trainY, testY) = train_test_split(data, labels,
test_size=0.15,
random_state=42)
print("Dataset prepared")
from tensorflow.keras.layers import Conv2D, Flatten, Dropout, Activation, MaxPooling2D
from tensorflow.keras.optimizers import Adam
model = Sequential()
model.add(Conv2D(32, (3, 3), padding="same", input_shape=(32, 32, 3)))
model.add(Activation("relu"))
model.add(Conv2D(32, (3, 3), padding="same", activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), padding="same", activation="relu"))
model.add(Conv2D(64, (3, 3), padding="same", activation="relu"))
model.add (MaxPooling2D( pool_size = ( 2 , 2 )))
model.add(Dropout(0.25))
model.add(Flatten())
#model.add(Dense(1024,input_shape=(3072,), activation='sigmoid'))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
INIT_LR = 0.01
#opt = SGD(lr=INIT_LR)
opt = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(loss="binary_crossentropy",optimizer=opt,
metrics=["accuracy"]) #categorial_crosentropy
print ("Model compiled")
model.summary()
EPOCHS = 30
checkpointer = ModelCheckpoint(filepath='/content/drive/My Drive/GoogleColab/Lesson_4/ConvNN.h5', verbose=1, save_best_only=True)
H = model.fit(trainX, trainY, validation_data=(testX,testY),
epochs=EPOCHS, batch_size=32,
shuffle=True,
callbacks=[checkpointer])
print("Model trained")
predictions = model.predict (testX, batch_size = 32)
print(predictions)
print(classification_report(testY.argmax(axis=1),
predictions.argmax(axis=1), target_names=("cats","dogs")))
N = np.arange(0, EPOCHS)
plt.style.use("ggplot")
plt.figure()
plt.plot(N, H.history["loss"], label="train_loss")
plt.plot(N, H.history["val_loss"], label="vall_loss")
plt.plot(N, H.history["accuracy"], label="train_acc")
plt.plot(N, H.history["val_accuracy"], label="val_acc")
plt.title("Results")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy/")
plt.legend()
plt.savefig("/content/drive/My Drive/GoogleColab/Lesson_4/Loss.png")
model.save("/content/drive/My Drive/GoogleColab/Lesson_4/EasyNet.h5")
print("End")

Stream multiple videos using OpenCV Python Flask

I'm trying to stream 2 webcams at once using Flask Python but I'm not able to do so, when I run my code, both webcams light up but only one of the cameras show on the webpage and I'm not sure.
Here is the code I'm using:
from vCamera import VideoCamera
import pdb
app = Flask(__name__)
#app.route('/')
def index():
return render_template('index.html')
def gen(vCamera0):
while True:
frame0 = vCamera0.get_frame0()
yield (b'--frame0\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + frame0 + b'\r\n\r\n')
frame2 = vCamera0.get_frame2()
yield (b'--frame2\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + frame2 + b'\r\n\r\n')
#app.route('/video_feed0')
def video_feed0():
return Response(gen(VideoCamera()),
mimetype='multipart/x-mixed-replace; boundary=frame0')
#app.route('/video_feed2')
def video_feed2():
return Response(gen(VideoCamera()),
mimetype='multipart/x-mixed-replace; boundary=frame2')
if __name__ == '__main__':
app.run(host='127.0.0.1', debug=True)
And this is my camera file:
import pdb
import cv2
fullbody_cascade = cv2.CascadeClassifier('haarcascade_fullbody.xml')
upperbody_cascade = cv2.CascadeClassifier('haarcascade_upperbody.xml')
class VideoCamera(object):
def __init__(self):
self.video0 = cv2.VideoCapture(0)
self.video2 = cv2.VideoCapture(2)
def __del__(self):
self.video0.release()
def get_frame0(self):
success0, frame0 = self.video0.read()
gray0 = cv2.cvtColor(frame0, cv2.COLOR_BGR2GRAY)
fullbody0 = fullbody_cascade.detectMultiScale(gray0)
upperbody0 = upperbody_cascade.detectMultiScale(gray0)
for (x,y,w,h) in fullbody0:
cv2.rectangle(frame0, (x,y), (x+w, y+h), (255,0,0), 2)
for (x,y,w,h) in upperbody0:
cv2.rectangle(frame0, (x,y), (x+w, y+h), (255,0,0), 2)
ret0, jpeg0 = cv2.imencode('.jpg', frame0)
return jpeg0.tobytes()
def get_frame2(self):
success2, frame2 = self.video2.read()
gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
fullbody2 = fullbody_cascade.detectMultiScale(gray2)
upperbody2 = upperbody_cascade.detectMultiScale(gray2)
for (x,y,w,h) in fullbody2:
cv2.rectangle(frame2, (x,y), (x+w, y+h), (255,0,0), 2)
for (x,y,w,h) in upperbody2:
cv2.rectangle(frame2, (x,y), (x+w, y+h), (255,0,0), 2)
ret2, jpeg2 = cv2.imencode('.jpg', frame2)
return jpeg2.tobytes()
I am very new to Flask so I'm not quite sure what the issue with the code I have written is. Any advice would be helpful!

AWS Sagemaker batch transform with JSON input filter

I have a custom Sagemaker instance on a NLP task and trying to run a batch transform on the following json file
{"id":123, "features":"This is a test message"}'
and im looking to output the following:
{"id":123,"SageMakerOutput":spam}
Here's my batch transform code:
transformer = sklearn.transformer(instance_count=1,
instance_type='local',
accept='application/json',
output_path="s3://spam-detection-messages-output/json_examples")
transformer.transform("s3://spam-detection-messages/json_examples", content_type='application/json', input_filter="$.features", join_source="Input", output_filter="$['features', SageMakerOutput']")
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()
According to this document,
https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#batch-transform-data-processing-examples
i should be able to grab the "features" object using input_filter,
however, it grabs the entire json payload. and only outputs the prediction
I'm also including my training code
import argparse
import pandas as pd
import os
import glob
import io
import json
from sklearn import tree
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer()
def remove_stop_words(words):
result = [i for i in words if i not in ENGLISH_STOP_WORDS]
return result
def word_stemmer(words):
return [stemmer.stem(o) for o in words]
def word_lemmatizer(words):
return [lemmatizer.lemmatize(o) for o in words]
def remove_characters(words):
return [word for word in words if len(word)> 1]
def clean_token_pipeline(words):
cleaning_utils = [remove_stop_words, word_lemmatizer]
for o in cleaning_utils:
words = o(words)
return words
def process_text(X_train, X_test, y_train, y_test):
X_train = [word_tokenize(o) for o in X_train]
X_test = [word_tokenize(o) for o in X_test]
X_train = [clean_token_pipeline(o) for o in X_train]
X_test = [clean_token_pipeline(o) for o in X_test]
X_train = [" ".join(o) for o in X_train]
X_test = [" ".join(o) for o in X_test]
return X_train, X_test, y_train, y_test
def convert_to_feature(raw_tokenize_data):
raw_sentences = [' '.join(o) for o in raw_tokenize_data]
return vectorizer.transform(raw_sentences)
def _npy_loads(data):
"""
Deserializes npy-formatted bytes into a numpy array
"""
stream = io.BytesIO(data)
return np.load(stream)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Sagemaker specific arguments. Defaults are set in the environment variables.
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
args = parser.parse_args()
train_data = pd.read_csv(args.train+"/spamAssassin_min.csv", index_col=0)
train_data.dropna(inplace=True)
print(train_data.head())
X_train, X_test, y_train, y_test = train_test_split(train_data['message'], train_data['label'], test_size = 0.2, random_state = 1)
X_train, X_test, y_train, y_test = process_text(X_train, X_test, y_train, y_test)
X_train = [o.split(" ") for o in X_train]
X_test = [o.split(" ") for o in X_test]
vectorizer = TfidfVectorizer()
raw_sentences = [' '.join(o) for o in X_train]
vectorizer.fit(raw_sentences)
# print("saving transformer to {}".format(args.model_dir))
joblib.dump(vectorizer, os.path.join(args.model_dir, "vectorizer.joblib"))
x_train_features = convert_to_feature(X_train)
x_test_features = convert_to_feature(X_test)
clf = GaussianNB()
clf.fit(x_train_features.toarray(),y_train)
y_true, y_pred = y_test, clf.predict(x_test_features.toarray())
print(classification_report(y_true, y_pred))
joblib.dump(clf, os.path.join(args.model_dir, "model.joblib"))
def model_fn(model_dir):
"""Deserialized and return fitted model
Note that this should have the same name as the serialized model in the main method
"""
clf = joblib.load(os.path.join(model_dir, "model.joblib"))
# print("model loaded {}".format(clf))
return clf
def input_fn(request_body, request_content_type):
print("** input_fn**")
print("request_body:{} request_content_type:{}".format(request_body, request_content_type))
if request_content_type == "text/plain":
#convert to string
message = str(request_body)
return message
elif request_content_type == "application/json":
request_body_json = json.loads(request_body)
# print("json {}".format(request_body_json))
return request_body_json['features']
elif request_content_type == "application/x-npy":
return " ".join(_npy_loads(request_body))
else:
# Handle other content-types here or raise an Exception
# if the content type is not supported.
return request_body
def predict_fn(input_data, model):
print("** predict_fn**")
print("input_data: {} model:{}".format(input_data, model))
print("\n")
prefix = '/opt/ml/'
model_path = os.path.join(prefix, 'model')
my_vect = joblib.load(os.path.join(model_path, "vectorizer.joblib"))
message = "".join(clean_token_pipeline(input_data))
print("processed message: {}".format(message))
message = my_vect.transform([message])
message = message.toarray()
prediction = model.predict(message)
return prediction