Can not load exact model from Huggingface pipeline - deep-learning

I tried to load a model, which I found on HuggingFace:
https://huggingface.co/deepset/gelectra-large-germanquad
The pipeline shows different (but correct results) then loading the model. What do I need to do, to load the excat model from the pipeline?
Pipeline:
from transformers import pipeline
qa_pipeline = pipeline(
"question-answering",
model="deepset/gelectra-base-germanquad",
tokenizer="deepset/gelectra-base-germanquad"
)
qa_pipeline({
"context": "Ein Geschäftsbericht kostet über 100.000 Euro",
'question': "Wie teuer ist ein Geschäftsbericht?"
})
>>> out[0]: "über 100.000 Euro"
Loading the model:
from transformers import ElectraTokenizer, ElectraForQuestionAnswering
import torch
tokenizer = ElectraTokenizer.from_pretrained("deepset/gelectra-base-germanquad")
model = ElectraForQuestionAnswering.from_pretrained("deepset/gelectra-base-germanquad")
question = "Wie teuer ist ein Geschäftsbericht?"
doc = "Ein Geschäftsbericht kostet über 100.000 Euro"
encoding = tokenizer(question, doc, add_special_tokens=True ,return_tensors="pt")
outputs = model(**encoding)
start = outputs.start_logits
end = outputs.end_logits
answer_tokens = all_tokens[torch.argmax(start) :torch.argmax(end)+1]
answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))
print(answer)
>>> out[1]: ""

Related

how to registered the log_model in MLflow?

I have tried to load the deep learning model on mlflow, it's perfectly loaded, but the model is not stored in the model registry, can any one guide me how to register the model and its dataset for inference?
Thanks
from mlflow import MlflowClient
experiment_name = "nlp_model"
try:
exp_id = mlflow.create_experiment(name=experiment_name) # set the experiment id
except Exception as e:
exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
with mlflow.start_run(experiment_id=exp_id):
run_id = mlflow.active_run().info.run_id
print(run_id)
mlflow.sklearn.autolog(log_models=True)
print(mlflow.tracking.get_tracking_uri())
model = Demucs(**args.demucs, sample_rate=args.sample_rate)# fitting the model
"""
loaded model on mlflow
"""
mlflow.sklearn.log_model(model, "nlp_model")
"""
saved model on mlflow model registory
"""
client = MlflowClient()
model = client.create_model_version(
name="denoiser_nlp_model",
source=f"./mlruns/{exp_id}/{run_id}/artifacts/nlp_model",
run_id=mlflow.active_run().info.run_id
)
#Here is the error msg
[1]: https://i.stack.imgur.com/LNAsc.png

How to download a trained model as a pickle file using Streamlit Download Button?

How do I download a trained model as a pickle file using Streamlit Download Button?
You can use io.BytesIO to store the pickled data inside bytes in RAM. Then, give these bytes as data argument in the st.download_button function.
import io
import pickle
import streamlit as st
def create_model():
"""Create an sklearn model so that we will have
something interesting to pickle.
Example taken from here:
https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
"""
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ["preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"]
dataframe = pd.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7
X_train, _, Y_train, _ = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
# Fit the model on training set
model = LogisticRegression()
model.fit(X_train, Y_train)
return model
def pickle_model(model):
"""Pickle the model inside bytes. In our case, it is the "same" as
storing a file, but in RAM.
"""
f = io.BytesIO()
pickle.dump(model, f)
return f
st.title("My .pkl downloader")
model = create_model()
data = pickle_model(model)
st.download_button("Download .pkl file", data=data, file_name="my-pickled-model.pkl")

tfa.optimizers.MultiOptimizer - TypeError: 'Not JSON Serializable:'

I'm trying to use tfa.optimizers.MultiOptimizer(). I did everything according to the docs (https://www.tensorflow.org/addons/api_docs/python/tfa/optimizers/MultiOptimizer) yet I'm getting the following error:
TypeError: ('Not JSON Serializable:', <tf.Tensor 'gradient_tape/model_80/dense_3/Tensordot/MatMul/MatMul:0' shape=(1, 1) dtype=float32>)
Below is a minimal, working example that reproduces the error, just copy and paste it. The error occurs when the first epoch is finished and the callback trys to save the model.
##############################################################################
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.layers as l
import tensorflow_addons.layers as la
import tensorflow.keras as ke
import numpy as np
##############################################################################
def build_model_1():
model_input = l.Input(shape=(32,1))
x = l.Dense(1)(model_input)
model = ke.Model(inputs=model_input, outputs=x)
##########
optimizers = [tf.keras.optimizers.Adam(),
tf.keras.optimizers.Adam()]
optimizers_and_layers = [(optimizers[0], model.layers[:5]), (optimizers[1], model.layers[5:])]
optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
model.compile(optimizer=optimizer, loss='mse', metrics='mse')
test = tf.keras.optimizers.serialize(optimizer)
return model
##############################################################################
input_data = np.arange( 0, 10000, 1).reshape(10000,1)
target_data = np.arange(-10000, 0, 1).reshape(10000,1)
model = build_model_1()
model_checkpoint = ke.callbacks.ModelCheckpoint('best_model.h5',
monitor='val_mse',
mode='min',
save_best_only=True,
verbose=1)
training_history = model.fit(x = input_data,
y = target_data,
validation_split = 0.2,
epochs = 5,
verbose = 1,
callbacks = [model_checkpoint])
##############################################################################
When saving a complete Keras model (with its own structure in the .h5 file) the tf.keras.Model object is completely serialized as a JSON: this means that every property of the model should be JSON serializable.
NOTE: tf.Tensor are NOT JSON serializable.
When using this multi optimizer from tfa you're adding properties to the model that the JSON serializer will try (and fail) to serialize.
In particular there's this attribute gv that I think it comes from the custom optimizer used.
'gv': [(<tf.Tensor 'gradient_tape/model/dense/Tensordot/MatMul/MatMul:0' shape=(1, 1) dtype=float32>, <tf.Variable 'dense/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[-0.55191684]], dtype=float32)>), (<tf.Tensor 'gradient_tape/model/dense/BiasAdd/BiasAddGrad:0' shape=(1,) dtype=float32>, <tf.Variable 'dense/bias:0' shape=(1,) dtype=float32, numpy=array([-0.23444518], dtype=float32)>)]},
All this tf.Tensor are not JSON serializable, that's why it fails.
The only option is to do not save the model completely (with all its attributes, which should be defined as Keras layers, but in this case is not possible) but saving only the model parameters.
In short, if you add the save_weights_only=True to the callback your training (and checkpoint of the weights) will work fine.
model_checkpoint = ke.callbacks.ModelCheckpoint(
"best_model.h5",
monitor="val_mse",
mode="min",
save_best_only=True,
verbose=1,
save_weights_only=True,
)

Twitter streaming script is throwing a keyerror on location field of the tweet

I have as of now written a Python script to stream tweets and I have made use of the tweepy module to do so. After streaming for around 3 minutes for tweets, I dump these tweets into a .json file. I populate these tweets (I try to) into a pandas dataframe for location and text fields of the tweet. The text field of the tweet gets populated but not for every tweet (problem 1) in the .json file and as far as the location field is concerned a keyerror (problem 2) is thrown. May I know what exactly is going wrong.
twitter_stream_dump.py
import time
import json
import pandas as pd
import re
#tweepy based modules
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
#initializing authentication credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener) :
def __init__(self,time_limit) :
self.start_time = time.time()
self.limit = time_limit
self.saveFile = open('requests.json','a')
super(StdOutListener,self).__init__()
def on_data(self, data) :
if ((time.time() - self.start_time) < self.limit) :
self.saveFile.write(data)
self.saveFile.write('\n')
return True
else :
self.saveFile.close()
return False
def on_error(self, status) :
print(status)
def getwords(string) :
return re.findall(r"[\w'#]+|[.,!?;]",string)
if __name__ == '__main__' :
#This handles Twitter authetification and the connection to Twitter Streaming API
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
time_limit = input("Enter the time limit in minutes : ")
time_limit *= 60
stream = Stream(auth,listener = StdOutListener(time_limit))
string = raw_input("Enter the list of keywords/hashtags to be compared : ")
keyword_list = getwords(string)
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track = keyword_list)
tweets_data_path = 'requests.json'
tweets_data = []
tweet_list = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file :
try :
tweet = json.loads(line)
tweet_list.append(tweet)
except :
continue
num_tweets_collected = len(tweet_list)
#Creates a data frame structure
tweet_dataframe = pd.DataFrame()
text_dump = open('text_dump.txt', 'w')
#Populating the location field of the data frame
#tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
tweet_dataframe['text'] = map(lambda tweet : tweet['text'], tweet_list)
print(tweet_dataframe['text'])
Errors :
abhijeet-mohanty-2:Desktop SubrataMohanty$ python twitter_stream_dump.py
Enter the time limit in minutes : 3
Enter the list of keywords/hashtags to be compared : python ruby scala
Traceback (most recent call last):
File "twitter_stream_dump.py", line 81, in <module>
tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
File "twitter_stream_dump.py", line 81, in <lambda>
tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
KeyError: 'location'
requests.json (My .json file)
https://drive.google.com/file/d/0B1p05OszaBkXLWFsQ2VmeWVjbDQ/view?usp=sharing
The location field is a user-defined value and will sometimes not be present.
That's why you're getting the KeyError.
Note that location is part of the "user profile" metadata that comes with a tweet. It's intended to describe a user's location (like their hometown), and not the geotagged location of a given tweet.
In case you're interested in geotags, first check a tweeet to see if the geo_enabled field is true. If so, the geo, coordinates, and place fields may contain geotagged information.
As for missing text entries, I don't see the same issue when using the data you provided. It's possible the issue was caused by your try/except clause when reading in lines of data. Consider this approach:
for i, line in enumerate(tweets_file):
if line.rstrip():
tweet = json.loads(line)
tweet_list.append(tweet)
num_tweets_collected = len(tweet_list)
texts = [tweet['text'] for tweet in tweet_list]
tweet_dataframe = pd.DataFrame(texts, columns=['text'])
Sample output:
print(tweet_dataframe.head())
# text
# 0 Tweets and python BFF <3 15121629.976126991
# 1 RT #zeroSteiner: Can now write more post modul...
# 2 •ruby• #MtvInstagLSelena #MtvColabTaylors
# 3 Ruby Necklace July Birthstone Jewelry Rosary...
# 4 #ossia I didn't see any such thing as Python. ...
A few quick summary stats show that no lines are missing, and no entries are null:
print("N tweets: {}".format(num_tweets_collected))
# N tweets: 286
print("N rows in dataframe: {}".format(tweet_dataframe.shape[0]))
# N rows in dataframe: 286
null_count = tweet_dataframe.text.isnull().sum()
print("Tweets with no text field extracted: {}".format(null_count))
# Tweets with no text field extracted: 0

How to separate text from twitter streaming JSON responses and run analysis on text with python?

I am trying to use the twitter API to run sentiment analysis on the text. I am running into the issue that I am not understanding the way to separate the text from each tweet and running the sentiment polarity analysis provided in the TextBlob library. Further more I would like this to only pull back on english tweets. The output is in JSON.
Here is the code to produce the tweets based on keywords (in this case "usd", "euro", "loonie") and my lame attempt at storing the text and using the result in a variable:
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
import re
import pandas as pd
import matplotlib.pyplot as plt
#Variables that contains the user credentials to access Twitter API
access_token = "xxxx"
access_token_secret = "xxxx"
consumer_key = "xxxx"
consumer_secret = "xxxx"
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
print data
return True
def on_error(self, status):
print status
if __name__ == '__main__':
#This handles Twitter authentication and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['euro', 'dollar', 'loonie', ] )
tweets_data_path = stream.filter
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
print len(tweets_data)
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
wiki = TextBlob(tweets['text'])
r = wiki.sentiment.polarity
print r
This is what the output looks like:
{"created_at":"Sun Jun 14 23:43:31 +0000 2015","id":610231121016524801,"id_str":"610231121016524801","text":"RT #amirulimannn: RM6 diperlukan utk tukar kpd 1Pound.\nRM3 diperlukan utk tukar kpd 1S'pore Dollar.\n\nGraf matawang jatuh. Tak sedih ke? htt\u2026","source":"\u003ca href=\"http://twitter.com/download/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":42642877,"id_str":"42642877","name":"Wny","screen_name":"waaannnyyy","location":"Dirgahayu Darul Makmur","url":null,"description":"Aku serba tiada, aku kekurangan.","protected":false,"verified":false,"followers_count":320,"friends_count":239,"listed_count":1,"favourites_count":4344,"statuses_count":34408,"created_at":"Tue May 26 15:10:28 +0000 2009","utc_offset":28800,"time_zone":"Kuala Lumpur","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/433201191825047553/PM76m-v2.jpeg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/433201191825047553/PM76m-v2.jpeg","profile_background_tile":true,"profile_link_color":"DD2E44","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/609402965795835904/mm6jjRRO_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/609402965795835904/mm6jjRRO_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/42642877/1415486321","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sat Jun 13 03:33:29 +0000 2015","id":609564219495706624,"id_str":"609564219495706624","text":"RM6 diperlukan utk tukar kpd 1Pound.\nRM3 diperlukan utk tukar kpd 1S'pore Dollar.\n\nGraf matawang jatuh. Tak sedih ke? http://t.co/dum4skb6uK","source":"\u003ca href=\"http://twitter.com/download/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":481856658,"id_str":"481856658","name":"seorang iman","screen_name":"amirulimannn","location":"+06MY","url":"http://instagram.com/amirulimannn","description":"I wanna drown myself in a bottle of her perfume","protected":false,"verified":false,"followers_count":723,"friends_count":834,"listed_count":2,"favourites_count":4810,"statuses_count":50981,"created_at":"Fri Feb 03 07:49:55 +0000 2012","utc_offset":28800,"time_zone":"Kuala Lumpur","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"AD0A20","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/378800000139426816/61DHBnYy.jpeg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/378800000139426816/61DHBnYy.jpeg","profile_background_tile":false,"profile_link_color":"E36009","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"24210E","profile_text_color":"89B5A2","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/592744790283911169/dW7S73WA_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/592744790283911169/dW7S73WA_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/481856658/1428379855","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweet_count":1321,"favorite_count":229,"entities":{"hashtags":[],"trends":[],"urls":[],"user_mentions":[],"symbols":[],"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[118,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[118,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"in"},"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"trends":[],"urls":[],"user_mentions":[{"screen_name":"amirulimannn","name":"seorang iman","id":481856658,"id_str":"481856658","indices":[3,16]}],"symbols":[],"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[139,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}},"source_status_id":609564219495706624,"source_status_id_str":"609564219495706624"}]},"extended_entities":{"media":[{"id":609564142886760448,"id_str":"609564142886760448","indices":[139,140],"media_url":"http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","media_url_https":"https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg","url":"http://t.co/dum4skb6uK","display_url":"pic.twitter.com/dum4skb6uK","expanded_url":"http://twitter.com/amirulimannn/status/609564219495706624/photo/1","type":"photo","sizes":{"small":{"w":340,"h":340,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":600,"h":600,"resize":"fit"},"large":{"w":1024,"h":1024,"resize":"fit"}},"source_status_id":609564219495706624,"source_status_id_str":"609564219495706624"}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"in","timestamp_ms":"1434325411453"}
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
# Variables that contains the user credentials to access Twitter API
access_token = ''
access_token_secret = ''
consumer_key = ''
consumer_secret = ''
# This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
json_load = json.loads(data)
texts = json_load['text']
coded = texts.encode('utf-8')
s = str(coded)
print(s[2:-1])
return True
def on_error(self, status):
print(status)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, StdOutListener())
# This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['euro', 'dollar', 'loonie', ], languages=['en'])
For your original question about the json: You can load the data asit streams using json.loads(). The reason for the other stuff so you don't get charmap error when you're extracting the data from twitter onto python. The reason for s[2:-1] is to get rid of the extra character from encoding to utf-8.
For english only tweets you can also filter directly from the stream using languages=['en'].
I'm not familiar with TextBlob library, but you can store it through multiple ways, just write your information onto a file and when you run TextBlob read directly from the file. You can replace print(s[2:-1]) or add to it:
myfile = open('text.csv','a')
myFile.write(s[2:-1])
myFile.write('\n') # adds a line between tweets
myFile.close()
You can read it using file = open('text.csv', 'r') to do your sentiment analysis. Don't forget to add file.close() anytime you open a file.