Save TFIDF vocab and transformation and use on new dataset - pickle

I am trying to save all the vocab words and the tfidf vectorizer from the train/test set so that I can use it on a new set of text at a later time. I got the vocab and idf dictionary using this code:
cvec_tfidf = TfidfVectorizer(analyzer="word", tokenizer=nltk.word_tokenize, strip_accents='unicode', min_df = .01, max_df = .99, ngram_range=(1,3))
cvec_tfidf.fit(X_train['answer'])
vocab_tfidf = cvec_tfidf.get_feature_names()
def tfidf (tokens, vocab, cvec):
cvec_counts = cvec.transform(tokens)
cvec_matrix = cvec_counts.toarray()
tfidf_model = pd.DataFrame(cvec_matrix, columns=cvec.vocabulary_)
idf = dict(zip(vocab, cvec.idf_))
return tfidf_model, idf
X_train, X_train_idf = tfidf(X_train['answer'], vocab_tfidf, cvec_tfidf)
X_test, X_test_idf = tfidf(X_test['answer'], vocab_tfidf, cvec_tfidf)
I think I have saved and loaded the vocab with
import pickle
pickle.dump(cvec_tfidf.vocabulary_, open("feature.pkl", "wb"))
## LOAD TFIDF
savedtfidf = pickle.load(open("feature.pkl", 'rb'))
I tried to run it on new text but got an error
## USE TFIDF ON NEW DATA
newtext = savedtfidf.fit_transform(text['newtext'])
File "<ipython-input-573-4d2aef685725>", line 1, in <module>
newtext = savedtfidf.fit_transform(text['PSW_Attention_3_cl'])
AttributeError: 'dict' object has no attribute 'fit_transform'
Any idea what I am doing wrong?

The issue is that you are serializing and deserializing only the model's vocabulary - and, as the error says, the vocabulary is simply a dictionary that doesn't have the fit_transform method.
What you want to do is to initialize a new TF-IDF model with your serialized vocabulary:
saved_vocabulary = pickle.load(open("feature.pkl", 'rb'))
cvec_tfidf = TfidfVectorizer(analyzer="word", tokenizer=nltk.word_tokenize, strip_accents='unicode', min_df = .01, max_df = .99, ngram_range=(1,3), vocabulary=saved_vocabulary)
cvec_tfidf.fit_transform(text['newtext'])

Related

How can I use the T5 transformers decoder part as decoder for TrOCR ? for fine tuning task

#after importing lib
T5model = AutoModel.from_pretrained("t5-small")
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
# from transformers import AutoTokenizer
processor.tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
model.config.decoder = T5model.decoder
model.config.pad_token_id = processor.tokenizer.pad_token_id
After uploading IAM data set and setting config and executing using trainer class from hugging face
output["decoder"] = self.decoder.to_dict()
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1130, in __getattr__
raise AttributeError("'{}' object has no attribute '{}'".format(
AttributeError: 'T5Stack' object has no attribute 'to_dict'

Loading multiple csvs with mixed dtypes in tensorflow for training

I have 100s of csvs in a directory, with headers. I am trying to create a feedforward NN using tensorflow for regression.
What's the best way to import these csvs and train using tf & train it?
Also help to look at my preprocessing if I am doing it right?
Note: My features has mixed datatypes (int,float,string), My target is float
I can not concat the csv and use pandas to import, my data size is >50 GB so can not load it in-memory, have to read it iteratively from disc
Directory Path:
./data/train/ -> 100s of csvs
./data/test -> 100s of csvs
./data/valid -> 100s of csvs
Code:
Methodology:
Create Generator
Use Dataset API to load the data
Preprocess the Data (embedding, one-hot,etc)
Train fit
But, in generator I was able to give only output formats where the inputs/ outputs are homogeneous ddtypes.
Code:
def data_generator(file_list, batch_size = 2):
i = 0
while True:
if i*batch_size >= len(file_list): # This loop is used to run the generator indefinitely.
i = 0
np.random.shuffle(file_list)
else:
file_chunk = file_list[i*batch_size:(i+1)*batch_size]
data = []
labels = []
for file in file_chunk:
temp = pd.read_csv(open(file,'r')) # Change this line to read any other type of file
labels = temp.pop('ACTUAL_BOXES')
data.append(temp.values) # Convert column data to matrix like data with one channel
labels.append(labels)
data = np.asarray(data)
labels = np.asarray(labels)
yield data, labels # Here data will be mixed datatype arrays & lables will be a float dtype array
i = i + 1
#getting list of files inside the directory
train_file_list = np.sort(glob.glob('././data/train/*.csv'))
test_file_list = np.sort(glob.glob('././data/test/*.csv'))
val_file_list = np.sort(glob.glob('././data/val/*.csv'))
train_dataset = tf.data.Dataset.from_generator(data_generator,args= [train_file_list , batch_size = 2],
output_types = (tf.float32, tf.float32), #This is where I am struck
#my sample data and lables will be like this
data = ['a','b',1,2,3.14,2] #Mixed dtypes
lables = [1.0] #float
)
val_dataset = tf.data.Dataset.from_generator(data_generator,args= [val_file_list , batch_size = 2],
output_types = (tf.float32, tf.float32), #This is where I am struck
)
# Pre processing Part:
def encode_inputs(EMBEDDING_FEATURES,INDICATOR_FEATURES):
''' Function for encoding the deatures'''
encoded_features = []
for feature_name in EMBEDDING_FEATURES:
#Getting unique vocab list
vocabulary = np.array(list(flatten(vocab_list[feature_name])))
# categorical columns using the lists created above:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary)
embedding_dims = int(math.sqrt(len(vocabulary)))
# create an embedding from the categorical column:
cat_emb = tf.feature_column.embedding_column(cat_col,8) #,dimension=embedding_dims
# add the embeddings to the list of feature columns
encoded_features.append(cat_emb)
for feature_name in INDICATOR_FEATURES:
#Getting unique vocab list
vocabulary = list(flatten(vocab_list[feature_name]))
# indicator columns using the lists created above:
ind_col = tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary)
# create an embedding from the categorical column:
cat_one_hot = tf.feature_column.indicator_column(ind_col)
# add the embeddings to the list of feature columns
encoded_features.append(cat_one_hot)
# create the input layer for the model
feature_layer = tf.keras.layers.DenseFeatures(encoded_features)
return feature_layer
# Opening JSON file that contains vocab list for str cols
f = open('./vocab_list.json') # File that contains the unique values of each feature
vocab_list = json.load(f)
features_layer = encode_inputs(EMBEDDING_FEATURES,INDICATOR_FEATURES)
# Model Part
model = tf.keras.models.Sequential([
features_layer,
tf.keras.layers.Dense(30, activation = 'relu'),
tf.keras.layers.Dense(1)
])
m_loss = tf.keras.losses.mean_squared_error
m_optimizer = tf.keras.optimizers.SGD(lr = 1e-3)
batch_size = 32
model.compile(loss = m_loss, optimizer = m_optimizer, metrics = ['accuracy'])
model.fit(train_dataset ,epochs = 10, validation_data = val_dataset )

tfa.optimizers.MultiOptimizer - TypeError: 'Not JSON Serializable:'

I'm trying to use tfa.optimizers.MultiOptimizer(). I did everything according to the docs (https://www.tensorflow.org/addons/api_docs/python/tfa/optimizers/MultiOptimizer) yet I'm getting the following error:
TypeError: ('Not JSON Serializable:', <tf.Tensor 'gradient_tape/model_80/dense_3/Tensordot/MatMul/MatMul:0' shape=(1, 1) dtype=float32>)
Below is a minimal, working example that reproduces the error, just copy and paste it. The error occurs when the first epoch is finished and the callback trys to save the model.
##############################################################################
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.layers as l
import tensorflow_addons.layers as la
import tensorflow.keras as ke
import numpy as np
##############################################################################
def build_model_1():
model_input = l.Input(shape=(32,1))
x = l.Dense(1)(model_input)
model = ke.Model(inputs=model_input, outputs=x)
##########
optimizers = [tf.keras.optimizers.Adam(),
tf.keras.optimizers.Adam()]
optimizers_and_layers = [(optimizers[0], model.layers[:5]), (optimizers[1], model.layers[5:])]
optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)
model.compile(optimizer=optimizer, loss='mse', metrics='mse')
test = tf.keras.optimizers.serialize(optimizer)
return model
##############################################################################
input_data = np.arange( 0, 10000, 1).reshape(10000,1)
target_data = np.arange(-10000, 0, 1).reshape(10000,1)
model = build_model_1()
model_checkpoint = ke.callbacks.ModelCheckpoint('best_model.h5',
monitor='val_mse',
mode='min',
save_best_only=True,
verbose=1)
training_history = model.fit(x = input_data,
y = target_data,
validation_split = 0.2,
epochs = 5,
verbose = 1,
callbacks = [model_checkpoint])
##############################################################################
When saving a complete Keras model (with its own structure in the .h5 file) the tf.keras.Model object is completely serialized as a JSON: this means that every property of the model should be JSON serializable.
NOTE: tf.Tensor are NOT JSON serializable.
When using this multi optimizer from tfa you're adding properties to the model that the JSON serializer will try (and fail) to serialize.
In particular there's this attribute gv that I think it comes from the custom optimizer used.
'gv': [(<tf.Tensor 'gradient_tape/model/dense/Tensordot/MatMul/MatMul:0' shape=(1, 1) dtype=float32>, <tf.Variable 'dense/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[-0.55191684]], dtype=float32)>), (<tf.Tensor 'gradient_tape/model/dense/BiasAdd/BiasAddGrad:0' shape=(1,) dtype=float32>, <tf.Variable 'dense/bias:0' shape=(1,) dtype=float32, numpy=array([-0.23444518], dtype=float32)>)]},
All this tf.Tensor are not JSON serializable, that's why it fails.
The only option is to do not save the model completely (with all its attributes, which should be defined as Keras layers, but in this case is not possible) but saving only the model parameters.
In short, if you add the save_weights_only=True to the callback your training (and checkpoint of the weights) will work fine.
model_checkpoint = ke.callbacks.ModelCheckpoint(
"best_model.h5",
monitor="val_mse",
mode="min",
save_best_only=True,
verbose=1,
save_weights_only=True,
)

keep getting error message when trying to read json from s3

I keep getting this error in my lambda function:
{"errorMessage": "module initialization error"}
This happens when i try to turn the following string containing json data into a json dictionary object within python.
"{\n\"main\": {\n \"PART_NAME\": \"Genuine Cardboard Honda Wing\",\n \"BRAND\": \"Honda\",\n \"MJR_CAT\": \"Aero\",\n \"CAT\": \"Rear Wing\",\n \"SUB_CAT\": \"NA\",\n \"Power_Increase\": \"0\"\n},\n\"forza\":\n{\n \"power\": \"[0, True]\",\n \"Torque\": \"[0, True]\",\n \"Traction\": \"[50, True]\",\n \"Handling\": \"[100, True]\",\n \"Breaking\": \"[40, True]\"\n},\n\"custom\": {\n\"length\": 120,\n\"car max height[m]\": 2,\n\"RICER RANK\": -10\n\n}\n"
Here is my code to replicate this error:
client = boto3.client('s3')
result = client.get_object(Bucket=BUCKET, Key=FILE_TO_READ)
text = result['Body'].read().decode('utf-8')
text = json.load(text)
print(text)
without the print(text) it produces that string above.
Thanks :)
Here is the full lambda function (though not commented) if you are interested.
import json
import boto3
print('got this far')
BUCKET = '******'
FILE_TO_READ = 'example_honda_wing.json'
client = boto3.client('s3')
result = client.get_object(Bucket=BUCKET, Key=FILE_TO_READ)
text = result['Body'].read().decode('utf-8')
#text = str(text).replace("\n","")
#text = text.replace('\"',' ')
#text = json.load(text)
print(text) # Use your desired JSON Key for your value
def lambda_handler(event, context):
# TODO implement
return text

Is num_epochs limited in tensorflow's csv file reader string_input_producer()?

I have a dummy csv file (y=-x+1)
x,y
1,0
2,-1
3,-2
I try to feed that into a linear regression model. Since I have only so few examples, I want to iterate the training like 1000 times over that file, so I set num_epochs=1000.
However, it seems that Tensorflow limits this number. It works fine if I use num_epochs=5 or 10, but beyond 33 it is capped to 33 epochs. Is that true or am Im doing anything wrong?
# model = W*x+b
...
optimizer = tf.train.GradientDescentOptimizer(0.01)
train = optimizer.minimize(loss)
# reading input from csv
filename_queue = tf.train.string_input_producer(["/tmp/testinput.csv"], num_epochs=1000)
reader = tf.TextLineReader(skip_header_lines=1)
...
col_x, col_label = tf.decode_csv(csv_row, record_defaults=record_defaults)
with tf.Session() as sess:
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
while True:
try:
input_x, input_y = sess.run([col_x, col_label])
sess.run(train, feed_dict={x:input_x, y:input_y})
...
Side question, do I need to do:
input_x, input_y = sess.run([col_x, col_label])
sess.run(train, feed_dict={x:input_x, y:input_y})
I have tried sess.run(train, feed_dict={x:col_x, y:col_y}) directly to avoid the friction but it doesn't work (they are nodes, and feed_dict expects regular data)
The following snippets works perfectly (with your input):
import tensorflow as tf
filename_queue = tf.train.string_input_producer(["/tmp/input.csv"], num_epochs=1000)
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
col_x, col_label = tf.decode_csv(csv_row, record_defaults=[[0], [0]])
with tf.Session() as sess:
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
num = 0
try:
while True:
sess.run([col_x, col_label])
num += 1
except:
print(num)
Which gives the following output:
edb#lapelidb:/tmp$ python csv.py
3000