Transforming a Dataset for Multi-Label Text Classification - deep-learning

I am conducting some experiments on multi-label classification via deep learning models.
But I face a problem with the dataset.
I use Keras,TensorFlow 2.0, numpy,pandas.
I have a dataset in the form:
Dataset in the form that I have it
To apply multi-label classification(6 labels) I need my dataset to be in this form:
Dataset in the form that I need it
How is it possible to achieve this? Are there any functions making this transformation easier?
Try:
comments_df[['abusive','hateful','offensive','disrespectful','fearful','normal']] = comments_df['sentiment'].str.split('_', -1, expand=True)
This gives me an error:
ValueError: Columns must be same length as key
Regarding the DL model I will use, it's bi-LSTM, but it doesn't have anything to do with the question per-se.

Try this:
df = pd.get_dummies(data = df, columns = ['sentiment'])

I found this to work (not optimal solution):
"""
Creating a column for each of the target labels with sentiment's column data.
"""
def split_sentiment_outputs(output_label, sentiment_col="sentiment"):
comments_df[output_label] = comments_df[sentiment_col].str.split('_')
"""
Transform column's data to categorical.
"""
def transform_data_for_multilabel(output_label):
row = comments_df[output_label]
for index, row in row.items():
# print("Index:", index)
# print("length:", len(row))
# print("content:", row)
# print("--------------")
z = 0
while z < len(row):
if row[z] == output_label:
comments_df.at[index, output_label] = 1
break
else:
comments_df.at[index, output_label] = 0
z = z + 1
# Applying Data Transformation
output_labels = ["abusive", "hateful", "offensive", "disrespectful", "fearful", "normal"]
for i in range(MAX_OUT):
split_sentiment_outputs(output_labels[i])
for i in range(MAX_OUT):
transform_data_for_multilabel(output_labels[i])

Related

Loading multiple csvs with mixed dtypes in tensorflow for training

I have 100s of csvs in a directory, with headers. I am trying to create a feedforward NN using tensorflow for regression.
What's the best way to import these csvs and train using tf & train it?
Also help to look at my preprocessing if I am doing it right?
Note: My features has mixed datatypes (int,float,string), My target is float
I can not concat the csv and use pandas to import, my data size is >50 GB so can not load it in-memory, have to read it iteratively from disc
Directory Path:
./data/train/ -> 100s of csvs
./data/test -> 100s of csvs
./data/valid -> 100s of csvs
Code:
Methodology:
Create Generator
Use Dataset API to load the data
Preprocess the Data (embedding, one-hot,etc)
Train fit
But, in generator I was able to give only output formats where the inputs/ outputs are homogeneous ddtypes.
Code:
def data_generator(file_list, batch_size = 2):
i = 0
while True:
if i*batch_size >= len(file_list): # This loop is used to run the generator indefinitely.
i = 0
np.random.shuffle(file_list)
else:
file_chunk = file_list[i*batch_size:(i+1)*batch_size]
data = []
labels = []
for file in file_chunk:
temp = pd.read_csv(open(file,'r')) # Change this line to read any other type of file
labels = temp.pop('ACTUAL_BOXES')
data.append(temp.values) # Convert column data to matrix like data with one channel
labels.append(labels)
data = np.asarray(data)
labels = np.asarray(labels)
yield data, labels # Here data will be mixed datatype arrays & lables will be a float dtype array
i = i + 1
#getting list of files inside the directory
train_file_list = np.sort(glob.glob('././data/train/*.csv'))
test_file_list = np.sort(glob.glob('././data/test/*.csv'))
val_file_list = np.sort(glob.glob('././data/val/*.csv'))
train_dataset = tf.data.Dataset.from_generator(data_generator,args= [train_file_list , batch_size = 2],
output_types = (tf.float32, tf.float32), #This is where I am struck
#my sample data and lables will be like this
data = ['a','b',1,2,3.14,2] #Mixed dtypes
lables = [1.0] #float
)
val_dataset = tf.data.Dataset.from_generator(data_generator,args= [val_file_list , batch_size = 2],
output_types = (tf.float32, tf.float32), #This is where I am struck
)
# Pre processing Part:
def encode_inputs(EMBEDDING_FEATURES,INDICATOR_FEATURES):
''' Function for encoding the deatures'''
encoded_features = []
for feature_name in EMBEDDING_FEATURES:
#Getting unique vocab list
vocabulary = np.array(list(flatten(vocab_list[feature_name])))
# categorical columns using the lists created above:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary)
embedding_dims = int(math.sqrt(len(vocabulary)))
# create an embedding from the categorical column:
cat_emb = tf.feature_column.embedding_column(cat_col,8) #,dimension=embedding_dims
# add the embeddings to the list of feature columns
encoded_features.append(cat_emb)
for feature_name in INDICATOR_FEATURES:
#Getting unique vocab list
vocabulary = list(flatten(vocab_list[feature_name]))
# indicator columns using the lists created above:
ind_col = tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary)
# create an embedding from the categorical column:
cat_one_hot = tf.feature_column.indicator_column(ind_col)
# add the embeddings to the list of feature columns
encoded_features.append(cat_one_hot)
# create the input layer for the model
feature_layer = tf.keras.layers.DenseFeatures(encoded_features)
return feature_layer
# Opening JSON file that contains vocab list for str cols
f = open('./vocab_list.json') # File that contains the unique values of each feature
vocab_list = json.load(f)
features_layer = encode_inputs(EMBEDDING_FEATURES,INDICATOR_FEATURES)
# Model Part
model = tf.keras.models.Sequential([
features_layer,
tf.keras.layers.Dense(30, activation = 'relu'),
tf.keras.layers.Dense(1)
])
m_loss = tf.keras.losses.mean_squared_error
m_optimizer = tf.keras.optimizers.SGD(lr = 1e-3)
batch_size = 32
model.compile(loss = m_loss, optimizer = m_optimizer, metrics = ['accuracy'])
model.fit(train_dataset ,epochs = 10, validation_data = val_dataset )

PyTorch: Multi-class segmentation loss value != 0 when using target image as the prediction

I was performing semantic segmentation using PyTorch. There are a total of 103 different classes in the dataset and the targets are RGB images with only the Red channel containing the labels. I was using nn.CrossEntropyLoss as my loss function. For sanity, I wanted to check if using nn.CrossEntropyLoss is correct for this problem and whether it has the expected behaviour
I pick a random mask from my dataset and create a categorical version of it using this custom transform
class ToCategorical:
def __init__(self, n_classes: int) -> None:
self.n_classes = n_classes
def __call__(self, sample: torch.Tensor):
mask = sample.permute(1, 2, 0)
categories = torch.unique(mask).tolist()[1:] # get all categories other than 0
# build a tensor with `n_classes` channels
one_hot_image = torch.zeros(self.n_classes, *mask.shape[:-1])
for category in categories:
# get spacial locs where the categ is present
rows, cols, _ = torch.where(mask == category)
# in same spacial loc but in `categ` channel fill 1
one_hot_image[category, rows, cols] = 1
return one_hot_image
And then I send this image as the output (prediction) and use the ground truth mask as the target to the loss function.
import torch.nn as nn
mask = T.PILToTensor()(Image.open("path_to_image").convert("RGB"))
categorical_mask = ToCategorical(103)(mask).unsqueeze(0)
mask = mask[0].unsqueeze(0) # get only the red channel, add fake batch_dim
loss_fn = nn.CrossEntropyLoss()
target = mask
output = categorical_mask
print(output.shape, target.shape)
print(loss_fn(output, target.to(torch.long)))
I expected the loss to be zero but to my surprise, the output is as follows
torch.Size([1, 103, 600, 800]) torch.Size([1, 600, 800])
tensor(4.2836)
I verified with other samples in the dataset and I obtained similar values for other masks as well. Am I doing something wrong? I expect the loss to be = 0 when the output is the same as the target.
PS. I also know that nn.CrossEntropyLoss is the same as using log_softmax followed by nn.NLLLoss() but even I obtained the same value by using nllloss as well
For Reference
Dataset used: UECFoodPixComplete
I would like to adress this:
I expect the loss to be = 0 when the output is the same as the target.
If the prediction matches the target, i.e. the prediction corresponds to a one-hot-encoding of the labels contained in the dense target tensor, but the loss itself is not supposed to equal to zero. Actually, it can never be equal to zero because the nn.CrossEntropyLoss function is always positive by definition.
Let us take a minimal example with number of #C classes and a target y_pred and a prediction y_pred consisting of prefect predictions:
As a quick reminder:
The softmax is applied on the logits (q_i) as p_i = log(exp(q_i)/sum_j(exp(q_j)):
>>> p = F.softmax(y_pred, 1)
Similarly if you are using the log-softmax, defined as logp_i = log(p_i):
>>> logp = F.log_softmax(y_pred, 1)
Then comes the negative likelihood function computed between x the input and y the target: -y*x. In association with the softmax, it comes down to -y*p, or -y*logp respectively. In any case, whether you apply the log or not, only the predictions corresponding to the true classes will remain since the others ones are zeroed-out.
That being said, applying the NLLLoss on y_pred would indeed result with a 0 as you expected in your question. However, here we apply it on the probability distribution or log-probability: p, or logp respectively!
In our specific case, p_i = 1 for the true class and p_i = 0 for all other classes (there are #C - 1 of those). This means the softmax of the logit associated with the true class will equal to exp(1)/sum_i(p_i). And since sum_i(p_i) = (#C-1)*exp(0) + exp(1). We therefore have:
softmax(p) = e / (#C - 1 + e)
Similarly for log-softmax:
log-softmax(p) = log(e / (#C-1 + e)) = 1 - log(#C - 1 + e)
If we proceed by applying the negative likelihood function we simply get cross-entropy(y_pred, y_true) = (nllloss o log-softmax)(y_pred, y_true). This results in:
loss = - (1 - log(#C - 1 + e)) = log(#C - 1 + e) - 1
This effectively corresponds to the minimum of the nn.CrossEntropyLoss function.
Regarding your specific case where #C = 103, you may have an issue in your code... since the average loss should equal to log(102 + e) - 1 i.e. around 3.65.
>>> y_true = torch.randint(0,103,(1,1,2,5))
>>> y_pred = torch.zeros(1,103,2,5).scatter(1, y_true, value=1)
You can see for yourself with one of the provided methods:
the builtin function nn.functional.cross_entropy:
>>> F.cross_entropy(y_pred, y_true[:,0])
tensor(3.6513)
manually computing the quantity:
>>> logp = F.log_softmax(y_pred, 1)
>>> -logp.gather(1, y_true).mean()
tensor(3.6513)
analytical result:
>>> log(102 + e) - 1
3.6513

How to normalize pytorch model output to be in range [0,1]

lets say I have model called UNet
output = UNet(input)
that output is a vector of grayscale images shape: (batch_size,1,128,128)
What I want to do is to normalize each image to be in range [0,1].
I did it like this:
for i in range(batch_size):
output[i,:,:,:] = output[i,:,:,:]/torch.amax(output,dim=(1,2,3))[i]
now every image in the output is normalized, but when I'm training such model, pytorch claim it cannot calculate the gradients in this procedure, and I understand why.
my question is what is the right way to normalize image without killing the backpropogation flow?
something like
output = UNet(input)
output = output.normalize
output2 = some_model(output)
loss = ..
loss.backward()
optimize.step()
my only option right now is adding a sigmoid activation at the end of the UNet but i dont think its a good idea..
update - code (gen2,disc = unet,discriminator models. est_bias is some output):
update 2x code:
with torch.no_grad():
est_bias_for_disc = gen2(input_img)
est_bias_for_disc /= est_bias_for_disc.amax(dim=(1,2,3), keepdim=True)
disc_fake_hat = disc(est_bias_for_disc.detach())
disc_fake_loss = BCE(disc_fake_hat, torch.zeros_like(disc_fake_hat))
disc_real_hat = disc(bias_ref)
disc_real_loss = BCE(disc_real_hat, torch.ones_like(disc_real_hat))
disc_loss = (disc_fake_loss + disc_real_loss) / 2
if epoch<=epochs_till_gen2_stop:
disc_loss.backward(retain_graph=True) # Update gradients
opt_disc.step() # Update optimizer
then theres seperate training:
opt_gen2.zero_grad()
est_bias = gen2(input_img)
est_bias /= est_bias.amax(dim=(1,2,3), keepdim=True)
disc_fake = disc(est_bias)
ADV_loss = BCE(disc_fake, torch.ones_like(disc_fake))
gen2_loss = ADV_loss
gen2_loss.backward()
opt_gen2.step()
You can use the normalize function:
>>> import torch
>>> import torch.nn.functional as F
>>> x = torch.tensor([[3.,4.],[5.,6.],[7.,8.]])
>>> x = F.normalize(x, dim = 0)
>>> print(x)
tensor([[0.3293, 0.3714],
[0.5488, 0.5571],
[0.7683, 0.7428]])
This will give a differentiable tensor as long as out is not used.
You are overwriting the tensor's value because of the indexing on the batch dimension. Instead, you can perform the operation in vectorized form:
output = output / output.amax(dim=(1,2,3), keepdim=True)
The keepdim=True argument keeps the shape of torch.Tensor.amax's output equal to that of its inputs allowing you to perform an in-place operation with it.

Pytorch: Overfitting on a small batch: Debugging

I am building a multi-class image classifier.
There is a debugging trick to overfit on a single batch to check if there any deeper bugs in the program.
How to design the code in a way that can do it in a much portable format?
One arduous and a not smart way is to build a holdout train/test folder for a small batch where test class consists of 2 distribution - seen data and unseen data and if the model is performing better on seen data and poorly on unseen data, then we can conclude that our network doesn't have any deeper structural bug.
But, this does not seems like a smart and a portable way, and have to do it with every problem.
Currently, I have a dataset class where I am partitioning the data in train/dev/test in the below way -
def split_equal_into_val_test(csv_file=None, stratify_colname='y',
frac_train=0.6, frac_val=0.15, frac_test=0.25,
):
"""
Split a Pandas dataframe into three subsets (train, val, and test).
Following fractional ratios provided by the user, where val and
test set have the same number of each classes while train set have
the remaining number of left classes
Parameters
----------
csv_file : Input data csv file to be passed
stratify_colname : str
The name of the column that will be used for stratification. Usually
this column would be for the label.
frac_train : float
frac_val : float
frac_test : float
The ratios with which the dataframe will be split into train, val, and
test data. The values should be expressed as float fractions and should
sum to 1.0.
random_state : int, None, or RandomStateInstance
Value to be passed to train_test_split().
Returns
-------
df_train, df_val, df_test :
Dataframes containing the three splits.
"""
df = pd.read_csv(csv_file).iloc[:, 1:]
if frac_train + frac_val + frac_test != 1.0:
raise ValueError('fractions %f, %f, %f do not add up to 1.0' %
(frac_train, frac_val, frac_test))
if stratify_colname not in df.columns:
raise ValueError('%s is not a column in the dataframe' %
(stratify_colname))
df_input = df
no_of_classes = 4
sfact = int((0.1*len(df))/no_of_classes)
# Shuffling the data frame
df_input = df_input.sample(frac=1)
df_temp_1 = df_input[df_input['labels'] == 1][:sfact]
df_temp_2 = df_input[df_input['labels'] == 2][:sfact]
df_temp_3 = df_input[df_input['labels'] == 3][:sfact]
df_temp_4 = df_input[df_input['labels'] == 4][:sfact]
dev_test_df = pd.concat([df_temp_1, df_temp_2, df_temp_3, df_temp_4])
dev_test_y = dev_test_df['labels']
# Split the temp dataframe into val and test dataframes.
df_val, df_test, dev_Y, test_Y = train_test_split(
dev_test_df, dev_test_y,
stratify=dev_test_y,
test_size=0.5,
)
df_train = df[~df['img'].isin(dev_test_df['img'])]
assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
return df_train, df_val, df_test
def train_val_to_ids(train, val, test, stratify_columns='labels'): # noqa
"""
Convert the stratified dataset in the form of dictionary : partition['train] and labels.
To generate the parallel code according to https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel
Parameters
-----------
csv_file : Input data csv file to be passed
stratify_columns : The label column
Returns
-----------
partition, labels:
partition dictionary containing train and validation ids and label dictionary containing ids and their labels # noqa
"""
train_list, val_list, test_list = train['img'].to_list(), val['img'].to_list(), test['img'].to_list() # noqa
partition = {"train_set": train_list,
"val_set": val_list,
}
labels = dict(zip(train.img, train.labels))
labels.update(dict(zip(val.img, val.labels)))
return partition, labels
P.S - I know about the Pytorch lightning and know that they have an overfitting feature which can be used easily but I don't want to move to PyTorch lightning.
I don't know how portable it will be, but a trick that I use is to modify the __len__ function in the Dataset.
If I modified it from
def __len__(self):
return len(self.data_list)
to
def __len__(self):
return 20
It will only output the first 20 elements in the dataset (regardless of shuffle). You only need to change one line of code and the rest should work just fine so I think it's pretty neat.

How do I get CSV files into an Estimator in Tensorflow 1.6

I am new to tensorflow (and my first question in StackOverflow)
As a learning tool, I am trying to do something simple. (4 days later I am still confused)
I have one CSV file with 36 columns (3500 records) with 0s and 1s.
I am envisioning this file as a flattened 6x6 matrix.
I have another CSV file with 1 columnn of ground truth 0 or 1 (3500 records) which indicates if at least 4 of the 6 of elements in the 6x6 matrix's diagonal are 1's.
I am not sure I have processed the CSV files correctly.
I am confused as to how I create the features dictionary and Labels and how that fits into the DNNClassifier
I am using TensorFlow 1.6, Python 3.6
Below is the small amount of code I have so far.
import tensorflow as tf
import os
def x_map(line):
rDefaults = [[] for cl in range(36)]
x_row = tf.decode_csv(line, record_defaults=rDefaults)
return x_row
def y_map(line):
line = tf.string_to_number(line, out_type=tf.int32)
y_row = tf.one_hot(line, depth=2)
return y_row
x_path_file = os.path.join('D:', 'Diag', '6x6_train.csv')
y_path_file = os.path.join('D:', 'Diag', 'HasDiag_train.csv')
filenames = [x_path_file]
x_dataset = tf.data.TextLineDataset(filenames)
x_dataset = x_dataset.map(x_map)
x_dataset = x_dataset.batch(1)
x_iter = x_dataset.make_one_shot_iterator()
x_next_el = x_iter.get_next()
filenames = [y_path_file]
y_dataset = tf.data.TextLineDataset(filenames)
y_dataset = y_dataset.map(y_map)
y_dataset = y_dataset.batch(1)
y_iter = y_dataset.make_one_shot_iterator()
y_next_el = y_iter.get_next()
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
x_el = (sess.run(x_next_el))
y_el = (sess.run(y_next_el))
The output for x_el is:
(array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([0.] ... it goes on...
The output for y_el is:
[[1. 0.]]
You're pretty much there for a minimal working model. The main issue I see is that tf.decode_csv returns a tuple of tensors, where as I expect you want a single tensor with all values. Easy fix:
x_row = tf.stack(tf.decode_csv(line, record_defaults=rDefaults))
That should work... but it fails to take advantage of many of the awesome things the tf.data.Dataset API has to offer, like shuffling, parallel threading etc. For example, if you shuffle each dataset, those shuffling operations won't be consistent. This is because you've created two separate datasets and manipulated them independently. If you create them independently, zip them together then manipulate, those manipulations will be consistent.
Try something along these lines:
def get_inputs(
count=None, shuffle=True, buffer_size=1000, batch_size=32,
num_parallel_calls=8, x_paths=[x_path_file], y_paths=[y_path_file]):
"""
Get x, y inputs.
Args:
count: number of epochs. None indicates infinite epochs.
shuffle: whether or not to shuffle the dataset
buffer_size: used in shuffle
batch_size: size of batch. See outputs below
num_parallel_calls: used in map. Note if > 1, intra-batch ordering
will be shuffled
x_paths: list of paths to x-value files.
y_paths: list of paths to y-value files.
Returns:
x: (batch_size, 6, 6) tensor
y: (batch_size, 2) tensor of 1-hot labels
"""
def x_map(line):
rDefaults = [[] for cl in range(n_dims**2)]
x_row = tf.stack(tf.decode_csv(line, record_defaults=rDefaults))
return x_row
def y_map(line):
line = tf.string_to_number(line, out_type=tf.int32)
y_row = tf.one_hot(line, depth=2)
return y_row
def xy_map(x, y):
return x_map(x), y_map(y)
x_ds = tf.data.TextLineDataset(x_paths)
y_ds = tf.data.TextLineDataset(y_paths)
combined = tf.data.Dataset.zip((x_ds, y_ds))
combined = combined.repeat(count=count)
if shuffle:
combined = combined.shuffle(buffer_size)
combined = combined.map(xy_map, num_parallel_calls=num_parallel_calls)
combined = combined.batch(batch_size)
x, y = combined.make_one_shot_iterator().get_next()
return x, y
To experiment/debug,
x, y = get_inputs()
with tf.Session() as sess:
xv, yv = sess.run((x, y))
print(xv.shape, yv.shape)
For use in an estimator, pass the function itself.
estimator.train(get_inputs, max_steps=10000)
def get_eval_inputs():
return get_inputs(
count=1, shuffle=False
x_paths=[x_eval_paths],
y_paths=[y_eval_paths])
estimator.eval(get_eval_inputs)