Pytorch:Apply cross entropy loss with custom weight map - deep-learning

I am solving multi-class segmentation problem using u-net architecture in pytorch.
As specified in U-NET paper, I am trying to implement custom weight maps to counter class imbalances.
Below is the opertion which I want to apply -
Also, I reduced the batch_size=1 so that I can remove that dimension while passing it to precompute_to_masks function.
I tried the below approach-
def precompute_for_image(masks):
masks = masks.cpu()
cls = masks.unique()
res = torch.stack([torch.where(masks==cls_val, torch.tensor(1), torch.tensor(0)) for cls_val in cls])
return res
def train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_path):
###################
# train the model #
###################
model.train()
for batch_idx, (data, target) in enumerate(final_train_loader):
# move to GPU
if use_cuda:
data, target = data.cuda(), target.cuda()
optimizer.zero_grad()
output = model(data)
temp_target = precompute_for_image(target)
w = weight_map(temp_target)
loss = criterion(output,target)
loss = w*loss
loss.backward()
optimizer.step()
train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.data - train_loss))
return model
where weight_map is the function to calculate weight mask which I got from here
The issue, I am facing is I am getting memory error when I apply the following method. I am using 61gb RAM and Tesla V100 GPU.
I really think I am applying it in incorrect way.
How to do it?
I am omitting the non-essential details from the training loop.
Below is my weight_map function:
from skimage.segmentation import find_boundaries
w0 = 10
sigma = 5
def make_weight_map(masks):
"""
Generate the weight maps as specified in the UNet paper
for a set of binary masks.
Parameters
----------
masks: array-like
A 3D array of shape (n_masks, image_height, image_width),
where each slice of the matrix along the 0th axis represents one binary mask.
Returns
-------
array-like
A 2D array of shape (image_height, image_width)
"""
nrows, ncols = masks.shape[1:]
masks = (masks > 0).astype(int)
distMap = np.zeros((nrows * ncols, masks.shape[0]))
X1, Y1 = np.meshgrid(np.arange(nrows), np.arange(ncols))
X1, Y1 = np.c_[X1.ravel(), Y1.ravel()].T
for i, mask in enumerate(masks):
# find the boundary of each mask,
# compute the distance of each pixel from this boundary
bounds = find_boundaries(mask, mode='inner')
X2, Y2 = np.nonzero(bounds)
xSum = (X2.reshape(-1, 1) - X1.reshape(1, -1)) ** 2
ySum = (Y2.reshape(-1, 1) - Y1.reshape(1, -1)) ** 2
distMap[:, i] = np.sqrt(xSum + ySum).min(axis=0)
ix = np.arange(distMap.shape[0])
if distMap.shape[1] == 1:
d1 = distMap.ravel()
border_loss_map = w0 * np.exp((-1 * (d1) ** 2) / (2 * (sigma ** 2)))
else:
if distMap.shape[1] == 2:
d1_ix, d2_ix = np.argpartition(distMap, 1, axis=1)[:, :2].T
else:
d1_ix, d2_ix = np.argpartition(distMap, 2, axis=1)[:, :2].T
d1 = distMap[ix, d1_ix]
d2 = distMap[ix, d2_ix]
border_loss_map = w0 * np.exp((-1 * (d1 + d2) ** 2) / (2 * (sigma ** 2)))
xBLoss = np.zeros((nrows, ncols))
xBLoss[X1, Y1] = border_loss_map
# class weight map
loss = np.zeros((nrows, ncols))
w_1 = 1 - masks.sum() / loss.size
w_0 = 1 - w_1
loss[masks.sum(0) == 1] = w_1
loss[masks.sum(0) == 0] = w_0
ZZ = xBLoss + loss
return ZZ
Traceback of the error-
MemoryError Traceback (most recent call last)
<ipython-input-30-f0a595b8de7e> in <module>
1 # train the model
2 model_scratch = train(20, final_train_loader, unet, optimizer,
----> 3 criterion, train_on_gpu, 'model_scratch.pt')
<ipython-input-29-b481b4f3120e> in train(n_epochs, loaders, model, optimizer, criterion, use_cuda, save_path)
24 loss = criterion(output,target)
25 target.requires_grad = False
---> 26 w = make_weight_map(target)
27 loss = W*loss
28 loss.backward()
<ipython-input-5-e75a6281476f> in make_weight_map(masks)
33 X2, Y2 = np.nonzero(bounds)
34 xSum = (X2.reshape(-1, 1) - X1.reshape(1, -1)) ** 2
---> 35 ySum = (Y2.reshape(-1, 1) - Y1.reshape(1, -1)) ** 2
36 distMap[:, i] = np.sqrt(xSum + ySum).min(axis=0)
37 ix = np.arange(distMap.shape[0])
MemoryError:

Your final_train_loader provides you with an input image data and the expected pixel-wise labeling target. I assume (following pytorch's conventions) that data is of shape B-3-H-W and of dtype=torch.float.
More importantly, target is of shape B-H-W and of dtype=torch.long.
On the other hand make_weight_map expects its input to be C-H-W (with C = number of classes, NOT batch size), of type numpy array.
Try providing make_weight_map the input mask as it expects it and see if you get similar errors.
I also recommend that you visualize the resulting weight map - to make sure your function does what you expect it to do.

Related

Runtime error on WGan-gp algorithm when running on GPU

I am a newbie in pytorch and running the WGan-gp algorithm on google colab using GPU runtime. I encountered the error below. The algorithm works fine when at None runtime i.e cpu.
Error generated during training
0%| | 0/3 [00:00<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-18-7e1d4849a60a> in <module>
19 # Calculate gradient penalty on real and fake images
20 # generated by generator
---> 21 gp = gradient_penalty(netCritic, real_image, fake, device)
22 critic_loss = -(torch.mean(critic_real_pred)
23 - torch.mean(critic_fake_pred)) + LAMBDA_GP * gp
<ipython-input-15-f84354d74f37> in gradient_penalty(netCritic, real_image, fake_image, device)
8 # image
9 # interpolated image ← alpha *real image + (1 − alpha) fake image
---> 10 interpolated_image = (alpha*real_image) + (1-alpha) * fake_image
11
12 # calculate the critic score on the interpolated image
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!
Snippet of my WGan-gp code
def gradient_penalty(netCritic, real_image, fake_image, device=device):
batch_size, channel, height, width = real_image.shape
# alpha is selected randomly between 0 and 1
alpha = torch.rand(batch_size, 1, 1, 1).repeat(1, channel, height, width)
# interpolated image=randomly weighted average between a real and fake
# image
# interpolated image ← alpha *real image + (1 − alpha) fake image
interpolated_image = (alpha*real_image) + (1-alpha) * fake_image
# calculate the critic score on the interpolated image
interpolated_score = netCritic(interpolated_image)
# take the gradient of the score wrt to the interpolated image
gradient = torch.autograd.grad(inputs=interpolated_image,
outputs=interpolated_score,
retain_graph=True,
create_graph=True,
grad_outputs=torch.ones_like
(interpolated_score)
)[0]
gradient = gradient.view(gradient.shape[0], -1)
gradient_norm = gradient.norm(2, dim=1)
gradient_penalty = torch.mean((gradient_norm - 1)**2)
return gradient_penalty
n_epochs = 2000
cur_step = 0
LAMBDA_GP = 10
display_step = 50
CRITIC_ITERATIONS = 5
nz = 100
for epoch in range(n_epochs):
# Dataloader returns the batches
for real_image, _ in tqdm(dataloader):
cur_batch_size = real_image.shape[0]
real_image = real_image.to(device)
for _ in range(CRITIC_ITERATIONS):
fake_noise = get_noise(cur_batch_size, nz, device=device)
fake = netG(fake_noise)
critic_fake_pred = netCritic(fake).reshape(-1)
critic_real_pred = netCritic(real_image).reshape(-1)
# Calculate gradient penalty on real and fake images
# generated by generator
gp = gradient_penalty(netCritic, real_image, fake, device)
critic_loss = -(torch.mean(critic_real_pred)
- torch.mean(critic_fake_pred)) + LAMBDA_GP * gp
netCritic.zero_grad()
# To make a backward pass and retain the intermediary results
critic_loss.backward(retain_graph=True)
optimizerCritic.step()
# Train Generator: max E[critic(gen_fake)] <-> min -E[critic(gen_fake)]
gen_fake = netCritic(fake).reshape(-1)
gen_loss = -torch.mean(gen_fake)
netG.zero_grad()
gen_loss.backward()
# Update optimizer
optimizerG.step()
# Visualization code ##
if cur_step % display_step == 0 and cur_step > 0:
print(f"Step{cur_step}: GenLoss: {gen_loss}: CLoss: {critic_loss}")
display_images(fake)
display_images(real_image)
gen_loss = 0
critic_loss = 0
cur_step += 1
I tried to introduce cuda() at the lines 10 and 21 indicated in the error output.But not working.
Here is one approach to solve this kind of error:
Read the error message and locate the exact line where it occured:
... in gradient_penalty(netCritic, real_image, fake_image, device)
8 # image
9 # interpolated image ← alpha *real image + (1 − alpha) fake image
---> 10 interpolated_image = (alpha*real_image) + (1-alpha) * fake_image
11
12 # calculate the critic score on the interpolated image
RuntimeError: Expected all tensors to be on the same device,
but found at least two devices, cuda:0 and cpu!
Look for input tensors that have not been properly transferred to the correct device. Then look for intermediate tensors that have not been transferred.
Here alpha is assigned to a random tensor but no transfer is done!
>>> alpha = torch.rand(batch_size, 1, 1, 1) \
.repeat(1, channel, height, width)
Fix the issue and test:
>>> alpha = torch.rand(batch_size, 1, 1, 1, device=fake_image.device) \
.repeat(1, channel, height, width)

predicting simple autoregressive model with fully connected

the question is at the end, you can just jump to the question, I just wanted to share my process, in case someone want to give me general advice.
I started learning how to use LSTM layers and tried to build a simple predictor to the following AR model:
class AR_model:
def __init__(self, length=100):
self.time = 0
self.first_value = 0
self.a1 = 0.6
self.a2 = -0.5
self.a3 = -0.2
self.Xt = self.first_value
self.Xt_minus_1 = 0
self.Xt_minus_2 = 0
self.length = length
def __iter__(self):
return self
def __next__(self): # raise StopIteration
if self.time == self.length:
raise StopIteration
new_value = self.a1 * self.Xt + \
self.a2 * self.Xt_minus_1 + \
self.a3 * self.Xt_minus_2 + \
random.uniform(0, 0.1)
self.Xt_minus_2 = self.Xt_minus_1
self.Xt_minus_1 = self.Xt
self.Xt = new_value
self.time += 1
return new_value
which basicly means the following series:
Xt = a1 * Xt−1 + a2 * Xt−2 + a3X * t−3 + Ut
where: a1 = 0.6, a2 = −0.5, a3 = −0.2 and Ut (i.i.d) ∼ Uniform(0, 0.1)
using the following forward method:
def forward(self, input):
# input: [Batch x seq_length x input_size]
x, _ = self.lstm(input)
# x: [Batch x seq_length x hidden_state]
x = x[:, -1, :]
# taking only the last x: [Batch x hidden_state]
x = self.linear(x)
# x: [Batch x 1]
return x
the best result seems ok:
picture of results, 91 steps
with the following hyper-parameters:
signal_count = 50
signal_length = 200
hidden_state = 200
learning_rate = 0.1
also tried it on sin and tri waves:
sin wave 20 steps
tri wave 75 steps
tri wave might have worked on deeper layered network but I didnt bother to try
Question 1
It make sense that for a simple AR model, such as:
Xt = a1 * Xt−1 + a2 * Xt−2 + a3X * t−3 + Ut
where: a1 = 0.6, a2 = −0.5, a3 = −0.2 and Ut (i.i.d) ∼ Uniform(0, 0.1)
It would be possible to get a good prediction with a simple three input one layered fully connected network, where the inputs are the last tree values of the AR series.
but I just get terrible result. Even when I remove the noise from the AR model I still get bad results. Am I in the wrong to think this?
I didn't post the code because I think its a concept problem. If someone asks, I will post.
Question 2
for the above AR model, what simple predictor would you recommend, not necessarily based deep learning.
asking friends I got recommended kalman filter, and Markovian based.
haven't really checked them out yet.
Thank you for reading

Pytorch - Loss for Object Localization

I am trying to perform an object localization task with MNIST based on Andrew Ng's lecture here. I am taking the MNIST digits and randomly placing them into a 90x90 shaped image and predicting the digit and it's center point. When I train, I am getting very poor results and my question is about whether or not my loss function is set up correctly. I basically just take the CrossEntropy for the digit, the MSE for the coordinates, and then add them all up. Is this correct? I don't get any errors, but the performance is just horrendous.
My dataset is defined as follows (which returns the label and the x y coordinates of the center of the digit):
class CustomMnistDataset_OL(Dataset):
def __init__(self, df, test=False):
'''
df is a pandas dataframe with 28x28 columns for each pixel value in MNIST
'''
self.df = df
self.test = test
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
if self.test:
image = np.reshape(np.array(self.df.iloc[idx,:]), (28,28)) / 255.
else:
image = np.reshape(np.array(self.df.iloc[idx,1:]), (28,28)) / 255.
# create the new image
new_img = np.zeros((90, 90)) # images will be 90x90
# randomly select a bottom left corner to use for img
x_min, y_min = randrange(90 - image.shape[0]), randrange(90 - image.shape[0])
x_max, y_max = x_min + image.shape[0], y_min + image.shape[0]
x_center = x_min + (x_max-x_min)/2
y_center = y_min + (y_max-x_min)/2
new_img[x_min:x_max, y_min:y_max] = image
label = [int(self.df.iloc[idx,0]), x_center, y_center] # the label consists of the digit and the center of the number
sample = {"image": new_img, "label": label}
return sample['image'], sample['label']
My training function is set up as follows:
loss_fn = nn.CrossEntropyLoss()
loss_mse = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
def train(dataloader, model, loss_fn, loss_mse, optimizer):
model.train() # very important... This turns the model back to training mode
size = len(train_dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y0, y1, y2 = X.to(device), y[0].to(device), y[1].to(device), y[2].to(device)
pred = model(X.float())
# DEFINE LOSS HERE -------
loss = loss_fn(pred[0], y0) + loss_mse(pred[1], y1.float()) + loss_mse(pred[2], y2.float())
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch*len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")

WHat does Lambda do in this code (python keras)?

def AdaIN(x):
#Normalize x[0] (image representation)
mean = K.mean(x[0], axis = [1, 2], keepdims = True)
std = K.std(x[0], axis = [1, 2], keepdims = True) + 1e-7
y = (x[0] - mean) / std
#Reshape scale and bias parameters
pool_shape = [-1, 1, 1, y.shape[-1]]
scale = K.reshape(x[1], pool_shape)
bias = K.reshape(x[2], pool_shape)#Multiply by x[1] (GAMMA) and add x[2] (BETA)
return y * scale + bias
def g_block(input_tensor, latent_vector, filters):
gamma = Dense(filters, bias_initializer = 'ones')(latent_vector)
beta = Dense(filters)(latent_vector)
out = UpSampling2D()(input_tensor)
out = Conv2D(filters, 3, padding = 'same')(out)
out = Lambda(AdaIN)([out, gamma, beta])
out = Activation('relu')(out)
return out
Please see code above. I am currently studying styleGAN. I am trying to convert this code into pytorch but I cant seem to understand what does Lambda do in g_block. AdaIN needs only one input based on its declaration but some how is gamma and beta also used as input? Please inform me what does the Lambda do in this code.
Thank you very much.
Lambda layers in keras are used to call custom functions inside the model. In g_block Lambda calls AdaIN function and passes out, gamma, beta as arguments inside a list. And AdaIN function receives these 3 tensors encapsulated within a single list as x. And also those tensors are accessed inside AdaIN function by indexing list x(x[0], x[1], x[2]).
Here's pytorch equivalent:
import torch
import torch.nn as nn
import torch.nn.functional as F
class AdaIN(nn.Module):
def forward(self, out, gamma, beta):
bs, ch = out.size()[:2]
mean = out.reshape(bs, ch, -1).mean(dim=2).reshape(bs, ch, 1, 1)
std = out.reshape(bs, ch, -1).std(dim=2).reshape(bs, ch, 1, 1) + 1e-7
y = (out - mean) / std
bias = beta.unsqueeze(-1).unsqueeze(-1).expand_as(out)
scale = gamma.unsqueeze(-1).unsqueeze(-1).expand_as(out)
return y * scale + bias
class g_block(nn.Module):
def __init__(self, filters, latent_vector_shape, input_tensor_channels):
super().__init__()
self.gamma = nn.Linear(in_features = latent_vector_shape, out_features = filters)
# Initializes all bias to 1
self.gamma.bias.data = torch.ones(filters)
self.beta = nn.Linear(in_features = latent_vector_shape, out_features = filters)
# calculate appropriate padding
self.conv = nn.Conv2d(input_tensor_channels, filters, 3, 1, padding=1)# calc padding
self.adain = AdaIN()
def forward(self, input_tensor, latent_vector):
gamma = self.gamma(latent_vector)
beta = self.beta(latent_vector)
# check default interpolation mode in keras and replace mode below if different
out = F.interpolate(input_tensor, scale_factor=2, mode='nearest')
out = self.conv(out)
out = self.adain(out, gamma, beta)
out = torch.relu(out)
return out
# Sample:
input_tensor = torch.randn((1, 3, 10, 10))
latent_vector = torch.randn((1, 5))
g = g_block(3, latent_vector.shape[1], input_tensor.shape[1])
out = g(input_tensor, latent_vector)
print(out)
Note: you need to pass latent_vector and input_tensor shapes while creating g_block.

Losses are increasing in Binary classification using gradient descent optimization method

This my program for Binary classification using gradient descent optimization method. I am not sure about my loss function. The error in my case is incresing when plotted
def sigmoid_activation(x):
return 1.0 / (1 + np.exp(-x))
def predict(testX, W):
preds= sigmoid_activation(np.dot(testX,W))
# apply a step function to threshold (=0.5) the outputs to binary class
#labels
#start your code here
for i in range(len(preds)):
if preds[i]<0.5:
p.append(0)
if preds[i]>=0.5:
p.append(1)
return p
epochs = 50
alpha = 0.01
(X,y)=make_moons(n_samples=1000, noise = 0.15)
y=y.reshape(y.shape[0],1)
X = np.c_[X, np.ones((X.shape[0]))]
(trainX, testX, trainY, testY) = train_test_split(X, y,
test_size=0.5, random_state=42)
print("[INFO] training...")
W = np.random.randn(X.shape[1], 1)
losses = []
for epoch in np.arange(0, epochs):
#start your code here
Z=np.dot(trainX, W)
yhat= sigmoid_activation(Z)
error=trainY-yhat
loss = np.sum(error ** 2)
losses.append(loss)
gradient = trainX.T.dot(error) / trainX.shape[0]
W= W-alpha*gradient #moving in -ve direction
# check to see if an update should be displayed
if epoch == 0 or (epoch + 1) % 5 == 0:
print("[INFO] epoch={}, loss={:.7f}".format(int(epoch + 1),
loss))
# evaluate our model
print("[INFO] evaluating...")
preds = predict(testX, W)
print(classification_report(testY, preds))
# plot the (testing) classification data
plt.style.use("ggplot")
plt.figure()
plt.title("Data")
plt.scatter(testX[:, 0], testX[:, 1], marker="o", c=testY[:,0], s=30)
# construct a figure that plots the loss over time
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, epochs), losses)
plt.title("Training Loss")
plt.xlabel("Epoch #")
plt.ylabel("Loss")
plt.show()