I have an issue with multiprocessing in pytorch in AWS sagemaker. Essentially my GAN model runs fine when using single processing (num workers = 0) in AWS sagemaker, but produces error when using multiprocessing (num workers > 0). One thing weird is that my GAN model runs fine with multiprocessing when I'm using a local environment, but not in AWS Sagemaker, which or may or may not be due to different OS I think.
Traceback (most recent call last):
File "main.py", line 371, in <module>
g_scaler=g_scaler, d_scaler=d_scaler, runtime_log_folder=runtime_log_folder, runtime_log_file_name=runtime_log_file_name)
File "main.py", line 78, in train_fn
for idx, (x, y) in enumerate(loop):
File "/opt/conda/lib/python3.6/site-packages/tqdm/std.py", line 1171, in __iter__
for obj in iterable:
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 525, in __next__
(data, worker_id) = self._next_data()
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1252, in _next_data
return (self._process_data(data), w_id)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1299, in _process_data
data.reraise()
File "/opt/conda/lib/python3.6/site-packages/torch/_utils.py", line 429, in reraise
raise self.exc_type(msg)
File "/opt/conda/lib/python3.6/site-packages/botocore/exceptions.py", line 84, in __init__
super(HTTPClientError, self).__init__(**kwargs)
File "/opt/conda/lib/python3.6/site-packages/botocore/exceptions.py", line 40, in __init__
msg = self.fmt.format(**kwargs)
KeyError: 'error'
---------------------------------------------------------------------------
UnexpectedStatusException Traceback (most recent call last)
<ipython-input-1-81655136a841> in <module>
58 py_version='py3')
59
---> 60 pytorch_estimator.fit({'train': Runtime.dataset_path}, job_name=Runtime.job_name)
61
62 #print(pytorch_estimator.latest_job_tensorboard_artifacts_path())
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
955 self.jobs.append(self.latest_training_job)
956 if wait:
--> 957 self.latest_training_job.wait(logs=logs)
958
959 def _compilation_job_name(self):
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/estimator.py in wait(self, logs)
1954 # If logs are requested, call logs_for_jobs.
1955 if logs != "None":
-> 1956 self.sagemaker_session.logs_for_job(self.job_name, wait=True, log_type=logs)
1957 else:
1958 self.sagemaker_session.wait_for_job(self.job_name)
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/session.py in logs_for_job(self, job_name, wait, poll, log_type)
3751
3752 if wait:
-> 3753 self._check_job_status(job_name, description, "TrainingJobStatus")
3754 if dot:
3755 print()
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/session.py in _check_job_status(self, job, desc, status_key_name)
3304 ),
3305 allowed_statuses=["Completed", "Stopped"],
-> 3306 actual_status=status,
3307 )
3308
UnexpectedStatusException: Error for Training job 2022-06-03-05-16-49-pix2pix-U12239-2022-05-09-14-39-18-training: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 main.py --runtime_var dataset_name=U12239-2022-05-09-14-39-18,job_name=2022-06-03-05-16-49-pix2pix-U12239-2022-05-09-14-39-18-training,model_name=pix2pix"
0%| | 0/248 [00:00<?, ?it/s]
0%| | 1/248 [00:30<2:07:28, 30.97s/it]
0%| | 1/248 [00:30<2:07:28, 30.97s/it]
Traceback (most recent call last):
File "main.py", line 371, in <module>
g_scaler=g_scaler, d_scaler=d_scaler, runtime_log_folder=runtime_log_folder, runtime_log_file_name=runtime_log_file_name)
File "main.py", line 78, in train_fn
for idx, (x, y) in enumerate(loop):
File "/opt/conda/lib/python3.6/site-packages/tqdm/std.py", line 1171, in __iter__
for obj in iterable:
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 525, in __next__
(data, worker_id) = self._next_data()
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1252, in _next_data
return (self
This is the code that produces the error
def train_fn(disc, gen, loader, opt_disc, opt_gen, l1, bce, g_scaler, d_scaler,runtime_log_folder,runtime_log_file_name):
total_output=''
loop = tqdm(loader, leave=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Loop")
print(loop)
print("Length loop")
print(len(loop))
for idx, (x, y) in enumerate(loop): #<--error happens here
print("Loop index")
print(idx)
print("Loop item")
print(x,y)
x = x.to(device)
y = y.to(device)
# train discriminator
with torch.cuda.amp.autocast():
y_fake = gen(x)
D_real = disc(x, y)
D_fake = disc(x, y_fake.detach())
# use detach so as to avoid breaking computational graph when do optimizer.step on discriminator
# can use detach, or when do loss.backward put loss.backward(retain_graph = True)
D_real_loss = bce(D_real, torch.ones_like(D_real))
D_fake_loss = bce(D_fake, torch.ones_like(D_fake))
D_loss = (D_real_loss + D_fake_loss) / 2
# log tensorboard
disc.zero_grad()
d_scaler.scale(D_loss).backward()
d_scaler.step(opt_disc)
d_scaler.update()
# train generator
with torch.cuda.amp.autocast():
D_fake = disc(x, y_fake)
# compute fake loss
# trick discriminator to believe these are real, hence send in torch.oneslikedfake
G_fake_loss = bce(D_fake, torch.ones_like(D_fake))
# compute L1 loss
L1 = l1(y_fake, y) * args.l1_lambda
G_loss = G_fake_loss + L1
# log tensorboard
opt_gen.zero_grad()
g_scaler.scale(G_loss).backward()
g_scaler.step(opt_gen)
g_scaler.update()
# print epoch, generator loss, discriminator loss
print(f'[Epoch {epoch}/{args.num_epochs} (b: {idx})] [D loss: {D_loss}, D real loss: {D_real_loss}, D fake loss: {D_fake_loss}] [G loss: ##{G_loss}, G fake loss: {G_fake_loss}, L1 loss: {L1}]')
output = f'[Epoch {epoch}/{args.num_epochs} (b: {idx})] [D loss: {D_loss}, D real loss: {D_real_loss}, D fake loss: {D_fake_loss}] [G loss: ##{G_loss}, G fake loss: {G_fake_loss}, L1 loss: {L1}]\n'
total_output+=output
runtime_log = get_json_file_from_s3(runtime_log_folder, runtime_log_file_name)
runtime_log += total_output
upload_json_file_to_s3(runtime_log_folder,runtime_log_file_name,json.dumps(runtime_log))
def __getitem__(self, index):
print("Index ",index)
pair_key = self.list_files[index]
print("Pair key ",pair_key)
pair = Boto.s3_client.list_objects(Bucket=Boto.bucket_name, Prefix=pair_key, Delimiter='/')
input_image_key = pair.get('Contents')[1].get('Key')
input_image_path = f's3://{Boto.bucket_name}/{input_image_key}'
print("Input image path ",input_image_path)
input_image_s3_source = get_file_from_filepath(input_image_path)
input_image = np.array(Image.open(input_image_s3_source))
target_image_key = pair.get('Contents')[0].get('Key')
target_image_path = f's3://{Boto.bucket_name}/{target_image_key}'
print("Target image path ",target_image_path)
target_image_s3_source = get_file_from_filepath(target_image_path)
target_image = np.array(Image.open(target_image_s3_source))
augmentations = config.both_transform(image=input_image, image0=target_image)
# get input image and target image by doing augmentations of images
input_image, target_image = augmentations['image'], augmentations['image0']
input_image = config.transform_only_input(image=input_image)['image']
target_image = config.transform_only_mask(image=target_image)['image']
print("Input image size ",input_image.size())
print("Target image size ",target_image.size())
return input_image, target_image
Here are a summary of traces of the failure points from logs
i) 2022-06-03-05-00-04-pix2pix-U12239-2022-05-09-14-39-18-training
No index shown
[Epoch 0/100 (b: 0)]
ii) 2022-06-03-05-16-49-pix2pix-U12239-2022-05-09-14-39-18-training
Index 160
[Epoch 0/100 (b: 0)]
iii) 2022-06-03-05-44-46-pix2pix-U12239-2022-05-09-14-39-18-training
Index 160
[Epoch 0/100 (b: 0)]
iv) 2022-06-03-06-08-33-pix2pix-U12239-2022-05-09-14-39-18-training
Index 160
[Epoch 1/100 (b: 0)]
v) 2022-06-15-02-49-20-pix2pix-U12239-2022-05-09-14-39-18-training
Index 160
Pair key datasets/training-data/testing/2022-05-09-14-39-18/match-raws-finals/U12239/P423712/Pair_71/
[Epoch 0/100 (b: 0)
vi) 2022-06-15-02-59-43-pix2pix-U12239-2022-05-09-14-39-18-training
Index 64
Pair key datasets/training-data/testing/2022-05-09-14-39-18/match-raws-finals/U12239/P425642/Pair_27/
[Epoch 0/100 (b: 247)]
vii) 2022-06-15-04-49-33-pix2pix-U12239-2022-05-09-14-39-18-training
Index 64
Pair key datasets/training-data/testing/2022-05-09-14-39-18/match-raws-finals/U12239/P415414/Pair_124/
No specific epoch
Essentially it seems to fail either at the start of the epoch(batch 0) or end of epoch(batch 247), and due to multiprocessing. Does anyone have suggestions on how to fix it please?
Related
So, the questionn is this:
What I am doing wrong when defining the neural net architecture? Look at sections Define the neural network model and Define the learning rate scheduler train the model
Details:
I have written the code of this where revenue_data shape is (1749, 2) while weather_data shape is (86990, 10) X_train shape is ([69010, 14]), y_train is ([69010]), X_val is ([17253, 14]), y_val = ([17253]) and have done the preprocesing, scaling, removing oputliers and splitting the data as here:
Convert date and time columns to datetime format
revenue_data['Date'] = pd.to_datetime(revenue_data['Date'], format='%Y%m%d')
weather_data['dt'] = pd.to_datetime(weather_data['dt'], format='%Y%m%d')
weather_data['time'] = pd.to_datetime(weather_data['time'], format='%H:%M:%S')
Convert wind and condition columns to embeddings
wind_embeddings = nn.Embedding(len(weather_data['wind'].unique()), 5)
weather_data['wind_code'] = weather_data['wind'].astype('category').cat.codes
wind_vectors = wind_embeddings(torch.tensor(weather_data['wind_code'].values, dtype=torch.long))
weather_data['wind_x'] = wind_vectors[:, 0].detach().numpy()
weather_data['wind_y'] = wind_vectors[:, 1].detach().numpy()
weather_data['wind_z'] = wind_vectors[:, 2].detach().numpy()
weather_data['wind_t'] = wind_vectors[:, 3].detach().numpy()
weather_data['wind_u'] = wind_vectors[:, 4].detach().numpy()
condition_embeddings = nn.Embedding(len(weather_data['condition'].unique()), 3)
weather_data['condition_code'] = weather_data['condition'].astype('category').cat.codes
condition_vectors = condition_embeddings(torch.tensor(weather_data['condition_code'].values, dtype=torch.long))
weather_data['condition_x'] = condition_vectors[:, 0].detach().numpy()
weather_data['condition_y'] = condition_vectors[:, 1].detach().numpy()
weather_data['condition_z'] = condition_vectors[:, 2].detach().numpy()
Group the weather data by date and hour and calculate the mean for each date and hour
weather_data = weather_data.groupby(['dt', 'time']).mean()
weather_data = weather_data.reset_index()
weather_data['Date'] = weather_data['dt']
weather_data.drop(['dt', 'time', 'wind_code', 'condition_code'], axis=1, inplace=True)
Merge the revenue and weather data on the 'Date' column and drop 'Date'
merged_data = pd.merge(revenue_data, weather_data, on='Date')
merged_data.drop('Date', axis=1, inplace=True)
merged_data.head()
Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(merged_data)
Split the data into input and target sets
X = scaled_data[:, 1:]
y = scaled_data[:, 0]
from scipy.stats import zscore
Calculate z-scores for each feature | Remove outliers that have z-scor bigger that 3
z_scores = zscore(X)
Identify rows where any feature has a z-score > 3
mask = (z_scores > 3).any(axis=1)
Remove rows with high z-scores from the x and y
features = X[~mask, :]
target = y[~mask]
Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
Convert the data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
but the I am struggling to realise what is wrong with the neural net architecture defined:
Define the neural network model
class RevenuePredictor(nn.Module):
def __init__(self):
super().__init__()
self.lstm = nn.LSTM(input_size=14, hidden_size=32, num_layers=1, batch_first=True)
self.fc1 = nn.Linear(32, 16)
self.fc2 = nn.Linear(16, 1)
def forward(self, x, lengths):
print('x shape:', x.shape)
Get the lengths of the input sequences
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lengths = lengths.to(device)
lengths = lengths.cpu()
print('lengths shape:', lengths.shape)
Sort the input sequences by length
sorted_lengths, sorted_idx = torch.sort(lengths, descending=True)
sorted_x = x[sorted_idx]
Pack the sorted input sequences
packed_x = nn.utils.rnn.pack_padded_sequence(sorted_x, sorted_lengths, batch_first=True)
Convert the packed sequence to a tensor with two dimensions
x_data, batch_sizes = nn.utils.rnn.pad_packed_sequence(packed_x, batch_first=True)
Convert the packed sequence to a tensor with two dimensions
x_data, batch_sizes = x.data, x.batch_sizes
seq_len = batch_sizes[0]
batch_size = len(batch_sizes)
x = x_data.new_zeros((batch_size, seq_len, 14))
s = 0
for i, l in enumerate(batch_sizes):
x[i, :l] = x_data[s:(s+l)]
s += l
Pass the packed input sequences through the LSTM
lstm_output, (h, c) = self.lstm(packed_x)
Unpack the LSTM output sequences
unpacked_output, _ = nn.utils.rnn.pad_packed_sequence(lstm_output, batch_first=True)
Re-sort the output sequences to their original order
unsorted_idx = sorted_idx.sort(0)
output = unpacked_output[unsorted_idx]
Pass the output sequences through the fully connected layers
output = nn.functional.relu(self.fc1(output[:, -1, :]))
output = self.fc2(output)
return output
Then Create the model
model = RevenuePredictor()
followed by loss and metrics
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
metrics = {
'mse': MeanSquaredError(),
'mae': MeanAbsoluteError(),
'r2': R2Score(),
}
Define the learning rate scheduler train the model
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)
best_val_loss = np.inf
for epoch in range(num_epochs):
# Set the model to training mode
model.train()
train_loss = 0.0
num_batches = 0
for X_train, y_train in train_loader:
lengths = torch.ones(X_train.shape[0], dtype=torch.long)
optimizer.zero_grad()
output = model(X_train, lengths)
loss = loss_fn(output, y_train)
loss.backward()
optimizer.step()
train_loss += loss.item()
num_batches += 1
val_loss = 0.0
for X_val, y_val in val_loader:
lengths = torch.ones(X_val.shape[0], dtype=torch.long)
output = model(X_val, lengths)
loss = loss_fn(output, y_val)
val_loss += loss.item()
scheduler.step(val_loss)
val_loss /= len(val_loader)
val_mse = metrics['mse'].compute()
val_mae = metrics['mae'].compute()
val_r2 = metrics['r2'].compute()
for metric in metrics.values():
metric.reset()
if (epoch+1) % 100 == 0:
print('Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}, MSE: {:.4f}, MAE: {:.4f}, R2: {:.4f}'
.format(epoch+1, num_epochs, train_loss/num_batches, val_loss, val_mse, val_mae, val_r2))
I get this error which I think is because of something being wwrong in defining neural netwrok model:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-164-e20b93c25048>", line 3, in <module>
output = model(X_train, lengths)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "<ipython-input-163-43b2ef5c15db>", line 38, in forward
lstm_output, (h, c) = self.lstm(packed_x)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/rnn.py", line 772, in forward
self.check_forward_args(input, hx, batch_sizes)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/rnn.py", line 697, in check_forward_args
self.check_input(input, batch_sizes)
File "/usr/local/lib/python3.8/dist-packages/torch/nn/modules/rnn.py", line 206, in check_input
raise RuntimeError(
# RuntimeError: input must have 2 dimensions, got 1
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py", line 2040, in showtraceback
stb = value._render_traceback_()
AttributeError: 'RuntimeError' object has no attribute '_render_traceback_'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
File "/usr/local/lib/python3.8/dist-packages/IPython/core/ultratb.py", line 319, in wrapped
return f(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/IPython/core/ultratb.py", line 353, in _fixed_getinnerframes
records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
File "/usr/lib/python3.8/inspect.py", line 1515, in getinnerframes
frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)
File "/usr/lib/python3.8/inspect.py", line 1473, in getframeinfo
filename = getsourcefile(frame) or getfile(frame)
File "/usr/lib/python3.8/inspect.py", line 708, in getsourcefile
if getattr(getmodule(object, filename), '__loader__', None) is not None:
File "/usr/lib/python3.8/inspect.py", line 737, in getmodule
file = getabsfile(object, _filename)
File "/usr/lib/python3.8/inspect.py", line 721, in getabsfile
return os.path.normcase(os.path.abspath(_filename))
File "/usr/lib/python3.8/posixpath.py", line 379, in abspath
cwd = os.getcwd()
FileNotFoundError: [Errno 2] No such file or directory
---------------------------------------------------------------------------
I tried coverting the packed sequence to a tensor with two dimensions in different way:
x_data, ba
tch_sizes = x.data, x.batch_sizes
seq_len = batch_sizes[0]
batch_size = len(batch_sizes)
x = x_data.new_zeros((batch_size, seq_len, 14))
s = 0
for i, l in enumerate(batch_sizes):
x[i, :l] = x_data[s:(s+l)]
s += l
Didnt work.
Then tried rehsaping x to have three dimensions like:
batch_size, seq_len, input_size = x.shape
Didn't work and finally tried:
unsqueze(-1) on output after I defined model like:
model = REvenuePredictor()
output = model(X_train, lengths).unsqueeze(-1)
everyone!
I am working on an object detection project using the VGG network in the PASCAL VOC dataset. I used custom dataset loading to load the PASCAL VOC dataset. And coded network from scratch. (They are similar to PyTorch's Vision method).
Currently, I'm getting the following error:
Traceback (most recent call last):
File "/home/khushi/Documents/deep-learning/benchmarking-deep-neural-networks/vgg/main.py", line 59, in <module>
main()
File "/home/khushi/Documents/deep-learning/benchmarking-deep-neural-networks/vgg/main.py", line 55, in main
train(data, model, num_epochs, criteria, optimizer)
File "/home/khushi/Documents/deep-learning/benchmarking-deep-neural-networks/vgg/main.py", line 21, in train
outputs = model(image)
File "/home/khushi/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/khushi/Documents/deep-learning/benchmarking-deep-neural-networks/vgg/vgg_torch.py", line 39, in forward
x = self.features(x)
File "/home/khushi/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/khushi/.local/lib/python3.9/site-packages/torch/nn/modules/container.py", line 141, in forward
input = module(input)
File "/home/khushi/.local/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/khushi/.local/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 446, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/khushi/.local/lib/python3.9/site-packages/torch/nn/modules/conv.py", line 442, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
TypeError: conv2d() received an invalid combination of arguments - got (Image, Parameter, Parameter, tuple, tuple, tuple, int), but expected one of:
* (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, tuple of ints padding, tuple of ints dilation, int groups)
didn't match because some of the arguments have invalid types: (Image, Parameter, Parameter, tuple, tuple, tuple, int)
* (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, str padding, tuple of ints dilation, int groups)
didn't match because some of the arguments have invalid types: (Image, Parameter, Parameter, tuple, tuple, tuple, int)
The code I am implementing:
import vgg_torch
import voc_loader
import time
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
num_epochs = 5
learning_rate = 0.01
# https://github.com/khushi-411/tutorials/pytorch
def train(data, model, num_epochs, criteria, optimizer):
steps = len(data)
for epochs in range(num_epochs):
for i, (image, target) in enumerate(data):
# forward pass
# https://stackoverflow.com/questions/57237381
#outputs = model(image[None, ...])
outputs = model(image)
loss = criteria(outputs, target)
# backward pass and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i+1) % 100 == 0:
print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' .format(epochs+1, num_epochs, i+1, steps, loss.item()))
def main():
# give absolute path to dataset
# https://stackoverflow.com/questions/56741108
data = voc_loader.VOCDetection('/home/khushi/Documents/deep-learning/datasets/pascal-voc/')
""",
transform=transforms.Compose([
transforms.ToTensor(),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
)
"""
# Load model: vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19, vgg19_bn
model = vgg_torch.vgg11()
print(model)
# Loss function and optimizer
criteria = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# training
start = time.time()
train(data, model, num_epochs, criteria, optimizer)
print("Total time taken to train: ", time.time() - start)
if __name__ == "__main__":
main()
Dependencies
PyTorch: 1.10.0+cu102
OS: Manjaro Distro
RAM: 16GB
Will anyone please help me out to resolve this error? Thanks!
Im working on a Multiclass Classification (Bengali Language Sentiment Analysis) on a pretrained Huggingface (BertForMaskedLM) model.
When the error occured I knew I have to change the label(output) size to match the input. But do not know how. Im adding the code snippents below.
MAX_LEN = 200
BATCH_SIZE = 16
The pretrained models used:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
model = BertForMaskedLM.from_pretrained("sagorsarker/bangla-bert-base")
tokenizer = BertTokenizer.from_pretrained("sagorsarker/bangla-bert-base")
Code to make the pytorch dataset:
class GPReviewDataset(Dataset):
def __init__(self, reviews, targets, tokenizer, max_len):
self.reviews = reviews
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.reviews)
def __getitem__(self, item):
review = str(self.reviews[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
review,
add_special_tokens=True,
max_length=self.max_len,
truncation = True,
return_token_type_ids=False,
padding='max_length',
return_attention_mask=True,
return_tensors='pt',
)
return {
'review_text': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
The input dimentions are:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)
Which Outputs:
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
Training Class
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
model = model.train() # tells your model that we are training
losses = []
correct_predictions = 0
for d in data_loader:
input_ids = d["input_ids"].to(device)
attention_mask = d["attention_mask"].to(device)
targets = d["targets"].to(device)
loss, logits = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels = targets
)
#logits = classification scores befroe softmax
#loss = classification loss
logits = logits.view(-1, 28*28).detach().cpu().numpy()
label_ids = targets.to('cpu').numpy()
preds = np.argmax(logits, axis=1).flatten() #returns indices of maximum logit
targ = label_ids.flatten()
correct_predictions += np.sum(preds == targ)
losses.append(loss.item())
loss.backward() # performs backpropagation(computes derivates of loss w.r.t to parameters)
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) #clipping gradients so they dont explode
optimizer.step() #After gradients are computed by loss.backward() this makes the optimizer iterate over all parameters it is supposed to update and use internally #stored grad to update their values
scheduler.step() # this will make sure learning rate changes. If we dont provide this learning rate stays at initial value
optimizer.zero_grad() # clears old gradients from last step
return correct_predictions / n_examples, np.mean(losses)
Where the training Starts (Where the error triggers):
%%time
# standard block
# used accuracy as metric here
history = defaultdict(list)
best_acc = 0
for epoch in range(EPOCHS):
print(f'Epoch {epoch + 1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))
print(f'Train loss {train_loss} Accuracy {train_acc}')
val_acc, val_loss = eval_model(model, valid_data_loader, device, len(df_valid))
print(f'Val loss {val_loss} Accuracy {val_acc}')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
if val_acc > best_acc:
torch.save(model.state_dict(), 'best_model_state_a5.bin')
best_acc = val_acc
The error:
Epoch 1/5
----------
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-41-fb5a4d77ce37> in <module>()
----> 1 get_ipython().run_cell_magic('time', '', "# standard block\n# used accuracy as metric here\nhistory = defaultdict(list)\n\nbest_acc = 0\n\nfor epoch in range(EPOCHS):\n\n print(f'Epoch {epoch + 1}/{EPOCHS}')\n print('-' * 10)\n\n train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))\n\n print(f'Train loss {train_loss} Accuracy {train_acc}')\n\n val_acc, val_loss = eval_model(model, valid_data_loader, device, len(df_valid))\n\n print(f'Val loss {val_loss} Accuracy {val_acc}')\n print()\n\n history['train_acc'].append(train_acc)\n history['train_loss'].append(train_loss)\n history['val_acc'].append(val_acc)\n history['val_loss'].append(val_loss)\n\n if val_acc > best_acc:\n torch.save(model.state_dict(), 'best_model_state_a5.bin')\n best_acc = val_acc\n\n# We are storing state of best model indicated by highest validation accuracy")
8 frames
/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
2115 magic_arg_s = self.var_expand(line, stack_depth)
2116 with self.builtin_trap:
-> 2117 result = fn(magic_arg_s, cell)
2118 return result
2119
<decorator-gen-53> in time(self, line, cell, local_ns)
/usr/local/lib/python3.7/dist-packages/IPython/core/magic.py in <lambda>(f, *a, **k)
186 # but it's overkill for just that one bit of state.
187 def magic_deco(arg):
--> 188 call = lambda f, *a, **k: f(*a, **k)
189
190 if callable(arg):
/usr/local/lib/python3.7/dist-packages/IPython/core/magics/execution.py in time(self, line, cell, local_ns)
1191 else:
1192 st = clock2()
-> 1193 exec(code, glob, local_ns)
1194 end = clock2()
1195 out = None
<timed exec> in <module>()
<ipython-input-39-948eefef2f8d> in train_epoch(model, data_loader, optimizer, device, scheduler, n_examples)
13 input_ids=input_ids,
14 attention_mask=attention_mask,
---> 15 labels = targets
16 )
17
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, output_attentions, output_hidden_states, return_dict)
1327 if labels is not None:
1328 loss_fct = CrossEntropyLoss() # -100 index = padding token
-> 1329 masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1330
1331 if not return_dict:
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
1119 def forward(self, input: Tensor, target: Tensor) -> Tensor:
1120 return F.cross_entropy(input, target, weight=self.weight,
-> 1121 ignore_index=self.ignore_index, reduction=self.reduction)
1122
1123
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
2822 if size_average is not None or reduce is not None:
2823 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2824 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2825
2826
ValueError: Expected input batch_size (3200) to match target batch_size (16).
i am trying to run this custom loss function in keras and i always run into this error bellow. This pairwaise constraint loss
def loss(y_true, y_pred):
pw = pairwise_distances(y_true, squared=False)
n, d = y_pred.get_shape()
# generate constraint data points
c1 = y_pred[pw[:, 0], :]
c2 = y_pred[pw[:, 1], :]
loss = np.zeros(dtype=np.float32, shape=(pw.shape[0], d * 2))
loss[:, :d] = np.abs(c1 - c2)
loss[:, d:] = (c1 + c2) / 2
return loss
Bellow is the error i get when i try to implement this loss function
File "C:\Users\Benji\Anaconda2\envs\ben\lib\site-packages\keras\engine\training.py", line 692, in _prepare_total_loss
y_true, y_pred, sample_weight=sample_weight)
File "C:\Users\Benji\Anaconda2\envs\ben\lib\site-packages\keras\losses.py", line 71, in __call__
losses = self.call(y_true, y_pred)
File "C:\Users\Benji\Anaconda2\envs\ben\lib\site-packages\keras\losses.py", line 132, in call
return self.fn(y_true, y_pred, **self._fn_kwargs)
File "C:/Users/Benji/PycharmProjects/Code/NEWWORK6.py", line 73, in loss
c1 = y_pred[pw[:, 0], :]
File "C:\Users\Benji\Anaconda2\envs\ben\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 766, in _slice_helper
_check_index(s)
File "C:\Users\Benji\Anaconda2\envs\ben\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 655, in _check_index
raise TypeError(_SLICE_TYPE_ERROR + ", got {!r}".format(idx))
TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got <tf.Tensor 'loss/activation_6_loss/loss/strided_slice:0' shape=(?,) dtype=float32>
Process finished with exit code 1
I'm learn tensorflow2.0 from official tutorials.I can understand the result from below code.
def square_if_positive(x):
return [i ** 2 if i > 0 else i for i in x]
square_if_positive(range(-5, 5))
# result
[-5, -4, -3, -2, -1, 0, 1, 4, 9, 16]
But if I change the inputs with tensor not python code, just like this
def square_if_positive(x):
return [i ** 2 if i > 0 else i for i in x]
square_if_positive(tf.range(-5, 5))
I get below error!!
OperatorNotAllowedInGraphError Traceback (most recent call last)
<ipython-input-39-6c17f29a3443> in <module>
2 def square_if_positive(x):
3 return [i**2 if i > 0 else i for i in x]
----> 4 square_if_positive(tf.range(10))
5 # measure_graph_size(square_if_positive, range(10))
~/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/eager/def_function.py in __call__(self, *args, **kwds)
437 # This is the first call of __call__, so we have to initialize.
438 initializer_map = {}
--> 439 self._initialize(args, kwds, add_initializers_to=initializer_map)
440 if self._created_variables:
441 try:
~/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
380 self._concrete_stateful_fn = (
381 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 382 *args, **kwds))
383
384 def invalid_creator_scope(*unused_args, **unused_kwds):
~/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
1793 if self.input_signature:
1794 args, kwargs = None, None
-> 1795 graph_function, _, _ = self._maybe_define_function(args, kwargs)
1796 return graph_function
1797
~/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in _maybe_define_function(self, args, kwargs)
2093 graph_function = self._function_cache.primary.get(cache_key, None)
2094 if graph_function is None:
-> 2095 graph_function = self._create_graph_function(args, kwargs)
2096 self._function_cache.primary[cache_key] = graph_function
2097 return graph_function, args, kwargs
~/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
1984 arg_names=arg_names,
1985 override_flat_arg_shapes=override_flat_arg_shapes,
-> 1986 capture_by_value=self._capture_by_value),
1987 self._function_attributes,
1988 # Tell the ConcreteFunction to clean up its graph once it goes out of
~/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
851 converted_func)
852
--> 853 func_outputs = python_func(*func_args, **func_kwargs)
854
855 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/eager/def_function.py in wrapped_fn(*args, **kwds)
323 # __wrapped__ allows AutoGraph to swap in a converted function. We give
324 # the function a weak reference to itself to avoid a reference cycle.
--> 325 return weak_wrapped_fn().__wrapped__(*args, **kwds)
326 weak_wrapped_fn = weakref.ref(wrapped_fn)
327
~/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/framework/func_graph.py in wrapper(*args, **kwargs)
841 except Exception as e: # pylint:disable=broad-except
842 if hasattr(e, "ag_error_metadata"):
--> 843 raise e.ag_error_metadata.to_exception(type(e))
844 else:
845 raise
OperatorNotAllowedInGraphError: in converted code:
<ipython-input-37-6c17f29a3443>:3 square_if_positive *
return [i**2 if i > 0 else i for i in x]
/Users/zhangpan/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:547 __iter__
self._disallow_iteration()
/Users/zhangpan/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:540 _disallow_iteration
self._disallow_when_autograph_enabled("iterating over `tf.Tensor`")
/Users/zhangpan/tf2_workspace/tf2.0/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:518 _disallow_when_autograph_enabled
" decorating it directly with #tf.function.".format(task))
OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed: AutoGraph did not convert this function. Try decorating it directly with #tf.function.
I can't find any specifications about this error. I think the real reason is not "iterating over tf.Tensor is not allowed" . Becase I can write like this.
#tf.function
def square_if_positive(x):
for i in x:
if i>0:
tf.print(i**2)
else:
tf.print(i)
square_if_positive(tf.range(10))
I iterate over tensor just like above code.
So my question is what's the real reason about this error? Any suggestions will help me. I really can't understand this error through I read a lot of materials.
The root cause is that autograph doesn't yet support list comprehensions (primarily because it's difficult to determine the dtype of the result in all cases)
As a workaround, you can use tf.map_fn for the comprehension:
return tf.map_fn(lambda i: i ** 2 if i > 0 else i, x)
For more information please take a look at this issue
In case it helps someone.
I had the same problem with a code that did:
for index, image in enumerate(inputs):
... My code ...
The solution was just to do:
index = 0
for image in inputs:
.... My code ...
index += 1
I had a similar issue when using tf.range() instead of python's range() for a list comprehension inside a tensorflow graph function. I was training a 3D segmentation neural net and had to use range() for the code to work.
Check the pseudo code below:-
Y = # [Batch,Height,Width,Depth,Channels]
y_predict = # [B,H,W,D,C,MC_Runs] ; MC_Runs=Monte Carlo Runs
#tf.function
def train_loss(Y,y_predict):
# calulate loss and return scalar value
#tf.function
def train_step():
loss = [train_loss(Y, y_predict[:,:,:,:,:,id_])) for id_ in range(MC_RUNS)]
loss = tf.math.reduce_mean(loss)