I've got some images for training and testing a tensorflow model encoded in csv format. Is there a way to extract those images and / or save them in jpg like format?
Part of the file can be seen above as a opened in excel as a screenshot. If you prefer text to hyperlinks, here is a part of it in a form of a text:
label pixel1 pixel2 ...
6 149 149 ...
5 126 128 ...
10 85 88 ...
0 203 205 ...
There are 785 columns and 7173 rows in total. I have no idea how to deal with that.
You can do it like this
# first i create a dummy dataset to work on
data = make_classification(10000, n_features=784, random_state=1234)
df = pd.DataFrame(data[0], columns=[str(f'col_{i}') for i in range(784)])
df['label'] = data[1]
# Now we create a img_vector and labels array from the dataframe
img_vector = df[[str(f'col_{i}') for i in range(784)]].values
labels = df['label'].values
# splitting the data
# Now we creating the dataset
def get_img(inputs, labels):
# here you have 784 pixels which usually represent a 28*28 image with 1 channel
# hence I reshape it that way
img = tf.reshape(inputs, (28,28,1))
# you can also add some augmentation
img = tf.image.flip_left_right(img)
img = tf.image.flip_up_down(img)
return img, labels
# We pass the img_vector and labels to the make the dataset
train_dataset = tf.data.Dataset.from_tensor_slices((train_mat, train_label))
# Map the dataset to get images form it.
train_dataset = train_dataset.map(get_img).batch(16)
# same for valid dataset
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_mat, valid_label))
valid_dataset = valid_dataset.map(get_img).batch(16)
# A sanity check
import matplotlib.pyplot as plt
sample = None
for i in train_dataset:
sample = i
break
plt.imshow(sample[0][0])
# Creating a model
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(3,3, input_shape=(28,28,1)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# Finally train the model
model.fit(train_dataset,
epochs=10,
validation_data=valid_dataset)
Also, if you ever take a dataset from Kaggle you will usually find a sample notebook for that dataset in the code section.
You can read any row, plot it and save it as image like this:
import numpy as np
import pandas as pd
# read csv file
df = pd.read_csv("data.csv")
# read pixels
images = np.array(df.iloc[:,1:])
labels = np.array(df.iloc[:,0])
# select random number between 0 and 7172
index = 2
# reshape 784 rows to 28 height x 28 width
sample_image = images[index,:].reshape(28,28)
# import plt for displaying image
from matplotlib import pyplot as plt
# plot image
plt.imshow(sample_image)
plt.axis('off')
# plot it's label
print(labels[index])
# save image
plt.savefig("./image{}_label{}".format(index,labels[index]))
Related
I am trying to get the output stored in a variable so that it can be used later on for more processing.
But to get to that stage I am facing a challenge with this code
######INFERENCE ON P6 MODELS*****************************************************************************
import torch
import glob
from natsort import natsorted
import cv2
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import pandas as pd
import os
model = torch.hub.load('/Users/yolov5', 'custom', path='/User/yolov5/runs/train/exp11/weights/best.pt', source='local', force_reload=True) # custom trained model
model.conf = 0.25 # NMS confidence threshold
Path = 'User/yolov5/data/images/'
imgs = [cv2.imread(file) for file in natsorted(glob.glob(Path+"/*.jpg"))]
# Inference
results = model(imgs,size=640)
# Results:
#results.save() # or .print() .show(), .save(), .crop(), .pandas(), etc.
results.pandas()
#print(results.print())
#print(results.pandas().xyxy[:])
# results.show()
#results.pandas().xyxy[0]
#print(results)
#print(results.pandas().xyxy[0])
# dfm = pd.DataFrame(results.pandas().xyxy[0])#, columns = ['Loss','Accuracy']
# # #dfm['Classes'] = classes.tolist()
# predict_labs = 'pred_yolo_individual.csv'
# with open(predict_labs, mode='w') as fd:
# dfm.to_csv(fd)
#results.print() # or .show(), .save(), .crop(), .pandas(), etc.
#results.render()
results.xyxy[0] # im predictions (tensor)
results.pandas().xyxy[0]
results.print()
# pred = results.pandas().xyxy[0]
# for index, row in pred.iterrows():
# print(row['class'], row['confidence'], row['name'])
As you can see I am trying loads of stuff to get this going but some major details I am missing that is not getting the right output as desired.
I would like to get the output like this format below for the folder of images that I have.
# Results
results.print() # or .show(), .save(), .crop(), .pandas(), etc.
results.xyxy[0] # im predictions (tensor)
results.pandas().xyxy[0] # im predictions (pandas)
# xmin ymin xmax ymax confidence class name
# 0 749.50 43.50 1148.0 704.5 0.874023 0 person
# 2 114.75 195.75 1095.0 708.0 0.624512 0 person
# 3 986.00 304.00 1028.0 420.0 0.286865 27 tie
*** THE ISSUE IS***
When I use the same code I am only getting one output!!!!!?!?!?!
If I do a
print(results.pandas().xyxy[0:])
I am seeing the output as demonstrated below but not in the structured format as above:
YOLOv5 🚀 v7.0-72-g064365d Python-3.10.6 torch-1.13.1 CPU
Fusing layers...
Model summary: 212 layers, 20856975 parameters, 0 gradients, 47.9 GFLOPs
Adding AutoShape...
[ xmin ymin xmax ymax confidence class name
0 539.859314 119.92907 602.884216 245.533752 0.353711 1 Stabbing, Empty DataFrame
Columns: [xmin, ymin, xmax, ymax, confidence, class, name]
Index: [], Empty DataFrame
Columns: [xmin, ymin, xmax, ymax, confidence, class, name]
Index: [], xmin ymin xmax ymax confidence class name
0 709.833496 66.843300 1025.770752 800.782593 0.771696 1 Stabbing
1 84.628845 4.153772 461.863617 833.189636 0.632551 1 Stabbing]
Please assist, and thank you in advance for acknowledging my issues.
I would suggest using the following code to get the desired output:
# Inference
results = model(imgs, size=640)
# Results:
preds = results.pandas().xyxy[0] # im predictions (pandas)
# print(preds)
# Create dataframe and write to file
dfm = pd.DataFrame(preds)
dfm.columns = ['xmin', 'ymin', 'xmax', 'ymax', 'confidence', 'class', 'name']
predict_labs = 'pred_yolo_individual.csv'
dfm.to_csv(predict_labs, index=False)
This will create a CSV file containing all the predictions in the desired format.
I am following this tutorial from Pytorch for Finetuning a pre-trained model on my own dataset. I have my annotation in the COCO format in a json file, so, I first implemented the dataloader as follows:
import torch
import json
from torch.utils.data import Dataset
from pycocotools.coco import COCO
from PIL import Image
import os
import numpy as np
from torchvision import transforms
import Config
import transforms as T
from torchvision.transforms import functional as F
class CustomDataset(Dataset):
def __init__(self, root, json_file, transform=None):
self.root = root
with open(json_file) as f:
self.data = json.load(f)
self.transform = transform
self.image_ids = [img["id"] for img in self.data["images"]]
self.imgs = list(sorted(os.listdir(os.path.join(root, "Images"))))
self.masks = list(sorted(os.listdir(os.path.join(root, "Masks"))))
def __getitem__(self, idx):
# Get image ID
img_id = self.image_ids[idx]
img = next(image for image in self.data["images"] if image["id"] == img_id)
img_path = os.path.join(self.root, "Images")
mask_path = os.path.join(self.root, "Masks")
# Load image
image = Image.open(os.path.join(img_path, img['file_name'])).convert("RGB")
# extract annotations from the json file
annotations = [ann for ann in self.data["annotations"] if ann["image_id"] == img_id]
# extract labels from annotations
labels = [ann["label"] for ann in annotations]
# convert labels to integers
labels = [label for label in labels]
labels = torch.as_tensor(labels, dtype=torch.int64)
# extract boxes and convert them to format [x1, y1, x2, y2]
boxes = [ann["bbox"] for ann in annotations]
boxes = [[bbox[0], bbox[1], bbox[2], bbox[3]] for bbox in boxes]
num_objects = len(boxes)
# read the mask and include the number of objects in the first dimension
mask = np.array(Image.open(os.path.join(mask_path, img['file_name'])).convert("L"))
# Check if mask is empty
if mask.size == 0:
mask = np.zeros((num_objects, 1, 1), dtype=np.uint8)
else:
mask = np.expand_dims(mask, axis=0)
mask = np.repeat(mask, num_objects, axis=0)
# convert the binary mask array to a torch tensor
mask = torch.as_tensor(mask, dtype=torch.uint8)
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objects,), dtype=torch.int64)
# convert bboxes to tensors
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# calculate the area of the bounding box
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# convert id to tensor
image_id = torch.tensor([idx])
# create target dictionary
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = mask
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
# apply the transform if any
if self.transform is not None:
image, target = self.transform(image, target)
return image, target
def __len__(self):
return len(self.imgs)
and I am using this code for training:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from engine import train_one_epoch
import utils
import transforms as T
from dataloader import CustomDataset
import Config
import torch
import utils
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from torchvision.transforms import functional as F
def get_instance_segmentation_model(num_classes):
# load an instance segmentation model pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
# get the number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
# now get the number of input features for the mask classifier
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,
num_classes)
return model
def get_transform(train):
transforms = []
# converts the image, a PIL image, into a PyTorch Tensor
transforms.append(T.PILToTensor())
if train:
# during training, randomly flip the training images
# and ground-truth for data augmentation
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
json_path = 'annotations.json'
# use our dataset and defined transformations
dataset = CustomDataset(root = Config.Dataset_dir, json_file=json_path, transform = get_transform(train=True))
# for image, target in dataset:
# print(image.shape)
# split the dataset in train and test set
torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-500])
dataset_test = torch.utils.data.Subset(dataset, indices[-500:])
# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=1, shuffle=True, num_workers=4,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=1, shuffle=False, num_workers=4,
collate_fn=utils.collate_fn)
device = Config.DEVICE
# # our dataset has two classes only - background and person
num_classes = 2
# get the model using our helper function
model = get_instance_segmentation_model(num_classes)
# move model to the right device
model.to(device)
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.1,
momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler which decreases the learning rate by
# 10x every 3 epochs
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.1)
# let's train it for 10 epochs
num_epochs = 10
for epoch in range(num_epochs):
# train for one epoch, printing every 10 iterations
train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
# update the learning rate
lr_scheduler.step()
# evaluate on the test dataset
evaluate(model, data_loader_test, device=device)
This training code is as stated in the tutorial is using some helper functions which can be accessed from here. I have run the training code and the training is working for the first 10 samples in the data, but then it gives the following error:
Epoch: [0] [ 0/2759] eta: 13:29:50 lr: 0.000200 loss: -136.8811 (-136.8811) loss_classifier: 0.9397 (0.9397) loss_box_reg: 0.0017 (0.0017) loss_mask: -137.9142 (-137.9142) loss_objectness: 0.0859 (0.0859) loss_rpn_box_reg: 0.0057 (0.0057) time: 17.6117 data: 10.0775
Loss is nan, stopping training
{'loss_classifier': tensor(nan, grad_fn=<NllLossBackward0>), 'loss_box_reg': tensor(nan, grad_fn=<DivBackward0>), 'loss_mask': tensor(nan, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_objectness': tensor(nan, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), 'loss_rpn_box_reg': tensor(nan, grad_fn=<DivBackward0>)}
An exception has occurred, use %tb to see the full traceback.
SystemExit: 1
This error is raised from the engine.py train_one_epoch function, especially from this part of the function:
with torch.cuda.amp.autocast(enabled=scaler is not None):
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
losses_reduced = sum(loss for loss in loss_dict_reduced.values())
loss_value = losses_reduced.item()
if not math.isfinite(loss_value):
print(f"Loss is {loss_value}, stopping training")
print(loss_dict_reduced)
sys.exit(1)
Which indicates that the losses returned after the first loop are NaN ... What could be wrong here please? I am running out of ideas and don't know what's going wrong anymore.
I have a .txt file with 683,500 rows, every 7 rows its a different person that contain:
ID
Name
Work position
Date 1 (year - month)
Date 2 (year - month)
Gross payment
Service time
I would like to read that .txt and output (could be a json, csv, txt, or even in a database) every person in a 7 column, for example:
ID Name Work position Date 1 Date 2 Gross payment Service time
ID Name Work position Date 1 Date 2 Gross payment Service time
ID Name Work position Date 1 Date 2 Gross payment Service time
ID Name Work position Date 1 Date 2 Gross payment Service time
Example in the txt:
00000000886
MANUEL DE JESUS SUBERVI PEÑA
MAESTRO MEDIA GENERAL
2006-08
2021-09
30,556.04
15.7
00000000086
MANUEL DE JESUS SUBERVI PEÑA
MAESTRO MEDIA GENERAL
2006-01
2021-09
30,556.04
15.7
00100000086
MANUEL DE JESUS SUBERVI PEÑA
MAESTRO MEDIA GENERAL
2006-01
2021-09
30,556.04
15.7
import csv
#opening file
file = open (r"C:\Users\Redford\Documents\Proyecto automatizacion\data1.txt") #open file
counter = 0
total_lines = len(file.readlines()) #count lines
#print('Total lines:', x)
#reading from file
content = file.read()
colist = content.split ()
print(colist)
#read data from data1.txt and write in data2.txt
lines = open (r"C:\Users\Redford\Documents\Proyecto automatizacion\data1.txt")
arr = []
with open('data2.txt', 'w') as f:
for line in lines:
#arr.append(line)
f.write (line)
I'm new to programing and I don't know how to translate my logic to code.
Your code does not collect multiple lines to write them into one.
Use this approach:
read your file line by line
collect each line without a \n into a list
if list reaches 7 length, write into csv and clear list
repeat until done
Create data file:
with open ("t.txt","w") as f:
f.write("""00000000886\nMANUEL DE JESUS SUBERVI PEÑA\nMAESTRO MEDIA GENERAL\n2006-08\n2021-09\n30,556.04\n15.7
00000000086\nMANUEL DE JESUS SUBERVI PEÑA\nMAESTRO MEDIA GENERAL\n2006-01\n2021-09\n30,556.04\n15.7
00100000086\nMANUEL DE JESUS SUBERVI PEÑA\nMAESTRO MEDIA GENERAL\n2006-01\n2021-09\n30,556.04\n15.7""")
Program:
import csv
with open("t.csv","w",newline="") as wr, open("t.txt") as r:
# create a csv writer
writer = csv.writer(wr)
# uncomment if you want a header over your data
# h = ["ID","Name","Work position","Date 1","Date 2",
# "Gross payment","Service time"]
# writer.writerow(h)
person = []
for line in r: # could use enumerate as well, this works ok
# collect line data minus the \n into list
person.append(line.strip())
# this person is finished, write, clear list
if len(person) == 7:
# leveraged the csv module writer, look it up if you need
# to customize it further regarding quoting etc
writer.writerow(person)
person = [] # reset list for next person
# something went wrong, your file is inconsistent, write remainder
if person:
writer.writerow(person)
print(open("t.csv").read())
Output:
00000000886,MANUEL DE JESUS SUBERVI PEÑA,MAESTRO MEDIA GENERAL,2006-08,2021-09,"30,556.04",15.7
00000000086,MANUEL DE JESUS SUBERVI PEÑA,MAESTRO MEDIA GENERAL,2006-01,2021-09,"30,556.04",15.7
00100000086,MANUEL DE JESUS SUBERVI PEÑA,MAESTRO MEDIA GENERAL,2006-01,2021-09,"30,556.04",15.7
Readup: csv module - writer
The "Gross payment" needs to be quoted because it contain s a ',' wich is the delimiter for csv - the module does this automagically.
On top of the excellent answer from #PatrickArtner, I would like to propose an itertools-based solution:
import csv
import itertools
def file_grouper_itertools(
in_filepath="t.txt",
out_filepath="t.csv",
size=7):
with open(in_filepath, 'r') as in_file,\
open(out_filepath, 'w') as out_file:
writer = csv.writer(out_file)
args = [iter(in_file)] * size
for block in itertools.zip_longest(*args, fillvalue=' '):
# equivalent, for the given input, to:
# block = [x.rstrip('\n') for x in block]
block = ''.join(block).rstrip('\n').split('\n')
writer.writerow(block)
The idea there is to loop in blocks of the required size.
For larger group sizes this gets faster simply because of the fewer cycles the main loop is being executed.
Running some micro-benchmarking shows that your use case should benefit from this approach compared to the manual looping (adapted into a function):
import csv
def file_grouper_manual(
in_filepath="t.txt",
out_filepath="t.csv",
size=7):
with open(in_filepath, 'r') as in_file,\
open(out_filepath, 'w') as out_file:
writer = csv.writer(out_file)
block = []
for line in in_file:
block.append(line.rstrip('\n'))
if len(block) == size:
writer.writerow(block)
block = []
if block:
writer.writerow(block)
Benchmarking:
n = 100_000
k = 7
with open ("t.txt", "w") as f:
for i in range(n):
f.write("\n".join(["0123456"] * k))
%timeit file_grouper_manual()
# 1 loop, best of 5: 325 ms per loop
%timeit file_grouper_itertools()
# 1 loop, best of 5: 230 ms per loop
Alternatively, you could use Pandas, which is very convenient, but requires that all the input fit into available memory (which should not be a problem in your case, but can be for larger inputs):
import numpy as np
import pandas as pd
def file_grouper_pandas(in_filepath="t.txt", out_filepath="t.csv", size=7):
with open(in_filepath) as in_filepath:
data = [x.rstrip('\n') for x in in_filepath.readlines()]
df = pd.DataFrame(np.array(data).reshape((-1, size)), columns=list(range(size)))
# consistent with the other solutions
df.to_csv(out_filepath, header=False, index=False)
%timeit file_grouper_pandas()
# 1 loop, best of 5: 666 ms per loop
If you do a lot of work with tables and data, NumPy and Pandas are really useful libraries to get comfortable with.
import numpy as np
import pandas as pd
columns = ['ID', 'Name' , 'Work position', 'Date 1 (year - month)', 'Date 2 (year - month)',
'Gross payment', 'Service time']
with open('oldfile.txt', 'r') as stream:
# read file into a list of lines
lines = stream.readlines()
# remove newline character from each element of the list.
lines = [line.strip('\n') for line in lines]
# Figure out how many rows there will be in the table
number_of_people = len(lines)/7
# Split data into rows
data = np.array_split(lines, number_of_people)
# Convert data to pandas dataframe
df = pd.DataFrame(data, columns = columns)
Once you have converted the data to a Pandas Dataframe, you can easily output it to any of the formats you listed. For example to output to csv you can do:
df.to_csv('newfile.csv')
Or for json it would be:
df.to_json('newfile.csv')
With Mnist I have a single file with the labels and a single file for the train, so I simply do:
self.data = datasets.MNIST(root='./data', train=True, download=True)
Basically I create a set of labels (from 0-9) and save the i-th position of the image in the data structure, to create my custom tasks:
def make_tasks (self):
        self.task_to_examples = {} #task 0-9
        self.all_tasks = set (self.data.train_labels.numpy ())
        for i, digit in enumerate (self.data.train_labels.numpy ()):
            if str(digit) not in self.task_to_examples:
                self.task_to_examples[str(digit)] = []
           self.task_to_examples[str(digits)].append(i)
I don't understand how to do the same thing using CIFAR10 because it is divided into 5 batches, I would like all the data in a single structure.
If your desired structure is {"class_id": [indices of the samples]}, then for CIFAR10 you can do something like this:
import numpy as np
import torchvision
# set root accordingly
cifar = torchvision.datasets.CIFAR10(root=".", train=True, download=True)
task_to_examples = {
str(task_id): np.where(cifar.targets == task_id)[0].tolist()
for task_id in np.unique(cifar.targets)
}
I am working on a time series problem. Different training time series data is stored in a large JSON file with the size of 30GB. In tensorflow I know how to use TF records. Is there a similar way in pytorch?
I suppose IterableDataset (docs) is what you need, because:
you probably want to traverse files without random access;
number of samples in jsons is not pre-computed.
I've made a minimal usage example with an assumption that every line of dataset file is a json itself, but you can change the logic.
import json
from torch.utils.data import DataLoader, IterableDataset
class JsonDataset(IterableDataset):
def __init__(self, files):
self.files = files
def __iter__(self):
for json_file in self.files:
with open(json_file) as f:
for sample_line in f:
sample = json.loads(sample_line)
yield sample['x'], sample['time'], ...
...
dataset = JsonDataset(['data/1.json', 'data/2.json', ...])
dataloader = DataLoader(dataset, batch_size=32)
for batch in dataloader:
y = model(batch)
Generally, you do not need to change/overload the default data.Dataloader.
What you should look into is how to create a custom data.Dataset.
Once you have your own Dataset that knows how to extract item-by-item from the json file, you feed it do the "vanilla" data.Dataloader and all the batching/multi-processing etc, is done for you based on your dataset provided.
If, for example, you have a folder with several json files, each containing several examples, you can have a Dataset that looks like:
import bisect
class MyJsonsDataset(data.Dataset):
def __init__(self, jfolder):
super(MyJsonsDataset, self).__init__()
self.filenames = [] # keep track of the jfiles you need to load
self.cumulative_sizes = [0] # keep track of number of examples viewed so far
# this is not actually python code - just pseudo code for you to follow
for each jsonfile in jfolder:
self.filenames.append(jsonfile)
l = number of examples in jsonfile
self.cumulative_sizes.append(self.cumulative_sizes[-1] + l)
# discard the first element
self.cumulative_sizes.pop(0)
def __len__(self):
return self.cumulative_sizes[-1]
def __getitem__(self, idx):
# first you need to know wich of the files holds the idx example
jfile_idx = bisect.bisect_right(self.cumulative_sizes, idx)
if jfile_idx == 0:
sample_idx = idx
else:
sample_idx = idx - self.cumulative_sizes[jfile_idx - 1]
# now you need to retrieve the `sample_idx` example from self.filenames[jfile_idx]
return retrieved_example