Loading multiple csvs with mixed dtypes in tensorflow for training - csv

I have 100s of csvs in a directory, with headers. I am trying to create a feedforward NN using tensorflow for regression.
What's the best way to import these csvs and train using tf & train it?
Also help to look at my preprocessing if I am doing it right?
Note: My features has mixed datatypes (int,float,string), My target is float
I can not concat the csv and use pandas to import, my data size is >50 GB so can not load it in-memory, have to read it iteratively from disc
Directory Path:
./data/train/ -> 100s of csvs
./data/test -> 100s of csvs
./data/valid -> 100s of csvs
Code:
Methodology:
Create Generator
Use Dataset API to load the data
Preprocess the Data (embedding, one-hot,etc)
Train fit
But, in generator I was able to give only output formats where the inputs/ outputs are homogeneous ddtypes.
Code:
def data_generator(file_list, batch_size = 2):
i = 0
while True:
if i*batch_size >= len(file_list): # This loop is used to run the generator indefinitely.
i = 0
np.random.shuffle(file_list)
else:
file_chunk = file_list[i*batch_size:(i+1)*batch_size]
data = []
labels = []
for file in file_chunk:
temp = pd.read_csv(open(file,'r')) # Change this line to read any other type of file
labels = temp.pop('ACTUAL_BOXES')
data.append(temp.values) # Convert column data to matrix like data with one channel
labels.append(labels)
data = np.asarray(data)
labels = np.asarray(labels)
yield data, labels # Here data will be mixed datatype arrays & lables will be a float dtype array
i = i + 1
#getting list of files inside the directory
train_file_list = np.sort(glob.glob('././data/train/*.csv'))
test_file_list = np.sort(glob.glob('././data/test/*.csv'))
val_file_list = np.sort(glob.glob('././data/val/*.csv'))
train_dataset = tf.data.Dataset.from_generator(data_generator,args= [train_file_list , batch_size = 2],
output_types = (tf.float32, tf.float32), #This is where I am struck
#my sample data and lables will be like this
data = ['a','b',1,2,3.14,2] #Mixed dtypes
lables = [1.0] #float
)
val_dataset = tf.data.Dataset.from_generator(data_generator,args= [val_file_list , batch_size = 2],
output_types = (tf.float32, tf.float32), #This is where I am struck
)
# Pre processing Part:
def encode_inputs(EMBEDDING_FEATURES,INDICATOR_FEATURES):
''' Function for encoding the deatures'''
encoded_features = []
for feature_name in EMBEDDING_FEATURES:
#Getting unique vocab list
vocabulary = np.array(list(flatten(vocab_list[feature_name])))
# categorical columns using the lists created above:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary)
embedding_dims = int(math.sqrt(len(vocabulary)))
# create an embedding from the categorical column:
cat_emb = tf.feature_column.embedding_column(cat_col,8) #,dimension=embedding_dims
# add the embeddings to the list of feature columns
encoded_features.append(cat_emb)
for feature_name in INDICATOR_FEATURES:
#Getting unique vocab list
vocabulary = list(flatten(vocab_list[feature_name]))
# indicator columns using the lists created above:
ind_col = tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary)
# create an embedding from the categorical column:
cat_one_hot = tf.feature_column.indicator_column(ind_col)
# add the embeddings to the list of feature columns
encoded_features.append(cat_one_hot)
# create the input layer for the model
feature_layer = tf.keras.layers.DenseFeatures(encoded_features)
return feature_layer
# Opening JSON file that contains vocab list for str cols
f = open('./vocab_list.json') # File that contains the unique values of each feature
vocab_list = json.load(f)
features_layer = encode_inputs(EMBEDDING_FEATURES,INDICATOR_FEATURES)
# Model Part
model = tf.keras.models.Sequential([
features_layer,
tf.keras.layers.Dense(30, activation = 'relu'),
tf.keras.layers.Dense(1)
])
m_loss = tf.keras.losses.mean_squared_error
m_optimizer = tf.keras.optimizers.SGD(lr = 1e-3)
batch_size = 32
model.compile(loss = m_loss, optimizer = m_optimizer, metrics = ['accuracy'])
model.fit(train_dataset ,epochs = 10, validation_data = val_dataset )

Related

NMT , 'KerasTensor' object is not callable'

Here I share a code snippet for training Encoder_Decoder Model for machine translation. While Using the Embedding layer (trained previously) during inference mode( on test_data) . It threw the following error --->
# Encoder
encoder_inputs = Input(shape=(None ,))
enc_emb = Embedding(eng_vocab_size, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(deu_vocab_size, latent_dim, mask_zero = True)(decoder_inputs)
# decoder return full output sequences, and internal states as well.
# We don't use the return states in the training model,
# but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
initial_state=encoder_states)
decoder_dense = Dense(deu_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2= dec_emb(decoder_inputs) # reusing embedding layer
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs) # reusing lstm layer
decoder_outputs2 = decoder_dense(decoder_outputs2) # softmax_layer to generate prob_dist. over target vocab
# Final decoder model
decoder_model = Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs2] )
ERROR
8 decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
---> 10 dec_emb2= dec_emb(decoder_inputs) # reusing embedding layer
11
12 decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs) # reusing lstm layer
TypeError: 'KerasTensor' object is not callableenter image description here
I read through various solutions available for this issue , but couldn't understand what 2 modes of model they were talking about and what their soltion was effectively doing .
Pls explain in detail. Thanks in advance

How do I apply NLP to the search engine I’m building using MySQL as data storage

I’m working on a search engine project for my country. I have the country’s domains list of the sites to crawl. So I have built a bot (the bot was written in python) to crawl some of the sites at the moment. When crawling is successful, the crawler will commit the crawled content to MySQL database. So I have data that people can search for in the MySQL remote server as I speak.
Now, I want to implement NLP in the search such that when a user enters a keyword in the search box, relevant results from MySQL database will show to the user based on the keyword used. I’m using python 3.8 and NLTK for this project. I haven’t done anything about NLP before. This is my first time. But I have read about it though. I also want to ask if using MySQL database is the right option for the search engine. If not, why can’t I use it and what should I use? I’m currently using MySQL because I’m more familiar with it a lot and I enjoy when using it for data storage. I’ve been struggling with this thing since last December. What I really need is the right NLP algorithm to use for selecting relevant results from MySQL database. I know that NLP is difficult to implement but I will appreciate if you can at least try to help out.
Here’s the code:
What I have done so far, I copied some of the code from Kaggle.com . Here is the link https://www.kaggle.com/amitkumarjaiswal/nlp-search-engine/notebook but I still haven’t been able to make it work for my own project.
import pandas as pd
import numpy as np
import string
import random
import nltk
import os
import re
#import nltk.corpus
import csv
#nltk.download('all')
#print(os.listdir(nltk.data.find("corpora")))
#pip install --upgrade nltk
from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
#load 10k reuters news documents
len(reuters.fileids())
#view text from one document
reuters.raw(fileids=['test/14826'])[0:201]
exclude = set(string.punctuation)
alldocslist = []
for index, i in enumerate(reuters.fileids()):
text = reuters.raw(fileids=[i])
text = ''.join(ch for ch in text if ch not in exclude)
alldocslist.append(text)
print(alldocslist[1])
#tokenize words in all DOCS
plot_data = [[]] * len(alldocslist)
for doc in alldocslist:
text = doc
tokentext = word_tokenize(text)
plot_data[index].append(tokentext)
print(plot_data[0][1])
# Navigation: first index gives all documents, second index gives specific document, third index gives words of that doc
plot_data[0][1][0:10]
#make all words lower case for all docs
for x in range(len(reuters.fileids())):
lowers = [word.lower() for word in plot_data[0][x]]
plot_data[0][x] = lowers
plot_data[0][1][0:10]
# remove stop words from all docs
stop_words = set(stopwords.words('english'))
for x in range(len(reuters.fileids())):
filtered_sentence = [w for w in plot_data[0][x] if not w in stop_words]
plot_data[0][x] = filtered_sentence
plot_data[0][1][0:10]
#stem words EXAMPLE (could try others/lemmers )
snowball_stemmer = SnowballStemmer("english")
stemmed_sentence = [snowball_stemmer.stem(w) for w in filtered_sentence]
stemmed_sentence[0:10]
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")
stemmed_sentence = [ porter_stemmer.stem(w) for w in filtered_sentence]
stemmed_sentence[0:10]
# Create inverse index which gives document number for each document and where word appears
#first we need to create a list of all words
l = plot_data[0]
flatten = [item for sublist in l for item in sublist]
words = flatten
wordsunique = set(words)
wordsunique = list(wordsunique)
import math
from textblob import TextBlob as tb
def tf(word, doc):
return doc.count(word) / len(doc)
def n_containing(word, doclist):
return sum(1 for doc in doclist if word in doc)
def idf(word, doclist):
return math.log(len(doclist) / (0.01 + n_containing(word, doclist)))
def tfidf(word, doc, doclist):
return (tf(word, doc) * idf(word, doclist))
# THIS ONE-TIME INDEXING IS THE MOST PROCESSOR-INTENSIVE STEP AND WILL TAKE TIME TO RUN (BUT ONLY NEEDS TO BE RUN ONCE)
plottest = plot_data[0][0:1000]
worddic = {}
for doc in plottest:
for word in wordsunique:
if word in doc:
word = str(word)
index = plottest.index(doc)
positions = list(np.where(np.array(plottest[index]) == word)[0])
idfs = tfidf(word,doc,plottest)
try:
worddic[word].append([index,positions,idfs])
except:
worddic[word] = []
worddic[word].append([index,positions,idfs])
# the index creates a dic with each word as a KEY and a list of doc indexs, word positions, and td-idf score as VALUES
worddic['china']
# pickel (save) the dictonary to avoid re-calculating
np.save('worddic_1000.npy', worddic)
# create word search which takes multiple words and finds documents that contain both along with metrics for ranking:
## (1) Number of occruances of search words
## (2) TD-IDF score for search words
## (3) Percentage of search terms
## (4) Word ordering score
## (5) Exact match bonus
from collections import Counter
def search(searchsentence):
try:
# split sentence into individual words
searchsentence = searchsentence.lower()
try:
words = searchsentence.split(' ')
except:
words = list(words)
enddic = {}
idfdic = {}
closedic = {}
# remove words if not in worddic
realwords = []
for word in words:
if word in list(worddic.keys()):
realwords.append(word)
words = realwords
numwords = len(words)
# make metric of number of occurances of all words in each doc & largest total IDF
for word in words:
for indpos in worddic[word]:
index = indpos[0]
amount = len(indpos[1])
idfscore = indpos[2]
enddic[index] = amount
idfdic[index] = idfscore
fullcount_order = sorted(enddic.items(), key=lambda x:x[1], reverse=True)
fullidf_order = sorted(idfdic.items(), key=lambda x:x[1], reverse=True)
# make metric of what percentage of words appear in each doc
combo = []
alloptions = {k: worddic.get(k, None) for k in (words)}
for worddex in list(alloptions.values()):
for indexpos in worddex:
for indexz in indexpos:
combo.append(indexz)
comboindex = combo[::3]
combocount = Counter(comboindex)
for key in combocount:
combocount[key] = combocount[key] / numwords
combocount_order = sorted(combocount.items(), key=lambda x:x[1], reverse=True)
# make metric for if words appear in same order as in search
if len(words) > 1:
x = []
y = []
for record in [worddic[z] for z in words]:
for index in record:
x.append(index[0])
for i in x:
if x.count(i) > 1:
y.append(i)
y = list(set(y))
closedic = {}
for wordbig in [worddic[x] for x in words]:
for record in wordbig:
if record[0] in y:
index = record[0]
positions = record[1]
try:
closedic[index].append(positions)
except:
closedic[index] = []
closedic[index].append(positions)
x = 0
fdic = {}
for index in y:
csum = []
for seqlist in closedic[index]:
while x > 0:
secondlist = seqlist
x = 0
sol = [1 for i in firstlist if i + 1 in secondlist]
csum.append(sol)
fsum = [item for sublist in csum for item in sublist]
fsum = sum(fsum)
fdic[index] = fsum
fdic_order = sorted(fdic.items(), key=lambda x:x[1], reverse=True)
while x == 0:
firstlist = seqlist
x = x + 1
else:
fdic_order = 0
# also the one above should be given a big boost if ALL found together
#could make another metric for if they are not next to each other but still close
return(searchsentence,words,fullcount_order,combocount_order,fullidf_order,fdic_order)
except:
return("")
search('indonesia crude palm oil')[1]
# 0 return will give back the search term, the rest will give back metrics (see above)
search('indonesia crude palm oil')[1][1:10]
# save metrics to dataframe for use in ranking and machine learning
result1 = search('china daily says what')
result2 = search('indonesia crude palm oil')
result3 = search('price of nickel')
result4 = search('north yemen sugar')
result5 = search('nippon steel')
result6 = search('China')
result7 = search('Gold')
result8 = search('trade')
df = pd.DataFrame([result1,result2,result3,result4,result5,result6,result7,result8])
df.columns = ['search term', 'actual_words_searched','num_occur','percentage_of_terms','td-idf','word_order']
df
# look to see if the top documents seem to make sense
alldocslist[1]
# create a simple (non-machine learning) rank and return function
def rank(term):
results = search(term)
# get metrics
num_score = results[2]
per_score = results[3]
tfscore = results[4]
order_score = results[5]
final_candidates = []
# rule1: if high word order score & 100% percentage terms then put at top position
try:
first_candidates = []
for candidates in order_score:
if candidates[1] > 1:
first_candidates.append(candidates[0])
second_candidates = []
for match_candidates in per_score:
if match_candidates[1] == 1:
second_candidates.append(match_candidates[0])
if match_candidates[1] == 1 and match_candidates[0] in first_candidates:
final_candidates.append(match_candidates[0])
# rule2: next add other word order score which are greater than 1
t3_order = first_candidates[0:3]
for each in t3_order:
if each not in final_candidates:
final_candidates.insert(len(final_candidates),each)
# rule3: next add top td-idf results
final_candidates.insert(len(final_candidates),tfscore[0][0])
final_candidates.insert(len(final_candidates),tfscore[1][0])
# rule4: next add other high percentage score
t3_per = second_candidates[0:3]
for each in t3_per:
if each not in final_candidates:
final_candidates.insert(len(final_candidates),each)
#rule5: next add any other top results for metrics
othertops = [num_score[0][0],per_score[0][0],tfscore[0][0],order_score[0][0]]
for top in othertops:
if top not in final_candidates:
final_candidates.insert(len(final_candidates),top)
# unless single term searched, in which case just return
except:
othertops = [num_score[0][0],num_score[1][0],num_score[2][0],per_score[0][0],tfscore[0][0]]
for top in othertops:
if top not in final_candidates:
final_candidates.insert(len(final_candidates),top)
for index, results in enumerate(final_candidates):
if index < 5:
print("RESULT", index + 1, ":", alldocslist[results][0:100],"...")
# example of output
rank('indonesia palm oil')
# example of output
rank('china')
# Create pseudo-truth set using first 5 words
# Because I don't have a turth set I will generate a pseudo one by pulling terms from the documents - this is far from perfect
# as it may not approximate well peoples actual queries but it will serve well to build the ML architecture
df_truth = pd.DataFrame()
for doc in plottest:
first_five = doc[0:5]
test_sentence = ' '.join(first_five)
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth= pd.concat([df_truth, df_temp])
df_truth['truth'] = range(0,len(plottest))
df_truth1 = pd.DataFrame()
seqlen = 3
for doc in plottest:
try:
start = random.randint(0,(len(doc)-seqlen))
random_seq = doc[start:start+seqlen]
test_sentence = ' '.join(random_seq)
except:
test_sentence = doc[0]
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth1= pd.concat([df_truth1, df_temp])
df_truth1['truth'] = range(0,len(plottest))
# create another psuedo-truth set using different random 4 word sequence from docs
df_truth2 = pd.DataFrame()
seqlen = 4
for doc in plottest:
try:
start = random.randint(0,(len(doc)-seqlen))
random_seq = doc[start:start+seqlen]
test_sentence = ' '.join(random_seq)
except:
test_sentence = doc[0]
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth2= pd.concat([df_truth2, df_temp])
df_truth2['truth'] = range(0,len(plottest))
# create another psuedo-truth set using different random 2 word sequence from docs
df_truth3 = pd.DataFrame()
seqlen = 2
for doc in plottest:
try:
start = random.randint(0,(len(doc)-seqlen))
random_seq = doc[start:start+seqlen]
test_sentence = ' '.join(random_seq)
except:
test_sentence = doc[0]
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth3= pd.concat([df_truth3, df_temp])
df_truth3['truth'] = range(0,len(plottest))
# combine the truth sets and save to disk
truth_set = pd.concat([df_truth,df_truth1,df_truth2,df_truth3])
truth_set.columns = ['search term', 'actual_words_searched','num_occur','percentage_of_terms','td-idf','word_order','truth']
truth_set.to_csv("truth_set_final.csv")
truth_set[0:10]
truth_set
test_set = truth_set[0:3]
test_set
# convert to long format for ML
# WARNING AGAIN THIS IS A SLOW PROCESS DUE TO RAM ILOC - COULD BE OPTIMISED FOR FASTER PERFORMANCE
# BUG When min(maxnum, len(truth_set) <- is a int not a list because of very short variable length)
# row is row
# column is variable
# i is the result
final_set = pd.DataFrame()
test_set = truth_set[1:100]
maxnum = 5
for row in range(0,len(test_set.index)):
test_set = truth_set[1:100]
for col in range(2,6):
for i in range(0,min(maxnum,len(truth_set.iloc[row][col]))):
x = pd.DataFrame([truth_set.iloc[row][col][i]])
x['truth'] = truth_set.iloc[row]['truth']
x.columns = [(str(truth_set.columns[col]),"index",i),(str(truth_set.columns[col]),"score",i),'truth']
test_set = test_set.merge(x,on='truth')
final_set = pd.concat([final_set,test_set])
final_set.head()
final_set.to_csv("ML_set_100.csv")
final_set2 = final_set.drop(['actual_words_searched','num_occur','percentage_of_terms','search term','td-idf','word_order'], 1)
final_set2.to_csv("ML_set_100_3.csv")
final_set2.head()
final_set3 = final_set2
final_set3[0:10]
Obviously, the code above isn't returning searched keywords from MySQL database. I believe you understand me? Thank you very much!

How to handle large JSON file in Pytorch?

I am working on a time series problem. Different training time series data is stored in a large JSON file with the size of 30GB. In tensorflow I know how to use TF records. Is there a similar way in pytorch?
I suppose IterableDataset (docs) is what you need, because:
you probably want to traverse files without random access;
number of samples in jsons is not pre-computed.
I've made a minimal usage example with an assumption that every line of dataset file is a json itself, but you can change the logic.
import json
from torch.utils.data import DataLoader, IterableDataset
class JsonDataset(IterableDataset):
def __init__(self, files):
self.files = files
def __iter__(self):
for json_file in self.files:
with open(json_file) as f:
for sample_line in f:
sample = json.loads(sample_line)
yield sample['x'], sample['time'], ...
...
dataset = JsonDataset(['data/1.json', 'data/2.json', ...])
dataloader = DataLoader(dataset, batch_size=32)
for batch in dataloader:
y = model(batch)
Generally, you do not need to change/overload the default data.Dataloader.
What you should look into is how to create a custom data.Dataset.
Once you have your own Dataset that knows how to extract item-by-item from the json file, you feed it do the "vanilla" data.Dataloader and all the batching/multi-processing etc, is done for you based on your dataset provided.
If, for example, you have a folder with several json files, each containing several examples, you can have a Dataset that looks like:
import bisect
class MyJsonsDataset(data.Dataset):
def __init__(self, jfolder):
super(MyJsonsDataset, self).__init__()
self.filenames = [] # keep track of the jfiles you need to load
self.cumulative_sizes = [0] # keep track of number of examples viewed so far
# this is not actually python code - just pseudo code for you to follow
for each jsonfile in jfolder:
self.filenames.append(jsonfile)
l = number of examples in jsonfile
self.cumulative_sizes.append(self.cumulative_sizes[-1] + l)
# discard the first element
self.cumulative_sizes.pop(0)
def __len__(self):
return self.cumulative_sizes[-1]
def __getitem__(self, idx):
# first you need to know wich of the files holds the idx example
jfile_idx = bisect.bisect_right(self.cumulative_sizes, idx)
if jfile_idx == 0:
sample_idx = idx
else:
sample_idx = idx - self.cumulative_sizes[jfile_idx - 1]
# now you need to retrieve the `sample_idx` example from self.filenames[jfile_idx]
return retrieved_example

How do I get CSV files into an Estimator in Tensorflow 1.6

I am new to tensorflow (and my first question in StackOverflow)
As a learning tool, I am trying to do something simple. (4 days later I am still confused)
I have one CSV file with 36 columns (3500 records) with 0s and 1s.
I am envisioning this file as a flattened 6x6 matrix.
I have another CSV file with 1 columnn of ground truth 0 or 1 (3500 records) which indicates if at least 4 of the 6 of elements in the 6x6 matrix's diagonal are 1's.
I am not sure I have processed the CSV files correctly.
I am confused as to how I create the features dictionary and Labels and how that fits into the DNNClassifier
I am using TensorFlow 1.6, Python 3.6
Below is the small amount of code I have so far.
import tensorflow as tf
import os
def x_map(line):
rDefaults = [[] for cl in range(36)]
x_row = tf.decode_csv(line, record_defaults=rDefaults)
return x_row
def y_map(line):
line = tf.string_to_number(line, out_type=tf.int32)
y_row = tf.one_hot(line, depth=2)
return y_row
x_path_file = os.path.join('D:', 'Diag', '6x6_train.csv')
y_path_file = os.path.join('D:', 'Diag', 'HasDiag_train.csv')
filenames = [x_path_file]
x_dataset = tf.data.TextLineDataset(filenames)
x_dataset = x_dataset.map(x_map)
x_dataset = x_dataset.batch(1)
x_iter = x_dataset.make_one_shot_iterator()
x_next_el = x_iter.get_next()
filenames = [y_path_file]
y_dataset = tf.data.TextLineDataset(filenames)
y_dataset = y_dataset.map(y_map)
y_dataset = y_dataset.batch(1)
y_iter = y_dataset.make_one_shot_iterator()
y_next_el = y_iter.get_next()
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
x_el = (sess.run(x_next_el))
y_el = (sess.run(y_next_el))
The output for x_el is:
(array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([1.], dtype=float32), array([0.] ... it goes on...
The output for y_el is:
[[1. 0.]]
You're pretty much there for a minimal working model. The main issue I see is that tf.decode_csv returns a tuple of tensors, where as I expect you want a single tensor with all values. Easy fix:
x_row = tf.stack(tf.decode_csv(line, record_defaults=rDefaults))
That should work... but it fails to take advantage of many of the awesome things the tf.data.Dataset API has to offer, like shuffling, parallel threading etc. For example, if you shuffle each dataset, those shuffling operations won't be consistent. This is because you've created two separate datasets and manipulated them independently. If you create them independently, zip them together then manipulate, those manipulations will be consistent.
Try something along these lines:
def get_inputs(
count=None, shuffle=True, buffer_size=1000, batch_size=32,
num_parallel_calls=8, x_paths=[x_path_file], y_paths=[y_path_file]):
"""
Get x, y inputs.
Args:
count: number of epochs. None indicates infinite epochs.
shuffle: whether or not to shuffle the dataset
buffer_size: used in shuffle
batch_size: size of batch. See outputs below
num_parallel_calls: used in map. Note if > 1, intra-batch ordering
will be shuffled
x_paths: list of paths to x-value files.
y_paths: list of paths to y-value files.
Returns:
x: (batch_size, 6, 6) tensor
y: (batch_size, 2) tensor of 1-hot labels
"""
def x_map(line):
rDefaults = [[] for cl in range(n_dims**2)]
x_row = tf.stack(tf.decode_csv(line, record_defaults=rDefaults))
return x_row
def y_map(line):
line = tf.string_to_number(line, out_type=tf.int32)
y_row = tf.one_hot(line, depth=2)
return y_row
def xy_map(x, y):
return x_map(x), y_map(y)
x_ds = tf.data.TextLineDataset(x_paths)
y_ds = tf.data.TextLineDataset(y_paths)
combined = tf.data.Dataset.zip((x_ds, y_ds))
combined = combined.repeat(count=count)
if shuffle:
combined = combined.shuffle(buffer_size)
combined = combined.map(xy_map, num_parallel_calls=num_parallel_calls)
combined = combined.batch(batch_size)
x, y = combined.make_one_shot_iterator().get_next()
return x, y
To experiment/debug,
x, y = get_inputs()
with tf.Session() as sess:
xv, yv = sess.run((x, y))
print(xv.shape, yv.shape)
For use in an estimator, pass the function itself.
estimator.train(get_inputs, max_steps=10000)
def get_eval_inputs():
return get_inputs(
count=1, shuffle=False
x_paths=[x_eval_paths],
y_paths=[y_eval_paths])
estimator.eval(get_eval_inputs)

Is num_epochs limited in tensorflow's csv file reader string_input_producer()?

I have a dummy csv file (y=-x+1)
x,y
1,0
2,-1
3,-2
I try to feed that into a linear regression model. Since I have only so few examples, I want to iterate the training like 1000 times over that file, so I set num_epochs=1000.
However, it seems that Tensorflow limits this number. It works fine if I use num_epochs=5 or 10, but beyond 33 it is capped to 33 epochs. Is that true or am Im doing anything wrong?
# model = W*x+b
...
optimizer = tf.train.GradientDescentOptimizer(0.01)
train = optimizer.minimize(loss)
# reading input from csv
filename_queue = tf.train.string_input_producer(["/tmp/testinput.csv"], num_epochs=1000)
reader = tf.TextLineReader(skip_header_lines=1)
...
col_x, col_label = tf.decode_csv(csv_row, record_defaults=record_defaults)
with tf.Session() as sess:
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
while True:
try:
input_x, input_y = sess.run([col_x, col_label])
sess.run(train, feed_dict={x:input_x, y:input_y})
...
Side question, do I need to do:
input_x, input_y = sess.run([col_x, col_label])
sess.run(train, feed_dict={x:input_x, y:input_y})
I have tried sess.run(train, feed_dict={x:col_x, y:col_y}) directly to avoid the friction but it doesn't work (they are nodes, and feed_dict expects regular data)
The following snippets works perfectly (with your input):
import tensorflow as tf
filename_queue = tf.train.string_input_producer(["/tmp/input.csv"], num_epochs=1000)
reader = tf.TextLineReader(skip_header_lines=1)
_, csv_row = reader.read(filename_queue)
col_x, col_label = tf.decode_csv(csv_row, record_defaults=[[0], [0]])
with tf.Session() as sess:
sess.run(tf.local_variables_initializer())
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
num = 0
try:
while True:
sess.run([col_x, col_label])
num += 1
except:
print(num)
Which gives the following output:
edb#lapelidb:/tmp$ python csv.py
3000