How to read json file and fit lstm model? - json

I am trying to apply LSTM on HP news dataset. The data is in JSON format (https://www.kaggle.com/rmisra/news-category-dataset). I have tried this code and got errors. Don't know what's wrong with this code?
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
import json
from sklearn.preprocessing import LabelBinarizer
with open('News_Category_Dataset_v2.json', 'r') as f:
train = json.load(f)
Y_train = list(train.values())
lb = LabelBinarizer()
X_train = lb.fit_transform(list(train.keys()))
##
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)
##
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
def RNN():
inputs = Input(name='inputs',shape=[max_len])
layer = Embedding(max_words,50,input_length=max_len)(inputs)
layer = LSTM(64)(layer)
layer = Dense(256,name='FC1')(layer)
layer = Activation('relu')(layer)
layer = Dropout(0.5)(layer)
layer = Dense(1,name='out_layer')(layer)
layer = Activation('softmax')(layer)
model = Model(inputs=inputs,outputs=layer)
return model
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
Got these errors
Traceback (most recent call last):
Traceback (most recent call last):
File ".\Hpnews.py", line 30, in <module>
train = json.load(f)
File "C:\Users\a\Anaconda3\lib\json\__init__.py", line 293, in load
return loads(fp.read(),
File "C:\Users\a\Anaconda3\lib\json\__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "C:\Users\a\Anaconda3\lib\json\decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 366)
this is my json file format
"root":{6 items
"category":string"CRIME"
"headline":string"There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV"
"authors":string"Melissa Jeltsen"
"link":string"huffingtonpost.com/entry/…" "short_description":string"She left her husband. He killed their children. Just another day in America."
"date":string"2018-05-26" }

The JSON is not a typical JSON but a ndJSON ("newline-delimited JSON") that won't be opened by json.load.
You should use pandas to load you data:
import pandas as pd
data = pd.read_json('News_Category_Dataset_v2.json', lines=True)

Related

How to download a trained model as a pickle file using Streamlit Download Button?

How do I download a trained model as a pickle file using Streamlit Download Button?
You can use io.BytesIO to store the pickled data inside bytes in RAM. Then, give these bytes as data argument in the st.download_button function.
import io
import pickle
import streamlit as st
def create_model():
"""Create an sklearn model so that we will have
something interesting to pickle.
Example taken from here:
https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
"""
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ["preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"]
dataframe = pd.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
test_size = 0.33
seed = 7
X_train, _, Y_train, _ = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)
# Fit the model on training set
model = LogisticRegression()
model.fit(X_train, Y_train)
return model
def pickle_model(model):
"""Pickle the model inside bytes. In our case, it is the "same" as
storing a file, but in RAM.
"""
f = io.BytesIO()
pickle.dump(model, f)
return f
st.title("My .pkl downloader")
model = create_model()
data = pickle_model(model)
st.download_button("Download .pkl file", data=data, file_name="my-pickled-model.pkl")

FileNotFoundError: [Errno 2] File b'Downloads/BetterLifeIndex2015.csv' does not exist: b'Downloads/BetterLifeIndex2015.csv'

Resolved
Answer: Changed the path, it was in fact inncorect path after all. Used absolute path (alt+d+copy from file explorer". Also used "r" before the path so the path is treated like a raw string.
# load the data
BetterLifeIndex = pd.read_csv(r"C:\Users\brede\OneDrive\Dokumenter\Downloads\BetterLifeIndex2015.csv", thousands = ',')
gdp_per_capita = pd.read_csv(r"C:\Users\brede\OneDrive\Dokumenter\Downloads\gdpcapita.csv", thousands= ',', delimiter ='\t',
encoding = 'latin1' , na_values="n/a")
Im new to Python and I'm running a Example from a machine learning book. I cant get python to read my csv file.
Code:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
def prepare_country_stats(oecd_bli, gdp_per_capita):
oecd_bli = oecd_bli[oecd_bli["INEQUALITY"]=="TOT"]
oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value")
gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True)
gdp_per_capita.set_index("Country", inplace=True)
full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita,
left_index=True, right_index=True)
full_country_stats.sort_values(by="GDP per capita", inplace=True)
remove_indices = [0, 1, 6, 8, 33, 34, 35]
keep_indices = list(set(range(36)) - set(remove_indices))
return full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices]
# load the data
oecd_bli = pd.read_csv("Downloads/BetterLifeIndex2015.csv", thousands = ',')
gdp_per_capita = pd.read_csv("C:/Users/brede/Downloads/gdpcapita.csv", thousands= ',', delimiter ='\t',
encoding = 'latin1' , na_values="n/a")
#prepare the data
country_stats = prepare_country_stats (oecd_bli, gdp_per_capita)
x = np.c_[country_stats["gdp per capita"]]
y = np.c_[country_stats["life satisfaction"]]
#visualize the data
country_stats.plot(kind= 'scatter' , x = "GDP per capita", y ='Life satisfaction')
#select a linear model
model = sklearn.linear_model.LinearRegression()
#train the model
model.fit (x, y)
#make a prediction for Cyprus
X_new = [[22587]] #Cyprus GDP per capita
print(model.predict(X_new)) #outputs[[5.96242338]]
The output is:
runfile('C:/Users/brede/Downloads/practice_gdp.py', wdir='C:/Users/brede/Downloads')
Traceback (most recent call last):
File "<ipython-input-59-2f130edd277c>", line 1, in <module>
runfile('C:/Users/brede/Downloads/practice_gdp.py', wdir='C:/Users/brede/Downloads')
File "C:\Users\brede\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "C:\Users\brede\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/brede/Downloads/practice_gdp.py", line 31, in <module>
oecd_bli = pd.read_csv("Downloads/BetterLifeIndex2015.csv", thousands = ',')
File "C:\Users\brede\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\brede\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 457, in _read
parser = TextFileReader(fp_or_buf, **kwds)
File "C:\Users\brede\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 895, in __init__
self._make_engine(self.engine)
File "C:\Users\brede\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 1135, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "C:\Users\brede\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 1917, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas\_libs\parsers.pyx", line 382, in pandas._libs.parsers.TextReader.__cinit__
File "pandas\_libs\parsers.pyx", line 689, in pandas._libs.parsers.TextReader._setup_parser_source
FileNotFoundError: [Errno 2] File b'Downloads/BetterLifeIndex2015.csv' does not exist: b'Downloads/BetterLifeIndex2015.csv'
I have triplechecked the path to the file, and I can't seem to figure this out! All help is appreciated.
This is done in Spyder, also tried in Jupyter with same result. I've even copied the path etc.
help...
I think you have to include'/' in the file path.Try that 'C:/Users/brede/OneDrive....'

Skyfield year is out of range

I'm trying to use Skyfield to plot an orbit, but it doesn't work.
Here's the code:
import numpy as np
import matplotlib.pyplot as plt
from skyfield.api import Loader, Topos, EarthSatellite
text = """
GOCE
1 34602U 09013A 13314.96046236 .14220718 20669-5 50412-4 0 930
2 34602 096.5717 344.5256 0009826 296.2811 064.0942 16.58673376272979
"""
lines = text.strip().splitlines()
sat = EarthSatellite(lines[1], lines[2], lines[0])
print(sat.epoch.utc_jpl())
Here's the error I get:
File "orbit_preditor.py", line 21, in <module>
ISS = EarthSatellite(L1, L2)
File "C:\Python\Python36\lib\site-packages\skyfield\sgp4lib.py", line 86, in __init__
EarthSatellite.timescale = load.timescale()
File "C:\Python\Python36\lib\site-packages\skyfield\iokit.py", line 232, in timescale
preds = self('deltat.preds')
File "C:\Python\Python36\lib\site-packages\skyfield\iokit.py", line 142, in __call__
expiration_date, data = parser(f)
File "C:\Python\Python36\lib\site-packages\skyfield\iokit.py", line 309, in parse_deltat_preds
expiration_date = date(year[0] + 2, month[0], 1)
ValueError: year 58668 is out of range
Any ideas?
Try upgrading to the new version of Skyfield with pip install -U skyfield. A third party data file changed formats and so we made a new Skyfield release to fix it.

Convert numbers from mathematica csv export to numpy complex array

I have exported data from mathematica to a csv file. The file structure looke as follows:
"x","y","Ex","Ey"
0.,0.,0.+0.*I,-3.0434726787506006*^-12+3.4234894344189825*^-12*I
0.,0.,0.+0.*I,-5.0434726787506006*^-12+10.4234894344189825*^-13*I
...
I'm reading in the data with pandas, but I get an error
import csv
import pandas as pd
import numpy as np
df=pd.read_csv('filename.csv')
df.columns=['x', 'y', 'Ex','Ey']
df['Ey'] = df['Ey'].str.replace('*^','E')
df['Ey'] = df['Ey'].str.replace('I','1j').apply(lambda x: np.complex(x))
Edit: I'm getting the following error in the second last line of my code:
Traceback (most recent call last):
File "plot.py", line 6, in <module>
df['Ey'] = df['Ey'].str.replace('*^','E')
File "/home/.../.local/lib/python2.7/site-packages/pandas/core/strings.py", line 1579, in replace
flags=flags)
File "/home/.../.local/lib/python2.7/site-packages/pandas/core/strings.py", line 424, in str_replace
regex = re.compile(pat, flags=flags)
File "/usr/lib/python2.7/re.py", line 194, in compile
return _compile(pattern, flags)
File "/usr/lib/python2.7/re.py", line 251, in _compile
raise error, v # invalid expression
sre_constants.error: nothing to repeat
When I write instead
df['Ey'] = df['Ey'].str.replace('*','E')
or
df['Ey'] = df['Ey'].str.replace('^','E')
I'm not getting an error. It seems like one can only give one charcter which is replaced?
Why beat yourself up messing with ascii encoded floats?
here is how to exchange complex arrays between python and mathematica using raw binary files.
in mathematica:
cdat = RandomComplex[{0, 1 + I}, 5]
{0.0142816 + 0.0835513 I, 0.434109 + 0.977644 I,
0.579678 + 0.337286 I, 0.426271 + 0.166166 I, 0.363249 + 0.0867334 I}
f = OpenWrite["test", BinaryFormat -> True]
BinaryWrite[f, cdat, "Complex64"]
Close[f]
or:
Export["test", cdat, "Binary", "DataFormat" -> "Complex64"]
in python:
import numpy as np
x=np.fromfile('test',np.complex64)
print x
[ 0.01428160+0.0835513j 0.43410850+0.97764391j 0.57967812+0.3372865j
0.42627081+0.16616575j 0.36324903+0.08673338j]
going the other way:
y=np.array([[1+2j],[3+4j]],np.complex64)
y.tofile('test')
f = OpenRead["test", BinaryFormat -> True]
BinaryReadList[f, "Complex64"]
Close[f]
note this will be several orders of magnitude faster than exchanging data by csv.

Keras fit_generator throwing ValueError

So I'm trying to create a generator to iterate through a data set for use in training with Keras's fit_generator. Here's the definition of the generator, the model, and the call to fit_generator:
import numpy as np
from queue import Queue, deque
from keras.models import Sequential
from keras.layers import Dense
num_features = 40
len_data = 100
data = np.random.rand(len_data, num_features)
def train_generator(train_idxs):
while True:
i = train_idxs.get(block=False)
training_example = data[i,:]
training_example.shape = (1, len(training_example))
yield (training_example, training_example)
layer0_size = num_features
layer1_size = layer0_size / 2
layer2_size = layer1_size / 2
layers = []
layers.append(
Dense(input_dim=layer0_size, output_dim=layer1_size, activation='relu'))
layers.append(
Dense(input_dim=layer1_size, output_dim=layer2_size, activation='relu'))
layers.append(
Dense(input_dim=layer2_size, output_dim=layer1_size, activation='relu'))
layers.append(
Dense(input_dim=layer1_size, output_dim=layer0_size, activation='sigmoid'))
model = Sequential()
for layer in layers:
model.add(layer)
model.compile(optimizer='adam', loss='binary_crossentropy')
train_idxs = Queue()
train_idxs.queue = deque(range(len_data))
train_gen = train_generator(train_idxs)
max_q_size = 2
model.fit_generator(train_gen, samples_per_epoch=len(data), max_q_size=max_q_size, nb_epoch=1)
Keras will then successfully train 98/100 training examples and throw this error
98/100 [============================>.] - ETA: 0s - loss: 0.6930Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
self.run()
File "/usr/lib/python3.5/threading.py", line 862, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 429, in data_generator_task
generator_output = next(self._generator)
File "scrap.py", line 12, in train_generator
i = train_idxs.get(block=False)
File "/usr/lib/python3.5/queue.py", line 161, in get
raise Empty
queue.Empty
Traceback (most recent call last):
File "scrap.py", line 43, in <module>
model.fit_generator(train_gen, samples_per_epoch=len(data), max_q_size=max_q_size, nb_epoch=1)
File "/usr/local/lib/python3.5/dist-packages/keras/models.py", line 935, in fit_generator
initial_epoch=initial_epoch)
File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 1528, in fit_generator
str(generator_output))
ValueError: output of generator should be a tuple (x, y, sample_weight) or (x, y). Found: None
It seems like what's happening is that it popped of all of the training_idxs and it's still trying to get more until Keras exhaust the training examples in its internal queue. Is there a way to get it to stop trying to get more training examples from the generator?