Pyspark Dropping the header in a dataframe, AttributeError: _jdf - csv

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession
avg_calc = spark.read.csv("quiz2_algo.csv", header= True,inferSchema=True)
header = avg_calc.first()
no_header = avg_calc.subtract(header)
no_header
avg_calc contains 2 columns and I am trying to remove the 1st row from both columns, however I am receiving the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-50-24671d91e691> in <module>()
----> 1 no_header = avg_calc.subtract(header)
C:\spark\spark-2.3.0-bin-hadoop2.7\python\pyspark\sql\dataframe.pyc in subtract(self, other)
1391
1392 """
-> 1393 return DataFrame(getattr(self._jdf, "except")(other._jdf), self.sql_ctx)
1394
1395 #since(1.4)
C:\spark\spark-2.3.0-bin-hadoop2.7\python\pyspark\sql\types.pyc in __getattr__(self, item)
1559 raise AttributeError(item)
1560 except ValueError:
-> 1561 raise AttributeError(item)
1562
1563 def __setattr__(self, key, value):
AttributeError: _jdf
If anyone can help I would appreciate it!
Example of the data: avg_calc.show(5)

first() returns a Row object rather than a DataFrame which is required by subtract. See http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.first
You could try something like:
avg_calc.subtract(avg_calc.limit(1))
For example:
>>> df = spark.createDataFrame([Row(x=1), Row(x=2)])
>>> print(df.subtract(df.limit(1)).toPandas())
x
0 2
Apply an ordering to you dataframe to ensure the row you would like dropped is in the correct location:
>>> from pyspark.sql import functions as F
>>> df = df.orderBy(F.col('CS202 Quiz#2').desc())
>>> df = df.subtract(df.limit(1))

Related

How to read json file and fit lstm model?

I am trying to apply LSTM on HP news dataset. The data is in JSON format (https://www.kaggle.com/rmisra/news-category-dataset). I have tried this code and got errors. Don't know what's wrong with this code?
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
import json
from sklearn.preprocessing import LabelBinarizer
with open('News_Category_Dataset_v2.json', 'r') as f:
train = json.load(f)
Y_train = list(train.values())
lb = LabelBinarizer()
X_train = lb.fit_transform(list(train.keys()))
##
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)
##
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
def RNN():
inputs = Input(name='inputs',shape=[max_len])
layer = Embedding(max_words,50,input_length=max_len)(inputs)
layer = LSTM(64)(layer)
layer = Dense(256,name='FC1')(layer)
layer = Activation('relu')(layer)
layer = Dropout(0.5)(layer)
layer = Dense(1,name='out_layer')(layer)
layer = Activation('softmax')(layer)
model = Model(inputs=inputs,outputs=layer)
return model
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
Got these errors
Traceback (most recent call last):
Traceback (most recent call last):
File ".\Hpnews.py", line 30, in <module>
train = json.load(f)
File "C:\Users\a\Anaconda3\lib\json\__init__.py", line 293, in load
return loads(fp.read(),
File "C:\Users\a\Anaconda3\lib\json\__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "C:\Users\a\Anaconda3\lib\json\decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 366)
this is my json file format
"root":{6 items
"category":string"CRIME"
"headline":string"There Were 2 Mass Shootings In Texas Last Week, But Only 1 On TV"
"authors":string"Melissa Jeltsen"
"link":string"huffingtonpost.com/entry/…" "short_description":string"She left her husband. He killed their children. Just another day in America."
"date":string"2018-05-26" }
The JSON is not a typical JSON but a ndJSON ("newline-delimited JSON") that won't be opened by json.load.
You should use pandas to load you data:
import pandas as pd
data = pd.read_json('News_Category_Dataset_v2.json', lines=True)

Setting column on empty dataframe

I'm reading json arrays from a text file and then create an empty dataframe. I want to add a new column 'id' to the empty dataframe. 'id' comes from the json arrays in the text file.
Error message reads "Cannot set a frame with no defined index and a value that canot be converted to a series". I tried to overcome this error by defining dataframe size upfront which did not help. Any ideas?
import json
import pandas as pd
path = 'my/path'
mydata = []
myfile = open(path, "r")
for line in myfile:
try:
myline = json.loads(line)
mydata.append(myline)
except:
continue
mydf = pd.DataFrame()
mydf['id'] = map(lambda myline: myline['id'], mydata)
I think better is use:
for line in myfile:
try:
#extract only id to list
myline = json.loads(line)['id']
mydata.append(myline)
except:
continue
print (mydata)
[10, 5]
#create DataFrame by constructor
mydf = pd.DataFrame({'id':mydata})
print (mydf)
id
0 10
1 5

tweepy json getting AttributeError: 'int' object has no attribute 'items'

I have collected some 12,000 tweets following the code of http://adilmoujahid.com/posts/2014/07/twitter-analytics/
The problem is that I get error once the tweets are higher in number. The smaller number doesn't give this problem.
#adding columns
from pandas.io.json import json_normalize
tweets = json_normalize(tweet_data)[["text", "lang", "created_at", "user.time_zone", "user.location"]]
This gives me such result
AttributeError Traceback (most recent call last)
<ipython-input-21-19596361d3f0> in <module>()
1 #adding columns
2 from pandas.io.json import json_normalize
----> 3 tweets = json_normalize(tweet_data)[["text", "lang", "created_at", "user.time_zone", "user.location"]]
/usr/lib/python2.7/dist-packages/pandas/io/json.pyc in json_normalize(data, record_path, meta, meta_prefix, record_prefix)
713 # TODO: handle record value which are lists, at least error
714 # reasonably
--> 715 data = nested_to_record(data)
716 return DataFrame(data)
717 elif not isinstance(record_path, list):
/usr/lib/python2.7/dist-packages/pandas/io/json.pyc in nested_to_record(ds, prefix, level)
612
613 new_d = copy.deepcopy(d)
--> 614 for k, v in d.items():
615 # each key gets renamed with prefix
616 if level == 0:
AttributeError: 'int' object has no attribute 'items'
Is there any way to get out of this I am totally a novice in handling Pandas and Json stuff.

Convert json file into csv encode/decode problems

My semester project is about classification by using Naive bayes. I ve decided to use Yelp dataset. While I was turning the json file into csv file I came up with couple of problems. Such as :
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Its because of the wrong usage of json.loads(). I tried a couple of deifferent usage of the function to manage this part of the program. Unfortunately, none of them worked. I put my code down below, if you have any idea about how to handle this, can you please explain it to me?
`
import json
import pandas as pd
from glob import glob
import codecs
global df
global s
global count
def convert(x):
ob = json.loads(x)
for k, v in ob.items():
if isinstance(v, list):
ob[k] = ','.join(v)
elif isinstance(v, dict):
for kk, vv in v.items():
ob['%s_%s' % (k, kk)] = vv
del ob[k]
return ob
s = ""
count = 0
for json_filename in glob('*.json'):
csv_filename = '%s.csv' % json_filename[:-5]
print('Converting %s to %s' % (json_filename, csv_filename))
with open('yelp_dataset_challenge_round9.json','rb') as f: #open in binary mode
for line in f:
for cp in ('cp1252', 'cp850'):
try:
if count is 0:
count = 1
else:
s = str(line.decode('utf-8'))
except UnicodeDecodeError:
pass
df = pd.DataFrame([convert(s)])
df.to_csv(csv_filename, encoding='utf-8', index=False)
`
Thanks in advance :)

How to convert hierarchical DataFrame back from json? [duplicate]

I'm trying to read in a dataframe created via df.to_json() via pd.read_json but I'm getting a ValueError. I think it may have to do with the fact that the index is a MultiIndex but I'm not sure how to deal with that.
The original dataframe of 55k rows is called psi and I created test.json via:
psi.head().to_json('test.json')
Hereis the output of print psi.head().to_string() if you want to use that.
When I do it on this small set of data (5 rows), I get a ValueError.
! wget --no-check-certificate https://gist.githubusercontent.com/olgabot/9897953/raw/c270d8cf1b736676783cc1372b4f8106810a14c5/test.json
import pandas as pd
pd.read_json('test.json')
Here's the full stack:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-14-1de2f0e65268> in <module>()
1 get_ipython().system(u' wget https://gist.githubusercontent.com/olgabot/9897953/raw/c270d8cf1b736676783cc1372b4f8106810a14c5/test.json'>)
2 import pandas as pd
----> 3 pd.read_json('test.json')
/home/obot/virtualenvs/envy/lib/python2.7/site-packages/pandas/io/json.pyc in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit)
196 obj = FrameParser(json, orient, dtype, convert_axes, convert_dates,
197 keep_default_dates, numpy, precise_float,
--> 198 date_unit).parse()
199
200 if typ == 'series' or obj is None:
/home/obot/virtualenvs/envy/lib/python2.7/site-packages/pandas/io/json.pyc in parse(self)
264
265 else:
--> 266 self._parse_no_numpy()
267
268 if self.obj is None:
/home/obot/virtualenvs/envy/lib/python2.7/site-packages/pandas/io/json.pyc in _parse_no_numpy(self)
481 if orient == "columns":
482 self.obj = DataFrame(
--> 483 loads(json, precise_float=self.precise_float), dtype=None)
484 elif orient == "split":
485 decoded = dict((str(k), v)
ValueError: No ':' found when decoding object value
> /home/obot/virtualenvs/envy/lib/python2.7/site-packages/pandas/io/json.py(483)_parse_no_numpy()
482 self.obj = DataFrame(
--> 483 loads(json, precise_float=self.precise_float), dtype=None)
484 elif orient == "split":
But when I do it on the whole dataframe (55k rows) then I get an invalid pointer error and the IPython kernel dies. Any ideas?
EDIT: added how the json was generated in the first place.
This is not implemented ATM, see the issue here: https://github.com/pydata/pandas/issues/4889.
You can simply reset the index first, e.g
df.reset_index().to_json(...)
and it will work.
Or you can just write json with orient = 'table'
df.to_json(path_or_buf='test.json', orient='table')
read multi_index json
pd.read_json('test.json', orient='table')
if you want to return MultiIndex structure:
# save MultiIndex indexes names
indexes_names = df.index.names
df.reset_index().to_json('dump.json')
# return back MultiIndex structure:
loaded_df = pd.read_json('dump.json').set_index(indexes_names)
this was my simple dirty fix for encoding/decoding multiindex pandas dataframe which seems to also work for datetime in index/columns... not optimized!
here is the encoder to json - I encoder the dataframe, index and columns into a dict to create a json
import json
import pandas as pd
def to_json_multiindex(df):
dfi = df.index.to_frame()
dfc = df.columns.to_frame()
d = dict(
df = df.to_json(),
di = dfi.to_json(),
dc = dfc.to_json()
)
return json.dumps(d)
meanwhile here is the decoder which reads the json dict and re-creates the dataframe
def read_json_multiindex(j):
d = json.loads(j)
di=pd.read_json(d['di'])
if di.shape[1]>1:
di = pd.MultiIndex.from_frame(di)
else:
_name = di.columns[0]
di = di.index
di.name = _name
dc=pd.read_json(d['dc'])
if dc.shape[1]>1:
dc = pd.MultiIndex.from_frame(dc)
else:
_name = dc.columns[0]
dc = dc.index
dc.name = _name
df = pd.read_json(d['df']).values
return pd.DataFrame(
data=df,
index=di,
columns=dc,
)
and here is a test for multiindex columns and index... seems to preserve the dataframe. Couple of issues 1) probably inefficient and 2) does seem to work for datatime in multiindex (but works when it isn't multiindex)
df = pd.DataFrame(
data = [[0,1,2],[2,3,4],[5,6,7]],
index = pd.MultiIndex.from_tuples(
(('aa','bb'),('aa','cc'),('bb','cc')
),
names=['AA','BB']),
columns = pd.MultiIndex.from_tuples(
(('XX','YY'),('XX','ZZ'),('YY','ZZ')
),
names=['YY','ZZ'])
)
j = to_json_multiindex(df)
d = read_json_multiindex(j)