Convert a key value pair in a column as new column in python - json

I want to parse a column, and get the key-value pair as column
Input:
I have a dataframe (called df) with the following structure:
ID data
A1 {"userMatch": "{"match":{"phone":{"name":{"score":1}},"name":{"score":1}}}"}
A2 {"userMatch": "{"match":{"phone":{"name":{"score":0.934}},"name":{"score":0.952}}}"}
Expected Output:
I wanted to create new column called 'score' and get the value from the key value pair
ID score1 score2
A1 1 1
A2 0.934 0.952
Attempted Solution:
data_json = df['data'].transform(lambda x: json.loads(x))
df['score1'] = data_json.str.get('userMatch').str.get('match').str.get('phone').str.get('name').str.get('score')
df['score2'] = data_json.str.get('userMatch').str.get('match').str.get('phone').str.get('name').str.get('name').str.get('score')
Error:
TypeError: the JSON object must be str, bytes or bytearray, not Series
Notes:
I am not even sure how to get the next score2

Using mu previous though regarding using regex, this is how I would approach your problem:
import re
def getOffset(row, offset):
vals = re.findall(r"[-+]?\d*\.\d+|\d+", row.data['userMatch'])
if len(vals)> offset:
return vals[offset]
return None
df['score1'] = df.apply(lambda row: getOffset(row, 0), axis= 1)
df['score2'] = df.apply(lambda row: getOffset(row, 1), axis = 1)
df.drop(['data'], axis= 1, inplace=True)
This yields a dataframe of the form:
ID score1 score2
0 A1 1 1
1 A2 0.934 0.952

This isn't pretty, but works with split(). Couldn't get a dictionary to be read, kept getting invalid syntax or missing delimiter.
df = pd.read_csv(io.StringIO('''ID data
A1 {"userMatch": "{"match":{"phone":{"name":{"score":1}},"name":{"score":1}}}"}
A2 {"userMatch": "{"match":{"phone":{"name":{"score":0.934}},"name":{"score":0.952}}}"}'''), sep=' ', engine='python')
df['score1'] = df['data'].apply(lambda x: x.split('{"userMatch": "{"match":{"phone":{"name":{"score":')[1].split('}', 1)[0])
df['score2'] = df['data'].apply(lambda x: x.split('{"userMatch": "{"match":{"phone":{"name":{"score":')[1].split(',"name":{"score":')[1].split('}', 1)[0])
Output:
ID data score1 score2
0 A1 {"userMatch": "{"match":{"phone":{"name":{"score":1}},"name":{"score":1}}}"} 1 1
1 A2 {"userMatch": "{"match":{"phone":{"name":{"score":0.934}},"name":{"score":0.952}}}"} 0.934 0.952

Related

pandas Dataframe: efficiently expanding column containing json into multiple columns

I have a dataframe where a column is a json string with a dictionary, and I need to expand the json into separate columns. Example:
c1 c2
0 a1 {'x1': 1, 'x3': 3, 'x2': 2}
1 a2 {'x1': 21, 'x3': 23, 'x2': 22}
should become:
c1 x1 x2 x3
0 a1 1.0 2.0 3.0
1 a2 21.0 22.0 23.0
My problem is very similar to this thread, except that I have strings, not dictionaries (although the strings evaluate to a dictionary), and the simple, optimized solution proposed there doesn't work in my case.
I have a working solution, but it is clearly horribly inefficient. Here's a snippet with my code and the solution proposed in that thread:
import json
import pandas as pd
def expandFeatures(df, columnName):
"""Expands column 'columnName', which contains a dictionary in form of a json string, into N single columns, each containing a single feature"""
# get names of new columns from the first row
features = json.loads(df.iloc[0].loc[columnName])
featureNames = list(features.keys())
featureNames.sort()
# add new columns (empty values)
newCols = list(df.columns) + featureNames
df = df.reindex(columns=newCols, fill_value=0.0)
# fill in the values of the new columns
for index, row in df.iterrows():
features = json.loads(row[columnName])
for key,val in features.items():
df.at[index, key] = val
# remove column 'columnName'
return df.drop(columns=[columnName])
def expandFeatures1(df, columnName):
return df.drop(columnName, axis=1).join(pd.DataFrame(df[columnName].values.tolist()))
df_json = pd.DataFrame([['a1', '{"x1": 1, "x2": 2, "x3": 3}'], ['a2', '{"x1": 21, "x2": 22, "x3": 23}']],
columns=['c1', 'c2'])
df_dict = pd.DataFrame([['a1', {'x1': 1, 'x2': 2, 'x3': 3}], ['a2', {'x1': 21, 'x2': 22, 'x3': 23}]],
columns=['c1', 'c2'])
# correct result, but inefficient
print("expandFeatures, df_json")
df = df_json.copy()
print(df)
df = expandFeatures(df, 'c2')
print(df)
# this gives an error because expandFeatures expects a string, not a dictionary
# print("expandFeatures, df_dict")
# df = df_dict.copy()
# print(df)
# df = expandFeatures(df, 'c2')
# print(df)
# WRONG, doesn't expand anything
print("expandFeatures1, df_json")
df = df_json.copy()
print(df)
df = expandFeatures1(df, 'c2')
print(df)
# correct and efficient, but not my use case (I have strings not dicts)
print("expandFeatures1, df_dict")
df = df_dict.copy()
print(df)
df = expandFeatures1(df, 'c2')
print(df)
I'm sure there is some obvious way to improve the efficiency of my code, make it more similar to the single line proposed in the other thread, but I can't really see it myself... Thanks in advance for any help.
If you json strings are valid dictionaries, you can use ast.literal_eval to parse them:
import pandas as pd
from ast import literal_eval
df_json = pd.DataFrame([['a1', '{"x1": 1, "x2": 2, "x3": 3}'],
['a2', '{"x1": 21, "x2": 22, "x3": 23}']],
columns=['c1', 'c2'])
print (pd.concat([df_json,pd.DataFrame(df_json["c2"].apply(literal_eval).to_list())],axis=1).drop("c2",axis=1))
#
c1 x1 x2 x3
0 a1 1 2 3
1 a2 21 22 23

Grouping CSV file by ID and extracting JSON column

I currently have a CSV like this:
A B C
1 10 {"a":"one","b":"two","c":"three"}
1 10 {"a":"four","b":"five","c":"six"}
1 10 {"a":"seven","b":"eight","c":"nine"}
1 10 {"a":"ten","b":"eleven","c":"twelve"}
2 10 {"a":"thirteen","b":"fourteen","c":"fifteen"}
2 10 {"a":"sixteen","b":"seventeen","c":"eighteen"}
2 10 {"a":"nineteen","b":"twenty","c":"twenty-one"}
3 10 {"a":"twenty-two","b":"twenty-three","c":"twenty-four"}
3 10 {"a":"twenty-five","b":"twenty-six","c":"twenty-seven"}
3 10 {"a":"twenty-eight","b":"twenty-nine","c":"thirty"}
3 10 {"a":"thirty-one","b":"thirty-two","c":"thirty-three"}
I want to group by column A, ignore column B, and take only the "b" field in C, and get an output like:
A C
1 ['two','five','eight','eleven']
2 ['fourteen','seventeen','twenty']
3 ['twenty-three','twenty-six','twenty-nine','thirty-two']
Can I do this? I have pandas if that will be useful! Also I would like the output file to be tab delimited.
Try this:
import pandas as pd
import json
# read file that looks exactly as given above
df = pd.read_csv("file.csv", delim_whitespace=True)
# drop the 'B' column
del df['B']
# 'C' will start life as a string. convert from json, extract values, return as list
df['C'] = df['C'].map(lambda x: json.loads(x)['b'])
# 'C' now holds just the 'b' values. group these together:
df = df.groupby('A').C.apply(lambda x : list(x))
print(df)
This returns:
A
1 [two, five, eight, eleven]
2 [fourteen, seventeen, twenty]
3 [twenty-three, twenty-six, twenty-nine, thirty...
IIUC
df.groupby('A').C.apply(lambda x : [y['b'] for y in x ])
A
1 [two, five, eight, eleven]
2 [fourteen, seventeen, twenty]
3 [twenty-three, twenty-six, twenty-nine, thirty...
Name: C, dtype: object

Scala Spark - For loop in Data Frame and compare date

I have a Data Frame which has 3 columns like this:
---------------------------------------------
| x(string) | date(date) | value(int) |
---------------------------------------------
I want to SELECT all the the rows [i] that satisfy all 4 conditions:
1) row [i] and row [i - 1] have the same value in column 'x'
AND
2) 'date' at row [i] == 'date' at row [i - 1] + 1 (two consecutive days)
AND
3) 'value' at row [i] > 5
AND
4) 'value' at row [i - 1] <= 5
I think maybe I need a For loop, but don't know how exactly! Please help me!
Every help is much appreciated!
It can be very easily done with Window functions, look at lag function:
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import sqlContext.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
// test data
val list = Seq(
("x", "2016-12-13", 1),
("x", "2016-12-14", 7)
);
val df = sc.parallelize(list).toDF("x", "date", "value");
// add lags - so read previous value from dataset
val withPrevs = df
.withColumn ("prevX", lag('x, 1).over(Window.orderBy($"date")))
.withColumn ("prevDate", lag('date, 1).over(Window.orderBy($"date")))
.withColumn ("prevValue", lag('value, 1).over(Window.orderBy($"date")))
// filter values and select only needed fields
withPrevs
.where('x === 'prevX)
.where('value > lit(5))
.where('prevValue < lit(5))
.where('date === date_add('prevDate, 1))
.select('x, 'date, 'value)
.show()
Note that without order, i.e. by date, this cannot be done. Dataset has none meaningful order, you must specify order explicity
If you have a DataFrame created, then all you need to do is to call a filter function on DataFrame will all your conditions.
For example:
df1.filter($"Column1" === 2 || $"Column2" === 3)
You can pass as many conditions as you want. It will return you a new DataFrame with filtered data.

Python. If value on column 1 (row X) = value from column 2 (row Y), print row Y of column 3

I have a .csv file, df, with 3 columns (C1, C2 and C3). All columns are of the same length (aprox. 600000 rows) and have unique values. Values in C1, which represent SNPs (single nucleotide polymorphisms) are ordered according to their location on chromosomes. C2 has the same values as C1 but they are disordered. Values in C2 are coupled to corresponding values (chromosome locations) in the same row on C3. What I want to do is to couple the chromosomal locations on C3 to the values in C1 keeping the column order of C1. In other words, generate another column with chromosome locations for the ordered SNPs on C1. So far, I tried to create a dictionary with keys from C2 and values from C3 and then using a for loop to match values on C1 and print the ordered chromosome positions, but I get C3. I understand why I get that but I don't manage to get what I want.
Any suggestion/help would be welcome. I am new into programming.
import csv
from collections import OrderedDict # to save keys order
import sys
sys.stdout = open("output1.csv", "w")
# C1= rows[0], C2= rows[1], C3= rows[2]
with open('df1.csv', 'rU') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader) #skip header
d = OrderedDict((rows[1], rows[2]) for rows in reader)
for rows in reader:
if rows[0] in d:
print rows[2]
Input example:
C1 C2 C3
12082473 2980300 785989
11240776 4245756 799463
2980300 12082473 740857
2905036 2341354 918573
4245756 3748597 888659
3748597 11240776 765269
2341354 2905036 792480
2465126 2465126 947034
Desired output:
C1 C4
12082473 740857
11240776 765269
2980300 785989
2905036 792480
4245756 799463
3748597 888659
2341354 918573
2465126 947034
I am not entirely sure I understand what you are trying to do.
I think your error is from using the generator expression d = OrderedDict((rows[0], rows[3]) for rows in reader1) and then referring to it after the file has been closed at the end of the with block.
You might try something along these lines:
import csv
from collections import OrderedDict
d=OrderedDict()
with open('df1.csv', 'rU') as csv1, open('df2.csv', 'rU') as csv2:
reader1 = csv.reader(csv1, delimiter=',')
reader2 = csv.reader(csv2, delimiter=',')
next(reader1) #skip header
next(reader2) #skip header
for row in reader1:
d[row[0]]=row[3]
# d = OrderedDict(("a", "b") for rows in reader1)
for row in reader2:
if row[0] in d:
print d[row[0]]
I do not see any reason you need an OrderedDict since this is just a mapping between row[0] and row[3] as written. You are not using the order currently.

write items from a list to csv file column by column using pandas dataframe.to_csv

I have a list named items
items=['a' , 'b','c']
Code is:
df = pandas.DataFrame(items)
df.to_csv("myfile.csv",headers=None,index=False)
the values written to the file are in different rows but same column.(vertically written)
But
I want the values to be written as : a b c ie. in same row but different column.
Help please
You get each element in different rows because you load the df as that way.
If you want in different column I would suggest to do transpose,
df = df.T
or you can load as one row like below,
items=[['a' , 'b','c']]
df = pd.DataFrame(items)
df
Out[22]:
0 1 2
0 a b c
And then write the output to csv,
eg:
df = pandas.DataFrame(items)
df = df.T
df.to_csv("myfile.csv",headers=None,index=False)
df = pd.DataFrame(items)
df
Out[5]:
0
0 a
1 b
2 c
df.T
Out[11]:
0 1 2
0 a b c