validation of csv using json schema in python

validation of csv using json schema in python - json

I want to perform the validation on the data. I have written the code using the pandas schema , instead of pandas schema how can I pass a json file which contains all the rules of validation in it and then apply it on the csv file.
That means to apply which rule on which column must be taken from the json file instead of the pandas schema and generate the error file.
def check_decimal(dec):
try:
Decimal(dec)
except InvalidOperation:
return False
return True
def check_int(num):
try:
int(num)
except ValueError:
return False
return True
def do_validation():
# read the data
data = pd.read_csv('data.csv')
# define validation elements
decimal_validation = [CustomElementValidation(lambda d: check_decimal(d), 'is not decimal')]
int_validation = [CustomElementValidation(lambda i: check_int(i), 'is not integer')]
null_validation = [CustomElementValidation(lambda d: d is None, 'this field cannot be null')]
# define validation schema
schema = pandas_schema.Schema([
Column('dec1', decimal_validation + null_validation),
Column('dec2', decimal_validation),
Column('dec3', decimal_validation),
Column('dec4', decimal_validation),
Column('dec5', decimal_validation),
Column('dec6', decimal_validation),
Column('dec7', decimal_validation),
Column('company_id', int_validation + null_validation),
Column('currency_id', int_validation + null_validation),
Column('country_id', int_validation + null_validation)])
# apply validation
errors = schema.validate(data)
errors_index_rows = [e.row for e in errors]
data_clean = data.drop(index=errors_index_rows)
# save data
pd.DataFrame({'col':errors}).to_csv('errors55.csv')

So, I don't know anything really about pandas_schema, but if you have columns and their validators in a json like this:
{
"dec1": ['decimal', 'null'],
"dec2": ['decimal'],
"dec3": ['decimal'],
"dec4": ['decimal'],
"dec5": ['decimal'],
"dec6": ['decimal'],
"dec7": ['decimal'],
"company_id": ['int', 'null'],
"currency_id": ['int', 'null'],
"country_id": ['int', 'null']
}
Then you can use a dict of validators and a list comprehension to generate your Column objects for the Schema:
def check_decimal(dec):
try:
Decimal(dec)
except InvalidOperation:
return False
return True
def check_int(num):
try:
int(num)
except ValueError:
return False
return True
VALIDATORS = {
'decimal': CustomElementValidation(lambda d: check_decimal(d), 'is not decimal'),
'int': CustomElementValidation(lambda i: check_int(i), 'is not integer'),
'null': CustomElementValidation(lambda d: d is None, 'this field cannot be null'),
}
def do_validation():
# read the data
data = pd.read_csv('data.csv')
with open('my_json_schema.json', 'r') as my_json:
json_schema = json.load(my_json)
column_list = [Column(k, [VALIDATORS[v] for v in vals]) for k, vals in json_schema.items()]
schema = pandas_schema.Schema(column_list)
# apply validation
errors = schema.validate(data)
errors_index_rows = [e.row for e in errors]
data_clean = data.drop(index=errors_index_rows)
# save data
pd.DataFrame({'col':errors}).to_csv('errors55.csv')
EDIT:
For using validators with arguments defined in the JSON you are going to need to change up both the JSON format and the code a bit. The following should work, but I can't test it myself.
{
"dec1": [['decimal'], ['null']],
"dec2": [['decimal'], ['range', 0, 10]],
"dec3": [['decimal']],
"dec4": [['decimal']],
"dec5": [['decimal']],
"dec6": [['decimal']],
"dec7": [['decimal']],
"company_id": [['int'], ['null']],
"currency_id": [['int'], ['null']],
"country_id": [['int'], ['null']]
}
def get_validator(opts)
VALIDATORS = {
'decimal': (CustomElementValidation, [lambda d: check_decimal(d), 'is not decimal']),
'int': (CustomElementValidation, [lambda i: check_int(i), 'is not integer']),
'null': (CustomElementValidation, [lambda d: d is None, 'this field cannot be null']),
'range': (InRangeValidation, []),
}
func, args = VALIDATORS[opts[0]]
args.extend(opts[1:])
return func(*args)
def do_validation():
# read the data
data = pd.read_csv('data.csv')
with open('my_json_schema.json', 'r') as my_json:
json_schema = json.load(my_json)
column_list = [Column(k, [get_validator(v) for v in vals]) for k, vals in json_schema.items()]
schema = pandas_schema.Schema(column_list)
# apply validation
errors = schema.validate(data)
errors_index_rows = [e.row for e in errors]
data_clean = data.drop(index=errors_index_rows)
# save data
pd.DataFrame({'col':errors}).to_csv('errors55.csv')

Related

recursive search json elements

I'm having trouble with finding json elements in a nested json.
It seems that my code only finds the element on the root level.
My code is not able to find the elements recursively it seems.
import json
import pandas as pd
jsonString = '{"airplane": {"wings": {}, "wheels": {}, "cockpit": {}}}'
jsonObj = json.loads(jsonString)
data = ['airplane','wings','wheels','cockpit']
dfProp = pd.DataFrame(data, columns=['object'])
# find elements in JSON
for index, row in dfProp.iterrows():
if row['object'] in jsonObj:
print(row['object'] + ' ' + 'FOUND')
else:
print(row['object'] + ' ' + 'NOT FOUND')
I want to find all elements regardless of how many nesting levels there are in json files.
Can someone point me into the right direction?

If I understand you correctly, you want to check if all values from list data is found as a key in jsonObj:
import json
jsonString = '{"airplane": {"wings": {}, "wheels": {}, "cockpit": {}}}'
jsonObj = json.loads(jsonString)
data = ["airplane", "wings", "wheels", "cockpit"]
def find(o):
if isinstance(o, dict):
for k, v in o.items():
yield k
yield from find(v)
elif isinstance(o, list):
for v in o:
yield from find(v)
s = set(data).difference(find(jsonObj))
if not s:
print("All values from data found in jsonObj")
else:
print("Not all values from data found in jsonObj", s)
Prints:
All values from data found in jsonObj

Bizarre Environment-dependent Bad Request 400 error

I'm writing a program to convert a repository into a Docker with an API based on some specification files. When I run the app on my Macbook's base environment, the computer-generated API works perfectly with both gunicorn and uwsgi. However, within the miniconda-based docker container, it failed with Bad Request 400: The browser (or proxy) sent a request that this server could not understand. My goal is to eliminate this error. Obviously, this has to do with the versions of some dependency or set of dependencies. Interestingly, the last endpoint in the API, which has a request parser within a namespace with no arguments, works perfectly, unlike the two other endpoints in the default namespace that do have arguments.
The API is built on flask_restx and uses reqparse.
The API code is here:
from flask_restx import Api, Resource, Namespace, reqparse, inputs
import flask
import process
from load_data import store_data
app = flask.Flask("restful_api")
api = Api(app, title="My API", description="This is an extremely useful API for performing tasks you would do with an API.", version="3.14")
data = {}
data.update(store_data())
class DefaultClass():
def __init__(self):
self.data = data
def _replace_get(self, **args):
default_args = {}
args = {**default_args, **args}
return process.replace(**args)
def _find_get(self, **args):
default_args = {"data": self.data["data"]}
args = {**default_args, **args}
return process.find_in_data_string(**args)
def set_up_worker():
global defaultClass
defaultClass = DefaultClass()
set_up_worker()
_replaceGetParser = reqparse.RequestParser()
_replaceGetParser.add_argument("txt",
type=str,
required=True,
help="Text to search ")
_replaceGetParser.add_argument("old",
type=str,
required=True,
help="Substring to replace ")
_replaceGetParser.add_argument("new",
type=str,
required=True,
help="Replacement for old ")
_replaceGetParser.add_argument("irrelevant_parameter",
type=int,
required=False,
default=5,
help="")
_replaceGetParser.add_argument("smart_casing",
type=inputs.boolean,
required=False,
default=True,
help="True if we should infer replacement capitalization from original casing. ")
_replaceGetParser.add_argument("case_sensitive",
type=inputs.boolean,
required=False,
default=True,
help="True if we should only replace case-sensitive matches ")
_findGetParser = reqparse.RequestParser()
_findGetParser.add_argument("window",
type=int,
required=False,
default=5,
help="Number of characters before and after first match to return ")
_findGetParser.add_argument("txt",
type=str,
required=False,
default="quick",
help="Your search term ")
#api.route('/replace', endpoint='replace', methods=['GET'])
#api.doc('defaultClass')
class ReplaceFrontend(Resource):
#api.expect(_replaceGetParser)
def get(self):
args = _replaceGetParser.parse_args()
return defaultClass._replace_get(**args)
#api.route('/find', endpoint='find', methods=['GET'])
#api.doc('defaultClass')
class FindFrontend(Resource):
#api.expect(_findGetParser)
def get(self):
args = _findGetParser.parse_args()
return defaultClass._find_get(**args)
retrievalNamespace = Namespace("retrieval", description="Data retrieval operations")
class RetrievalNamespaceClass():
def __init__(self):
self.data = data
def _retrieval_retrieve_data_get(self, **args):
default_args = {"data": self.data["data"]}
args = {**default_args, **args}
return process.return_data(**args)
def set_up_retrieval_worker():
global retrievalNamespaceClass
retrievalNamespaceClass = RetrievalNamespaceClass()
set_up_retrieval_worker()
_retrieval_retrieve_dataGetParser = reqparse.RequestParser()
#retrievalNamespace.route('/retrieval/retrieve_data', endpoint='retrieval/retrieve_data', methods=['GET'])
#retrievalNamespace.doc('retrievalNamespaceClass')
class Retrieval_retrieve_dataFrontend(Resource):
#retrievalNamespace.expect(_retrieval_retrieve_dataGetParser)
def get(self):
args = _retrieval_retrieve_dataGetParser.parse_args()
return retrievalNamespaceClass._retrieval_retrieve_data_get(**args)
api.add_namespace(retrievalNamespace)
I have had this problem with both pip-installed gunicorn and conda-installed uwsgi. I'm putting the file imported by the API at the end, since I think it is likely irrelevant what the function definitions are.
import numpy as np
import pandas as pd
import re
from subprocess import Popen, PIPE
from flask_restx import abort
def replace(txt: str = '', # apireq
old: str = '', # apireq
new: str = '', # apireq
case_sensitive: bool = True,
smart_casing: bool = True,
irrelevant_parameter: int = 5):
"""
Search and replace within a string, as long as the string and replacement
contain no four letter words.
arguments:
txt: Text to search
old: Substring to replace
new: Replacement for old
case_sensitive: True if we should only replace case-sensitive matches
smart_casing: True if we should infer replacement capitalization
from original casing.
return
return value
"""
four_letter_words = [re.match('[a-zA-Z]{4}$', word).string
for word in ('%s %s' % (txt, new)).split()
if re.match('[a-zA-Z]{4}$', word)]
if four_letter_words:
error_message = ('Server refuses to process four letter word(s) %s'
% ', '.join(four_letter_words[:5])
+ (', etc' if len(four_letter_words) > 5 else ''))
abort(403, custom=error_message)
return_value = {}
if not case_sensitive:
return_value['output'] = txt.replace(old, new)
else:
lowered = txt.replace(old, old.lower())
return_value['output'] = lowered.replace(old.lower(), new)
return return_value
def find_in_data_string(txt: str = "quick", # req
window: int = 5,
data=None): # noapi
"""
Check if there is a match for your search string in our extensive database,
and return the position of the first match with the surrounding text.
arguments:
txt: Your search term
data: The server's text data
window: Number of characters before and after first match to return
"""
return_value = {}
if txt in data:
idx = data.find(txt)
min_idx = max(idx-window, 0)
max_idx = min(idx+len(txt)+window, len(data)-1)
return_value['string_found'] = True
return_value['position'] = idx
return_value['surrounding_string'] = data[min_idx:max_idx]
return_value['surrounding_string_indices'] = [min_idx, max_idx]
else:
return_value = {['string_found']: False}
return return_value
def return_data(data=None): # noapi
"""
Return all the data in our text database.
"""
with Popen(['which', 'aws'], shell=True, stdout=PIPE) as p:
output = p.stdout.read()
try:
assert not output.strip()
except AssertionError:
abort(503, custom='The server is incorrectly configured.')
return_value = {'data': data}
return return_value

Take Input Dynamically from user in Python Dictionary

I've created a Python Dictionary Structure as below:
import pprint
log_data = {
'Date':'',
'Prayers':{
'Fajr':'',
'Dhuhr/Jumu\'ah':'',
'Asr':'',
'Maghrib':'',
'Isha\'a':''
},
'Task List':[{
'Task':'',
'Timeline':'',
'Status':''
}],
'Meals':{
'Breakfast':{
'Menu':'',
'Place':'',
'Time':''
},
'Lunch':{
'Menu':'',
'Place':'',
'Time':''
},
'Evening Snacks':{
'Menu':'',
'Place':'',
'Time':''
},
'Dinner':{
'Menu':'',
'Place':'',
'Time':''
}
},
'Exercises':[{
'Exercise':'',
'Duration':''
}]
}
pprint.pprint(log_data)
As you see this is just an dictionary structure without data. I want to iterate over all the keys and take input data as value from user using input().
Then I would like to save this dictionary as json file.
Could you please help on how I can iterate over all keys and take input from user.
Thanks.
Searched but couldn't found exact type of help that I need.

For this kind of thing, one needs to use recursion.
This is not fancy, but will get the job done:
from copy import deepcopy
import json
import pprint
log_data = {
'Date':'',
'Prayers':{
'Fajr':'',
'Dhuhr/Jumu\'ah':'',
'Asr':'',
'Maghrib':'',
'Isha\'a':''
},
'Task List':[{
'Task':'',
'Timeline':'',
'Status':''
}],
# ...
}
def input_fields(substruct, path=""):
print(f"Inputing values '{path}':")
for fieldname, value in substruct.items():
if isinstance(value, (str, int)):
substruct[fieldname] = input(f"{path}.{fieldname}: ")
elif isinstance(value, dict):
input_fields(value, f"{path}.{fieldname}")
elif isinstance(value, list):
original = value[0]
value.pop()
counter = 0
if not isinstance(original, dict):
raise ValueError("Not supported: A list should contain a dictionary-substructure")
while True:
item = deepcopy(original)
input_fields(item, f"{path}.{fieldname}.[{counter}]")
value.append(item)
continue_ = input(f"Enter one more {path}.{fieldname} item? (y/n) ").lower().strip()[0] == "y"
if not continue_:
break
counter+=1
return substruct
def main():
values = input_fields(deepcopy(log_data))
json.dump(values, open("myfile.json", "wt"), indent=4)
if __name__ == "__main__":
main()

JSON serialization using Marshmallow - skip None attributes

I am using Marshmallow to send instance of my Decision class to JSON. However, this will also dump the attributes which are None, e.g. my attribute score will translate to null in JSON. After that I am unable to read the JSON again using the same approach.
https://repl.it/repls/VoluminousMulticoloredFacts
The last line is where it currently fails. I need to either NOT dump None to JSON or skip null during loading:
import json
from marshmallow import Schema, fields, post_load
json_data = """{
"appid": "2309wfjwef",
"strategy": "First Strategy"
}"""
# Output class definition
class Decision(object):
def __init__(self, appid = None, strategy = None, score = None):
self.appid = appid
self.strategy = strategy
self.score = score
class DecisionSchema(Schema):
appid = fields.Str()
strategy = fields.Str()
score = fields.Int()
#post_load
def make_decision(self, data):
return Decision(**data)
# Deserialization into object
dec_json = json.loads(json_data)
schema = DecisionSchema()
dec = schema.load(dec_json).data
print(dec.strategy)
# Dump results back to JSON
schema = DecisionSchema()
out = schema.dumps(dec)
print(out.data)
# Load back from dump
schema = DecisionSchema()
dec = schema.load(out).data
#print(dec.strategy) # returns error currently

An "official" answer from marshmallow development team can be found in this comment in the bugtracker:
Use a post_dump method.
from marshmallow import Schema, fields, post_dump
class BaseSchema(Schema):
SKIP_VALUES = set([None])
#post_dump
def remove_skip_values(self, data, **kwargs):
return {
key: value for key, value in data.items()
if value not in self.SKIP_VALUES
}
class MySchema(BaseSchema):
foo = fields.Field()
bar = fields.Field()
sch = MySchema()
sch.dump({'foo': 42, 'bar': None}).data # {'foo': 42}
As I point out in a further comment, there's a shortcoming: it will also remove None when the field's allow_none is True.

As I pointed out in my comment above this messes with the order if you use the
class Meta:
fields = (
'field1', 'field2'
)
ordered = True
To fix this I used this:
# Remove None fields
#pre_dump
def remove_skip_values(self, data):
return {
key: value for key, value in data.items()
if value is not None
}
This works for my dictonary of objects

Inverse of Pandas json_normalize

I just discovered the json_normalize function which works great in taking a JSON object and giving me a pandas Dataframe. Now I want the reverse operation which takes that same Dataframe and gives me a json (or json-like dictionary which I can easily turn to json) with the same structure as the original json.
Here's an example: https://hackersandslackers.com/json-into-pandas-dataframes/.
They take a JSON object (or JSON-like python dictionary) and turn it into a dataframe, but I now want to take that dataframe and turn it back into a JSON-like dictionary (to later dump to json file).

I implemented it with a couple functions
def set_for_keys(my_dict, key_arr, val):
"""
Set val at path in my_dict defined by the string (or serializable object) array key_arr
"""
current = my_dict
for i in range(len(key_arr)):
key = key_arr[i]
if key not in current:
if i==len(key_arr)-1:
current[key] = val
else:
current[key] = {}
else:
if type(current[key]) is not dict:
print("Given dictionary is not compatible with key structure requested")
raise ValueError("Dictionary key already occupied")
current = current[key]
return my_dict
def to_formatted_json(df, sep="."):
result = []
for _, row in df.iterrows():
parsed_row = {}
for idx, val in row.iteritems():
keys = idx.split(sep)
parsed_row = set_for_keys(parsed_row, keys, val)
result.append(parsed_row)
return result
#Where df was parsed from json-dict using json_normalize
to_formatted_json(df, sep=".")

A simpler approach:
Uses only 1 function...
def df_to_formatted_json(df, sep="."):
"""
The opposite of json_normalize
"""
result = []
for idx, row in df.iterrows():
parsed_row = {}
for col_label,v in row.items():
keys = col_label.split(sep)
current = parsed_row
for i, k in enumerate(keys):
if i==len(keys)-1:
current[k] = v
else:
if k not in current.keys():
current[k] = {}
current = current[k]
# save
result.append(parsed_row)
return result

df.to_json(path)
or
df.to_dict()

I just implemented this using 2 functions.
Get a full list of fields from the DataFrame that are part of a nested field. Only the parent i.e. if location.city.code fits the criteria, we only care about location.city. Sort it by the deepest level of nesting, i.e. location.city is nested further than location.
Starting with the deepest nested parent field, find all child fields by searching in the column name. Create a field in the DataFrame for the parent field, which is a combination of all child fields (renamed so that they lose the nesting structure, e.g. location.city.code becomes code) converted to JSON and then loaded to a dictionary value. Finally, drop all of the child fields.
def _get_nested_fields(df: pd.DataFrame) -> List[str]:
"""Return a list of nested fields, sorted by the deepest level of nesting first."""
nested_fields = [*{field.rsplit(".", 1)[0] for field in df.columns if "." in field}]
nested_fields.sort(key=lambda record: len(record.split(".")), reverse=True)
return nested_fields
def df_denormalize(df: pd.DataFrame) -> pd.DataFrame:
"""
Convert a normalised DataFrame into a nested structure.
Fields separated by '.' are considered part of a nested structure.
"""
nested_fields = _get_nested_fields(df)
for field in nested_fields:
list_of_children = [column for column in df.columns if field in column]
rename = {
field_name: field_name.rsplit(".", 1)[1] for field_name in list_of_children
}
renamed_fields = df[list_of_children].rename(columns=rename)
df[field] = json.loads(renamed_fields.to_json(orient="records"))
df.drop(list_of_children, axis=1, inplace=True)
return df

let me throw in my two cents
after backward converting you might need to drop empty columns from your generated jsons
therefore, i checked if val != np.nan. but u cant directly do it, instead you need to check val == val or not, because np.nan != itself.
my version:
def to_formatted_json(df, sep="."):
result = []
for _, row in df.iterrows():
parsed_row = {}
for idx, val in row.iteritems():
if val == val:
keys = idx.split(sep)
parsed_row = set_for_keys(parsed_row, keys, val)
result.append(parsed_row)
return result

This is a solution which looks working to me. It is designed to work on a dataframe with one line, but it can be easily looped over large dataframes.
class JsonRecreate():
def __init__(self, df):
self.df = df
def pandas_to_json(self):
df = self.df
# determine the number of nesting levels
number_levels = np.max([len(i.split('.')) for i in df.columns])
# put all the nesting levels in an a list
levels = []
for level_idx in np.arange(number_levels):
levels.append(np.array([i.split('.')[level_idx] if len(i.split('.')) > level_idx else ''
for i in df.columns.tolist()]))
self.levels = levels
return self.create_dict(upper_bound = self.levels[0].shape[0])
def create_dict(self, level_idx = 0, lower_bound = 0, upper_bound = 100):
''' Function to create the dictionary starting from a pandas dataframe generated by json_normalize '''
levels = self.levels
dict_ = {}
# current nesting level
level = levels[level_idx]
# loop over all the relevant elements of the level (relevant w.r.t. its parent)
for key in [i for i in np.unique(level[lower_bound: upper_bound]) if i != '']:
# find where a particular key occurs in the level
correspondence = np.where(level[lower_bound: upper_bound] == key)[0] + lower_bound
# check if the value(s) corresponding to the key appears once (multiple times)
if correspondence.shape[0] == 1:
# if the occurence is unique, append the value to the dictionary
dict_[key] = self.df.values[0][correspondence[0]]
else:
# otherwhise, redefine the relevant bounds and call the function recursively
lower_bound_, upper_bound_ = correspondence.min(), correspondence.max() + 1
dict_[key] = self.create_dict(level_idx + 1, lower_bound_, upper_bound_)
return dict_
I tested it with a simple dataframe such as:
df = pd.DataFrame({'a.b': [1], 'a.c.d': [2], 'a.c.e': [3], 'a.z.h1': [-1], 'a.z.h2': [-2], 'f': [4], 'g.h': [5], 'g.i.l': [6], 'g.i.m': [7], 'g.z.h1': [-3], 'g.z.h2': [-4]})
The order in the json is not exactly preserved in the resulting json, but it can be easily handled if needed.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

validation of csv using json schema in python - json

Related

recursive search json elements

Bizarre Environment-dependent Bad Request 400 error

Take Input Dynamically from user in Python Dictionary

JSON serialization using Marshmallow - skip None attributes

Inverse of Pandas json_normalize

Categories

Resources