Parsing incomplete / doubled / damaged JSON String

Parsing incomplete / doubled / damaged JSON String - json

In Python, how would one retrieve information from a incomplete or "overcomplete" JSON string?
Example (incomplete):
msg = '{"MESSAGE": {"MESSAGE_SIZE": "2230", "IMAGE_NUM'
Example (overcomplete):
msg = '{"MESSAGE": {"MESSAGE_SIZE": "2230", "IMAGE_NUMBER": "16227"}}{"MESSA'
Specifying the key MESSAGE_SIZE, I want to retrieve the integer 2230.
The position of the key inside the string is mutable.
One working solution I found is the following (ugly) piece of code. I'm sure there are better solutions though:
key = "\"MESSAGE_SIZE\":"
len_start_idx = 0
len_str = ""
len_int = 0
# Find position right after key
start = msg.find(key)
msg = msg[start+len(key):]
# Find the first value digit after the key
for c in msg:
if not c.isdigit():
len_start_idx += 1
else:
break
msg = msg[len_start_idx:]
# Append value digits until no more digits are found
for c in msg:
if c.isdigit():
len_str += c
else:
break
len_int = int(len_str)

Interestingly, there is a npm-module to untruncate json files but I couldn't find anything similar for python.
It shouldn't be too complicated to implement one though. This should work in most cases (also closing lists):
import json
import re
closing_chars = {
'{': '}',
'[': ']'
}
def close_structs(json_string):
stack = []
for char in json_string:
if char in ['{', '[']:
stack.append(char)
elif char in ['}', ']']:
stack.pop()
for open_struct in stack[::-1]:
json_string += closing_chars[open_struct]
return json_string
def untruncate_json(json_string):
while True:
try:
d = json.loads(json_string)
return d
except Exception as e:
if "Expecting ':' delimiter" in repr(e):
json_string += ": null"
elif "Unterminated string starting at" in repr(e):
json_string += '"'
elif "Expecting property name enclosed in double quotes" in repr(e) \
or "Expecting value" in repr(e):
json_string = re.sub(r'[:,]\s*$', '', json_string)
elif "Expecting ',' delimiter" in repr(e):
json_string = close_structs(json_string)
else:
print(json_string)
raise e
msg = '{"MESSAGE": {"MESSAGE_SIZE": "2230", "IMAGE_NUMBER": "16227", "TE'
d = untruncate_json(msg)
# getting the desired value:
print(int(d["MESSAGE"]["MESSAGE_SIZE"]))
Edit: handling cases "Expecting property name enclosed in double quotes" and "Expecting value" + stripping all kinds of whitespace chars with re.sub (in order to include \n)

Related

Python 2 to 3 = TypeError: descriptor 'find' for 'str' objects doesn't apply to a 'bytes' object

Hello we try to convert python 2 to 3 but we are stuck with an error.
Maybe someone has an idea.
Thanks
if episode_num is not None:
episode_num = str.encode(str(episode_num), 'ascii','ignore')
if str.find(episode_num, ".") != -1:
splitted = str.split(episode_num, ".")
if splitted[0] != "":
#TODO fix dk format
try:
season = int(splitted[0]) + 1
is_movie = None # fix for misclassification
if str.find(splitted[1], "/") != -1:
episode = int(splitted[1].split("/")[0]) + 1
elif splitted[1] != "":
episode = int(splitted[1]) + 1
except:
episode = ""
season = ""
if str.find(episode_num, ".") != -1:
TypeError: descriptor 'find' for 'str' objects doesn't apply to a 'bytes' object
https://www.dropbox.com/s/viszyzlpbl92yj0/source.py?dl=1

Python 3 is much more strict about mixing str and bytes strings. Just be consistent. When you use encode you create a bytes string.
if bytes.find(episode_num, b".") != -1:
Better, learn to use in:
if b"." in episode_num:

python 3 : deserialize nested dictionaries from sqlite

I have this sqlite3.register_converter function :
def str_to_dict(s: ByteString) -> Dict:
if s and isinstance(s, ByteString):
s = s.decode('UTF-8').replace("'", '"')
return json.loads(s)
raise TypeError(f'value : "{s}" should be a byte string')
which returns this exception text :
File "/usr/lib64/python3.7/json/decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 30 (char 29)
when encounter with this string :
s = b"{'foo': {'bar': [('for', 'grid')]}}"
It seems that the issue comes from the nested list/tuple/dictionary but what I don't understand is that in the sqlite shell, the value is correctly returned with a select command :
select * from table;
whereas the same command issued from a python script returned the exception above :
class SqliteDb:
def __init__(self, file_path: str = '/tmp/database.db'):
self.file_path = file_path
self._db = sqlite3.connect(self.file_path, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES)
if self._db:
self._cursor = self._db.cursor()
else:
raise ValueError
# register data types converters and adapters
sqlite3.register_adapter(Dict, dict_to_str)
sqlite3.register_converter('Dict', str_to_dict)
sqlite3.register_adapter(List, list_to_str)
sqlite3.register_converter('List', str_to_list)
def __del__(self):
self._cursor.close()
self._db.close()
def select_from(self, table_name: str):
with self._db:
query = f'SELECT * FROM {table_name}'
self._cursor.execute(query)
if __name__ == '__main__':
try:
sq = SqliteDb()
selection_item = sq.select_from("table")[0]
print(f'selection_item : {selection_item}')
except KeyboardInterrupt:
print('\n')
sys.exit(0)
s, the value is already saved in database with no issue. Only the selection causes this issue.
So, anybody has a clue why ?

Your input is really a Python dict literal, and contains structures such as the tuple ('for', 'grid') that cannot be directly parsed as JSON even after you replace single quotes with double quotes.
You can use ast.literal_eval instead to parse the input:
from ast import literal_eval
def str_to_dict(s: ByteString) -> Dict:
return literal_eval(s.decode())

How to compare a character at a specific location in a string to identify the processing path

I'm trying to identify if the first character in a .txt/string is either a "{" or a "<". Depending on which will determine how the .txt is handled.
I'm working with two systems where one takes xml and the other takes json. So, as a file comes from one system it's converted and sent to the other. I've worked out the conversion for the files if they have the correct file extension but now I'm needing to be able to identify if a file is json or xml based off the content of a .txt file. I don't know why this would occur but was asked to include it.
Best way, as far as I can tell, is based off the first character within the file. If it's "<" than it is xml, if it's "{" than it's json. I'm not aware of a character that is only in json or only in xml that I can search through and identify that way.
The code below the # txt to xml and json is searching the whole file for the string which can give false positives which is why I'm trying to look at just the first character.
start_path = 'fileLocation'
for path,dirs,files in os.walk(start_path):
for fileName in files:
filePath = os.path.join(path,fileName)
# xml2json
if re.match('.*\.xml',fileName):
with open(filePath) as x:
xStr = x.read()
jStr = json.dumps(xmltodict.parse(xStr), indent=4)
with open("jsonOutput.json", 'w') as j:
j.write(jStr)
# json2xml
elif re.match('.*\.json',fileName):
with open(filePath) as j:
jStr = j.read()
xStr = xmltodict.unparse(json.loads(jStr), pretty=True)
with open('xmlOutput.xml', 'w') as x:
x.write(xStr)
# **Where I'm Having Trouble**
# txt to xml and json
elif re.match('.*\.txt',fileName):
with open(filePath) as t:
tStr = t.read()
if 'xml' in tStr:
with open('xmlOutput.xml', 'w') as x:
x.write(tStr)
elif '{' in tStr:
with open('jsonOutput.xml', 'w') as j:
j.write(tStr)
The ideal solution would replace the 'xml' and '{' full txt search with '<' and '{' checking the first character.
Any help is greatly appreciated and thank you.

If anyone is interested, I found a solution using readline(). This reads only the first line and if '{' is found it will process as a json, if there's an '<' it will process as xml. Thanks everyone for the help.
# unk to json & xml
else:
with open(filePath) as u:
fLine = u.readline() #This is only reading the first line.
uStr = u.read()
if '<' in fLine:
time = strftime('%Y%b%d %H%M', gmtime())
fName = fileName + ' ' + time + ".xml"
with open(fName, 'w') as x:
x.write(uStr)
elif '{' in fLine:
time = strftime('%Y%b%d %H%M', gmtime())
fName = fileName + ' ' + time + ".json"
with open(fName, 'w') as j:
j.write(uStr)

Inserting cipher text into mysql using python

So i have a program which will encrypt a string using AES and generate cipher which in bytes[].
I wish to store this cipher as it is in mysql database.
I found we could use VARBINARY data type in mysql to do so.
In what ways we could achieve so.
Here is my try to do so :
import ast
import mysql.connector
from Crypto.Cipher import AES
from Crypto.Random import get_random_bytes
def encrypt(key, msg):
iv = get_random_bytes(16)
cipher = AES.new(key, AES.MODE_CFB, iv)
ciphertext = cipher.encrypt(msg) # Use the right method here
db = iv + ciphertext
print(db)
cursor.executemany(sql_para_query,db)
print(cursor.fetchone())
connection.commit()
return iv + ciphertext
def decrypt(key, ciphertext):
iv = ciphertext[:16]
ciphertext = ciphertext[16:]
cipher = AES.new(key, AES.MODE_CFB, iv)
msg = cipher.decrypt(ciphertext)
return msg.decode("utf-8")
if __name__ == "__main__":
connection = mysql.connector.connect(host = "localhost", database = "test_db", user = "sann", password = "userpass",use_pure=True)
cursor = connection.cursor(prepared = True)
sql_para_query = """insert into test1 values(UNHEX(%s)) """
ed = input("(e)ncrypt or (d)ecrypt: ")
key = str(1234567899876543)
if ed == "e":
msg = input("message: ")
s= encrypt(key, msg)
print("Encrypted message: ", s)
file = open("e_tmp","wb+")
file.write(s)
print(type(s))
elif ed == "d":
#smsg = input("encrypted message: ")
#file = open("e_tmp","rb")
#smsg = file.read()
#print(type(smsg))
sql_para_query = """select * from test1"""
cursor.execute(sql_para_query)
row = cursor.fetchone()
print(row)
#smsg = str(smsg)
#msg = ast.literal_eval(smsg)
#print(msg)
#print(type(msg))
#s=decrypt(key, msg)
#print("Decrypted message: ", s)
#print(type(s))
Error I'm getting :
Traceback (most recent call last): File
"/home/mr_pool/.local/lib/python3.6/site-packages/mysql/connector/cursor.py",
line 1233, in executemany
self.execute(operation, params) File "/home/mr_pool/.local/lib/python3.6/site-packages/mysql/connector/cursor.py",
line 1207, in execute
elif len(self._prepared['parameters']) != len(params): TypeError: object of type 'int' has no len()
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "tmp1.py", line 36, in
s= encrypt(key, msg) File "tmp1.py", line 14, in encrypt
cursor.executemany(sql_para_query,db) File "/home/mr_pool/.local/lib/python3.6/site-packages/mysql/connector/cursor.py",
line 1239, in executemany
"Failed executing the operation; {error}".format(error=err)) mysql.connector.errors.InterfaceError: Failed executing the operation;
object of type 'int' has no len()
Any other alternatives are also welcome.
My ultimate goal is to store the encrypted text in database.

I reproduced your error, but it seems there are more errors in your code.
The key as well as the message are strings, therefore I got this error:
TypeError: Object type <class 'str'> cannot be passed to C code
Which I fixed by encoding them in utf-8:
# line 38:
key = str(1234567899876543).encode("utf8")
# .... line 41:
s= encrypt(key, msg.encode("utf8"))
The UNHEX function in your SQL Query is not needed because we are entering the data as VARBINARY. You can change your statement to:
"""insert into test1 values(%s) """
The function executemany() can be replaced by execute() because you are only entering one statement. However I will write the solution for using both, execute or executemany.
insert with execute():
From the documentation:
cursor.execute(operation, params=None, multi=False)
iterator = cursor.execute(operation, params=None, multi=True)
This method executes the given database operation (query or command). The parameters found in the tuple or dictionary params are bound to the variables in the operation. Specify variables using %s or %(name)s parameter style (that is, using format or pyformat style). execute() returns an iterator if multi is True.
https://dev.mysql.com/doc/connector-python/en/connector-python-api-mysqlcursor-execute.html
So we need just to build a tuple with your parameters by changing the cursor.execute line to:
cursor.execute(sql_para_query, (db, ))
insert with executemany():
From the documentation:
cursor.executemany(operation, seq_of_params)
This method prepares a database operation (query or command) and executes it against all parameter sequences or mappings found in the sequence seq_of_params.
https://dev.mysql.com/doc/connector-python/en/connector-python-api-mysqlcursor-executemany.html
Therefore we need to build a sequence with values you'd like to insert. In your case just one value:
cursor.executemany(sql_para_query, [(db, )])
To insert multiple values, you can add as many tuples into your sequence as you want.
full code:
import ast
import mysql.connector
from Crypto.Cipher import AES
from Crypto.Random import get_random_bytes
def encrypt(key, msg):
iv = get_random_bytes(16)
cipher = AES.new(key, AES.MODE_CFB, iv)
ciphertext = cipher.encrypt(msg) # Use the right method here
db = iv + ciphertext
cursor.execute(sql_para_query, (db, ))
connection.commit()
return iv + ciphertext
def decrypt(key, ciphertext):
iv = ciphertext[:16]
ciphertext = ciphertext[16:]
cipher = AES.new(key, AES.MODE_CFB, iv)
msg = cipher.decrypt(ciphertext)
return msg.decode("utf-8")
if __name__ == "__main__":
connection = mysql.connector.connect(host = "localhost", database = "test_db", user = "sann", password = "userpass",use_pure=True)
cursor = connection.cursor(prepared = True)
sql_para_query = """insert into test1 values(%s) """
ed = input("(e)ncrypt or (d)ecrypt: ")
key = str(1234567899876543).encode("utf8")
if ed == "e":
msg = input("message: ")
s= encrypt(key, msg.encode("utf8"))
print("Encrypted message: ", s)
file = open("e_tmp","wb+")
file.write(s)
print(type(s))
elif ed == "d":
sql_para_query = """select * from test1"""
cursor.execute(sql_para_query)
row = cursor.fetchone()
msg = row[0] # row is a tuple, therefore get first element of it
print("Unencrypted message: ", msg)
s=decrypt(key, msg)
print("Decrypted message: ", s)
output:
#encrypt:
(e)ncrypt or (d)ecrypt: e
message: this is my test message !!
Encrypted message: b"\x8f\xdd\xe6f\xb1\x8e\xb51\xc1'\x9d\xbf\xb5\xe1\xc7\x87\x99\x0e\xd4\xb2\x06;g\x85\xc4\xc1\xd2\x07\xb5\xc53x\xb9\xbc\x03+\xa2\x95\r4\xd1*"
<class 'bytes'>
#decrypt:
(e)ncrypt or (d)ecrypt: d
Unencrypted message: bytearray(b"\x8f\xdd\xe6f\xb1\x8e\xb51\xc1\'\x9d\xbf\xb5\xe1\xc7\x87\x99\x0e\xd4\xb2\x06;g\x85\xc4\xc1\xd2\x07\xb5\xc53x\xb9\xbc\x03+\xa2\x95\r4\xd1*")
Decrypted message: this is my test message !!

Reading binary .SAVE files?

I was wondering how to open or read a binary file that has been saved in octave, with the extension .SAVE? I have tried opening it with MATLAB, using the 'load' function in octave, but nothing seems to be working. I'm trying to understand someone else's code and they have saved the output of a simulation in this file.

The Octave binary format is briefly described in the comments before the function read_binary_data() in load-save.cc.
Are you sure the file is in "Octave binary format". The file ending ".SAVE" can be choosen arbitrary so this could also be CSV, gzipped...
You can run "file yourfile.SAFE" and paste the output or check the first bytes of your file if they are "Octave-1-L" or "Octave-1-B".
If you want to use these files from another program than GNU Octave I would suggest loading it in Octave and safe it in an other format. See "help save" for a list of supported formats.
EDIT:
Because the initial poster asked: Of course you can use GNU Octave from the terminal (no need for the GUI and I don't now to which software part you refer when you are using the phrase "octave GUI", see here Octave FAQ). Just install it for your used platform install instructions on wiki.octave.org and run it.

Code for reading octave binary save files in python 2/3.
Tested on:
strings
single and double precision real and complex floats
various integer types
scalar, matrix and array
Unsupported:
struct
cell array
...
Python code:
# This code is public domain
from __future__ import print_function
import sys
from collections import OrderedDict
import numpy as np
if sys.version_info[0] > 2:
def tostr(s):
return s.decode('utf8')
def decode(s, encoding='utf8'):
return s.decode(encoding)
STR_ENCODING = 'utf8'
else:
def tostr(s):
return s
def decode(s, encoding='utf8'):
return unicode(s, encoding)
STR_ENCODING = None
DATA_TYPES = {
1: "scalar",
2: "matrix",
3: "complex scalar",
4: "complex matrix",
5: "old_string",
6: "range",
7: "string",
}
TYPE_CODES = {
0: "u1",
1: "u2",
2: "u4",
3: "i1",
4: "i2",
5: "i4",
6: "f4",
7: "f8",
8: "u8",
9: "i8",
}
DTYPES = {k: np.dtype(v) for k, v in TYPE_CODES.items()}
def loadoct(fd, encoding=STR_ENCODING):
"""
Read an octave binary file from the file handle fd, returning
an array of structures. If encoding is not None then convert
strings from bytes to unicode. Default is STR_ENCODING, which
is utf8 for python 3 and None for python 2, yielding arrays
of type str in each dialect.
"""
magic = fd.read(10)
assert(magic == b"Octave-1-L" or magic == b"Octave-1-B")
endian = "<" if magic[-1:] == b"L" else ">"
# Float type is 0: IEEE-LE, 1: IEEE-BE, 2: VAX-D, 3: VAX-G, 4: Cray
# Not used since Octave assumes IEEE format floats.
_float_format = fd.read(1)
len_dtype = np.dtype(endian + "i4")
def read_len():
len_bytes = fd.read(4)
if not len_bytes:
return None
return np.frombuffer(len_bytes, len_dtype)[0]
table = OrderedDict()
while True:
name_length = read_len()
if name_length is None: # EOF
break
name = tostr(fd.read(name_length))
doc_length = read_len()
doc = tostr(fd.read(doc_length)) if doc_length else ''
is_global = bool(ord(fd.read(1)))
data_type = ord(fd.read(1))
if data_type == 255:
type_str = tostr(fd.read(read_len()))
else:
type_str = DATA_TYPES[data_type]
#print("reading", name, type_str)
if type_str.endswith("scalar"):
if type_str == "scalar":
dtype = DTYPES[ord(fd.read(1))]
elif type_str == "complex scalar":
_ = fd.read(1)
dtype = np.dtype('complex128')
elif type_str == "float complex scalar":
_ = fd.read(1)
dtype = np.dtype('complex64')
else:
dtype = np.dtype(type_str[:-7])
dtype = dtype.newbyteorder(endian)
data = np.frombuffer(fd.read(dtype.itemsize), dtype)
table[name] = data[0]
elif type_str.endswith("matrix"):
ndims = read_len()
if ndims < 0:
ndims = -ndims
dims = np.frombuffer(fd.read(4*ndims), len_dtype)
else:
dims = (ndims, read_len())
count = np.prod(dims)
if type_str == "matrix":
dtype = DTYPES[ord(fd.read(1))]
elif type_str == "complex matrix":
_ = fd.read(1)
dtype = np.dtype('complex128')
elif type_str == "float complex matrix":
_ = fd.read(1)
dtype = np.dtype('complex64')
else:
dtype = np.dtype(type_str[:-7])
dtype = dtype.newbyteorder(endian)
data = np.frombuffer(fd.read(count*dtype.itemsize), dtype)
# Note: Use data.copy() to make a modifiable array.
table[name] = data.reshape(dims, order='F')
elif type_str == "old_string":
data = fd.read(read_len())
if encoding is not None:
data = decode(data, encoding)
table[name] = data
elif type_str in ("string", "sq_string"):
nrows = read_len()
if nrows < 0:
ndims = -nrows
dims = np.frombuffer(fd.read(4*ndims), len_dtype)
count = np.prod(dims)
fortran_order = np.frombuffer(fd.read(count), dtype='uint8')
c_order = np.ascontiguousarray(fortran_order.reshape(dims, order='F'))
data = c_order.view(dtype='|S'+str(dims[-1]))
if encoding is not None:
data = np.array([decode(s, encoding) for s in data.flat])
table[name] = data.reshape(dims[:-1])
else:
data = [fd.read(read_len()) for _ in range(nrows)]
if encoding is not None:
data = [decode(s, encoding) for s in data]
table[name] = np.array(data)
else:
raise NotImplementedError("unknown octave type "+type_str)
#print("read %s:%s"%(name, type_str), table[name])
return table
def _dump(filename, encoding=STR_ENCODING):
import gzip
if filename.endswith('.gz'):
with gzip.open(filename, 'rb') as fd:
table = loadoct(fd, encoding)
else:
with open(filename, 'rb') as fd:
table = loadoct(fd, encoding)
for k, v in table.items():
print(k, v)
if __name__ == "__main__":
#_dump(sys.argv[1], encoding='utf8') # unicode
#_dump(sys.argv[1], encoding=None) # bytes
_dump(sys.argv[1]) # str, encoding=STR_ENCODING

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Parsing incomplete / doubled / damaged JSON String - json

Related

Python 2 to 3 = TypeError: descriptor 'find' for 'str' objects doesn't apply to a 'bytes' object

python 3 : deserialize nested dictionaries from sqlite

How to compare a character at a specific location in a string to identify the processing path

Inserting cipher text into mysql using python

Reading binary .SAVE files?

Categories

Resources