Hello Getting parsing json file - json

I need some help here parsing a json data :
My json File contain this
{
"message": "{\"gender\":\"MADAME\",\"Polo\":\"POTA\",\"last_name\":\"pato\",\"email\":\"pato.pota#mailler.com\",\"subject\":\"toto claim\",\"sub_subject\":\"Claim insurance car\",\"question\":\"claim for red car\",\"store\":\"claiming for incident\"}",
"context": [
],
"level": 200,
"level_name": "INFO",
"channel": "mailer",
"datetime": {
"date": "2016-09-19 11:00:26.795353",
"timezone_type": 3,
},
"extra": [
]
}
Python Code.
import os
import json
def Get_running_dir():
path = os.getcwd()
file = path + "\json_data.txt"
print(file)
with open(file, 'r') as f:
data = f.read()
data_json = json.loads(data)
print(data_json)
print(type(data_json))
Get_running_dir()
The issue is { print(type(data_json))} this is a dict right.
Once I call this print(data_json['message']['gender'])
<class 'dict'>
Traceback (most recent call last):
File "Extract_log.py", line 29, in <module>
Get_running_dir()
File "Extract_log.py", line 25, in Get_running_dir
print(data_json['message']['gender'])
TypeError: string indices must be integers
I need some help to parse this file please help me.
Thanking you in advance.
Regards,

I figured how to work with the json, this out today.
import os
import json
def Get_running_dir():
path = os.getcwd()
file = path + "\json_data.txt"
print(file)
with open(file, 'r') as f:
data = f.read()
data_json = json.loads(data)
# My error was here:
print(data_json['message']) # This convert to String.
msg = json.loads(data_json['message']) # THIS CONVERT THE STRING TO #Dict.
# this way i can access its keys.
# Like this.
print(msg['gender'] ,msg['first_name'], msg['last_name'])

Related

Download PDFs from multiple JSON URLs using Python

I have been tasked to create a method to download multiple PDFs from URLs included in JSON files. Probably 1 URL per JSON file, with approx 500k JSON files to process in any one batch.
Here's a sample of the JSON file:
{
"from": null,
"id": "sfm_c4kjatol7u8psvqfati0",
"imb_code": "897714123456789",
"mail_date": null,
"mail_type": "usps_first_class",
"object": "self_mailer",
"press_proof": "https://lob-assets.com/sid-self_mailers/sfm_c4kjatol7u8psvqfati0.pdf?version=v1&expires=1635274615&signature=AZlb0MSzZPuCjtKFkXRr_OoHzDzEy23UqzmKFWs5bycKCEcIyfe2od58zHzfP1a-iW5d9azFYUT1PnosqKcvBg",
"size": "11x9_bifold",
"target_delivery_date": null,
"to": {
"address_city": "SAN FRANCISCO",
"address_country": "UNITED STATES",
"address_line1": "185 BERRY ST STE 6100",
"address_line2": null,
"address_state": "CA",
"address_zip": "94107-1741",
"company": "Name.COM",
"name": "EMILE ILES"
}
}
The JSON file is converted to CSV and the URL is downloaded.
Here's what I have been trying to use but it is not working. What am I missing?
Import urllib.request, json, requests, os, csvkit
from itertools import islice
from pathlib import Path
path = Path("/Users/MyComputer/Desktop/self_mailers")
paths = [i.path for i in islice(os.scandir(path), 100)]
in2csv data.json > data.csv
with open('*.json', 'r') as f:
urls_dict = json.load(f)
urls_dict = urls_dict[0]
itr = iter(urls_dict)
len(list(itr))
f.write(r.pdf)
Why are you converting your JSON to a CSV?
Btw, if you are unsure of where are the urls in the jsons, I would do this:
import os
import json
from rethreader import Rethreader
from urllib.parse import urlparse
from urllib.request import urlretrieve
def download_pdf(url):
# use urlparse to find the pdf name
filename = urlparse(url).path.rsplit('/')[-1]
urlretrieve(url, filename)
# use multi-threading for faster downloads
downloader = Rethreader(download_pdf).start()
def verify_url(value):
if not isinstance(value, str):
# if the value is not a string, it's neither an url
return False
try:
parsed_url = urlparse(value)
except AttributeError:
# value cannot be parsed as url
return False
if not (parsed_url.scheme and parsed_url.netloc and parsed_url.path):
# value cannot be an url because it does not have the right scheme
return False
return True
def parse_data(data):
for value in data.values():
if verify_url(value):
downloader.add(value)
for file in os.listdir():
with open(file) as fp:
try:
json_data = json.load(fp)
except (json.JSONDecodeError, UnicodeDecodeError):
# this file is not a json; let's skip to the next one
continue
parse_data(json_data)
# quit the downloader after downloading the files
downloader.quit()
If you know in what possible keys can be the urls, I would do as this:
# The other parts same as before
def parse_data(data):
for key in ['possible_key', 'another_possible_key']:
if key in data and verify_url(data[key]):
downloader.add(data[key])

Trying to parse access.log

Good afternoon, I'm trying to find the top 10 ip in access.log (standard log of the Apache server).
There is a code like this:
import argparse
import json
import re
from collections import defaultdict, Counter
parser = argparse.ArgumentParser(description='parser script')
parser.add_argument('-f', dest='logfile', action='store', default='access.log')
args = parser.parse_args()
regul_ip = (r"^(?P<ips>.*?)")
regul_method = (r"\"(?P<request_method>GET|POST|PUT|DELETE|HEAD)")
def req_by_method():
dict_ip = defaultdict(lambda: {"GET": 0, "POST": 0, "PUT": 0, "DELETE": 0, "HEAD": 0})
with open(args.logfile) as file:
for index, line in enumerate(file.readlines()):
try:
ip = re.search(regul_ip, line).group()
method = re.search(regul_method, line).groups()[0]
return Counter(dict_ip).most_common(10)
except AttributeError:
pass
dict_ip[ip][method] += 1
print(json.dumps(dict_ip, indent=4))
with open("final_log.json", "w") as jsonfile:
json.dump(dict_ip, jsonfile, indent=5)
When the code is executed, I only get: []
How can I fix this code to make it work?
I also need to output to the final json file a set of such lines: "ip", "method", "status code", "url" and the duration of the request

Reading a JSON file using Python - JSONDecodeError Extra Data

I'm following along with https://realpython.com/python-json/. I'm using Python 3.8, on a Windows 10 machine, using IDLE.
I deviated a bit from the example.
>>> import json
>>>
>>> data1 = {
'president': {
'name': 'dumb-bell beetlebox',
'species': 'betelgusian'
}
}
>>> data2 = {
'emperor': {
'name': 'Ezekiel Wheel',
'species': 'klingon'
}
}
>>> data3 = {
'king': {
'name': 'tech shiny',
'species': 'two hundred'
}
}
>>>
>>> with open('data1_file.json', 'w') as wf:
json.dump(data1, wf)
>>> with open('data1_file.json', 'a') as af:
af.write('\n')
json.dump(data2, af)
af.write('\n')
json.dump(data3, af)
1
1
This created the json file, with the data per line.
I then tried to read it.
>>> with open('data1_file.json', 'r') as rf:
data = json.load(rf)
Traceback (most recent call last):
File "<pyshell#139>", line 2, in <module>
data4 = json.load(rf)
File "D:\Program Files (x86)\Python38-32\lib\json\__init__.py", line 293, in load
return loads(fp.read(),
File "D:\Program Files (x86)\Python38-32\lib\json\__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "D:\Program Files (x86)\Python38-32\lib\json\decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 73)
On the advice from a friend, who said there may have been extraneous data in the file -
>>> print(repr(open('data1_file.json').read()))
'{"president": {"name": "dumb-bell beetlebox", "species": "betelgusian"}}\n{"emperor": {"name": "Ezekiel Wheel", "species": "klingon"}}\n{"king": {"name": "tech shiny", "species": "two hundred"}}'
Any help would be appreciated. Thank you.
The problem is json.load does not decode multiple json objects. You'll probably want to place the data in an array. Check out this link for more info

Unable to get the content of Amazon S3 file and edit that file with python and boto3

I am trying to get the data from a file in Amazon S3, manipulate the content and then save it to another bucket.
import json
import urllib.parse
import boto3
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
file_name = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
s3_object = s3.get_object(Bucket=bucket, Key=file_name)
file_content = s3_object['Body'].read()
initial_data = json.load(file_content)
# some file manipulation comes here
data=json.dumps(initial_data, ensure_ascii=False)
s3.put_object(Bucket="new bucket name", Body=data, Key=file_name)
error message leads me to think that this has something to do with encoding:
Response:
{
"errorMessage": "'bytes' object has no attribute 'read'",
"errorType": "AttributeError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 25, in lambda_handler\n data_initlal = json.load(file_content)\n",
" File \"/var/lang/lib/python3.8/json/__init__.py\", line 293, in load\n return loads(fp.read(),\n"
]
}
Additionally, if I remove the following line from my code:
initial_data = json.load(file_content)
I get the error:
Response:
{
"errorMessage": "Object of type bytes is not JSON serializable",
"errorType": "TypeError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 29, in lambda_handler\n data=json.dumps(file_content, ensure_ascii=False)\n",
" File \"/var/lang/lib/python3.8/json/__init__.py\", line 234, in dumps\n return cls(\n",
" File \"/var/lang/lib/python3.8/json/encoder.py\", line 199, in encode\n chunks = self.iterencode(o, _one_shot=True)\n",
" File \"/var/lang/lib/python3.8/json/encoder.py\", line 257, in iterencode\n return _iterencode(o, 0)\n",
" File \"/var/lang/lib/python3.8/json/encoder.py\", line 179, in default\n raise TypeError(f'Object of type {o.__class__.__name__} '\n"
]
}
The file that I am trying to edit is a json format and the output should also be json.
This line:
initial_data = json.load(file_content)
Should be:
initial_data = json.loads(file_content)
Alternatively, replace these two lines:
file_content = s3_object['Body'].read()
initial_data = json.load(file_content)
with:
initial_data = json.load(s3_object['Body'])
The difference is json.load() vs json.loads().
The file_content that you are trying to read is utf-8 encoded. You need to decode that before converting it to json.
Try this:
initial_data = json.loads(file_content.decode('utf-8'))

Why am I getting error 'TypeError: string indices must be integers'

I have the JSON file below, and I am getting an error
Traceback (most recent call last):
File "test11.py", line 10, in <module>
print(driver['id'])
TypeError: string indices must be integers
{"drivers":
[
{
"id": "91907",
"groupId": "9039",
"vehicleId": "11111",
"currentVehicleId": "11111",
"username": "ablahblah",
"name": "Andrew Blahblah"
}
]
}
I have written the follow code to extract out values from the
file
import json
from pprint import pprint
with open('driver.json', 'r') as f:
drivers_dict = json.load(f)
for driver in drivers_dict:
print(driver['id'])
print(driver['groupId'])
print(driver['vehicleId'])
print(driver['username'])
print(driver['name'])
I need help to understand why I am getting the error and how to fix it.
Ultimately, the problem is that looping over a dict gives you the keys.
>>> [i for i in drivers_dict]
['drivers']
I think you just got your json layout confused. This works:
import json
with open('driver.json') as f:
j = json.load(f)
drivers_list = j["drivers"]
for driver in drivers_list:
# BTW you can DRY this part:
for key in ['id', 'groupId', 'vehicleId', 'username', 'name']:
print(driver[key])
Also Consider checking if the id is string or integer.
isinstance(s, str)