tinydb:Empty query was evaluated - mysql

One of the things this below code does is put different student IDs in tiny database after checking if the new ID is already present or not.
Code's below -
#enroll.py
# USAGE
# python enroll.py --id S1901 --name somename --conf config/config.json
# import the necessary packages
from pyimagesearch.utils import Conf
from imutils.video import VideoStream
from tinydb import TinyDB
from tinydb import where
import face_recognition
import argparse
import imutils
import pyttsx3
import time
import cv2
import os
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--id", required=True,
help="Unique student ID of the student")
ap.add_argument("-n", "--name", required=True,
help="Name of the student")
ap.add_argument("-c", "--conf", required=True,
help="Path to the input configuration file")
args = vars(ap.parse_args())
# load the configuration file
conf = Conf(args["conf"])
# initialize the database and student table objects
db = TinyDB(conf["db_path"])
studentTable = db.table("student")
# retrieve student details from the database
student = studentTable.search(where(args["id"]))
# check if an entry for the student id does *not* exist, if so, then
# enroll the student
if len(student) == 0:
# initialize the video stream and allow the camera sensor to warmup
print("[INFO] warming up camera...")
vs = VideoStream(src=0).start()
time.sleep(2.0)
# initialize the number of face detections and the total number
# of images saved to disk
faceCount = 0
total = 0
# ask the student to stand in front of the camera
print("{} please stand in front of the camera until you" \
"receive further instructions".format(args["name"]))
# initialize the status as detecting
status = "detecting"
# create the directory to store the student's data
os.makedirs(os.path.join(conf["dataset_path"], conf["class"],
args["id"]), exist_ok=True)
# loop over the frames from the video stream
while True:
# grab the frame from the threaded video stream, resize it (so
# face detection will run faster), flip it horizontally, and
# finally clone the frame (just in case we want to write the
# frame to disk later)
frame = vs.read()
frame = imutils.resize(frame, width=400)
frame = cv2.flip(frame, 1)
orig = frame.copy()
# convert the frame from from RGB (OpenCV ordering) to dlib
# ordering (RGB) and detect the (x, y)-coordinates of the
# bounding boxes corresponding to each face in the input image
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
boxes = face_recognition.face_locations(rgb,
model=conf["detection_method"])
# loop over the face detections
for (top, right, bottom, left) in boxes:
# draw the face detections on the frame
cv2.rectangle(frame, (left, top), (right, bottom),
(0, 255, 0), 2)
# check if the total number of face detections are less
# than the threshold, if so, then skip the iteration
if faceCount < conf["n_face_detection"]:
# increment the detected face count and set the
# status as detecting face
faceCount += 1
status = "detecting"
continue
# save the frame to correct path and increment the total
# number of images saved
p = os.path.join(conf["dataset_path"], conf["class"],
args["id"], "{}.png".format(str(total).zfill(5)))
cv2.imwrite(p, orig[top:bottom, left:right])
total += 1
# set the status as saving frame
status = "saving"
# draw the status on to the frame
cv2.putText(frame, "Status: {}".format(status), (10, 20),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
# show the output frame
cv2.imshow("Frame", frame)
cv2.waitKey(1)
# if the required number of faces are saved then break out from
# the loop
if total == conf["face_count"]:
# let the student know that face enrolling is over
print("Thank you {} you are now enrolled in the {} " \
"class.".format(args["name"], conf["class"]))
break
# insert the student details into the database
studentTable.insert({args["id"]: [args["name"], "enrolled"]})
# print the total faces saved and do a bit of cleanup
print("[INFO] {} face images stored".format(total))
print("[INFO] cleaning up...")
cv2.destroyAllWindows()
vs.stop()
# otherwise, a entry for the student id exists
else:
# get the name of the student
name = student[0][args["id"]][0]
print("[INFO] {} has already already been enrolled...".format(
name))
# close the database
db.close()
ISSUE:
While i run this code for the 1st time, everything works fine.
>> python3 enroll.py --id S1111 --name thor --conf config/config.json
I get my ID in my json file as shown below -
{"student": {"1": {"S1111": ["thor", "enrolled"]}}}
But when i try to put another ID -
python3 enroll.py --id S1112 --name hulk --conf config/config.json
I get the following error -
ERROR:
Traceback (most recent call last):
File "enroll.py", line 35, in <module>
student = studentTable.search(where(args["id"]))
File "/usr/lib/python3.5/site-packages/tinydb/table.py", line 222, in search
docs = [doc for doc in self if cond(doc)]
File "/usr/lib/python3.5/site-packages/tinydb/table.py", line 222, in <listcomp>
docs = [doc for doc in self if cond(doc)]
File "/usr/lib/python3.5/site-packages/tinydb/queries.py", line 59, in __call__
return self._test(value)
File "/usr/lib/python3.5/site-packages/tinydb/queries.py", line 136, in notest
raise RuntimeError('Empty query was evaluated')
RuntimeError: Empty query was evaluated
If i change my table name from student to something else then again it will store id only for the first time then gives the same error. I'm not sure what's wrong here.

Related

results.pandas().xyxy[0] in only outputting data for one image rather than four

I am trying to get the output stored in a variable so that it can be used later on for more processing.
But to get to that stage I am facing a challenge with this code
######INFERENCE ON P6 MODELS*****************************************************************************
import torch
import glob
from natsort import natsorted
import cv2
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import pandas as pd
import os
model = torch.hub.load('/Users/yolov5', 'custom', path='/User/yolov5/runs/train/exp11/weights/best.pt', source='local', force_reload=True) # custom trained model
model.conf = 0.25 # NMS confidence threshold
Path = 'User/yolov5/data/images/'
imgs = [cv2.imread(file) for file in natsorted(glob.glob(Path+"/*.jpg"))]
# Inference
results = model(imgs,size=640)
# Results:
#results.save() # or .print() .show(), .save(), .crop(), .pandas(), etc.
results.pandas()
#print(results.print())
#print(results.pandas().xyxy[:])
# results.show()
#results.pandas().xyxy[0]
#print(results)
#print(results.pandas().xyxy[0])
# dfm = pd.DataFrame(results.pandas().xyxy[0])#, columns = ['Loss','Accuracy']
# # #dfm['Classes'] = classes.tolist()
# predict_labs = 'pred_yolo_individual.csv'
# with open(predict_labs, mode='w') as fd:
# dfm.to_csv(fd)
#results.print() # or .show(), .save(), .crop(), .pandas(), etc.
#results.render()
results.xyxy[0] # im predictions (tensor)
results.pandas().xyxy[0]
results.print()
# pred = results.pandas().xyxy[0]
# for index, row in pred.iterrows():
# print(row['class'], row['confidence'], row['name'])
As you can see I am trying loads of stuff to get this going but some major details I am missing that is not getting the right output as desired.
I would like to get the output like this format below for the folder of images that I have.
# Results
results.print() # or .show(), .save(), .crop(), .pandas(), etc.
results.xyxy[0] # im predictions (tensor)
results.pandas().xyxy[0] # im predictions (pandas)
# xmin ymin xmax ymax confidence class name
# 0 749.50 43.50 1148.0 704.5 0.874023 0 person
# 2 114.75 195.75 1095.0 708.0 0.624512 0 person
# 3 986.00 304.00 1028.0 420.0 0.286865 27 tie
*** THE ISSUE IS***
When I use the same code I am only getting one output!!!!!?!?!?!
If I do a
print(results.pandas().xyxy[0:])
I am seeing the output as demonstrated below but not in the structured format as above:
YOLOv5 🚀 v7.0-72-g064365d Python-3.10.6 torch-1.13.1 CPU
Fusing layers...
Model summary: 212 layers, 20856975 parameters, 0 gradients, 47.9 GFLOPs
Adding AutoShape...
[ xmin ymin xmax ymax confidence class name
0 539.859314 119.92907 602.884216 245.533752 0.353711 1 Stabbing, Empty DataFrame
Columns: [xmin, ymin, xmax, ymax, confidence, class, name]
Index: [], Empty DataFrame
Columns: [xmin, ymin, xmax, ymax, confidence, class, name]
Index: [], xmin ymin xmax ymax confidence class name
0 709.833496 66.843300 1025.770752 800.782593 0.771696 1 Stabbing
1 84.628845 4.153772 461.863617 833.189636 0.632551 1 Stabbing]
Please assist, and thank you in advance for acknowledging my issues.
I would suggest using the following code to get the desired output:
# Inference
results = model(imgs, size=640)
# Results:
preds = results.pandas().xyxy[0] # im predictions (pandas)
# print(preds)
# Create dataframe and write to file
dfm = pd.DataFrame(preds)
dfm.columns = ['xmin', 'ymin', 'xmax', 'ymax', 'confidence', 'class', 'name']
predict_labs = 'pred_yolo_individual.csv'
dfm.to_csv(predict_labs, index=False)
This will create a CSV file containing all the predictions in the desired format.

Python script that can auto-annotate the images

I am using the https://github.com/mdhmz1/Auto-Annotate repo. I have tried to custom train my own dataset which has it own COCO JSON format file.
When I try to run
python3 customTrain.py train --dataset=path/to/dir --weights=coco
I get the following error:
Traceback (most recent call last):
File "customTrain.py", line 279, in
train(model)
File "customTrain.py", line 179, in train
dataset_train.load_custom(args.dataset, "train")
File "customTrain.py", line 87, in load_custom
annotations = [a for a in annotations if a['regions']]
File "customTrain.py", line 87, in
annotations = [a for a in annotations if a['regions']]
TypeError: list indices must be integers or slices, not str
My customtrain.py looks like the following:
import os
import sys
import json
import datetime
import numpy as np
import skimage.draw
Root directory of the project
ROOT_DIR = "/home/hiwi/Auto-Annotate"
Import Mask RCNN
sys.path.append(ROOT_DIR) # To find local version of the library
from mrcnn.config import Config
from mrcnn import model as modellib, utils
Path to trained weights file
COCO_WEIGHTS_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5")
Directory to save logs and model checkpoints, if not provided
through the command line argument --logs
DEFAULT_LOGS_DIR = os.path.join(ROOT_DIR, "logs")
############################################################
Configurations
############################################################
class CustomConfig(Config):
"""Configuration for training on the toy dataset.
Derives from the base Config class and overrides some values.
"""
# Give the configuration a recognizable name
NAME = "custom"
IMAGES_PER_GPU = 1
# Number of classes (including background)
NUM_CLASSES = 1 + 2 # Background + 2 classes
# Number of training steps per epoch
STEPS_PER_EPOCH = 100
# Skip detections with < 90% confidence
DETECTION_MIN_CONFIDENCE = 0.9
############################################################
Dataset
############################################################
class CustomDataset(utils.Dataset):
def load_custom(self, dataset_dir, subset):
"""Load a subset of the Custom dataset.
dataset_dir: Root directory of the dataset.
subset: Subset to load: train or val
"""
# Add classes. We have only one class to add.
self.add_class("custom", 0, "Primary_Track")
self.add_class("custom", 1, "Secondary_Track")
# Train or validation dataset?
assert subset in ["train", "val"]
dataset_dir = os.path.join(dataset_dir, subset)
# Load annotations
# VGG Image Annotator (up to version 1.6) saves each image in the form:
# { 'filename': '28503151_5b5b7ec140_b.jpg',
# 'regions': {
# '0': {
# 'region_attributes': {},
# 'shape_attributes': {
# 'all_points_x': [...],
# 'all_points_y': [...],
# 'name': 'polygon'}},
# ... more regions ...
# },
# 'size': 100202
# }
# We mostly care about the x and y coordinates of each region
# Note: In VIA 2.0, regions was changed from a dict to a list.
annotations1 = json.load(open(os.path.join(dataset_dir, "train.json")))
annotations = list(annotations1.values()) # don't need the dict keys
# The VIA tool saves images in the JSON even if they don't have any
# annotations. Skip unannotated images.
annotations = [a for a in annotations if a['regions']]
# Add images
for a in annotations:
# Get the x, y coordinaets of points of the polygons that make up
# the outline of each object instance. These are stores in the
# shape_attributes (see json format above)
# The if condition is needed to support VIA versions 1.x and 2.x.
if type(a['regions']) is dict:
polygons = [r['shape_attributes'] for r in a['regions'].values()]
else:
polygons = [r['shape_attributes'] for r in a['regions']]
#labelling each class in the given image to a number
custom = [s['region_attributes'] for s in a['regions']]
num_ids=[]
#Add the classes according to the requirement
for n in custom:
try:
if n['name']=="Primary_Track":
num_ids.append(0)
elif n['name']=='Secondary_Track':
num_ids.append(1)
except:
pass
# load_mask() needs the image size to convert polygons to masks.
# Unfortunately, VIA doesn't include it in JSON, so we must read
# the image. This is only managable since the dataset is tiny.
image_path = os.path.join(dataset_dir, a['filename'])
image = skimage.io.imread(image_path)
height, width = image.shape[:2]
self.add_image(
"custom",
image_id=a['filename'], # use file name as a unique image id
path=image_path,
width=width, height=height,
polygons=polygons,
num_ids=num_ids)
def load_mask(self, image_id):
"""Generate instance masks for an image.
Returns:
masks: A bool array of shape [height, width, instance count] with
one mask per instance.
class_ids: a 1D array of class IDs of the instance masks.
"""
# If not a custom dataset image, delegate to parent class.
image_info = self.image_info[image_id]
if image_info["source"] != "custom":
return super(self.__class__, self).load_mask(image_id)
num_ids = image_info['num_ids']
#print("Here is the numID",num_ids)
# Convert polygons to a bitmap mask of shape
# [height, width, instance_count]
info = self.image_info[image_id]
mask = np.zeros([info["height"], info["width"], len(info["polygons"])],
dtype=np.uint8)
for i, p in enumerate(info["polygons"]):
if p['name'] == 'polygon':
# Get indexes of pixels inside the polygon and set them to 1
rr, cc = skimage.draw.polygon(p['all_points_y'], p['all_points_x'])
else:
rr, cc = skimage.draw.rectangle((p['y'], p['x']), extent=(p['height'], p['width']))
rr[rr > mask.shape[0]-1] = mask.shape[0]-1
cc[cc > mask.shape[1]-1] = mask.shape[1]-1
mask[rr, cc, i] = 1
# Return mask, and array of class IDs of each instance. Since we have
# one class ID only, we return an array of 1s
num_ids = np.array(num_ids, dtype=np.int32)
return mask.astype(np.bool), num_ids.astype(np.bool), np.ones([mask.shape[-1]], dtype=np.int32)
#return mask.astype(np.bool), np.ones([mask.shape[-1]], dtype=np.int32)
def image_reference(self, image_id):
"""Return the path of the image."""
info = self.image_info[image_id]
if info["source"] == "Railtrack":
return info["path"]
else:
super(self.__class__, self).image_reference(image_id)
def train(model):
"""Train the model."""
# Training dataset.
dataset_train = CustomDataset()
dataset_train.load_custom(args.dataset, "train")
dataset_train.prepare()
# Validation dataset
dataset_val = CustomDataset()
dataset_val.load_custom(args.dataset, "val")
dataset_val.prepare()
# *** This training schedule is an example. Update to your needs ***
# Since we're using a very small dataset, and starting from
# COCO trained weights, we don't need to train too long. Also,
# no need to train all layers, just the heads should do it.
print("Training network heads")
model.train(dataset_train, dataset_val,
learning_rate=config.LEARNING_RATE,
epochs=30,
layers='heads')
############################################################
Training
############################################################
if name == 'main':
import argparse
# Parse command line arguments
parser = argparse.ArgumentParser(
description='Train Mask R-CNN to detect custom objects.')
parser.add_argument("command",
metavar="<command>",
help="'train' or 'splash'")
parser.add_argument('--dataset', required=False,
metavar="/path/to/custom/dataset/",
help='Directory of the Custom dataset')
parser.add_argument('--weights', required=True,
metavar="/path/to/weights.h5",
help="Path to weights .h5 file or 'coco'")
parser.add_argument('--logs', required=False,
default=DEFAULT_LOGS_DIR,
metavar="/path/to/logs/",
help='Logs and checkpoints directory (default=logs/)')
parser.add_argument('--image', required=False,
metavar="path or URL to image",
help='Image to apply the color splash effect on')
parser.add_argument('--video', required=False,
metavar="path or URL to video",
help='Video to apply the color splash effect on')
args = parser.parse_args()
# Validate arguments
if args.command == "train":
assert args.dataset, "Argument --dataset is required for training"
elif args.command == "splash":
assert args.image or args.video,\
"Provide --image or --video to apply color splash"
print("Weights: ", args.weights)
print("Dataset: ", args.dataset)
print("Logs: ", args.logs)
# Configurations
if args.command == "train":
config = CustomConfig()
# Create model
if args.command == "train":
model = modellib.MaskRCNN(mode="training", config=config,
model_dir=args.logs)
# Select weights file to load
if args.weights.lower() == "coco":
weights_path = COCO_WEIGHTS_PATH
# Download weights file
if not os.path.exists(weights_path):
utils.download_trained_weights(weights_path)
elif args.weights.lower() == "last":
# Find last trained weights
weights_path = model.find_last()
elif args.weights.lower() == "imagenet":
# Start from ImageNet trained weights
weights_path = model.get_imagenet_weights()
else:
weights_path = args.weights
# Load weights
print("Loading weights ", weights_path)
if args.weights.lower() == "coco":
# Exclude the last layers because they require a matching
# number of classes
model.load_weights(weights_path, by_name=True, exclude=[
"mrcnn_class_logits", "mrcnn_bbox_fc",
"mrcnn_bbox", "mrcnn_mask"])
else:
model.load_weights(weights_path, by_name=True)
# Train or evaluate
if args.command == "train":
train(model)
else:
print("'{}' is not recognized. "
"Use 'train' or 'splash'".format(args.command))

How can i extract information quickly from 130,000+ Json files located in S3?

i have an S3 was over 130k Json Files which i need to calculate numbers based on data in the json files (for example calculate the number of gender of Speakers). i am currently using s3 Paginator and JSON.load to read each file and extract information form. but it take a very long time to process such a large number of file (2-3 files per second). how can i speed up the process? please provide working code examples if possible. Thank you
here is some of my code:
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
result = paginator.paginate(Bucket='bucket-name',StartAfter='')
for page in result:
if "Contents" in page:
for key in page[ "Contents" ]:
keyString = key[ "Key" ]
s3 = boto3.resource('s3')
content_object = s3.Bucket('bucket-name').Object(str(keyString))
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)
x = (json_content['dict-name'])
In order to use the code below, I'm assuming you understand pandas (if not, you may want to get to know it). Also, it's not clear if your 2-3 seconds is on the read or includes part of the number crunching, nonetheless multiprocessing will speed this up dramatically. The gist is to read all the files in (as dataframes), concatenate them, then do your analysis.
To be useful for me, I run this on spot instances that have lots of vCPUs and memory. I've found the instances that are network optimized (like c5n - look for the n) and the inf1 (for machine learning) are much faster at reading/writing than T or M instance types, as examples.
My use case is reading 2000 'directories' with roughly 1200 files in each and analyzing them. The multithreading is orders of magnitude faster than single threading.
File 1: your main script
# create script.py file
import os
from multiprocessing import Pool
from itertools import repeat
import pandas as pd
import json
from utils_file_handling import *
ufh = file_utilities() #instantiate the class functions - see below (second file)
bucket = 'your-bucket'
prefix = 'your-prefix/here/' # if you don't have a prefix pass '' (empty string or function will fail)
#define multiprocessing function - get to know this to use multiple processors to read files simultaneously
def get_dflist_multiprocess(keys_list, num_proc=4):
with Pool(num_proc) as pool:
df_list = pool.starmap(ufh.reader_json, zip(repeat(bucket), keys_list), 15)
pool.close()
pool.join()
return df_list
#create your master keys list upfront; you can loop through all or slice the list to test
keys_list = ufh.get_keys_from_prefix(bucket, prefix)
# keys_list = keys_list[0:2000] # as an exampmle
num_proc = os.cpu_count() #tells you how many processors your machine has; function above defaults to 4 unelss given
df_list = get_dflist_multiprocess(keys_list, num_proc=num_proc) #collect dataframes for each file
df_new = pd.concat(df_list, sort=False)
df_new = df_new.reset_index(drop=True)
# do your analysis on the dataframe
File 2: class functions
#utils_file_handling.py
# create this in a separate file; name as you wish but change the import in the script.py file
import boto3
import json
import pandas as pd
#define client and resource
s3sr = boto3.resource('s3')
s3sc = boto3.client('s3')
class file_utilities:
"""file handling function"""
def get_keys_from_prefix(self, bucket, prefix):
'''gets list of keys and dates for given bucket and prefix'''
keys_list = []
paginator = s3sr.meta.client.get_paginator('list_objects_v2')
# use Delimiter to limit search to that level of hierarchy
for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/'):
keys = [content['Key'] for content in page.get('Contents')]
print('keys in page: ', len(keys))
keys_list.extend(keys)
return keys_list
def read_json_file_from_s3(self, bucket, key):
"""read json file"""
bucket_obj = boto3.resource('s3').Bucket(bucket)
obj = boto3.client('s3').get_object(Bucket=bucket, Key=key)
data = obj['Body'].read().decode('utf-8')
return data
# you may need to tweak this for your ['dict-name'] example; I think I have it correct
def reader_json(self, bucket, key):
'''returns dataframe'''
return pd.DataFrame(json.loads(self.read_json_file_from_s3(bucket, key))['dict-name'])

Working with coroutines in Python Tornado Web Server

I am working on an autonomous car implementation for a web browser game with Python 2x. I use Tornado Web Server to run game on localhost and I post and receive data from game with JSON data format in the function called "FrameHandler" and also I determine what the act of car should be in "to_dict_faster()" function.
Here, my problem is that I can write data to text file which is hold in speed_data variable in specific time interval with help of a coroutine. However, I can't dump JSON data to function in this specific time interval because "FrameHandler" acts like While True and it always requests data to dump. What I am trying to do is sending desired acts as writing text file in specific time interval while not changing flow frame handler because it affects FPS of the game.
I am trying to figure out How can I do that for a long time any help would be great here:
#gen.coroutine
def sampler():
io_loop = tornado.ioloop.IOLoop.current()
start = time.time()
while True:
with open("Sampled_Speed.txt", "a") as text_file:
text_file.write("%d,%.2f\n" % (speed_data, ((time.time() - start))))
yield gen.Task(io_loop.add_timeout, io_loop.time() + period)
class MainHandler(tornado.web.RequestHandler):
def get(self):
self.redirect("/static/v2.curves.html")
class FrameHandler(tornado.web.RequestHandler):
def post(self):
global speed_data
data = json.loads(self.get_arguments("telemetry")[0])
ar = np.fromstring(base64.decodestring(self.request.body), dtype=np.uint8)
image = ar.reshape(hp.INPUT_SIZE, hp.INPUT_SIZE, hp.NUM_CHANNELS)
left, right, faster, slower = data["action"]
terminal, action, all_data, was_start = (
data["terminal"],
Action(left=left, right=right, faster=faster, slower=slower),
data["all_data"],
data["was_start"]
)
for i in range(len(all_data)):
data_dict=all_data[i]
speed_data = data_dict[u'speed']
position_data=data_dict[u'position']
result_action = agent.steps(image, 0.1, terminal, was_start, action, all_data)
if speed_data < 4000:
self.write(json.dumps(result_action.to_dict_faster()))
else:
self.write(json.dumps(result_action.to_dict_constant()))
def make_app():
return tornado.web.Application([
(r"/", MainHandler),
(r"/frame", FrameHandler),
(r"/static/(.*)", tornado.web.StaticFileHandler, {"path": static_path})
], debug=True)
if __name__ == "__main__":
app = make_app()
if "SERVER_PORT" in os.environ:
port = int(os.environ["SERVER_PORT"])
else:
port = 8880
print "LISTENING ON PORT: %d" % port
app.listen(port)
tornado.ioloop.IOLoop.current().run_sync(sampler)
tornado.ioloop.IOLoop.current().start()
You can move file writing to a different thread (using tornado's run_on_executor for example), so python interpreter will automatically switch from Sampler to main thread with FrameHandler on write. But you have to use thread-safe speed_data variable, I've used stdlib Queue.Queue as an example:
class Handler(tornado.web.RequestHandler):
#gen.coroutine
def get(self):
global speed_data
speed_data.put("REALLY BIG TEST DATA\n")
self.finish("OK")
class Sampler():
executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
def __init__(self, queue):
self._q = queue
#run_on_executor
def write_sample(self):
with open("foobar.txt", "w") as f:
while True:
data = self._q.get()
f.write(data)
if __name__ == '__main__':
application = Application(
[("/status", Handler)]
)
server = HTTPServer(application)
server.listen(8888)
speed_data = Queue.Queue()
smp = Sampler(speed_data)
IOLoop.current().add_callback(smp.write_sample)
IOLoop.current().start()

Parse from log file in python

I have a log file with arbitrary number of lines and json strings. All I need is to extract is one json data from the log file BUT ONLY AFTER '_____GP D_____'. I do not want any other lines or json data from the file.
This is how my input file looks
INFO:modules.gp.helpers.parameter_getter:_____GP D_____
{'from_time': '2017-07-12 19:57', 'to_time': '2017-07-12 20:57', 'consig_number': 'dup1', 'text': 'r155', 'mobile': None, 'email': None}
ERROR:modules.common.actionexception:ActionError: [{'other': 'your request already crossed threshold time'}]
{'from_time': '2016-07-12 16:57', 'to_time': '2016-07-12 22:57', 'consig_number': 'dup2', 'text': 'r15', 'mobile': None, 'email': None}
how do i find the json string only after '_____GP D_____'?
You can read your file line by line until you encounter _____GP D_____ at the end of the line, and when you do pick up just the next line:
found_json = None
with open("input.log", "r") as f: # open your log file
for line in f: # read it line by line
if line.rstrip()[-14:] == "_____GP D_____": # if a line ends with our string...
found_json = next(f).rstrip() # grab the next line
break # stop reading of the file, nothing more of interest
Then you can do with your found_json whatever you want, including parsing it, printing it, etc.
UPDATE - If you want to continuously 'follow' your log file (akin to the tail -f command) you can open it in read mode and keep the file handle open while reading it line by line with a reasonable delay added between reads (that's largely how tail -f does it, too) - then you can use the same procedure to discover when your desired line occurs and capture the next line to process, send to some other process or do whatever you plan to do with it. Something like:
import time
capture = False # a flag to use to signal the capture of the next line
found_lines = [] # a list to store our found lines, just as an example
with open("input.log", "r") as f: # open the file for reading...
while True: # loop indefinitely
line = f.readline() # grab a line from the file
if line != '': # if there is some content on the current line...
if capture: # capture the current line
found_lines.append(line.rstrip()) # store the found line
# instead, you can do whatever you want with the captured line
# i.e. to print it: print("Found: {}".format(line.rstrip()))
capture = False # reset the capture flag
elif line.rstrip()[-14:] == "_____GP D_____": # if it ends in '_____GP D_____'..
capture = True # signal that the next line should be captured
else: # an empty buffer encountered, most probably EOF...
time.sleep(1) # ... let's wait for a second before attempting to read again...
import json
from ast import literal_eval
KEY_STRING = '''_____GP D_____'''
text = """INFO:modules.gp.helpers.parameter_getter:_____GP D_____
{'from_time': '2017-07-12 19:57', 'to_time': '2017-07-12 20:57', 'consig_number': 'dup1', 'text': 'r155', 'mobile': None, 'email': None}
ERROR:modules.common.actionexception:ActionError: [{'other': 'your request already crossed threshold time'}]
{'from_time': '2016-07-12 16:57', 'to_time': '2016-07-12 22:57', 'consig_number': 'dup2', 'text': 'r15', 'mobile': None, 'email': None}"""
lines = text.split("\n") # load log text into a list.
# for loading from log would be more like
# with open("/var/log/syslog.log", 'r') as f:
# lines = f.readlines()
# set "gate" flag to False
flag = False
for loop in lines:
line = loop.strip()
if flag: # "gate" opened
# depends how's the dictionary streamed to log
# you could use json.loads(line), but if it is not sent to log with json.dumps than you have pythonic dictinary and use
# literal_eval to load that dictionary to a variable
# .. a
target_json = literal_eval(line)
print json.dumps(target_json, indent=4)
if KEY_STRING in line:
flag = True # KEY_STRING found open "gate"
else:
flag = False # close "gate"
~
Output:
{
"consig_number": "dup1",
"text": "r155",
"email": null,
"mobile": null,
"to_time": "2017-07-12 20:57",
"from_time": "2017-07-12 19:57"
}