Converting COCO Format to LabelMe Format - json

I am trying to convert COCO json file to LabelMe json file. I used a python script called "coco2labelme.py" to convert the json file.
It successfully converts the json file, the only problem is that I get an error every time I try to load the converted json file in LabelMe. An error occurs regarding the 'imageData' of the file.
Does anyone have an idea on how to convert from COCO to LabelMe format with the image data?
below is the code for coco2labelme.py
[Source: https://gist.github.com/travishsu/6efa5c9fb92ece37b4748036026342f6]
import os
import json
import subprocess
import numpy as np
import pandas as pd
from skimage.measure import find_contours
class CocoDatasetHandler:
def __init__(self, jsonpath, imgpath):
with open(jsonpath, 'r') as jsonfile:
ann = json.load(jsonfile)
images = pd.DataFrame.from_dict(ann['images']).set_index('id')
annotations = pd.DataFrame.from_dict(ann['annotations']).set_index('id')
categories = pd.DataFrame.from_dict(ann['categories']).set_index('id')
annotations = annotations.merge(images, left_on='image_id', right_index=True)
annotations = annotations.merge(categories, left_on='category_id', right_index=True)
annotations = annotations.assign(
shapes=annotations.apply(self.coco2shape, axis=1))
self.annotations = annotations
self.labelme = {}
self.imgpath = imgpath
self.images = pd.DataFrame.from_dict(ann['images']).set_index('file_name')
def coco2shape(self, row):
if row.iscrowd == 1:
shapes = self.rle2shape(row)
elif row.iscrowd == 0:
shapes = self.polygon2shape(row)
return shapes
def rle2shape(self, row):
rle, shape = row['segmentation']['counts'], row['segmentation']['size']
mask = self._rle_decode(rle, shape)
padded_mask = np.zeros(
(mask.shape[0]+2, mask.shape[1]+2),
dtype=np.uint8,
)
padded_mask[1:-1, 1:-1] = mask
points = find_contours(mask, 0.5)
shapes = [
[[int(point[1]), int(point[0])] for point in polygon]
for polygon in points
]
return shapes
def _rle_decode(self, rle, shape):
mask = np.zeros([shape[0] * shape[1]], np.bool)
for idx, r in enumerate(rle):
if idx < 1:
s = 0
else:
s = sum(rle[:idx])
e = s + r
if e == s:
continue
assert 0 <= s < mask.shape[0]
assert 1 <= e <= mask.shape[0], "shape: {} s {} e {} r {}".format(shape, s, e, r)
if idx % 2 == 1:
mask[s:e] = 1
# Reshape and transpose
mask = mask.reshape([shape[1], shape[0]]).T
return mask
def polygon2shape(self, row):
# shapes: (n_polygons, n_points, 2)
shapes = [
[[int(points[2*i]), int(points[2*i+1])] for i in range(len(points)//2)]
for points in row.segmentation
]
return shapes
def coco2labelme(self):
fillColor = [255, 0, 0, 128]
lineColor = [0, 255, 0, 128]
groups = self.annotations.groupby('file_name')
for file_idx, (filename, df) in enumerate(groups):
record = {
'imageData': None,
'fillColor': fillColor,
'lineColor': lineColor,
'imagePath': filename,
'imageHeight': int(self.images.loc[filename].height),
'imageWidth': int(self.images.loc[filename].width),
}
record['shapes'] = []
instance = {
'line_color': None,
'fill_color': None,
'shape_type': "polygon",
}
for inst_idx, (_, row) in enumerate(df.iterrows()):
for polygon in row.shapes:
copy_instance = instance.copy()
copy_instance.update({
'label': row['name'],
'group_id': inst_idx,
'points': polygon
})
record['shapes'].append(copy_instance)
if filename not in self.labelme.keys():
self.labelme[filename] = record
def save_labelme(self, file_names, dirpath, save_json_only=False):
if not os.path.exists(dirpath):
os.makedirs(dirpath)
else:
raise ValueError(f"{dirpath} has existed")
for file in file_names:
filename = os.path.basename(os.path.splitext(file)[0])
with open(os.path.join(dirpath, filename+'.json'), 'w') as jsonfile:
json.dump(self.labelme[file], jsonfile, ensure_ascii=True, indent=2)
if not save_json_only:
subprocess.call(['cp', os.path.join(self.imgpath, file), dirpath])
ds = CocoDatasetHandler('cocodataset/annotations/instances_train2014.json', 'cocodataset/train2014/')
ds.coco2labelme()
ds.save_labelme(ds.labelme.keys(), 'cocodataset/labelme/train2014')

Related

AWS Sagemaker batch transform with JSON input filter

I have a custom Sagemaker instance on a NLP task and trying to run a batch transform on the following json file
{"id":123, "features":"This is a test message"}'
and im looking to output the following:
{"id":123,"SageMakerOutput":spam}
Here's my batch transform code:
transformer = sklearn.transformer(instance_count=1,
instance_type='local',
accept='application/json',
output_path="s3://spam-detection-messages-output/json_examples")
transformer.transform("s3://spam-detection-messages/json_examples", content_type='application/json', input_filter="$.features", join_source="Input", output_filter="$['features', SageMakerOutput']")
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()
According to this document,
https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#batch-transform-data-processing-examples
i should be able to grab the "features" object using input_filter,
however, it grabs the entire json payload. and only outputs the prediction
I'm also including my training code
import argparse
import pandas as pd
import os
import glob
import io
import json
from sklearn import tree
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
vectorizer = TfidfVectorizer()
def remove_stop_words(words):
result = [i for i in words if i not in ENGLISH_STOP_WORDS]
return result
def word_stemmer(words):
return [stemmer.stem(o) for o in words]
def word_lemmatizer(words):
return [lemmatizer.lemmatize(o) for o in words]
def remove_characters(words):
return [word for word in words if len(word)> 1]
def clean_token_pipeline(words):
cleaning_utils = [remove_stop_words, word_lemmatizer]
for o in cleaning_utils:
words = o(words)
return words
def process_text(X_train, X_test, y_train, y_test):
X_train = [word_tokenize(o) for o in X_train]
X_test = [word_tokenize(o) for o in X_test]
X_train = [clean_token_pipeline(o) for o in X_train]
X_test = [clean_token_pipeline(o) for o in X_test]
X_train = [" ".join(o) for o in X_train]
X_test = [" ".join(o) for o in X_test]
return X_train, X_test, y_train, y_test
def convert_to_feature(raw_tokenize_data):
raw_sentences = [' '.join(o) for o in raw_tokenize_data]
return vectorizer.transform(raw_sentences)
def _npy_loads(data):
"""
Deserializes npy-formatted bytes into a numpy array
"""
stream = io.BytesIO(data)
return np.load(stream)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Sagemaker specific arguments. Defaults are set in the environment variables.
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
args = parser.parse_args()
train_data = pd.read_csv(args.train+"/spamAssassin_min.csv", index_col=0)
train_data.dropna(inplace=True)
print(train_data.head())
X_train, X_test, y_train, y_test = train_test_split(train_data['message'], train_data['label'], test_size = 0.2, random_state = 1)
X_train, X_test, y_train, y_test = process_text(X_train, X_test, y_train, y_test)
X_train = [o.split(" ") for o in X_train]
X_test = [o.split(" ") for o in X_test]
vectorizer = TfidfVectorizer()
raw_sentences = [' '.join(o) for o in X_train]
vectorizer.fit(raw_sentences)
# print("saving transformer to {}".format(args.model_dir))
joblib.dump(vectorizer, os.path.join(args.model_dir, "vectorizer.joblib"))
x_train_features = convert_to_feature(X_train)
x_test_features = convert_to_feature(X_test)
clf = GaussianNB()
clf.fit(x_train_features.toarray(),y_train)
y_true, y_pred = y_test, clf.predict(x_test_features.toarray())
print(classification_report(y_true, y_pred))
joblib.dump(clf, os.path.join(args.model_dir, "model.joblib"))
def model_fn(model_dir):
"""Deserialized and return fitted model
Note that this should have the same name as the serialized model in the main method
"""
clf = joblib.load(os.path.join(model_dir, "model.joblib"))
# print("model loaded {}".format(clf))
return clf
def input_fn(request_body, request_content_type):
print("** input_fn**")
print("request_body:{} request_content_type:{}".format(request_body, request_content_type))
if request_content_type == "text/plain":
#convert to string
message = str(request_body)
return message
elif request_content_type == "application/json":
request_body_json = json.loads(request_body)
# print("json {}".format(request_body_json))
return request_body_json['features']
elif request_content_type == "application/x-npy":
return " ".join(_npy_loads(request_body))
else:
# Handle other content-types here or raise an Exception
# if the content type is not supported.
return request_body
def predict_fn(input_data, model):
print("** predict_fn**")
print("input_data: {} model:{}".format(input_data, model))
print("\n")
prefix = '/opt/ml/'
model_path = os.path.join(prefix, 'model')
my_vect = joblib.load(os.path.join(model_path, "vectorizer.joblib"))
message = "".join(clean_token_pipeline(input_data))
print("processed message: {}".format(message))
message = my_vect.transform([message])
message = message.toarray()
prediction = model.predict(message)
return prediction

How to update Kivy labels dynamically after button is pressed

I'm trying to make an app that uses data from MySQL server. So far I was doing fine, until I stumbled accross a need to update the Labels.
This is what I have so far:
from kivy.uix.button import Button
from kivy.lang import Builder
from kivy.uix.screenmanager import ScreenManager, Screen
from kivy.uix.label import Label
from kivy.uix.scrollview import ScrollView
from kivy.clock import Clock
import MySQLdb
class MainView(ScrollView):
def qchange(self):
query = 'SELECT * FROM `citiestovisit` ORDER BY `idcitiestovisit`'
self.db_data(query)
q = 'SELECT * FROM `citiestovisit` ORDER BY `Name`'
def db_data(self, query=q):
#vector = ListProperty()
vector = []
con = MySQLdb.connect(host="localhost", user="root", passwd="", db="cities")
cur = con.cursor()
cur.execute('SET NAMES `utf8`')
cur.execute(query)
result = cur.fetchall()
for row in result:
string = str(row[0]) + " " + str(row[1]) + " " + str(row[2])
vector.append(string)
print vector
return vector
def __init__(self, **kwargs):
kwargs['cols'] = 2
super(MainView, self).__init__(**kwargs)
GL = GridLayout(cols = 3, spacing=10, size_hint_y=None)
GL.bind(minimum_height=GL.setter('height'))
for row in self.db_data():
splitRow = row.split(" ")
for data in splitRow:
GL.add_widget(Label(text=data,size_hint_y=None, font_size='20sp'))
self.add_widget(GL)
Builder.load_string("""
<MenuScreen>:
BoxLayout:
GridLayout:
cols: 1
Button:
text: 'Goto settings'
on_press:
root.manager.transition.direction = 'left'
root.manager.current = 'settings'
Button:
text: 'Quit'
Label:
font_name: 'C:\Anonymous\Anonymous.ttf'
text: "picture here"
<SettingsScreen>:
""")
# Declare both screens
class MenuScreen(Screen):
pass
class SettingsScreen(Screen):
pass
ss = SettingsScreen(name='settings')
layout = BoxLayout(orientation='vertical')
BL = BoxLayout()
layout.add_widget(BL)
#Instance of a MainView class
MV = MainView()
def callback(instance):
sm.transition.direction = 'right'
sm.current = 'menu'
def callback2(instance):
MV.qchange()
btn = Button(text="Back to Menu")
btn.bind(on_press=callback)
btn.size_hint = (1, 0.3)
BL.add_widget(btn)
btn2 = Button(text="Sort by ID")
btn2.size_hint = (1, 0.3)
btn2.bind(on_press=callback2)
BL.add_widget(btn2)
layout.add_widget(MainView())
sublayout = GridLayout(cols=3)
sublayout.add_widget(Label(text="hello"))
sublayout.add_widget(Label(text="World"))
sublayout.add_widget(Label(text="Python"))
layout.add_widget(sublayout)
ss.add_widget(layout)
# Create the screen manager
sm = ScreenManager()
sm.add_widget(MenuScreen(name='menu'))
sm.add_widget(ss)
class MyApp(App):
def build(self):
return sm
if __name__ == '__main__':
MyApp().run()
I'm particularly interested in def qchange(self) mehod as it passes into def db_data(self, query=q) a new query; as a result request is sent to the database and an array of strings is returned. However, this array is not proccessed any further and labels in GL widget are not updated. I think I need to add the clock that would call the __init__ in MainView, but it's only a guess as I've also read about using properties (which I don't know how to use here as well)
I've eddited my code. Now it looks like this:
class MainView(ScrollView):
def qchange(self):
query = 'SELECT * FROM `citiestovisit` ORDER BY `idcitiestovisit`'
#self.db_data(query)
#LG = self.LabelsGrid(self.GL)
q = 'SELECT * FROM `citiestovisit` ORDER BY `Name`'
def db_data(self, query=q):
vector = []
con = MySQLdb.connect(host="localhost", user="root", passwd="", db="cities")
cur = con.cursor()
cur.execute('SET NAMES `utf8`')
cur.execute(query)
result = cur.fetchall()
for row in result:
string = str(row[0]) + " " + str(row[1]) + " " + str(row[2])
vector.append(string)
print vector
return vector
class LabelsGrid(GridLayout):
def __init__(self, **kwargs):
self.cols = 3
self.spacing = 10
self.size_hint_y = None
def show_labels(self, strings):
self.clear_widgets()
for row in strings:
splitRow = row.split(" ")
for data in splitRow:
label = Label(text=data, size_hint_y=None, font_size='20sp')
self.add_widget(label)
GL = LabelsGrid()
def __init__(self, **kwargs):
kwargs['cols'] = 2
super(MainView, self).__init__(**kwargs)
self.GL=self.LabelsGrid()
# GL = GridLayout(cols = 3, spacing=10, size_hint_y=None)
self.GL.bind(minimum_height=self.GL.setter('height'))
self.GL.show_labels(self.db_data(self.q))
self.add_widget(self.GL)
#self.GL.clear_widgets()
Builder.load_string("""
<MenuScreen>:
BoxLayout:
GridLayout:
cols: 1
Button:
text: 'Goto settings'
on_press:
root.manager.transition.direction = 'left'
root.manager.current = 'settings'
Button:
text: 'Quit'
Label:
font_name: 'C:\Anonymous\Anonymous.ttf'
text: "picture here"
<SettingsScreen>:
""")
# Declare both screens
class MenuScreen(Screen):
pass
class SettingsScreen(Screen):
pass
ss = SettingsScreen(name='settings')
layout = BoxLayout(orientation='vertical')
BL = BoxLayout()
layout.add_widget(BL)
#Instance of a MainView class
MV = MainView()
def callback(instance):
sm.transition.direction = 'right'
sm.current = 'menu'
def callback2(instance):
MV.qchange()
btn = Button(text="Back to Menu")
btn.bind(on_press=callback)
btn.size_hint = (1, 0.3)
BL.add_widget(btn)
btn2 = Button(text="Sort by ID")
btn2.size_hint = (1, 0.3)
btn2.bind(on_press=callback2)
BL.add_widget(btn2)
layout.add_widget(MainView())
sublayout = GridLayout(cols=3)
sublayout.add_widget(Label(text="hello"))
sublayout.add_widget(Label(text="World"))
sublayout.add_widget(Label(text="Python"))
layout.add_widget(sublayout)
ss.add_widget(layout)
# Create the screen manager
sm = ScreenManager()
sm.add_widget(MenuScreen(name='menu'))
sm.add_widget(ss)
class MyApp(App):
def build(self):
return sm
if __name__ == '__main__':
MyApp().run()
By adding
class LabelsGrid(GridLayout):
def __init__(self, **kwargs):
self.cols = 3
self.spacing = 10
self.size_hint_y = None
def show_labels(self, strings):
self.clear_widgets()
for row in strings:
splitRow = row.split(" ")
for data in splitRow:
label = Label(text=data, size_hint_y=None, font_size='20sp')
self.add_widget(label)
I wanted to add custom GridLayout according to a given piece of advice, however, now I get an error saying:
AttributeError: 'LabelsGrid' object has no attribute '_trigger_layout'
Any ideas on how to handle this?
Create a custom grid layout, let's say LabelsGrid, and in the class implement a method show_labels. Example:
class LabelsGrid(GridLayout):
def show_labels(self, strings):
self.clear_widgets()
for text in strings:
label = Label(text=text)
self.add_widget(label)
This way, each time you call the method with names of labels in a list, it will update itself.

TypeError: string indices must be integers

Hi i have a problem with my code that i get a error in a loop that works for a few times but then throws me a typeerro: string indices must be integers.
I want to call an api to get a json back and get some parts of the json response. heres the code:
class API(object):
def __init__(self, api_key):
self.api_key = api_key
def _request(self, api_url, params={}):
args = {'api_key': self.api_key}
for key, value in params.items():
if key not in args:
args[key] = value
response = requests.get(
Consts.URL['base'].format(
url=api_url
),
params=args
)
if response.status_code == requests.codes.ok:
return response.json()
else:
return "not possible"
print(response.url)
def get_list(self):
excel = EXCEL('s6.xlsx')
api_url = Consts.URL['list'].format(
version = Consts.API_VERSIONS['matchversion'],
start = excel.get_gamenr()
)
return self._request(api_url)
def get_match(self, matchid):
idlist = matchid
api_url = Consts.URL['match'].format(
version = Consts.API_VERSIONS['matchversion'],
matchId = idlist
)
return self._request(api_url)
def match_ids(self):
api = API('c6ea2f68-7ed6-40fa-9b99-fd591c55c05f')
x = api.get_list()
y = x['matches']
count = len(y)
ids = []
while count > 0:
count = count - 1
temp = y[0]
ids.append(temp['matchId'])
del y[0]
return ids
def match_info(self):
matchids = self.match_ids()
print(matchids)
matchinfolist = {}
counter = 1
for gameids in matchids:
info = self.get_match(gameids)
myid = self.find_partid(info['participantIdentities'])
prepdstats = info['participants'][myid-1]
print(prepdstats)
matchinfolist['stats' + str(counter)] = prepdstats
return matchinfolist
def find_partid(self, partlist):
partid = 0
idlist = partlist
while partid < 10:
partid = partid + 1
tempplayer = idlist[0]['player']
if tempplayer['summonerId'] == 19204660:
playernr = partid
partid = 500
del idlist[0]
return playernr
when i run the match_info() function i get this error
Traceback (most recent call last):
File "C:\Users\Niklas\Desktop\python riot\main.py", line 17, in <module>
main()
File "C:\Users\Niklas\Desktop\python riot\main.py", line 10, in main
print(api.match_info())
File "C:\Users\Niklas\Desktop\python riot\api.py", line 78, in match_info
myid = self.find_partid(info['participantIdentities'])
TypeError: string indices must be integers
but only after the loop in the function has run for a few times. I have no idea what im doing wrong. Any help would be nice.
Here is a link to the json: https://euw.api.pvp.net/api/lol/euw/v2.2/match/2492271473?api_key=c6ea2f68-7ed6-40fa-9b99-fd591c55c05f
The error shows up on
myid = self.find_partid(info['participantIdentities'])
For this line to execute, info must be a mapping with string keys, not a string itself. info is
info = self.get_match(gameids)
get_match ends with
return self._request(api_url)
_request ends with
if response.status_code == requests.codes.ok:
return response.json()
else:
return "not possible"
For the loop to ever run, response.json() must be a dict with key 'participantIdentities'. Your bug is expecting that to always be true.
One fix might be to make the expectation always ture. If there is a satisfactory default value, return {'participantIdentities': <default value>}. Otherwise, return None and change the loop to
info = self.get_match(gameids)
if info is not None:
# as before
else:
# whatever default action you want

Serialize Gtk TreeStore / ListStore using JSON

I made a new example which shows much better what I am trying to do. The new example gives the following ouput. Is there a way that the data can go into the respective store key (the {} brackets)?
{
"copy": [
[
[
5.0,
8.0,
9.0
]
],
[
[
4.0,
0.0,
1.0
]
]
],
"name": "dataset1",
"sets": [
{
"store": {},
"type": "vector"
},
{
"store": {},
"type": "vector"
}
]
}
New example
from gi.repository import Gtk
import json
import random
class Vector(object):
def __init__(self, data):
self.store = Gtk.ListStore(float, float, float)
self.store.append([data[0], data[1], data[2]])
self.type = "vector"
def return_data(self):
store_data = []
def iterate_over_data(model, path, itr):
row = model[path]
store_data.append([row[0], row[1], row[2]])
self.store.foreach(iterate_over_data)
return store_data
class DataSet(object):
def __init__(self, name):
self.name = name
self.sets = []
def add_vector(self):
data = [random.randint(0,9) for x in range(3)]
self.sets.append(Vector(data))
def to_json(self):
self.copy = []
for s in self.sets:
self.copy.append(s.return_data())
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
obj1 = DataSet("dataset1")
for x in range(2):
obj1.add_vector()
print(obj1.to_json())
Old example
I am currently figuring out how to serialize a Gtk ListStore that is nested in a Gtk TreeStore. I got a small example to work, but am not sure if this approach will scale for programs that have more data attached (For example the layer object could hold a color or a date of creation). Is there maybe another way to to this?
My current approach is to gather the data in list and dictionary form myself and then just create the JSON-dump. I have the feeling that this would be rather difficult to maintain if I need to attach 25 values to each layer-object.
from gi.repository import Gtk, Gdk
import json
import random
class LayerTreeView(Gtk.TreeView):
def __init__(self, store):
Gtk.TreeView.__init__(self, store)
renderer = Gtk.CellRendererText()
column = Gtk.TreeViewColumn("Name", renderer, text=0)
self.append_column(column)
class DataTreeView(Gtk.TreeView):
def __init__(self, store):
Gtk.TreeView.__init__(self, store)
self.store = store
renderer = Gtk.CellRendererText()
column = Gtk.TreeViewColumn("Data", renderer, text=0)
self.append_column(column)
class MainWindow(Gtk.Window):
def __init__(self):
Gtk.Window.__init__(self, title="TreeView Serialize")
self.connect("delete-event", Gtk.main_quit)
self.set_border_width(10)
self.set_default_size(400, 300)
vbox = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6, expand=True)
self.add(vbox)
self.clipboard = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD)
hbox = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=6)
button = Gtk.Button("Cut")
button.connect("clicked", self.on_cut_clicked)
hbox.pack_start(button, True, True, 0)
button = Gtk.Button(stock=Gtk.STOCK_COPY)
button.connect("clicked", self.on_copy_clicked)
hbox.pack_start(button, True, True, 0)
button = Gtk.Button(stock=Gtk.STOCK_PASTE)
button.connect("clicked", self.on_paste_clicked)
hbox.pack_start(button, True, True, 0)
vbox.add(hbox)
self.layer_store = Gtk.TreeStore(str, object, object)
self.layer_view = LayerTreeView(self.layer_store)
self.layer_sw = Gtk.ScrolledWindow()
self.data_sw = Gtk.ScrolledWindow()
self.layer_sw.add(self.layer_view)
treebox = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=6, expand=True)
treebox.pack_start(self.layer_sw, True, True, 0)
treebox.pack_start(self.data_sw, True, True, 0)
vbox.add(treebox)
self.select = self.layer_view.get_selection()
self.select.connect("changed", self.on_selection_changed)
self.add_test_data()
def add_test_data(self):
for x in range(3):
data_store = Gtk.ListStore(str)
data_view = DataTreeView(data_store)
for y in range(5):
data_store.append([str(y+x)])
self.layer_store.append(None, ["Data {}".format(x), data_store, data_view])
def on_selection_changed(self, selection):
"""
When layer is switched load respective data
"""
model, treeiter = selection.get_selected()
if treeiter != None:
data_view = model[treeiter][2]
child = self.data_sw.get_child()
if child != None:
self.data_sw.remove(self.data_sw.get_child())
self.data_sw.add(data_view)
self.show_all()
def on_cut_clicked(self, button):
pass
def on_copy_clicked(self, button):
copy_list = ["safe-to-paste"]
data_dict = {}
for row in self.layer_store:
name = row[0]
data_obj = row[1]
value_list = []
for datarow in data_obj:
value = datarow[0]
value_list.append(value)
data_dict[name] = value_list
copy_list.append(data_dict)
data = json.dumps(copy_list)
self.clipboard.set_text(data, -1)
def on_paste_clicked(self, button):
paste_str = self.clipboard.wait_for_text()
try:
parse = json.loads(paste_str)
json_str = True
except:
json_str = False
if json_str is False:
return
keyword = parse[0]
if keyword != "safe-to-paste":
return
data_dict = parse[1]
for x in data_dict:
data_list = data_dict[x]
data_store = Gtk.ListStore(str)
data_view = DataTreeView(data_store)
for y in data_list:
data_store.append([str(y)])
self.layer_store.append(None, [x, data_store, data_view])
win = MainWindow()
win.show_all()
Gtk.main()
I have an improved version of your code with dict comprehension and #staticmethod that makes the signal callbacks more readable and shorter. Nevertheless, this does not really solve your problem as it still generates the json manually. If the ListStore gets more complex, it would probably be better to let the DataListStore class generate its own json with a corresponding method.
from gi.repository import Gtk, Gdk
import json
class LayerTreeView(Gtk.TreeView):
def __init__(self, store):
Gtk.TreeView.__init__(self, store)
renderer = Gtk.CellRendererText()
column = Gtk.TreeViewColumn("Name", renderer, text=0)
self.append_column(column)
class DataTreeView(Gtk.TreeView):
def __init__(self):
Gtk.TreeView.__init__(self)
renderer = Gtk.CellRendererText()
column = Gtk.TreeViewColumn("Data", renderer, text=0)
self.append_column(column)
class DataListStore(Gtk.ListStore):
#staticmethod
def from_json(*args, values=[]):
store = DataListStore(*args)
for value in values:
store.append((value,))
return store
class MainWindow(Gtk.Window):
def __init__(self):
Gtk.Window.__init__(self, title="TreeView Serialize")
self.connect("delete-event", Gtk.main_quit)
self.set_border_width(10)
self.set_default_size(400, 300)
vbox = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6, expand=True)
self.add(vbox)
self.clipboard = Gtk.Clipboard.get(Gdk.SELECTION_CLIPBOARD)
hbox = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=6)
button = Gtk.Button("Cut")
button.connect("clicked", self.on_cut_clicked)
hbox.pack_start(button, True, True, 0)
button = Gtk.Button(stock=Gtk.STOCK_COPY)
button.connect("clicked", self.on_copy_clicked)
hbox.pack_start(button, True, True, 0)
button = Gtk.Button(stock=Gtk.STOCK_PASTE)
button.connect("clicked", self.on_paste_clicked)
hbox.pack_start(button, True, True, 0)
vbox.add(hbox)
self.layer_store = Gtk.TreeStore(str, object)
self.layer_view = LayerTreeView(self.layer_store)
self.data_view = DataTreeView()
layer_sw = Gtk.ScrolledWindow()
layer_sw.add(self.layer_view)
data_sw = Gtk.ScrolledWindow()
data_sw.add(self.data_view)
treebox = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=6, expand=True)
treebox.pack_start(layer_sw, True, True, 0)
treebox.pack_start(data_sw, True, True, 0)
vbox.add(treebox)
select = self.layer_view.get_selection()
select.connect("changed", self.on_selection_changed)
self.add_test_data()
def add_test_data(self):
for x in range(3):
data_list = [str(y+x) for y in range(5)]
self.layer_store.append(None, ["Data {}".format(x), data_list])
def on_selection_changed(self, selection):
"""
When layer is switched load respective data
"""
model, treeiter = selection.get_selected()
if treeiter != None:
self.data_view.set_model(
DataListStore.from_json(str, values=model[treeiter][1])
)
def on_cut_clicked(self, button):
pass
def on_copy_clicked(self, button):
copy_list = [
'safe-to-paste',
{row[0]: row[1] for row in self.layer_store},
]
data = json.dumps(copy_list)
self.clipboard.set_text(data, -1)
def on_paste_clicked(self, button):
paste_str = self.clipboard.wait_for_text()
try:
parse = json.loads(paste_str)
except:
return
if parse[0] != "safe-to-paste":
return
data_dict = parse[1]
for x in data_dict:
self.layer_store.append(None, [x, data_dict[x]])
win = MainWindow()
win.show_all()
Gtk.main()

scrapy unhandled exception

I am using scrapy 0.16.2 version on linux. I'm running:
scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider
I'm getting this error which blocks scrapy (hangs and doesn't finish automatically, only ^C stops it)
2012-11-20 15:04:51+0000 [-] Unhandled Error Traceback (most recent call last): File "/usr/lib/python2.7/site-packages/scrapy/commands/crawl.py", line 45, in run
self.crawler.start() File "/usr/lib/python2.7/site-packages/scrapy/crawler.py", line 80, in start
reactor.run(installSignalHandlers=False) # blocking call File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1169, in run
self.mainLoop() File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 1178, in mainLoop
self.runUntilCurrent() --- <exception caught here> --- File "/usr/lib/python2.7/site-packages/twisted/internet/base.py", line 800, in runUntilCurrent
call.func(*call.args, **call.kw) File "/usr/lib/python2.7/site-packages/scrapy/utils/reactor.py", line 41, in __call__
return self._func(*self._a, **self._kw) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 116, in
_next_request
self.crawl(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 172, in crawl
self.schedule(request, spider) File "/usr/lib/python2.7/site-packages/scrapy/core/engine.py", line 176, in schedule
return self.slots[spider].scheduler.enqueue_request(request) File "/usr/lib/python2.7/site-packages/scrapy/core/scheduler.py", line 48, in enqueue_request
if not request.dont_filter and self.df.request_seen(request): exceptions.AttributeError: 'NoneType' object has no attribute 'dont_filter'
BTW this worked in version 0.14
Here is the code:
class MySpider(CrawlSpider):
name = 'alrroya'
NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS)
NEW_IGNORED_EXTENSIONS.remove('pdf')
download_delay = 0.05
# Stay within these domains when crawling
allowed_domains = []
all_domains = {}
start_urls = []
# Add our callback which will be called for every found link
rules = [
Rule(SgmlLinkExtractor(deny_extensions=NEW_IGNORED_EXTENSIONS, tags=('a', 'area', 'frame', 'iframe'), attrs=('href', 'src')), follow=True, callback='parse_crawled_page')
]
# How many pages crawled
crawl_count = 0
# How many PDFs we have found
pdf_count = 0
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self, *args, **kwargs)
dispatcher.connect(self._spider_closed, signals.spider_closed)
dispatcher.connect(self._spider_opened, signals.spider_opened)
self.load_allowed_domains_and_start_urls()
def allowed_to_start(self):
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = self.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
reason = True
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
reason = True
else:
reason = False
else:
reason = True
return reason
def _spider_opened(self, spider):
if spider is not self:
return
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
crawler.engine.close_spider(self, 'finished')
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
os._exit(1)
else:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
def _spider_closed(self, spider, reason):
if spider is not self:
return
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
if 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('shutdown\n')
f.write(str(date.today()))
f.close()
else:
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
def _requests_to_follow(self, response):
if getattr(response, 'encoding', None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
def make_requests_from_url(self, url):
http_client = httplib2.Http()
try:
headers = {
'content-type': 'text/html',
'user-agent': random.choice(USER_AGENT_LIST)
}
response, content = http_client.request(url, method='HEAD', headers=headers)
#~ if 'pdf' in response['content-type'].lower() or (url.endswith('.pdf') and 'octet-stream' in response['content-type'].lower()):
if 'pdf' in response['content-type'].lower() or 'octet-stream' in response['content-type'].lower():
if self.allowed_to_start():
self.get_pdf_link(url)
else:
return CrawlSpider.make_requests_from_url(self, url)
except Exception as ex:
return CrawlSpider.make_requests_from_url(self, url)
def get_pdf_link(self, url):
source = self.__class__.name
parsed_url = urlparse(url)
url_domain = parsed_url.netloc
url_path = parsed_url.path
if url_domain:
for domain, paths in self.__class__.all_domains[source]['allow_domains'].iteritems():
if url_domain.endswith(domain):
pre_and = False
pre_or = False
and_cond = True
or_cond = False
for path in paths:
if path[0:1] == '!':
pre_and = True
if path[1:] not in url_path:
and_cond = and_cond and True
else:
and_cond = and_cond and False
else:
pre_or = True
if path in url_path:
or_cond = or_cond or True
else:
or_cond = or_cond or False
if pre_and and pre_or:
if and_cond and or_cond:
self.pdf_process(source, url)
return
elif pre_and:
if and_cond:
self.pdf_process(source, url)
return
elif pre_or:
if or_cond:
self.pdf_process(source, url)
return
else:
self.pdf_process(source, url)
return
def parse_crawled_page(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
if crawl_count % 100 == 0:
print 'Crawled %d pages' % crawl_count
if 'pdf' in response.headers.get('content-type', '').lower():
self.get_pdf_link(response.url)
return Item()
def load_allowed_domains_and_start_urls(self):
day = timedelta(days=1)
currdate = date.today()
alrroya = ('http://epaper.alrroya.com/currentissues.php?editiondt=' + currdate.strftime('%Y/%m/%d'),)
self.__class__.all_domains = {
'alrroya': {
'start_urls': alrroya,
'allow_domains': {
'epaper.alrroya.com': frozenset(()),
}
}
}
for domain in self.__class__.all_domains[self.__class__.name]['allow_domains']:
self.__class__.allowed_domains.append(domain)
self.__class__.start_urls.extend(self.__class__.all_domains[self.__class__.name]['start_urls'])
def pdf_process(self, source, url):
print '!!! ' + source + ' ' + url
This appears to be a bug in Scrapy. The current version doesn't seem to accept lists returned from make_requests_from_url(). I was able to modify the Scrapy code in the following way to work around the issue.
In the file Scrapy-0.16.5-py2.7.egg/scrapy/spider.py
Change:
def start_requests(self):
for url in self.start_urls:
yield self.make_requests_from_url(url)
To:
def start_requests(self):
for url in self.start_urls:
requests = self.make_requests_from_url(url)
if type(requests) is list:
for request in requests:
yield request
else:
yield requests
I expect that the official Scrapy people will fix this eventually.