How can I use a compressed connection between Django and MySQL? - mysql

I have compression on my MySQL server, and I'd like to ensure Django is making compressed connections. How can I do this?

Trial, error and inference suggest the solution is to use a compress field set to True in the OPTIONS dict:
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.mysql', # Add 'postgresql_psycopg2', 'mysql', 'sqlite3' or 'oracle'.
...
'OPTIONS': {
'compress': True
}
}
}
I can't confirm the connection is actually compressed though.

A cursory (no pun intended) examinations of /django/db/backends/mysql/base.py of Django 1.3:
298 def _cursor(self):
299 if not self._valid_connection():
300 kwargs = {
301 'conv': django_conversions,
302 'charset': 'utf8',
303 'use_unicode': True,
304 }
305 settings_dict = self.settings_dict
306 if settings_dict['USER']:
307 kwargs['user'] = settings_dict['USER']
308 if settings_dict['NAME']:
309 kwargs['db'] = settings_dict['NAME']
310 if settings_dict['PASSWORD']:
311 kwargs['passwd'] = settings_dict['PASSWORD']
312 if settings_dict['HOST'].startswith('/'):
313 kwargs['unix_socket'] = settings_dict['HOST']
314 elif settings_dict['HOST']:
315 kwargs['host'] = settings_dict['HOST']
316 if settings_dict['PORT']:
317 kwargs['port'] = int(settings_dict['PORT'])
318 # We need the number of potentially affected rows after an
319 # "UPDATE", not the number of changed rows.
320 kwargs['client_flag'] = CLIENT.FOUND_ROWS
321 kwargs.update(settings_dict['OPTIONS'])
322 self.connection = Database.connect(**kwargs)
323 self.connection.encoders[SafeUnicode] = self.connection.encoders[unicode]
324 self.connection.encoders[SafeString] = self.connection.encoders[str]
325 connection_created.send(sender=self.__class__, connection=self)
326 cursor = CursorWrapper(self.connection.cursor())
327 return cursor
When creating a connection on line 322, the code does not seem to pass the compress argument in kwargs, not by default anyway.
Passing 'compress': True through OPTIONS should let you create a compressed connection when it's available, this dictionary is merged to kwargs on line 321.
There does not seem to be any other calls to the MySQLdb.connect() method in the rest of the backend. Note that MySQLdb is imported as: import MySQLdb as Database in that file.

Related

Error while using cenpy library in python

I am working on a project where I need to use census data for a couple of towns in MA. For that, I am using cenpy library ASC data, but I got a key error. The same error happens even when I try the example code described for Chicago. Here is the example code I use and the error I see:
chicago = products.ACS(2017).from_place('Chicago, IL', level='tract',
variables=['B00002*', 'B01002H_001E'])
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File ~\anaconda3\envs\oxe\lib\site-packages\cenpy\tiger.py:192, in ESRILayer.query(self, raw, strict, **kwargs)
191 try:
--> 192 features = datadict["features"]
193 except KeyError:
KeyError: 'features'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
Input In [4], in <cell line: 1>()
----> 1 chicago = products.ACS(2017).from_place('Chicago, IL', level='tract',
2 variables=['B00002*', 'B01002H_001E'])
File ~\anaconda3\envs\oxe\lib\site-packages\cenpy\products.py:791, in ACS.from_place(self, place, variables, level, return_geometry, place_type, strict_within, return_bounds, replace_missing)
788 variables = self._preprocess_variables(variables)
789 variables.append("GEO_ID")
--> 791 geoms, variables, *rest = super(ACS, self).from_place(
792 place,
793 variables=variables,
794 level=level,
795 return_geometry=return_geometry,
796 place_type=place_type,
797 strict_within=strict_within,
798 return_bounds=return_bounds,
799 replace_missing=replace_missing,
800 )
801 variables["GEOID"] = variables.GEO_ID.str.split("US").apply(lambda x: x[1])
802 return_table = geoms[["GEOID", "geometry"]].merge(
803 variables.drop("GEO_ID", axis=1), how="left", on="GEOID"
804 )
File ~\anaconda3\envs\oxe\lib\site-packages\cenpy\products.py:200, in _Product.from_place(self, place, variables, place_type, level, return_geometry, geometry_precision, strict_within, return_bounds, replace_missing)
197 else:
199 placer = "STATE={} AND PLACE={}".format(placerow.STATEFP, placerow.TARGETFP)
--> 200 env = env_layer.query(where=placer)
202 print(
203 "Matched: {} to {} "
204 "within layer {}".format(
(...)
208 )
209 )
211 geoms, data = self._from_bbox(
212 env.to_crs(epsg=4326).total_bounds,
213 variables=variables,
(...)
219 replace_missing=replace_missing,
220 )
File ~\anaconda3\envs\oxe\lib\site-packages\cenpy\tiger.py:198, in ESRILayer.query(self, raw, strict, **kwargs)
196 if details is []:
197 details = "Mapserver provided no detailed error"
--> 198 raise KeyError(
199 (
200 r"Response from API is malformed. You may have "
201 r"submitted too many queries, formatted the request incorrectly, "
202 r"or experienced significant network connectivity issues."
203 r" Check to make sure that your inputs, like placenames, are spelled"
204 r" correctly, and that your geographies match the level at which you"
205 r" intend to query. The original error from the Census is:\n"
206 r"(API ERROR {}:{}({}))".format(code, msg, details)
207 )
208 )
209 todf = []
210 for i, feature in enumerate(features):
KeyError: 'Response from API is malformed. You may have submitted too many queries, formatted the request incorrectly, or experienced significant network connectivity issues. Check to make sure that your inputs, like placenames, are spelled correctly, and that your geographies match the level at which you intend to query. The original error from the Census is:\\n(API ERROR 400:Unable to complete operation.([]))'

Load custom package model to get model vocabulary in AllenNLP python interface

I'm trying to get the vocabulary from some publicly-available pre-trained models (that aren't mine) using the python interface of AllenNLP, using self.vocab. However, I'm running into problems trying to load in the model. I'm looking to get the vocabulary from the dygiepp models, using the following code:
from allennlp.models.model import Model
scierc_model = Model.from_archive('https://s3-us-west-2.amazonaws.com/ai2-s2-research/dygiepp/master/scierc.tar.gz')
However, I get the following error:
---------------------------------------------------------------------------
ConfigurationError Traceback (most recent call last)
/tmp/local/63381207/ipykernel_7616/3549263982.py in <module>
----> 1 scierc_model = Model.from_archive('https://s3-us-west-2.amazonaws.com/ai2-s2-research/dygiepp/master/scierc.tar.gz')
~/anaconda3/envs/dygiepp/lib/python3.7/site-packages/allennlp/models/model.py in from_archive(cls, archive_file, vocab)
480 from allennlp.models.archival import load_archive # here to avoid circular imports
481
--> 482 model = load_archive(archive_file).model
483 if vocab:
484 model.vocab.extend_from_vocab(vocab)
~/anaconda3/envs/dygiepp/lib/python3.7/site-packages/allennlp/models/archival.py in load_archive(archive_file, cuda_device, overrides, weights_file)
231 # Instantiate model and dataset readers. Use a duplicate of the config, as it will get consumed.
232 dataset_reader, validation_dataset_reader = _load_dataset_readers(
--> 233 config.duplicate(), serialization_dir
234 )
235 model = _load_model(config.duplicate(), weights_path, serialization_dir, cuda_device)
~/anaconda3/envs/dygiepp/lib/python3.7/site-packages/allennlp/models/archival.py in _load_dataset_readers(config, serialization_dir)
267
268 dataset_reader = DatasetReader.from_params(
--> 269 dataset_reader_params, serialization_dir=serialization_dir
270 )
271 validation_dataset_reader = DatasetReader.from_params(
~/anaconda3/envs/dygiepp/lib/python3.7/site-packages/allennlp/common/from_params.py in from_params(cls, params, constructor_to_call, constructor_to_inspect, **extras)
586 "type",
587 choices=as_registrable.list_available(),
--> 588 default_to_first_choice=default_to_first_choice,
589 )
590 subclass, constructor_name = as_registrable.resolve_class_name(choice)
~/anaconda3/envs/dygiepp/lib/python3.7/site-packages/allennlp/common/params.py in pop_choice(self, key, choices, default_to_first_choice, allow_class_names)
322 """{"model": "my_module.models.MyModel"} to have it imported automatically."""
323 )
--> 324 raise ConfigurationError(message)
325 return value
326
ConfigurationError: dygie not in acceptable choices for dataset_reader.type: ['babi', 'conll2003', 'interleaving', 'multitask', 'multitask_shim', 'sequence_tagging', 'sharded', 'text_classification_json']. You should either use the --include-package flag to make sure the correct module is loaded, or use a fully qualified class name in your config file like {"model": "my_module.models.MyModel"} to have it imported automatically.
The error describes how to fix the error from the command line, but not in the python interface. I additionally tried adding the line import dygie to my code to import the missing package, but that didn't solve the problem.
Wondering if anyone knows how to get around this?
To run this model, you'll need to have the code from this repo: https://github.com/dwadden/dygiepp.
In particular, you need to import the DyGIE dataset reader from here: https://github.com/dwadden/dygiepp/blob/master/dygie/data/dataset_readers/dygie.py#L29

Python OCR Tesseract, find a certain word in the image and return me the coordinates

I wanted your help, I've been trying for a few months to make a code that finds a word in the image and returns the coordinates where that word is in the image.
I was trying this using OpenCV, OCR tesseract, but I was not successful, could someone here in the community help me?
I'll leave an image here as an example:
Here is something you can start with:
import pytesseract
from PIL import Image
pytesseract.pytesseract.tesseract_cmd = r'C:\<path-to-your-tesseract>\Tesseract-OCR\tesseract.exe'
img = Image.open("img.png")
data = pytesseract.image_to_data(img, output_type='dict')
boxes = len(data['level'])
for i in range(boxes):
if data['text'][i] != '':
print(data['left'][i], data['top'][i], data['width'][i], data['height'][i], data['text'][i])
If you have difficulties with installing pytesseract see: https://stackoverflow.com/a/53672281/18667225
Output:
153 107 277 50 Palavras
151 197 133 37 com
309 186 154 48 R/RR
154 303 126 47 Rato
726 302 158 47 Resto
154 377 144 50 Rodo
720 379 159 47 Arroz
152 457 160 48 Carro
726 457 151 46 Ferro
154 532 142 50 Rede
726 534 159 47 Barro
154 609 202 50 Parede
726 611 186 47 Barata
154 690 124 47 Faro
726 685 288 50 Beterraba
154 767 192 47 Escuro
726 766 151 47 Ferro
I managed to find the solution and I'll post it here for you:
import pytesseract
import cv2
from pytesseract import Output
pytesseract.pytesseract.tesseract_cmd = r'C:\<path-to-your-tesseract>\Tesseract-OCR\tesseract.exe'
filepath = 'image.jpg'
image = cv2.imread(filepath, 1)
# converting image to grayscale image
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# converting to binary image by Thresholding
# this step is necessary if you have a color image because if you skip this part
# then the tesseract will not be able to detect the text correctly and it will give an incorrect result
threshold_img = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# displays the image
cv2.imshow('threshold image', threshold_img)
# Holds the output window until the user presses a key
cv2.waitKey(0)
# Destroying windows present on the screen
cv2.destroyAllWindows()
# setting parameters for tesseract
custom_config = r'--oem 3 --psm 6'
# now feeding image to tesseract
details = pytesseract.image_to_data(threshold_img, output_type=Output.DICT, config=custom_config, lang='eng')
# Color
vermelho = (0, 0, 255)
#Exibe todas as chaves encontradas
print(details.keys())
print(details['text'])
# For in all found texts
for i in range(len(details['text'])):
# If it finds the text "UNIVERIDADE" it will print the coordinates, and draw a rectangle around the word
if details['text'][i] == 'UNIVERSIDADE':
print(details['text'][i])
print(f"left: {details['left'][i]}")
print(f"top: {details['top'][i]}")
print(f"width: {details['width'][i]}")
print(f"height: {details['height'][i]}")
cv2.rectangle(image, (details['left'][i], details['top'][i]), (details['left'][i]+details['width'][i], details['top'][i]+details['height'][i]), vermelho)

How to scrape json data from an interactive chart?

I have a specific section of a website that I want to scrape data from and here's the screenshot of the section -
I inspected the elements of that particular section and noticed that it's within a canvas tag. However, I also checked the source code of the website and I found that the data lies within the source code in a format I'm not familiar with. Here's a sample of that data
JSON.parse('\x5B\x7B\x22id\x22\x3A\x2232522\x22,\x22minute\x22\x3A\x2222\x22,\x22result\x22\x3A\x22MissedShots\x22,
\x22X\x22\x3A\x220.7859999847412109\x22,\x22Y\x22\x3A\x220.52\x22,\x22xG\x22\x3A\x220.03867039829492569\x22,
\x22player\x22\x3A\x22Lionel\x20Messi\x22,
\x22h_a\x22\x3A\x22h\x22,
\x22player_id\x22\x3A\x222097\x22,\x22situation\x22\x3A\x22OpenPlay\x22,
\x22season\x22\x3A\x222014\x22,\x22shotType\x22\x3A\x22LeftFoot\x22,
\x22match_id\x22\x3A...);
How do I parse through this data to give me the x,y co-ordinates of every shot from the map in the screenshot?
Ya the issue is with the encoding/decoding.
You can pull that string and then essentially need to ignore the escape charachters. Once you do that, you can use json.loads() to read that in and then can navigate the json structure.
Now I only looked quickly, but did not see the data in there to show where the plot is on the shot chart. But you can have a look to see if you can find it. The data does however have a shotZones key.
import requests
from bs4 import BeautifulSoup
import json
import codecs
url = 'https://understat.com/player/2097'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if 'var groupsData = JSON.parse' in script.text:
encoded_string = script.text
encoded_string = encoded_string .split("var groupsData = JSON.parse('")[-1]
encoded_string = encoded_string.rsplit("'),",1)[0]
jsonStr = codecs.getdecoder('unicode-escape')(encoded_string)[0]
jsonObj = json.loads(jsonStr)
Edit
Actually I found it. Here you go:
import requests
from bs4 import BeautifulSoup
import json
import codecs
from pandas.io.json import json_normalize
url = 'https://understat.com/player/2097'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
scripts = soup.find_all('script')
# I noticed the data was imbedded in the script tag that started with `var shotsData`
for script in scripts:
if 'var shotsData' in script.text:
# I store that text, then trim off the string on the ends so that
# it's in a valid json format
encoded_string = script.text
encoded_string = encoded_string .split("JSON.parse('", 1)[-1]
encoded_string = encoded_string.rsplit("player_info =",1)[0]
encoded_string = encoded_string.rsplit("'),",1)[0]
# Have it ignore the escape characters so it can decode the ascii
# and be able to use json.loads
jsonStr = codecs.getdecoder('unicode-escape')(encoded_string)[0]
jsonObj = json.loads(jsonStr)
df = json_normalize(jsonObj)
Output:
print (df)
X ... xG
0 0.7859999847412109 ... 0.03867039829492569
1 0.8619999694824219 ... 0.06870150566101074
2 0.86 ... 0.15034306049346924
3 0.8180000305175781 ... 0.045503295958042145
4 0.8690000152587891 ... 0.06531666964292526
5 0.7230000305175781 ... 0.054804932326078415
6 0.9119999694824219 ... 0.0971858948469162
7 0.885 ... 0.11467907577753067
8 0.875999984741211 ... 0.10627452284097672
9 0.9540000152587891 ... 0.3100203275680542
10 0.8969999694824219 ... 0.12571729719638824
11 0.8959999847412109 ... 0.04122981056571007
12 0.8730000305175781 ... 0.09942527115345001
13 0.769000015258789 ... 0.025321772322058678
14 0.885 ... 0.7432776093482971
15 0.86 ... 0.4680374562740326
16 0.7619999694824219 ... 0.05699075385928154
17 0.919000015258789 ... 0.10647356510162354
18 0.9530000305175781 ... 0.571601390838623
19 0.8280000305175781 ... 0.07561512291431427
20 0.9030000305175782 ... 0.4600500166416168
21 0.9469999694824218 ... 0.3132372796535492
22 0.92 ... 0.2869703769683838
23 0.7659999847412109 ... 0.07576987147331238
24 0.9640000152587891 ... 0.3824153244495392
25 0.8590000152587891 ... 0.1282796859741211
26 0.9330000305175781 ... 0.42914989590644836
27 0.9230000305175782 ... 0.4968196153640747
28 0.8240000152587891 ... 0.08198583126068115
29 0.965999984741211 ... 0.4309735596179962
.. ... ... ...
843 0.9159999847412109 ... 0.4672183692455292
844 0.7430000305175781 ... 0.04068271815776825
845 0.815 ... 0.07300572842359543
846 0.8980000305175782 ... 0.06551901996135712
847 0.7680000305175781 ... 0.028392281383275986
848 0.885 ... 0.7432776093482971
849 0.875999984741211 ... 0.4060465097427368
850 0.7880000305175782 ... 0.09496577084064484
851 0.7190000152587891 ... 0.05071594566106796
852 0.7680000305175781 ... 0.090679831802845
853 0.7440000152587891 ... 0.06875557452440262
854 0.9069999694824219 ... 0.45824503898620605
855 0.850999984741211 ... 0.06454816460609436
856 0.935 ... 0.5926618576049805
857 0.9219999694824219 ... 0.16091874241828918
858 0.73 ... 0.05882067605853081
859 0.9080000305175782 ... 0.3522365391254425
860 0.8209999847412109 ... 0.1690768003463745
861 0.850999984741211 ... 0.11893663555383682
862 0.88 ... 0.11993970721960068
863 0.8119999694824219 ... 0.15579797327518463
864 0.7019999694824218 ... 0.011425728909671307
865 0.7530000305175781 ... 0.06945621967315674
866 0.850999984741211 ... 0.08273076266050339
867 0.8180000305175781 ... 0.06529481709003448
868 0.86 ... 0.10793478786945343
869 0.8190000152587891 ... 0.061923813074827194
870 0.8130000305175781 ... 0.05294585973024368
871 0.799000015258789 ... 0.06358513236045837
872 0.9019999694824219 ... 0.5841030478477478
[873 rows x 20 columns]

Keras --- About Masking Layer followed by a Reshape Layer

I want to using mask before LSTM, but the output of Lstm must be reshape to 4 dim.
So my code:
main_input = Input(shape=(96,1000), name='main_input')
pre_input = BatchNormalization()(main_input)
aaa= Masking(mask_value=0)(pre_input)
recurrent1 = LSTM(256,return_sequences=True)(aaa)
r_out= Reshape((1,96,256))(recurrent1)`
But it runs with error:
[![enter image description here][1]][1]
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-2-d1107015501b> in <module>()
17 recurrent1 = LSTM(256,return_sequences=True)(aaa)
18
---> 19 r_out= Reshape((1,96,256))(recurrent1)
/usr/local/lib/python2.7/dist-packages/keras/engine/topology.pyc in __call__(self, x, mask)
512 if inbound_layers:
513 # this will call layer.build() if necessary
--> 514 self.add_inbound_node(inbound_layers, node_indices, tensor_indices)
515 input_added = True
516
/usr/local/lib/python2.7/dist-packages/keras/engine/topology.pyc in add_inbound_node(self, inbound_layers, node_indices, tensor_indices)
570 # creating the node automatically updates self.inbound_nodes
571 # as well as outbound_nodes on inbound layers.
--> 572 Node.create_node(self, inbound_layers, node_indices, tensor_indices)
573
574 def get_output_shape_for(self, input_shape):
/usr/local/lib/python2.7/dist-packages/keras/engine/topology.pyc in create_node(cls, outbound_layer, inbound_layers, node_indices, tensor_indices)
148 if len(input_tensors) == 1:
149 output_tensors = to_list(outbound_layer.call(input_tensors[0], mask=input_masks[0]))
--> 150 output_masks = to_list(outbound_layer.compute_mask(input_tensors[0], input_masks[0]))
151 # TODO: try to auto-infer shape if exception is raised by get_output_shape_for
152 output_shapes = to_list(outbound_layer.get_output_shape_for(input_shapes[0]))
/usr/local/lib/python2.7/dist-packages/keras/engine/topology.pyc in compute_mask(self, input, input_mask)
605 else:
606 raise Exception('Layer ' + self.name + ' does not support masking, ' +
--> 607 'but was passed an input_mask: ' + str(input_mask))
608 # masking not explicitly supported: return None as mask
609 return None
Exception: Layer reshape_1 does not support masking, but was passed an input_mask: Any{2}.0
I have print out, the outshape of recurrent1 is (96,256)
How could I make it right?