I'm using NLTK wrapper for NER tagging using Stanford 3class model. On BBC news raw text that is written in English getting UnicodeDecodeError.
Here is my code
from nltk.tag import StanfordNERTagger
st1 = StanfordNERTagger('/home/saurabh/saurabh-cair/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/home/saurabh/saurabh-cair/stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8')
file=open('/home/saurabh/saurabh-cair/model_training/bbc/data.txt','rt')
text=file.read()
file.close()
import nltk
words = nltk.word_tokenize(text)
xyz=st1.tag(words)
for i in xyz:
print(i)
got error as
Traceback (most recent call last):
File "model_english.py", line 26, in <module>
words = nltk.word_tokenize(text)
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/__init__.py", line 128, in word_tokenize
sentences = [text] if preserve_line else sent_tokenize(text, language)
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/__init__.py", line 95, in sent_tokenize
return tokenizer.tokenize(text)
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1241, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1291, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1281, in span_tokenize
for sl in slices:
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1322, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 314, in _pair_iter
for el in it:
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1297, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1343, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1478, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 313, in _pair_iter
prev = next(it)
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 584, in _annotate_first_pass
for aug_tok in tokens:
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 550, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 1: ordinal not in range(128)
I tried utf-8, ascii and default encoding as well but it didn't resolve my problem.
Text data contains sentences like:
General Motors of the US is to pay Fiat 1.55bn euros ($2bn; £1.1bn) to get out of a deal which could have forced it to buy the Italian car maker outright.
I'm using Anaconda python 2.7
Related
I'm trying to parse the messages sent on an MQTT topic as JSONs, but when i try to call json.loads() on the decoded string, the code goes into exception:
File "/usr/lib/python3.7/json/decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 1 column 153 (char 152)
The code is the following:
def on_message_mqtt(self, client, userdata, message):
m_decode = message.payload.decode('utf-8')
logger.info(m_decode)
m_in = json.loads(m_decode)
logger.info(m_in)
While the payload of the messages has the following format:
{"Timestamp":"2021-05-24-13:27:13.450","AI":[1204,7,0,4,18,2,1176,802,11,0,381,2496,0,0,810,1282],"DI":[false,false,false,false,false,false,false,false]}
I tried to put the text into a string and loads() is able to parse it, so I bet that the problem is on the initial string decoding.
Here you can see a print of the decoded string, and then the error on the call to json.loads():
2021-05-24 16:31:47 - INFO - {"Timestamp":"2021-05-24-14:31:44.790","AI":[912,9,0,2,16,2,886,605,11,0,321,1924,1,963,620,9],"DI":[false,false,false,false,false,false,false,false]}
Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib/python3.7/threading.py", line 917, in _bootstrap_inner
self.run()
File "/usr/lib/python3.7/threading.py", line 865, in run
self._target(*self._args, **self._kwargs)
File "router.py", line 136, in run
self.mqtt.loop_forever()
File "/opt/control/env/lib/python3.7/site-packages/paho/mqtt/client.py", line 1779, in loop_forever
rc = self.loop(timeout, max_packets)
File "/opt/control/env/lib/python3.7/site-packages/paho/mqtt/client.py", line 1181, in loop
rc = self.loop_read(max_packets)
File "/opt/control/env/lib/python3.7/site-packages/paho/mqtt/client.py", line 1572, in loop_read
rc = self._packet_read()
File "/opt/control/env/lib/python3.7/site-packages/paho/mqtt/client.py", line 2310, in _packet_read
rc = self._packet_handle()
File "/opt/control/env/lib/python3.7/site-packages/paho/mqtt/client.py", line 2940, in _packet_handle
return self._handle_pubrel()
File "/opt/control/env/lib/python3.7/site-packages/paho/mqtt/client.py", line 3246, in _handle_pubrel
self._handle_on_message(message)
File "/opt/control/env/lib/python3.7/site-packages/paho/mqtt/client.py", line 3444, in _handle_on_message
self.on_message(self, self._userdata, message)
File "router.py", line 87, in on_message_mqtt
logger.info(json.loads(m_decode), exc_info=True)
File "/usr/lib/python3.7/json/__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "/usr/lib/python3.7/json/decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 1 column 151 (char 150)
Judging from the error, character 152 is the square bracket closing the arroy of the value for the key of "DI" ] which would imply incorrect parsing using the payload.decode function
Can you check if this works
def on_message_mqtt(self, client, userdata, message):
m_in = json.loads(message.decode("utf-8"))
logger.info(m_in)
I am trying to stream deepface from webcam but I keep on getting this error after running this command
DeepFace.stream('desktop/deepf/my_dataset')
vgg_face_weights.h5 will be downloaded...
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\Users\user\AppData\Roaming\Python\Python37\site-packages\deepface\DeepFace.py", line 735,
in stream
, source = source, time_threshold = time_threshold, frame_threshold = frame_threshold)
File "C:\Users\user\AppData\Roaming\Python\Python37\site-packages\deepface\commons\realtime.py", line 41, in analysis
model = DeepFace.build_model(model_name)
File "C:\Users\user\AppData\Roaming\Python\Python37\site-packages\deepface\DeepFace.py", line 46, in build_model
model = model()
File "C:\Users\user\AppData\Roaming\Python\Python37\site-packages\deepface\basemodels\VGGFace.py", line 77, in loadModel
gdown.download(url, output, quiet=False)
File "C:\Users\user\AppData\Roaming\Python\Python37\site-packages\gdown\download.py", line 97, in download
cookies = json.load(f)
File "C:\Program Files\Python37\lib\json\__init__.py", line 296, in load
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
File "C:\Program Files\Python37\lib\json\__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "C:\Program Files\Python37\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Program Files\Python37\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Deepface downloads pretrained weights from google drive but sometimes it exceeds the daily download limit. In this case, download the pre-trained weights from this url manually: https://drive.google.com/uc?id=1CPSeum3HpopfomUEK1gybeuIVoeJT_Eo
Then, copy it to the HOME_FOLDER/.deepface/weights folder.
I am on Django 1.11, python 3.6 and mysqlclient 1.4.6. I need to convert a Django queryset into a list. The objects in the queryset have a field with a unicode emoji value. (see: field=u'✅ This is the field value') That particular table and field in mySQL are using utf8mb4_unicode_ci encoding.
When I run qs_list = list(qs), MySQL throws a Unicode error which I think is caused by the emoji in the field when the queryset is evaluated.
File encoding:
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
Queryset:
qs = Fake.objects.filter(..)
qs_list = list(qs)
Error:
<class 'UnicodeDecodeError'>
Exception: 'utf-8' codec can't decode byte 0xed in position 11: invalid continuation byte
File "/home/mysite/lib/python3.6/site-packages/django/db/models/query.py", line 250, in __iter__
self._fetch_all()
File "/home/mysite/lib/python3.6/site-packages/django/db/models/query.py", line 1121, in _fetch_all
self._result_cache = list(self._iterable_class(self))
File "/home/mysite/lib/python3.6/site-packages/django/db/models/query.py", line 53, in __iter__
results = compiler.execute_sql(chunked_fetch=self.chunked_fetch)
File "/home/mysite/lib/python3.6/site-packages/django/db/models/sql/compiler.py", line 899, in execute_sql
raise original_exception
File "/home/mysite/lib/python3.6/site-packages/django/db/models/sql/compiler.py", line 889, in execute_sql
cursor.execute(sql, params)
File "/home/mysite/lib/python3.6/site-packages/django/db/backends/utils.py", line 64, in execute
return self.cursor.execute(sql, params)
File "/home/mysite/lib/python3.6/site-packages/django/db/backends/mysql/base.py", line 101, in execute
return self.cursor.execute(query, args)
File "/home/mysite/lib/python3.6/site-packages/MySQLdb/cursors.py", line 209, in execute
res = self._query(query)
File "/home/mysite/lib/python3.6/site-packages/MySQLdb/cursors.py", line 317, in _query
self._post_get_result()
File "/home/mysite/lib/python3.6/site-packages/MySQLdb/cursors.py", line 352, in _post_get_result
self._rows = self._fetch_row(0)
File "/home/mysite/lib/python3.6/site-packages/MySQLdb/cursors.py", line 325, in _fetch_row
return self._result.fetch_row(size, self._fetch_type)
Is there a simple solution or a workaround for this? How can I get the list of objects in the queryset without queryset being evaluated?
Most probably this is related to the encoding and you need to check other types of encoding as suggested in this article
Regarding the list of objects - try iterating on the result to get(for example) each name and store the values in a list:
qs = Fake.objects.filter(..)
qs_list = [x.name for x in qs]
where 'name' is the model field you need to iterate.
Resolved
Answer: Changed the path, it was in fact inncorect path after all. Used absolute path (alt+d+copy from file explorer". Also used "r" before the path so the path is treated like a raw string.
# load the data
BetterLifeIndex = pd.read_csv(r"C:\Users\brede\OneDrive\Dokumenter\Downloads\BetterLifeIndex2015.csv", thousands = ',')
gdp_per_capita = pd.read_csv(r"C:\Users\brede\OneDrive\Dokumenter\Downloads\gdpcapita.csv", thousands= ',', delimiter ='\t',
encoding = 'latin1' , na_values="n/a")
Im new to Python and I'm running a Example from a machine learning book. I cant get python to read my csv file.
Code:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model
def prepare_country_stats(oecd_bli, gdp_per_capita):
oecd_bli = oecd_bli[oecd_bli["INEQUALITY"]=="TOT"]
oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value")
gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True)
gdp_per_capita.set_index("Country", inplace=True)
full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita,
left_index=True, right_index=True)
full_country_stats.sort_values(by="GDP per capita", inplace=True)
remove_indices = [0, 1, 6, 8, 33, 34, 35]
keep_indices = list(set(range(36)) - set(remove_indices))
return full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices]
# load the data
oecd_bli = pd.read_csv("Downloads/BetterLifeIndex2015.csv", thousands = ',')
gdp_per_capita = pd.read_csv("C:/Users/brede/Downloads/gdpcapita.csv", thousands= ',', delimiter ='\t',
encoding = 'latin1' , na_values="n/a")
#prepare the data
country_stats = prepare_country_stats (oecd_bli, gdp_per_capita)
x = np.c_[country_stats["gdp per capita"]]
y = np.c_[country_stats["life satisfaction"]]
#visualize the data
country_stats.plot(kind= 'scatter' , x = "GDP per capita", y ='Life satisfaction')
#select a linear model
model = sklearn.linear_model.LinearRegression()
#train the model
model.fit (x, y)
#make a prediction for Cyprus
X_new = [[22587]] #Cyprus GDP per capita
print(model.predict(X_new)) #outputs[[5.96242338]]
The output is:
runfile('C:/Users/brede/Downloads/practice_gdp.py', wdir='C:/Users/brede/Downloads')
Traceback (most recent call last):
File "<ipython-input-59-2f130edd277c>", line 1, in <module>
runfile('C:/Users/brede/Downloads/practice_gdp.py', wdir='C:/Users/brede/Downloads')
File "C:\Users\brede\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "C:\Users\brede\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/brede/Downloads/practice_gdp.py", line 31, in <module>
oecd_bli = pd.read_csv("Downloads/BetterLifeIndex2015.csv", thousands = ',')
File "C:\Users\brede\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 685, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\brede\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 457, in _read
parser = TextFileReader(fp_or_buf, **kwds)
File "C:\Users\brede\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 895, in __init__
self._make_engine(self.engine)
File "C:\Users\brede\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 1135, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "C:\Users\brede\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 1917, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas\_libs\parsers.pyx", line 382, in pandas._libs.parsers.TextReader.__cinit__
File "pandas\_libs\parsers.pyx", line 689, in pandas._libs.parsers.TextReader._setup_parser_source
FileNotFoundError: [Errno 2] File b'Downloads/BetterLifeIndex2015.csv' does not exist: b'Downloads/BetterLifeIndex2015.csv'
I have triplechecked the path to the file, and I can't seem to figure this out! All help is appreciated.
This is done in Spyder, also tried in Jupyter with same result. I've even copied the path etc.
help...
I think you have to include'/' in the file path.Try that 'C:/Users/brede/OneDrive....'
I have a json object file and at the beginning of this file a header of text, date, time, IP address.
I've tried f.readlines()[5:] to no avail.
I've tried next(f)
I wish to skip 5 or 6 lines of text and go directly into the json data.
Here is an example.
import jsonlines
import json
data_file = input("Enter a file to parse: ")
with jsonlines.open(data_file) as file:
for obj in file:
try:
jsonparse = json.loads(obj)
except Exception as e:
pass
print(obj)
Error:
jsonlines.jsonlines.InvalidLineError: line contains invalid json: Expecting value: line 1 column 1 (char 0) (line 1)
Top of json file:
Start: 07/02/2019 14:59:40.686
Connected To:
192.168.11.203
Here is the full long error:
Enter a file to parse: Play.raw
Traceback (most recent call last):
File "C:\Users\sdickey\AppData\Local\Programs\Python\Python37\lib\site-packages\jsonlines\jsonlines.py", line 159, in read
value = self._loads(line)
File "C:\Users\sdickey\AppData\Local\Programs\Python\Python37\lib\json\__init__.py", line 348, in loads
return _default_decoder.decode(s)
File "C:\Users\sdickey\AppData\Local\Programs\Python\Python37\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\sdickey\AppData\Local\Programs\Python\Python37\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:/Users/sdickey/PycharmProjects/Python Testing/Actall Data/testing.py", line 6, in <module>
for obj in file:
File "C:\Users\sdickey\AppData\Local\Programs\Python\Python37\lib\site-packages\jsonlines\jsonlines.py", line 204, in iter
skip_empty=skip_empty)
File "C:\Users\sdickey\AppData\Local\Programs\Python\Python37\lib\site-packages\jsonlines\jsonlines.py", line 164, in read
six.raise_from(exc, orig_exc)
File "<string>", line 3, in raise_from
jsonlines.jsonlines.InvalidLineError: line contains invalid json: Expecting value: line 1 column 1 (char 0) (line 1)
Process finished with exit code 1
It is always preferable not to read the whole file in memory but one line at a time.
Assuming your input file contains:
first line
second line
third line
fourth line
fifth line
{ "k1": "val1", "k2": "val2" }
{ "k3": "val3", "k4": "val4" }
if you just want to skip 5 lines, you could do it brutally as:
import json
with open("test.txt") as f:
for _ in range(5):
next(f)
for line in f:
obj = json.loads(line)
print(obj)
or using enumerate:
import json
with open("test.txt") as f:
for i, line in enumerate(f):
if i<5:
continue
obj = json.loads(line)
print(obj)
or use itertools' dropwhile:
import itertools as it
import json
with open("test.txt") as f:
for i, line in it.dropwhile(lambda i: i[0]<5, enumerate(f)):
obj = json.loads(line)
print(obj)
I think you are trying to convert to json line by line. You need to join all lines by \n igoring the first 5 lines and then load it:
import json
with open("test.txt") as f:
json_obj = "\n".join(f.readlines()[5:])
jsonparse = json.loads(json_obj)
print(jsonparse)