OSError Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_8264/243236269.py in <module> - ocr

Trying to run code image to text
code below
image = Image.open("C:\Datasets\demo.jpg")
image = image.resize((300,150))
custom_config = r'-l eng --oem 3 --psm 6'
text = pytesseract.image_to_string(image,config=custom_config)
print(text)
#To Save The Text in Text file
filename = "C:\demo.txt"
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "w") as f:
f.writ
error
OSError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_8264/243236269.py in <module>
14 image = image.resize((300,150))
15 custom_config = r'-l eng --oem 3 --psm 6'
---> 16 text = pytesseract.image_to_string(image,config=custom_config)
17 print(text)
18
to run code file to save as txt

Related

How to use transformations of my choice during inference with Test Time Augmentation in fastai?

I am using Test Time Augmentation during inference, like so-
file_path = '/path/to/file.jpg'
dl = learn_.dls.test_dl([file_path])
pred, _ = learn_.tta(dl=dl, n=N_IMAGES)
When I try to add additional transformations of my choice, I am unable to do so.
If I try to add additional transforms using either the item_tfms or batch_tfms parameters following the docs, like this-
pred, _ = learn_.tta(dl=dl,
n=N_IMAGES,
item_tfms=Resize(256),
batch_tfms=Zoom(p=1, draw=2.0))
I get thrown this error-
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'fastai.vision.core.PILImage'>
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-8-86c798126984> in <module>()
1 # tta
2 dl = learn_.dls.test_dl([file_path])
----> 3 pred, _ = learn_.tta(dl=dl, n=N_IMAGES, item_tfms=Resize(256), batch_tfms=Zoom(p=1, draw=2.0))
4 cat = learn_.dls.vocab[torch.argmax(pred).item()]
5 cat.lstrip()
9 frames
/usr/local/lib/python3.7/dist-packages/torch/_utils.py in reraise(self)
423 # have message field
424 raise self.exc_type(message=msg)
--> 425 raise self.exc_type(msg)
426
427
TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 34, in fetch
data = next(self.dataset_iter)
File "/usr/local/lib/python3.7/dist-packages/fastai/data/load.py", line 118, in create_batches
yield from map(self.do_batch, self.chunkify(res))
File "/usr/local/lib/python3.7/dist-packages/fastai/data/load.py", line 144, in do_batch
def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
File "/usr/local/lib/python3.7/dist-packages/fastai/data/load.py", line 143, in create_batch
def create_batch(self, b): return (fa_collate,fa_convert)[self.prebatched](b)
File "/usr/local/lib/python3.7/dist-packages/fastai/data/load.py", line 50, in fa_collate
else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
File "/usr/local/lib/python3.7/dist-packages/fastai/data/load.py", line 50, in <listcomp>
else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
File "/usr/local/lib/python3.7/dist-packages/fastai/data/load.py", line 51, in fa_collate
else default_collate(t))
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/collate.py", line 86, in default_collate
raise TypeError(default_collate_err_msg_format.format(elem_type))
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'fastai.vision.core.PILImage'>
Is there any way I can use additional transformations during inference time with tta?

Error while setting up roBERTa model in colab notebook

I am getting error while merging vocabulary and merge txt files for tokenizers designed for Tensorflow roBERTa. I attached the error snapshot!![enter image description here][1]
Code:
tokenizer = tokenizers.ByteLevelBPETokenizer(vocab_file='vocab_roberta_base.json',
merges_file='merges_roberta_base.txt', lowercase=True,add_prefix_space=True)
ERROR:
Exception Traceback (most recent call last)
<ipython-input-9-5dab9f2389e4> in <module>()
1 MAX_LEN = 96
----> 2 tokenizer = tokenizers.ByteLevelBPETokenizer(vocab_file='vocab_roberta_base.json',merges_file='merges_roberta_base.txt')
3 sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
/usr/local/lib/python3.6/dist-packages/tokenizers/implementations/byte_level_bpe.py in __init__(self, vocab_file, merges_file, add_prefix_space, lowercase, dropout, unicode_normalizer, continuing_subword_prefix, end_of_word_suffix)
31 dropout=dropout,
32 continuing_subword_prefix=continuing_subword_prefix or "",
---> 33 end_of_word_suffix=end_of_word_suffix or "",
34 )
35 )
Exception: expected ident at line 1 column 2

UnicodeDecodeError in StanfordNERTagger compilation

I'm using NLTK wrapper for NER tagging using Stanford 3class model. On BBC news raw text that is written in English getting UnicodeDecodeError.
Here is my code
from nltk.tag import StanfordNERTagger
st1 = StanfordNERTagger('/home/saurabh/saurabh-cair/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', '/home/saurabh/saurabh-cair/stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8')
file=open('/home/saurabh/saurabh-cair/model_training/bbc/data.txt','rt')
text=file.read()
file.close()
import nltk
words = nltk.word_tokenize(text)
xyz=st1.tag(words)
for i in xyz:
print(i)
got error as
Traceback (most recent call last):
File "model_english.py", line 26, in <module>
words = nltk.word_tokenize(text)
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/__init__.py", line 128, in word_tokenize
sentences = [text] if preserve_line else sent_tokenize(text, language)
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/__init__.py", line 95, in sent_tokenize
return tokenizer.tokenize(text)
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1241, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1291, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1281, in span_tokenize
for sl in slices:
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1322, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 314, in _pair_iter
for el in it:
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1297, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1343, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1478, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 313, in _pair_iter
prev = next(it)
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 584, in _annotate_first_pass
for aug_tok in tokens:
File "/home/saurabh/anaconda2/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 550, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc2 in position 1: ordinal not in range(128)
I tried utf-8, ascii and default encoding as well but it didn't resolve my problem.
Text data contains sentences like:
General Motors of the US is to pay Fiat 1.55bn euros ($2bn; £1.1bn) to get out of a deal which could have forced it to buy the Italian car maker outright.
I'm using Anaconda python 2.7

CSV error opening file

I am getting an error opening a file that I can't resolve. I am able to open
this exact file with no issues using another small program I wrote.
First Program (doesn't work):
import csv
passwd = "f:\mark\python\etc_password.txt"
output = "f:\mark\python\output.txt"
with open(passwd, 'r') as passwd1, open(output, 'w') as output1:
ro = csv.reader(passwd1, delimiter=':')
wo = csv.writer(output1, delimiter='\t')
for record in ro:
# if not record[0].startswith('#'):
if len(record) > 1:
wo.writerow((record[0], record[2]))
Error:
Traceback (most recent call last):
File "C:/Users/Mark/PycharmProjects/main/main.py", line 11, in <module>
for record in ro:
ValueError: I/O operation on closed file.
Second Program (works):
etcfile = "f:\mark\python\etc_password.txt"
users = {}
with open(etcfile, "r") as datafile:
for line in datafile:
if not line.startswith("#"):
info = line.split(':')
users[info[0]] = info[2]
for username in sorted(users):
print("{}:{}".format(username, users[username]))
The first program has the issue that I can't figure out. The second program works just fine opening the same file.
The error ValueError: I/O operation on closed file. is telling you
you cannot read from a closed file. If you look at the indentation of your
first program, you are opening a csv reader to a file which is then closed
at the end of the with block. A simpler example of this behavior would be
In [1]: import csv
In [2]: file = open('test.csv')
In [3]: ro = csv.reader(file)
In [4]: file.close()
In [5]: for record in ro:
...: print(record)
...:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-5-1f7adaf76d31> in <module>()
----> 1 for record in ro:
2 print(record)
3
ValueError: I/O operation on closed file.

error loading json using topsy

When i load single record json is created just fine when i try to load multiple records i get this error. Sorry i am new to python http://tny.cz/ce1baaba
Traceback (most recent call last):
File "TweetGraber.py", line 26, in <module>
get_tweets_by_query(topic)
File "TweetGraber.py", line 15, in get_tweets_by_query
json_tree = json.loads(source)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/json/decoder.py", line 368, in decode
raise ValueError(errmsg("Extra data", s, end, len(s)))
ValueError: Extra data: line 2 column 1 - line 11 column 1 (char 2380 - 46974)
Here is my code
def get_tweets_by_query(query, count=10):
"""A function that gets the tweets by a query."""
Tweets=[]
queryEncoded=urllib.quote(query)
api_key = "xxxxx"
source=urllib.urlopen("http://api.topsy.com/v2/content/bulktweets.json?q=%s&type=tweet&offset=0&perpage=%s&window=realtime&apikey=%s" % (queryEncoded, count, api_key)).read()
json_tree = json.loads(source)
pprint(json_tree)
topic = raw_input("Please enter a topic: ")
get_tweets_by_query(topic)
Thanks Timusan I was able to correct my json The problem with the original it was missing the root element "[" which showed we are expecting array and there "," was missing after end of each object. So here is fixed code.
So here is the code
def get_tweets_by_query(query, count=10):
"""A function that gets the tweets by a query."""
Tweets=[]
queryEncoded=urllib.quote(query)
api_key = "xx"
source=urllib.urlopen("http://api.topsy.com/v2/content/bulktweets.json?q=%s&type=tweet&offset=0&perpage=%s&window=realtime&apikey=%s" % (queryEncoded, count, api_key)).read()
source="["+source+"]"
source=source.replace("}\n{","},{")
json_tree = json.loads(source)
pprint(json_tree)