pytesseract support for chi_tra_vert - ocr

Trying to recognize image that contain vertical Traditional Chinese text by the following code.
import cv2
import pytesseract
# Read image
img = cv2.imread('image3.png')
custom_config = r'--oem 2 --psm 5'
result = pytesseract.image_to_string(img, lang='chi_tra_vert', config=custom_config)
print(result)
It got the following error.
File "ic03.py", line 49, in <module>
result = pytesseract.image_to_string(img, lang='chi_tra_vert', config=custom_config)
File "/mnt/data1/home/ray/pyenv/venv/lib/python3.8/site-packages/pytesseract/pytesseract.py", line 423, in image_to_string
return {
File "/mnt/data1/home/ray/pyenv/venv/lib/python3.8/site-packages/pytesseract/pytesseract.py", line 426, in <lambda>
Output.STRING: lambda: run_and_get_output(*args),
File "/mnt/data1/home/ray/pyenv/venv/lib/python3.8/site-packages/pytesseract/pytesseract.py", line 288, in run_and_get_output
run_tesseract(**kwargs)
File "/mnt/data1/home/ray/pyenv/venv/lib/python3.8/site-packages/pytesseract/pytesseract.py", line 264, in run_tesseract
raise TesseractError(proc.returncode, get_errors(error_string))
pytesseract.pytesseract.TesseractError: (1, "Error: Tesseract (legacy) engine requested, but components are not present in /usr/share/tesseract-ocr/4.00/tessdata/chi_tra_vert.traineddata!! Failed loading language 'chi_tra_vert'
Same code work for lang='chi_tra' which recognize Horizontal Chinese Text and lang='eng'.
I have already downloaded and updated the file from the repo https://github.com/tesseract-ocr/tessdata.
Please advise what else I can try. Thanks.

Related

How to resolve coreferences without Internet using AllenNLP and coref-spanbert-large?

A want to resolve coreferences without Internet using AllenNLP and coref-spanbert-large model.
I try to do it in the way that is describing here https://demo.allennlp.org/coreference-resolution
My code:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
predictor = Predictor.from_path(r"C:\Users\aap\Desktop\coref-spanbert-large-2021.03.10.tar.gz")
example = 'Paul Allen was born on January 21, 1953, in Seattle, Washington, to Kenneth Sam Allen and Edna Faye Allen.Allen attended Lakeside School, a private school in Seattle, where he befriended Bill Gates, two years younger, with whom he shared an enthusiasm for computers.'
pred = predictor.predict(document=example)
coref_res = predictor.coref_resolved(example)
print(pred)
print(coref_res)
When I have an access to internet the code works correctly.
But when I don't have an access to internet I get the following errors:
Traceback (most recent call last):
File "C:/Users/aap/Desktop/CoreNLP/Coref_AllenNLP.py", line 14, in <module>
predictor = Predictor.from_path(r"C:\Users\aap\Desktop\coref-spanbert-large-2021.03.10.tar.gz")
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\predictors\predictor.py", line 361, in from_path
load_archive(archive_path, cuda_device=cuda_device, overrides=overrides),
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\models\archival.py", line 206, in load_archive
config.duplicate(), serialization_dir
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\models\archival.py", line 232, in _load_dataset_readers
dataset_reader_params, serialization_dir=serialization_dir
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\common\from_params.py", line 604, in from_params
**extras,
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\common\from_params.py", line 632, in from_params
kwargs = create_kwargs(constructor_to_inspect, cls, params, **extras)
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\common\from_params.py", line 200, in create_kwargs
cls.__name__, param_name, annotation, param.default, params, **extras
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\common\from_params.py", line 307, in pop_and_construct_arg
return construct_arg(class_name, name, popped_params, annotation, default, **extras)
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\common\from_params.py", line 391, in construct_arg
**extras,
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\common\from_params.py", line 341, in construct_arg
return annotation.from_params(params=popped_params, **subextras)
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\common\from_params.py", line 604, in from_params
**extras,
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\common\from_params.py", line 634, in from_params
return constructor_to_call(**kwargs) # type: ignore
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\data\token_indexers\pretrained_transformer_mismatched_indexer.py", line 63, in __init__
**kwargs,
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\data\token_indexers\pretrained_transformer_indexer.py", line 58, in __init__
model_name, tokenizer_kwargs=tokenizer_kwargs
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\data\tokenizers\pretrained_transformer_tokenizer.py", line 71, in __init__
model_name, add_special_tokens=False, **tokenizer_kwargs
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\allennlp\common\cached_transformers.py", line 110, in get_tokenizer
**kwargs,
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\transformers\models\auto\tokenization_auto.py", line 362, in from_pretrained
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\transformers\models\auto\configuration_auto.py", line 368, in from_pretrained
config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\transformers\configuration_utils.py", line 424, in get_config_dict
use_auth_token=use_auth_token,
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\transformers\file_utils.py", line 1087, in cached_path
local_files_only=local_files_only,
File "C:\Users\aap\Desktop\CoreNLP\corenlp\lib\site-packages\transformers\file_utils.py", line 1268, in get_from_cache
"Connection error, and we cannot find the requested files in the cached path."
ValueError: Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on.
Process finished with exit code 1
Please, say me, what do I need to do my code works without Internet?
You will need a local copy of transformer model's configuration file and vocabulary so that the tokenizer and token indexer don't need to download those:
from transformers import AutoConfig, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(transformer_model_name)
config = AutoConfig.from_pretrained(transformer_model_name)
tokenizer.save_pretrained(local_config_path)
config.to_json_file(local_config_path + "/config.json")
You will then need to override the transformer model name in the configuration file to the local directory (local_config_path) where you saved these things:
predictor = Predictor.from_path(
r"C:\Users\aap\Desktop\coref-spanbert-large-2021.03.10.tar.gz",
overrides={
"dataset_reader.token_indexers.tokens.model_name": local_config_path,
"validation_dataset_reader.token_indexers.tokens.model_name": local_config_path,
"model.text_field_embedder.tokens.model_name": local_config_path,
},
)
I have run into similar problem when using structured-prediction-srl-bert without internet, and I saw in the logs 4 item for downloads:
dataset_reader.bert_model_name = bert-base-uncased, Downloading 4 files
model INFO vocabulary.py - Loading token dictionary from data/structured-prediction-srl-bert.2020.12.15/vocabulary. Downloading... 4x smaller files
Spacy models 'en_core_web_sm' not found
later on, [nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary failure in name resolution> [nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary failure in name resolution>
I have solved it with these steps:
structured-prediction-srl-bert:
I have downloaded the structured-prediction-srl-bert.2020.12.15.tar.gz from the https://demo.allennlp.org/semantic-role-labeling (Model Card tab) -
https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz
I have unzipped it into ./data/structured-prediction-srl-bert.2020.12.15
The code:
pip install allennlp==2.10.0 allennlp-models==2.10.0
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("./data/structured-prediction-srl-bert.2020.12.15/")
bert-base-uncased
I have created a folder ./data/bert-base-uncased and there I have downloaded these files from https://huggingface.co/bert-base-uncased/tree/main
config.json
tokenizer.json
tokenizer_config.json
vocab.txt
pytorch_model.bin
Aditionally, I had to change the "bert_model_name" from "bert-base-uncased" into a path "./data/bert-base-uncased", the earlier causes the download. This has to be done in the ./data/structured-prediction-srl-bert.2020.12.15/config.json , and there are two occurences.
python -m spacy download en_core_web_sm
python -c 'import nltk; nltk.download("punkt"); nltk.download("wordnet")'
After these steps the allennlp did not need internet anymore.

Upload Pandas dataframe as a JSON object in Cloud Storage

I have been trying to upload a Pandas dataframe to a JSON object in Cloud Storage using Cloud Function. Follwing is my code -
def upload_blob(bucket_name, source_file_name, destination_blob_name):
"""Uploads a file to the bucket."""
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_file(source_file_name)
print('File {} uploaded to {}.'.format(
source_file_name,
destination_blob_name))
final_file = pd.concat([df, df_second], axis=0)
final_file.to_json('/tmp/abc.json')
with open('/tmp/abc.json', 'r') as file_obj:
upload_blob('test-bucket',file_obj,'abc.json')
I am getting the following error in line - blob.upload_from_file(source_file_name)
Deployment failure:
Function failed on loading user code. Error message: Code in file main.py
can't be loaded.
Detailed stack trace: Traceback (most recent call last):
File "/env/local/lib/python3.7/site-
packages/google/cloud/functions/worker.py", line 305, in
check_or_load_user_function
_function_handler.load_user_function()
File "/env/local/lib/python3.7/site-
packages/google/cloud/functions/worker.py", line 184, in load_user_function
spec.loader.exec_module(main)
File "<frozen importlib._bootstrap_external>", line 728, in exec_module
File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
File "/user_code/main.py", line 6, in <module>
import datalab.storage as gcs
File "/env/local/lib/python3.7/site-packages/datalab/storage/__init__.py",
line 16, in <module>
from ._bucket import Bucket, Buckets
File "/env/local/lib/python3.7/site-packages/datalab/storage/_bucket.py",
line 21, in <module>
import datalab.context
File "/env/local/lib/python3.7/site-packages/datalab/context/__init__.py",
line 15, in <module>
from ._context import Context
File "/env/local/lib/python3.7/site-packages/datalab/context/_context.py",
line 20, in <module>
from . import _project
File "/env/local/lib/python3.7/site-packages/datalab/context/_project.py",
line 18, in <module>
import datalab.utils
File "/env/local/lib/python3.7/site-packages/datalab/utils/__init__.py",
line 15
from ._async import async, async_function, async_method
^
SyntaxError: invalid syntax
What possibly is the error?
You are passing a string to blob.upload_from_file(), but this method requires a file object. You probably want to use blob.upload_from_filename() instead. Check the sample in the GCP docs.
Alternatively, you could get the file object, and keep using blob.upload_from_file(), but it's unnecessary extra lines.
with open('/tmp/abc.json', 'r') as file_obj:
upload_blob('test-bucket', file_obj, 'abc.json')
Use a bucket object instead of string
something like upload_blob(conn.get_bucket(mybucket),'/tmp/abc.json','abc.json')}

JSON Parsing with Nao robot - AttributeError

I'm using a NAO robot with naoqi version 2.1 and Choregraphe on Windows. I want to parse json from an attached file to the behavior. I attached the file like in that link.
Code:
def onLoad(self):
self.filepath = os.path.join(os.path.dirname(ALFrameManager.getBehaviorPath(self.behaviorId)), "fileName.json")
def onInput_onStart(self):
with open(self.filepath, "r") as f:
self.data = self.json.load(f.get_Response())
self.dataFromFile = self.data['value']
self.log("Data from file: " + str(self.dataFromFile))
But when I run this code on the robot (connected with a router) I'll get an error:
[ERROR] behavior.box :_safeCallOfUserMethod:281 _Behavior__lastUploadedChoregrapheBehaviorbehavior_1136151280__root__AbfrageKontostand_3__AuslesenJSONDatei_1: Traceback (most recent call last):
File "/usr/lib/python2.7/site-packages/naoqi.py", line 271, in _safeCallOfUserMethod
func()
File "<string>", line 20, in onInput_onStart
File "/usr/lib/python2.7/site-packages/inaoqi.py", line 265, in <lambda>
__getattr__ = lambda self, name: _swig_getattr(self, behavior, name)
File "/usr/lib/python2.7/site-packages/inaoqi.py", line 55, in _swig_getattr
raise AttributeError(name)
AttributeError: json
I already tried to understand the code from the correspondending lines but I couldn't fixed the error. But I know that the type of my object f is 'file'. How can I open the json file as a json file?
Your problem comes from this:
self.json.load(f.get_Response())
... there is no such thing as "self.json" on a Choregraphe box, import json and then do json.load. And what is get_Response ? That method doesn't exist on anything in Python that I know of.
You might want to first try making a standalone python script (that doesn't use the robot) that can read your json file before you try it with choregraphe. It will be easier.

error while using nltk.post_tag

I have been trying to use nltk.pos_tag in my code but I face an error when I do so. I have already downloaded Penn treebank and max_ent_treebank_pos. But the error persists. here is my code :
import nltk
from nltk import tag
from nltk import*
a = "Alan Shearer is the first player to score over a hundred Premier League goals."
a_sentences = nltk.sent_tokenize(a)
a_words = [nltk.word_tokenize(sentence) for sentence in a_sentences]
a_pos = [nltk.pos_tag(sentence) for sentence in a_words]
print(a_pos)
and this is the error I get :
"Traceback (most recent call last):
File "<pyshell#9>", line 1, in <module>
print (nltk.pos_tag(text))
File "C:\Python34\lib\site-packages\nltk\tag\__init__.py", line 110, in pos_tag
tagger = PerceptronTagger()
File "C:\Python34\lib\site-packages\nltk\tag\perceptron.py", line 140, in __init__
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
File "C:\Python34\lib\site-packages\nltk\data.py", line 641, in find
raise LookupError(resource_not_found)
LookupError:
Resource 'taggers/averaged_perceptron_tagger/averaged_perceptron
_tagger.pickle' not found. Please use the NLTK Downloader to
obtain the resource: >>> nltk.download()
Searched in:
- 'C:\\Users\\T01142/nltk_data'
- 'C:\\nltk_data'
- 'D:\\nltk_data'
- 'E:\\nltk_data'
- 'C:\\Python34\\nltk_data'
- 'C:\\Python34\\lib\\nltk_data'
- 'C:\\Users\\T01142\\AppData\\Roaming\\nltk_data'
Call this from python:
nltk.download('averaged_perceptron_tagger')
Had the same problem in a Flask server. nltk used a different path when in server config, so I recurred to adding nltk.data.path.append("/home/yourusername/whateverpath/") inside of the server code right before the pos_tag call
Note there is some replication of this question
How to config nltk data directory from code?
nltk doesn't add $NLTK_DATA to search path?
POS tagging with NLTK. Can't locate averaged_perceptron_tagger
To resolve this error run following command on python prompt:
import nltk
nltk.download('averaged_perceptron_tagger')

Pandas: read_csv() with engine=C issue (bug or feature?)

I am using pandas 0.18 on python 2.7.9 on Suse Enterprise Linux 11.
I have a file that contains multiple tables:
TABLE_A
col1,col2,...,col8
...
TABLE_B
col1,col2,...,col7
...
Table A is about 7300 lines, and Table B is about 100 lines. I make an initial pass through the file to determine the start/end positions of each table. Then, I use read_csv() in pandas w/ skiprows, nrows options to read the appropriate table into memory. I'm using engine='c'.
I'm seeing weird behavior when using engine='c'. I'm able to read the first 4552 or so lines of TABLE_A without any issues. But If I try to read 4553 lines, I get an error:
>>> df = pd.read_csv(f,engine='c',skiprows=1,nrows=4552)
>>> df.shape
(4552, 7)
>>> df = pd.read_csv(f,engine='c',skiprows=1,nrows=4553)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/python_pkgs/lib/python2.7/site-packages/pandas-0.18.0-py2.7-linux-x86_64.egg/pandas/io/parsers.py", line 529, in parser_f
return _read(filepath_or_buffer, kwds)
File "/python_pkgs/lib/python2.7/site-packages/pandas-0.18.0-py2.7-linux-x86_64.egg/pandas/io/parsers.py", line 301, in _read
return parser.read(nrows)
File "/python_pkgs/lib/python2.7/site-packages/pandas-0.18.0-py2.7-linux-x86_64.egg/pandas/io/parsers.py", line 763, in read
ret = self._engine.read(nrows)
File "/python_pkgs/lib/python2.7/site-packages/pandas-0.18.0-py2.7-linux-x86_64.egg/pandas/io/parsers.py", line 1213, in read
data = self._reader.read(nrows)
File "pandas/parser.pyx", line 766, in pandas.parser.TextReader.read (pandas/parser.c:7988)
File "pandas/parser.pyx", line 800, in pandas.parser.TextReader._read_low_memory (pandas/parser.c:8444)
File "pandas/parser.pyx", line 842, in pandas.parser.TextReader._read_rows (pandas/parser.c:8970)
File "pandas/parser.pyx", line 829, in pandas.parser.TextReader._tokenize_rows (pandas/parser.c:8838)
File "pandas/parser.pyx", line 1833, in pandas.parser.raise_parser_error (pandas/parser.c:22649)
pandas.parser.CParserError: Error tokenizing data. C error: Expected 7 fields in line 7421, saw 8
From the error message it seems like the C parser has continued reading way past the specified lines and has encountered TABLE_B which has 7 columns only (TABLE_A has 8 columns).
However, read with engine='python' works OK.
>>> df = pd.read_csv(f,engine='python',skiprows=1,nrows=6000)
>>> df.shape
(6000, 7)
>>>
So is this a bug or a feature/limitation? Perhaps maybe the way C parser works by reading chunks? Thanks.