Python OCR Tesseract, find a certain word in the image and return me the coordinates - ocr

I wanted your help, I've been trying for a few months to make a code that finds a word in the image and returns the coordinates where that word is in the image.
I was trying this using OpenCV, OCR tesseract, but I was not successful, could someone here in the community help me?
I'll leave an image here as an example:

Here is something you can start with:
import pytesseract
from PIL import Image
pytesseract.pytesseract.tesseract_cmd = r'C:\<path-to-your-tesseract>\Tesseract-OCR\tesseract.exe'
img = Image.open("img.png")
data = pytesseract.image_to_data(img, output_type='dict')
boxes = len(data['level'])
for i in range(boxes):
if data['text'][i] != '':
print(data['left'][i], data['top'][i], data['width'][i], data['height'][i], data['text'][i])
If you have difficulties with installing pytesseract see: https://stackoverflow.com/a/53672281/18667225
Output:
153 107 277 50 Palavras
151 197 133 37 com
309 186 154 48 R/RR
154 303 126 47 Rato
726 302 158 47 Resto
154 377 144 50 Rodo
720 379 159 47 Arroz
152 457 160 48 Carro
726 457 151 46 Ferro
154 532 142 50 Rede
726 534 159 47 Barro
154 609 202 50 Parede
726 611 186 47 Barata
154 690 124 47 Faro
726 685 288 50 Beterraba
154 767 192 47 Escuro
726 766 151 47 Ferro

I managed to find the solution and I'll post it here for you:
import pytesseract
import cv2
from pytesseract import Output
pytesseract.pytesseract.tesseract_cmd = r'C:\<path-to-your-tesseract>\Tesseract-OCR\tesseract.exe'
filepath = 'image.jpg'
image = cv2.imread(filepath, 1)
# converting image to grayscale image
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# converting to binary image by Thresholding
# this step is necessary if you have a color image because if you skip this part
# then the tesseract will not be able to detect the text correctly and it will give an incorrect result
threshold_img = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# displays the image
cv2.imshow('threshold image', threshold_img)
# Holds the output window until the user presses a key
cv2.waitKey(0)
# Destroying windows present on the screen
cv2.destroyAllWindows()
# setting parameters for tesseract
custom_config = r'--oem 3 --psm 6'
# now feeding image to tesseract
details = pytesseract.image_to_data(threshold_img, output_type=Output.DICT, config=custom_config, lang='eng')
# Color
vermelho = (0, 0, 255)
#Exibe todas as chaves encontradas
print(details.keys())
print(details['text'])
# For in all found texts
for i in range(len(details['text'])):
# If it finds the text "UNIVERIDADE" it will print the coordinates, and draw a rectangle around the word
if details['text'][i] == 'UNIVERSIDADE':
print(details['text'][i])
print(f"left: {details['left'][i]}")
print(f"top: {details['top'][i]}")
print(f"width: {details['width'][i]}")
print(f"height: {details['height'][i]}")
cv2.rectangle(image, (details['left'][i], details['top'][i]), (details['left'][i]+details['width'][i], details['top'][i]+details['height'][i]), vermelho)

Related

Error while using cenpy library in python

I am working on a project where I need to use census data for a couple of towns in MA. For that, I am using cenpy library ASC data, but I got a key error. The same error happens even when I try the example code described for Chicago. Here is the example code I use and the error I see:
chicago = products.ACS(2017).from_place('Chicago, IL', level='tract',
variables=['B00002*', 'B01002H_001E'])
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File ~\anaconda3\envs\oxe\lib\site-packages\cenpy\tiger.py:192, in ESRILayer.query(self, raw, strict, **kwargs)
191 try:
--> 192 features = datadict["features"]
193 except KeyError:
KeyError: 'features'
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
Input In [4], in <cell line: 1>()
----> 1 chicago = products.ACS(2017).from_place('Chicago, IL', level='tract',
2 variables=['B00002*', 'B01002H_001E'])
File ~\anaconda3\envs\oxe\lib\site-packages\cenpy\products.py:791, in ACS.from_place(self, place, variables, level, return_geometry, place_type, strict_within, return_bounds, replace_missing)
788 variables = self._preprocess_variables(variables)
789 variables.append("GEO_ID")
--> 791 geoms, variables, *rest = super(ACS, self).from_place(
792 place,
793 variables=variables,
794 level=level,
795 return_geometry=return_geometry,
796 place_type=place_type,
797 strict_within=strict_within,
798 return_bounds=return_bounds,
799 replace_missing=replace_missing,
800 )
801 variables["GEOID"] = variables.GEO_ID.str.split("US").apply(lambda x: x[1])
802 return_table = geoms[["GEOID", "geometry"]].merge(
803 variables.drop("GEO_ID", axis=1), how="left", on="GEOID"
804 )
File ~\anaconda3\envs\oxe\lib\site-packages\cenpy\products.py:200, in _Product.from_place(self, place, variables, place_type, level, return_geometry, geometry_precision, strict_within, return_bounds, replace_missing)
197 else:
199 placer = "STATE={} AND PLACE={}".format(placerow.STATEFP, placerow.TARGETFP)
--> 200 env = env_layer.query(where=placer)
202 print(
203 "Matched: {} to {} "
204 "within layer {}".format(
(...)
208 )
209 )
211 geoms, data = self._from_bbox(
212 env.to_crs(epsg=4326).total_bounds,
213 variables=variables,
(...)
219 replace_missing=replace_missing,
220 )
File ~\anaconda3\envs\oxe\lib\site-packages\cenpy\tiger.py:198, in ESRILayer.query(self, raw, strict, **kwargs)
196 if details is []:
197 details = "Mapserver provided no detailed error"
--> 198 raise KeyError(
199 (
200 r"Response from API is malformed. You may have "
201 r"submitted too many queries, formatted the request incorrectly, "
202 r"or experienced significant network connectivity issues."
203 r" Check to make sure that your inputs, like placenames, are spelled"
204 r" correctly, and that your geographies match the level at which you"
205 r" intend to query. The original error from the Census is:\n"
206 r"(API ERROR {}:{}({}))".format(code, msg, details)
207 )
208 )
209 todf = []
210 for i, feature in enumerate(features):
KeyError: 'Response from API is malformed. You may have submitted too many queries, formatted the request incorrectly, or experienced significant network connectivity issues. Check to make sure that your inputs, like placenames, are spelled correctly, and that your geographies match the level at which you intend to query. The original error from the Census is:\\n(API ERROR 400:Unable to complete operation.([]))'

Pandas local HTML erros

Download FileHi| I am trying to read local html files with pandas and one field is not passing the numeric value but a string that is not shown but it is inside the html code. How can I read the table with the values shown in the html ?
In the picture below you can see that I should be getting the 00:21.44 value but instead I am getting the string
"document.Write(Timefactor("0:19:46","raster"))
Any help ?
I am attaching the file.
Your problem is that you are reading raw HTML, but the browser also renders Javascript that it contains. You need to render HTML the same way the browser does.
For that you will need to install requests_html and html5lib packages. Now load and render your HTML. Then you can proceed as usual.
import pandas as pd
from requests_html import HTML
with open( << your file here >>, 'r', encoding='ISO-8859-1') as fi:
html_orig = fi.read()
html_rendered = HTML(html=html_orig)
html_rendered.render()
df = pd.read_html(html_rendered.html)
I would also suggest to clean the rendered HTML a little before feeding to pandas, for example:
import re
last_table = html_rendered.find('table')[-1].html
last_table_noscript = re.sub(r'<script[^<]*.+?<\/script>','', last_table, flags=re.MULTILINE)
df2 = pd.read_html(last_table_noscript)
df2
[ ASS. Programa T Ferramenta Ø RC ID Cone H Total H RESP. ZMin ap/ae STK(xy/z) Comentário F RPM Tempo WKPL Notas
0 NaN 5414TR20112 2 TR32R1.6 des 32 16 M16L100 37 NaN 12793 0,2/17 0,15/ Desbaste Raster 3500 1800 00:09:46 (3+2) 2POS NaN
1 NaN 5414TR20113 3 TR35R1 35 1 M16L100 34 NaN -957 0,2/16 0/ Desbaste Raster 2000 2500 00:03:50 (3+2) 2POS NaN
2 NaN 5414TR20114 3 TR35R1 35 1 M16L100 34 NaN 12591 0,2/17 0/ Desbaste Raster 2000 2500 00:01:36 (3+2) 2POS NaN
3 NaN 5414TR20115 2 TR32R1.6 des 32 16 M16L100 37 NaN -1865 0,2/ 0/ Z Constante 3500 1800 00:34:55 (3+2) 2POS NaN
4 NaN 5414TR20116 160 EHHB-4120-Ap 12 6 CT12L75 60 36.0 505 /0,3 0/ Raster 3500 6200 00:21:44 (3+2) 2POS NaN]

How to improve OCR accuracy?

I have 2 images like shown below. A.png is perfectly read by tesseract but B.png is terribly bad accuracy even though the B.png is similar to A.png. How can I improve the accuracy? I have no idea where to start debugging?
A.png
B.png
Run OCR
# tesseract -v
tesseract 4.1.1-rc2-22-g08899
# tesseract A.png stdout -l jpn --psm 6
Warning: Invalid resolution 0 dpi. Using 70 instead.
第 3 期 決算 公告 令 和 2 年 2 月 7 日
大 阪 市 中 央 区 南 新町 一 丁目 3 番 10 号
株 式 会 社 Link_Mobile
代表 取締 役 佐々 木 勉
貸借 対照 表 の 要旨 (平成 31 年 3 月 31 日 現在 }
# tesseract B.png stdout -l jpn --psm 6
Warning: Invalid resolution 0 dpi. Using 70 instead.
。 人 加計
区 三 6 番 12 号
中 野 駅 前 ビル 5 | 、
am 人 mw
に て
貸借 対照 表 の 要旨 ( 令 和 元 年 11 月 30 日 現在 }
Update 1
Were both scanned using the same scanner, and at the same resolution?
Yes. The images that were originally included in the same PDF were cut out.
Are you taking advantage of any APIs which Tesseract exposes for pre-processing the images before doing OCR?
No. I did not know that. I am checking now about it.
It improved. I read "Tesseract documentation" and rescaled the image.
Rescaling
Tesseract works best on images which have a DPI of at least 300 dpi, so it may be beneficial to resize images. For more information see the FAQ.
Rescaled image
Run OCR
# tesseract B2.png stdout -l jpn --psm 6
第 54 期 決 算 公 告 _ 令 和 2 年 1 月 29 日
東京 都 中 野 区 中 野 三 丁目 36 番 12 号
中 野 駅 前 ビル 5 F
株 式 会 社 コ ー エ ー テ クニ カ
代表 取締 役 小 空 _ 修
貸借 対照 表 の 要旨 ( 令 和 元 年 11 月 30 日 現在 )

Keras --- About Masking Layer followed by a Reshape Layer

I want to using mask before LSTM, but the output of Lstm must be reshape to 4 dim.
So my code:
main_input = Input(shape=(96,1000), name='main_input')
pre_input = BatchNormalization()(main_input)
aaa= Masking(mask_value=0)(pre_input)
recurrent1 = LSTM(256,return_sequences=True)(aaa)
r_out= Reshape((1,96,256))(recurrent1)`
But it runs with error:
[![enter image description here][1]][1]
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-2-d1107015501b> in <module>()
17 recurrent1 = LSTM(256,return_sequences=True)(aaa)
18
---> 19 r_out= Reshape((1,96,256))(recurrent1)
/usr/local/lib/python2.7/dist-packages/keras/engine/topology.pyc in __call__(self, x, mask)
512 if inbound_layers:
513 # this will call layer.build() if necessary
--> 514 self.add_inbound_node(inbound_layers, node_indices, tensor_indices)
515 input_added = True
516
/usr/local/lib/python2.7/dist-packages/keras/engine/topology.pyc in add_inbound_node(self, inbound_layers, node_indices, tensor_indices)
570 # creating the node automatically updates self.inbound_nodes
571 # as well as outbound_nodes on inbound layers.
--> 572 Node.create_node(self, inbound_layers, node_indices, tensor_indices)
573
574 def get_output_shape_for(self, input_shape):
/usr/local/lib/python2.7/dist-packages/keras/engine/topology.pyc in create_node(cls, outbound_layer, inbound_layers, node_indices, tensor_indices)
148 if len(input_tensors) == 1:
149 output_tensors = to_list(outbound_layer.call(input_tensors[0], mask=input_masks[0]))
--> 150 output_masks = to_list(outbound_layer.compute_mask(input_tensors[0], input_masks[0]))
151 # TODO: try to auto-infer shape if exception is raised by get_output_shape_for
152 output_shapes = to_list(outbound_layer.get_output_shape_for(input_shapes[0]))
/usr/local/lib/python2.7/dist-packages/keras/engine/topology.pyc in compute_mask(self, input, input_mask)
605 else:
606 raise Exception('Layer ' + self.name + ' does not support masking, ' +
--> 607 'but was passed an input_mask: ' + str(input_mask))
608 # masking not explicitly supported: return None as mask
609 return None
Exception: Layer reshape_1 does not support masking, but was passed an input_mask: Any{2}.0
I have print out, the outshape of recurrent1 is (96,256)
How could I make it right?

Problems using "AND" Logical Expression for defining a Mapserver class

I can't seem to get past this hurdle. Mapserver isn't throwing any errors...but it isn't returning anything either...I suspect my logical expression (... in the absence of any errors...I really have little clue what is going down here).
Ideally, I'd like to filter by my shapefile using these two columns: '[YODA] (text)' AND '[ZOOM] (Integer)'.
Currently my code reads as:
LAYER
# Zoom Level 11-16
TYPE ANNOTATION
STATUS ON
GROUP "yoda"
DATA "yoda_graphics"
NAME "yoda_awesome"
# # Visible in map from zoom level 11 onwards
MAXSCALEDENOM 325008
MINSCALEDENOM 5078
LABELITEM "label"
CLASS
# Yoda Head
EXPRESSION (('[YODA]' ~* '/^I/') AND ([Zoom]>8)) ## where things are suspect...
# yoda shell symbol w/ label
STYLE
SYMBOL 'yoda_red_top_shell'
#COLOR 255 255 255
#COLOR 218 218 203
COLOR 184 184 156
SIZE 16
END
STYLE
SYMBOL 'yoda_red_top_shell'
#COLOR 225 104 104
#COLOR 204 184 181
COLOR 214 214 169
SIZE 15
END
STYLE
SYMBOL 'yoda_blue_shell'
#COLOR 80 101 123
#COLOR 183 192 221
COLOR 241 241 226
SIZE 15
END
LABEL
TYPE truetype
FONT "deja-bold"
SIZE 5
#COLOR 255 255 255
COLOR 184 184 156
PARTIALS FALSE
WRAP " "
ALIGN center
POSITION CC
ANGLE 0
END # end label
END #end class
END # layer
You shouldn't surround your regular expression with slashes when using an explicit regular expression operator.
This is correct:
CLASSITEM "Yoda"
CLASS
EXPRESSION /^I/
In your case, use:
EXPRESSION (('[YODA]' ~* '^I') AND ([Zoom]>8))