ModuleNotFoundError: No module named 'nltk' - nltk

ModuleNotFoundError: No module named 'nltk'
sentence = "Какой был шквал?"
tokenized = word_tokenize(sentence)
pos_tagged = pos_tag(tokenized, lang='rus')
print(pos_tagged)
key_words = []
for word, pos in pos_tagged:
if pos == 'V' or pos == 'S':
key_words.append(word)
print(key_words)```

Open shell and install nltk.
pip install nltk
Open the python interactive shell by typing python in terminal. Install the punkt and averaged_perceptron_tagger_ru packages.
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_ru')
Run the following code.
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
sentence = "Какой был шквал?"
tokenized = word_tokenize(sentence)
pos_tagged = pos_tag(tokenized, lang='rus')
print(pos_tagged)
key_words = []
for word, pos in pos_tagged:
if pos == 'V' or pos == 'S':
key_words.append(word)
print(key_words)

Related

unable to use pytesseract on mac, after downloading tesseract through homebrew in terminal

the code i would like to run:
import cv2
import pytesseract
img = cv2.imread("the path to the png file")
print(pytesseract.image_to_string(img))
the answer i get:
TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.
have you put this variable :
pytesseract.pytesseract.tesseract_cmd = "/usr/local/bin/tesseract"
I use pytesseract on Mac.
here is an example of code :
"
import cv2
import pytesseract
from pytesseract import Output
pytesseract.pytesseract.tesseract_cmd = "/usr/local/bin/tesseract
custom_config = r'-c tessedit_char_blacklist=|[]{}?!&§()$*:ùûÿ --psm 6 --oem 3'
custom config allows me to remove unwanted characters
test=path+"image.png"
img = cv2.imread(test)
d=pytesseract.image_to_data(img, lang='fra',config=custom_config,output_type='data.frame').dropna()
output_type = 'data.frame' to use pandas to process the data
test1 = pytesseract.image_to_string(img ,lang = 'fra', config = custom_config)

JINA#4428[C]:Can not fetch the URL of Hubble from `api.jina.ai`

I was trying out the Semantic Wikipedia Search from jina-ai.
This is the error I am getting after running python app.py -t index.
app.py is used to index the data.
JINA#4489[C]:Can not fetch the URL of Hubble from api.jina.ai
HubIO#4489[E]:Error while pulling jinahub+docker://TransformerTorchEncoder:
JSONDecodeError('Expecting value: line 1 column 1 (char 0)')
This is app.py:
__copyright__ = "Copyright (c) 2021 Jina AI Limited. All rights reserved."
__license__ = "Apache-2.0"
import os
import sys
import click
import random
from jina import Flow, Document, DocumentArray
from jina.logging.predefined import default_logger as logger
MAX_DOCS = int(os.environ.get('JINA_MAX_DOCS', 10000))
def config(dataset: str):
if dataset == 'toy':
os.environ['JINA_DATA_FILE'] = os.environ.get('JINA_DATA_FILE', 'data/toy-input.txt')
elif dataset == 'full':
os.environ['JINA_DATA_FILE'] = os.environ.get('JINA_DATA_FILE', 'data/input.txt')
os.environ['JINA_PORT'] = os.environ.get('JINA_PORT', str(45678))
cur_dir = os.path.dirname(os.path.abspath(__file__))
os.environ.setdefault('JINA_WORKSPACE', os.path.join(cur_dir, 'workspace'))
os.environ.setdefault('JINA_WORKSPACE_MOUNT',
f'{os.environ.get("JINA_WORKSPACE")}:/workspace/workspace')
def print_topk(resp, sentence):
for doc in resp.data.docs:
print(f"\n\n\nTa-Dah🔮, here's what we found for: {sentence}")
for idx, match in enumerate(doc.matches):
score = match.scores['cosine'].value
print(f'> {idx:>2d}({score:.2f}). {match.text}')
def input_generator(num_docs: int, file_path: str):
with open(file_path) as file:
lines = file.readlines()
num_lines = len(lines)
random.shuffle(lines)
for i in range(min(num_docs, num_lines)):
yield Document(text=lines[i])
def index(num_docs):
flow = Flow().load_config('flows/flow.yml')
data_path = os.path.join(os.path.dirname(__file__), os.environ.get('JINA_DATA_FILE', None))
with flow:
flow.post(on='/index', inputs=input_generator(num_docs, data_path),
show_progress=True)
def query(top_k):
flow = Flow().load_config('flows/flow.yml')
with flow:
text = input('Please type a sentence: ')
doc = Document(content=text)
result = flow.post(on='/search', inputs=DocumentArray([doc]),
parameters={'top_k': top_k},
line_format='text',
return_results=True,
)
print_topk(result[0], text)
#click.command()
#click.option(
'--task',
'-t',
type=click.Choice(['index', 'query'], case_sensitive=False),
)
#click.option('--num_docs', '-n', default=MAX_DOCS)
#click.option('--top_k', '-k', default=5)
#click.option('--dataset', '-d', type=click.Choice(['toy', 'full']), default='toy')
def main(task, num_docs, top_k, dataset):
config(dataset)
if task == 'index':
if os.path.exists(os.environ.get("JINA_WORKSPACE")):
logger.error(f'\n +---------------------------------------------------------------------------------+ \
\n | 🤖🤖🤖 | \
\n | The directory {os.environ.get("JINA_WORKSPACE")} already exists. Please remove it before indexing again. | \
\n | 🤖🤖🤖 | \
\n +---------------------------------------------------------------------------------+')
sys.exit(1)
index(num_docs)
elif task == 'query':
query(top_k)
if __name__ == '__main__':
main()
This is flow.yml
version: '1' # This is the yml file version
with: # Additional arguments for the flow
workspace: $JINA_WORKSPACE # Workspace folder path
port_expose: $JINA_PORT # Network Port for the flow
executors: # Now, define the executors that are run on this flow
- name: transformer # This executor computes an embedding based on the input text documents
uses: 'jinahub+docker://TransformerTorchEncoder' # We use a Transformer Torch Encoder from the hub as a docker container
- name: indexer # Now, index the text documents with the embeddings
uses: 'jinahub://SimpleIndexer' # We use the SimpleIndexer for this purpose
And when I try to execute app.py -t index
This is the error:
JINA#3803[C]:Can not fetch the URL of Hubble from `api.jina.ai` HubIO#3803[E]:Error while pulling jinahub+docker://TransformerTorchEncoder: JSONDecodeError('Expecting value: line 1 column 1 (char 0)')
I think this just happened because the API was down. It should work now.

How to create python executable with Cython? (Segmentation fault)

I'm trying to set up a cython setup that will compile
a python source code to an executable (It should embed the main method within) - currently I have managed to set it up as an importable module but not as a standalone executable.
I saw that there is a compiler option Options.embed that should handle this. (In this post it said that it should be set to the function that the interpreter should call - main)
This is the module code:
def main():
print('Cython Demo')
if __name__ == '__main__':
main()
This is the setup "compile.py" code
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
from Cython.Build import cythonize
from Cython.Compiler import Options
Options.docstrings = False
Options.emit_code_comments = False
Options.embed = "main"
ext_modules = cythonize([
Extension("cython_demo.mymod.moduleA",["/home/myuser/PycharmProjects/cython_demo/mymod/moduleA.py"])],
compiler_directives=dict(always_allow_keywords=True,language_level = '3'))
setup(
name = 'My Program Name',
cmdclass = {'build_ext': build_ext},
ext_modules = ext_modules
)
Unfortunately after compiling the python code and trying to run the executable by calling:
./moduleA.cpython-36m-x86_64-linux-gnu.so
i get the segmentation error.
Segmentation fault (core dumped)
I saw that the main function is there by running grep "int main" on the file. What may be the problem?
When I'm importing the module from somewhere else and running main directly - it works:
import moduleA
moduleA.main()
Thanks!

Selenium in python is giving error

i have written a code in selenium using chrome driver, it codes works fine on some days and some days it gives error.Below is my code:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import os
import time
import csv
driver = webdriver.Chrome("chromedriver.exe")
driver.get('https://maharerait.mahaonline.gov.in/searchlist/searchlist')
# try:
# element = WebDriverWait(driver, 100).until(
# EC.presence_of_element_located((By.ID, "Promoter"))
# )
# finally:
# print('0000000000000000000000')
# driver.quit()
time.sleep(1)
driver.find_element_by_id('Promoter').click()
divisionLength = len(Select(driver.find_element_by_id('Division')).options)
print('*********{}'.format(divisionLength))
firstRow = 0
titleRow = []
contentRows = []
gdistName = ""
gdivName = ""
for divisionElement in range(1,divisionLength):
selectDivision = Select(driver.find_element_by_id('Division'))
selectDivision.options
selectDivision.select_by_index(divisionElement)
time.sleep(1)
districtLength =
len(Select(driver.find_element_by_id('District')).options)
gdivName = (selectDivision.options)[divisionElement].text
while districtLength == 1:
print("43")
print(districtLength)
for districtElement in range(1,districtLength):
selectDistrict = Select(driver.find_element_by_id('District'))
selectDistrict.options
selectDistrict.select_by_index(districtElement)
gdistName = (selectDistrict.options)[districtElement].text
time.sleep(2)
projectLength =
len(Select(driver.find_element_by_id('Project')).options)
print('/------------------------------/')
print('/-----project number: {}-------/'.format(projectLength))
print('/------------------------------/')
if projectLength == 1:
continue
for projectElement in range(1,projectLength):
selectDistrict = Select(driver.find_element_by_id('District'))
selectDistrict.select_by_index(0)
selectDistrict.select_by_index(districtElement)
time.sleep(2)
gdistName = (selectDistrict.options)[districtElement].text
# selectProject.options
# while len(selectProject.options) == 1:
# print(len(selectProject.options))
# print("65")
# c = len(select.options)
# print('---------------{}'.format(c))
# titleRow = []
# contentRows = []
# firstRow = 0
# for i in range(1,c):
# select = Select(driver.find_element_by_id('Project'))
# while len(select.options) == 1:
# pass
selectProject = Select(driver.find_element_by_id('Project'))
time.sleep(2)
selectProject.select_by_index(projectElement)
driver.find_element_by_id('btnSearch').click()
tableRows =
driver.find_element_by_class_name('table').find_elements_by_tag_name('tr')
if firstRow == 0:
headRow = tableRows[0].find_elements_by_tag_name('th')
for headRowData in range(0,len(headRow)):
text =
headRow[headRowData].find_element_by_tag_name('span').text
titleRow.append(text)
firstRow = firstRow + 1
for dataRowsNumbers in range(1,len(tableRows)):
dataRow =
tableRows[dataRowsNumbers].find_elements_by_tag_name('td')
tempList = []
for dataRowContents in range(0,len(dataRow)):
try:
a_link =
dataRow[dataRowContents].find_element_by_tag_name('a').get_attribute('href')
tempList.append(str(a_link))
except NoSuchElementException:
tempList.append(str(dataRow[dataRowContents].text))
# if dataRow[dataRowContents].text == 'View':
# a_link =
dataRow[dataRowContents].find_element_by_tag_name('a').get_attribute('href')
# tempList.append(str(a_link))
# else:
#
tempList.append(str(dataRow[dataRowContents].text))
#print(dataRow[dataRowContents].text)
tempList.append(gdivName)
tempList.append(gdistName)
print(tempList)
contentRows.append(tempList)
# print('Automated check is over')
# print('Stored data in programs is as below:')
# print(contentRows)
with open("./data.csv",'w') as csvfile:
csvfile = csv.writer(csvfile, delimiter=',')
csvfile.writerow(titleRow)
csvfile.writerow("")
for i in range(0,len(contentRows)):
csvfile.writerow(contentRows[i])
driver.close()
Please excuse of intended spaces.
so i receive this error when i run it..
Traceback (most recent call last):
File "C:\Users\prince.bhatia\Desktop\Crawlers\Maha_Rera1.py", line 68, in
<module>
selectDistrict.select_by_index(districtElement)
File
"C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-
packages\selenium\webdriver\support\select.py", line 103, in select_by_index
raise NoSuchElementException("Could not locate element with index %d" %
index)
selenium.common.exceptions.NoSuchElementException: Message: Could not locate
element with index 2
Please , if someone can suggest me what to change , because it worked fine yesterday and not it is not working..It requires chrome driver to run
this is the website: https://maharerait.mahaonline.gov.in/searchlist/searchlist
try using select.select_by_value(districtElement) instead of index

How to soup a browser response

I've got a program that sends a lot of requests to a website using RoboBrowser and gets the answers, but now I need to filter these answers to only the ones that don't have this string " Case Status Not Available " I tried to use beautifulsoup for it, but it is returning an error.
Here's the code so far:
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import csv
import pickle
import requests
from robobrowser import RoboBrowser
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
print(browser.response.text)
souptwo = BeautifulSoup(browser.response.text)
texttwo = soup.get_text()
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
if not matchtwo:
soupthree = BeautifulSoup(browser.response.text)
print soupthree
The error that returns is:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\converterpluspa.py", line 87, in <module>
matchtwo = soup.findall('<td class="fieldData">Case Status Not Available</TD>')
TypeError: 'NoneType' object is not callable
Line 87 includes an attempt to call the method findall of soup. soup was defined in line 65 where BeautifulSoup was called to parse the contents of a file. Since the error diagnostic says that soup is None this means that BeautifulSoup was unable to parse that file.