I want to create a LDA topic model and am using SpaCy to do so, following a tutorial. The error I receive when I try to use spacy is one I cannot find on google, so I'm hoping someone here knows what it's about.
I'm running this code on Anaconda:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
df = pd.DataFrame(data)
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
# deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
return texts_out
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])
And I receive the following error:
File "C:\Users\maart\AppData\Local\Continuum\anaconda3\lib\site-packages\_regex_core.py", line 1880, in get_firstset
raise _FirstSetError()
_FirstSetError
The error must occur somewhere after the lemmatization, because the other parts work fine.
Thanks a bunch!
I had this same issue and I was able to resolve it by uninstalling regex (I had the wrong version installed) and then running python -m spacy download en again. This will reinstall the correct version of regex.
Related
I'm trying to build my own desktop assistant and got problem with first line. Checking if i got extra space or line but all looks ok, could you please check if anything is wrong?
I did add my script in progress and picture. Thank you all !!
import speech_recognition as sr
import os
import sys
import re
import webbrowser
import smtplib
import requests
import subprocess
from pyowm import OWM
import youtube_dl
import vlc
import urllib
import urllib2
import json
from bs4 import BeautifulSoup as soup
from urllib2 import urlopen
import wikipedia
import random
from time import strftime
def sofiaResponse (audio);
"speaks audio passed as argument"
print(audio)
for line in audio.splitlines():
os.system("say" + audio)
def myCommand ():
"listens for commands"
r = sr.Recognizer()
with sr.Microphone() as source:
print('Say something...')
r.pause_threshold = 1
r.adjust_for_ambient_noise(source, duration=1)
audio = r.listen(source)
try:
command = r.recognize_google(audio).lower()
print('You said: ' + command + '\n')
#loop back to continue listening
except sr.UnknownValueError:
print('Error, help me error')
command = myCommand();
return command
def assistant (command):
"if statements for executing commands"
enter image description here
On line 22, you made a typo :
def sofiaResponse (audio);
should be
def sofiaResponse (audio):
# USAGE
# python train_simple_nn.py --dataset animals --model output/simple_nn.model --label-bin output/simple_nn_lb.pickle --plot output/simple_nn_plot.png
# set the matplotlib backend so figures can be saved in the background
import matplotlib
matplotlib.use("Agg")
# import the necessary packages
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from imutils import paths
import matplotlib.pyplot as plt
import numpy as np
import argparse
import random
import pickle
import cv2
import os
# construct the argument parser and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-d", "--dataset", required=True, help="path to input dataset of images")
ap.add_argument("-m", "--model", required=True, help="path to output trained model")
ap.add_argument("-l", "--label-bin", required=True, help="path to output label binarizer")
ap.add_argument("-p", "--plot", required=True, help="path to output accuracy/loss plot")
args = vars(ap.parse_args())
# initialize the data and labels
print("[INFO] loading images...")
data = []
labels = []
# grab the image paths and randomly shuffle them
imagePaths = sorted(list(paths.list_images(args["dataset"])))
random.seed(42)
random.shuffle(imagePaths)
# loop over the input images
for imagePath in imagePaths:
# load the image, resize the image to be 32x32 pixels (ignoring
# aspect ratio), flatten the image into 32x32x3=3072 pixel image
# into a list, and store the image in the data list
image = cv2.imread(imagePath)
image = cv2.resize(image, (32, 32)).flatten()
data.append(image)
# extract the class label from the image path and update the
# labels list
label = imagePath.split(os.path.sep)[-2]
labels.append(label)
# scale the raw pixel intensities to the range [0, 1]
data = np.array(data, dtype="float") / 255.0
labels = np.array(labels)
I found a test code for studing Deep learning.
And tried to run in Pycharm. but I got this error message.
Actually I couldn't understand what that parser function is doing here.
could you explain about that code and about the error?
---error i got in Pycharm -----------------------
C:\Users\giyeo\anaconda3\envs\tf\python.exe "D:/GiyeonLee/09. Machine Learning/Pycharm/Tutorial/keras-tutorial/train_simple_nn.py"
2020-07-06 13:56:28.409237: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll
usage: train_simple_nn.py [-h] -d DATASET -m MODEL -l LABEL_BIN -p PLOT
train_simple_nn.py: error: the following arguments are required: -d/--dataset, -m/--model, -l/--label-bin, -p/--plot
Process finished with exit code 2
Thanks for reading my quation..
I am currently working on a project dealing with a bunch of social media posts.
Some of these posts are in English and some in Spanish.
My current code runs quite smoothly. However, I am asking myself does Spacy/NLTK automatically detect which language stemmer/stopwords/etc. it has to use for each post (depending on whether it is an English or Spanish post)? At the moment, I am just parsing each post to a stemmer without explicitly specifying the language.
This is a snippet of my current script:
import re
import pandas as pd
!pip install pyphen
import pyphen
!pip install spacy
import spacy
!pip install nltk
import nltk
from nltk import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
!pip install spacy-langdetect
from spacy_langdetect import LanguageDetector
!pip install textblob
from textblob import TextBlob
# Download Stopwords
nltk.download('stopwords')
stop_words_eng = set(stopwords.words('english'))
stop_words_es = set(stopwords.words('spanish'))
# Import Stemmer
p_stemmer = PorterStemmer()
#Snowball (Porter2): Nearly universally regarded as an improvement over porter, and for good reason.
snowball_stemmer = SnowballStemmer("english")
dic = pyphen.Pyphen(lang='en')
# Load Data
data = pd.read_csv("mergerfile.csv", error_bad_lines=False)
pd.set_option('display.max_columns', None)
posts = data.loc[data["ad_creative"] != "NONE"]
# Functions
def get_number_of_sentences(text):
sentences = [sent.string.strip() for sent in text.sents]
return len(sentences)
def get_average_sentence_length(text):
number_of_sentences = get_number_of_sentences(text)
tokens = [token.text for token in text]
return len(tokens) / number_of_sentences
def get_token_length(text):
tokens = [token.text for token in text]
return len(tokens)
def text_analyzer(data_frame):
content = []
label = []
avg_sentence_length = []
number_sentences = []
number_words = []
for string in data_frame:
string.join("")
if len(string) <= 4:
print(string)
print("filtered")
content.append(string)
avg_sentence_length.append("filtered")
number_sentences.append("filtered")
number_words.append("filtered")
else:
# print list
print(string)
content.append(string)
##Average Sentence Lenght
result = get_average_sentence_length(nlp(string))
avg_sentence_length.append(result)
print("avg sentence length:", result)
##Number of Sentences
result = get_number_of_sentences(nlp(string))
number_sentences.append(result)
print("#sentences:", result)
##Number of words
result = get_token_length(nlp(string))
number_words.append(result)
print("#Words", result)
content, avg_sentence_length, number_sentences, number_words = text_analyzer(
data["posts"])
Short answer is no, neither NLTK nor SpaCy will automatically determine the language and apply appropriate algorithms to a text.
SpaCy has separate language models with their own methods, part-of-speech and dependency tagsets. It also has a set of stopwords for each available language.
NLTK is more modular; for stemming there is RSLPStemmer (Portuguese), ISRIStemmer (Arabic), and SnowballStemmer (Danish, Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish, Swedish).
When you determine the language of a post through spacy_langdetect, the next thing you should do is explicitly instruct to use the appropriate SpaCy language model or NLTK module.
Use GoogleTrans Library for this
#/usr/bin/python
from googletrans import Translator
translator = Translator()
translator.detect('이 문장은 한글로 쓰여졌습니다.')
This Returns
<Detected lang=ko confidence=0.27041003>
So, this is the best way to do so if you have an internet connection and is better in most cases than Spacy as Google Translate is more mature and has better algorithms, ;)
I am trying to open a file with the extension .csv in python, however it keeps saying that the file is not found. I am copying the path from the side bar, so I don't believe that's the problem
I have tried to insert / and ./ before the path of the file
And r in front of the file name
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
bkgrnd = pd.read_csv('/Desktop/Sro/Natrium22.csv')
No matter what I've tried, it keeps saying FileNotFoundError
you can import csv if file will be always .csv,
import csv
with open('C:\Users\user\Desktop\Sro\Natrium22.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
specifix on windows, it needs normalization of your pathname, maybe thats the issue,
try doing, will surely work,
import os
import pandas as pd
cwd = os.getcwd()
filePath = 'C:/Users/user/Desktop/Sro/Natrium22.csv'
data = pd.read_csv(os.path.normcase(os.path.join(cwd, filePath)))
print(data)
you can try even,
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
bkgrnd = pd.read_csv(r'C:\Users\user\Desktop\Sro\Natrium22.csv')
print(bkgrnd)
Fairly new to using ipython so I'm still getting confused quite easily. Here is my code so far. After loading I have to display only the first 5 rows of the file.
# Import useful packages for data science
from IPython.display import display, HTML
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load concerts.csv
path1 = 'C:\\Users\\Cathal\\Documents\\concerts.csv'
concerts = pd.read_csv(path1)
Thanks in advance for any help.
try
concerts = pd.read_csv(path1, encoding = 'utf8')
if that doesnt work try
concerts = pd.read_csv(path1, encoding = "ISO-8859-1")