I have followed a few solutions found on StackOverflow and the GIS StackExchange, but cannot seem to troubleshoot this error message. I don't have a lot of experience working with GeoPandas, so any help would be greatly appreciated. Attached below is my script:
# Key Module Import(s):
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import geopandas as gp
import rioxarray
from shapely.geometry import mapping
# Primary Workspace
cID = xr.open_dataset('E:\PSU\CCAR REU\Python Codex\MASKS\countrymask_0.1x0.1.nc',) # NetCDF
cID[['cell_area', 'CountryID']].transpose('lat', 'lon', 'time')
cID.rio.set_spatial_dims(x_dim = "lon", y_dim = "lat", inplace = True)
cID.rio.write_crs("EPSG:4326", inplace = True)
# Location of Each Shapefile
BorNA_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\BorNA\BorNA.shp')
TemNA_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\TemNA\TemNA.shp')
TroSA_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\TroSA\TroSA.shp')
TemSA_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\TemSA\TemSA.shp')
Eur_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\Eur\Eur.shp')
BorEuras_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\BorEuras\BorEuras.shp')
TemEuras_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\TemEuras\TemEuras.shp')
TroAs_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\TroAs\TroAs.shp')
Aus_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\Aus\Aus.shp')
NAfr_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\\NAfr\\NAfr.shp')
SAfr_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\SAfr\SAfr.shp')
OTH_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\OTH\OTH.shp')
# Clipping Procedure Using RioXarray
C_BorNA = cID.rio.clip(BorNA_sp.geometry.apply(mapping), BorNA_sp.crs)
C_TemNA = cID.rio.clip(TemNA_sp.geometry.apply(mapping), TemNA_sp.crs)
C_TroSA = cID.rio.clip(TroSA_sp.geometry.apply(mapping), TroSA_sp.crs)
C_TemSA = cID.rio.clip(TemSA_sp.geometry.apply(mapping), TemSA_sp.crs)
C_Eur = cID.rio.clip(Eur_sp.geometry.apply(mapping), Eur_sp.crs)
C_BorEuras = cID.rio.clip(BorEuras_sp.geometry.apply(mapping), BorEuras_sp.crs)
C_TemEuras = cID.rio.clip(TemEuras_sp.geometry.apply(mapping), TemEuras_sp.crs)
C_TroAs = cID.rio.clip(TroAs_sp.geometry.apply(mapping), TroAs_sp.crs)
C_Aus = cID.rio.clip(Aus_sp.geometry.apply(mapping), Aus_sp.crs)
C_NAfr = cID.rio.clip(NAfr_sp.geometry.apply(mapping), NAfr_sp.crs)
C_SAfr = cID.rio.clip(SAfr_sp.geometry.apply(mapping), SAfr_sp.crs)
C_OTH = cID.rio.clip(OTH_sp.geometry.apply(mapping), OTH_sp.crs)
# Save Clipped Output(s)
NC_BorNA = C_BorNA.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\BorNA.nc')
NC_TemNA = C_TemNA.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\TemNA.nc')
NC_TroSA = C_TroSA.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\TroSA.nc')
NC_TemSA = C_TemSA.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\TemSA.nc')
NC_Eur = C_Eur.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\Eur.nc')
NC_BorEuras = C_BorEuras.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\BorEuras.nc')
NC_TemEuras = C_TemEuras.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\TemEuras.nc')
NC_TroAs = C_TroAs.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\TroAs.nc')
NC_Aus = C_Aus.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\Aus.nc')
NC_NAfr = C_NAfr.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\\NAfr.nc')
NC_SAfr = C_SAfr.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\SAfr.nc')
NC_OTH = C_OTH.to_netcdf(path = 'E:\PSU\CCAR REU\Python Codex\MASKS\Outputs\OTH.nc')
Error received: ValueError: Assigning CRS to a GeoDataFrame without a geometry column is not supported. Supply geometry using the 'geometry=' keyword argument, or by providing a DataFrame with column name 'geometry' (#Line 28, i.e. BorNA_sp)
Thank you all in advance for the assistance, and apologies for what may read a repeat question.
**
FULL TRACEBACK:
**
File "C:\Users\Jordan\anaconda3\lib\site-packages\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "e:\psu\ccar reu\python codex\masking scratch space.py", line 28, in <module>
BorNA_sp = gp.read_file('E:\PSU\CCAR REU\Python Codex\MASKS\SP_Files\BorNA\BorNA.shp')
File "C:\Users\Jordan\anaconda3\lib\site-packages\geopandas\io\file.py", line 253, in _read_file
return _read_file_fiona(
File "C:\Users\Jordan\anaconda3\lib\site-packages\geopandas\io\file.py", line 340, in _read_file_fiona
df = GeoDataFrame.from_features(
File "C:\Users\Jordan\anaconda3\lib\site-packages\geopandas\geodataframe.py", line 656, in from_features
return cls(rows, columns=columns, crs=crs)
File "C:\Users\Jordan\anaconda3\lib\site-packages\geopandas\geodataframe.py", line 192, in __init__
raise ValueError(
ValueError: Assigning CRS to a GeoDataFrame without a geometry column is not supported. Supply geometry using the 'geometry=' keyword argument, or by providing a DataFrame with column name 'geometry'```
Related
import csv
import matplotlib.pyplot as plt
f= open('age.csv',encoding = 'cp949')
data = csv.reader(f)
next(data)
m =[]
f= []
name =input('찾고 싶은 지역의 이름을 알려주세요')
for row in data:
for i in row [3:104]:
m.append(-int(i))
for i in row[106:]:
f.append(int(i))
plt.style.use('ggplot')
plt.figure(figsize = (10,5),dpi =300)
plt.rc('font',familt = 'Malgun Gothic')
plt.rcParams['axes unicode_minus'] = False
plt.title(' men and women live in some region')
plt.barh(range(101),m,label = '남성')
plt.barh(range(101),f,label = '여성')
plt.legend()
plt.show()
invalid literal for int() with base 10: '311,415'
howreviseaboutthisError
pleaseknowyoursopinion
I am trying to webscrape a website that has multiple pages that are rendered by Javascript. I am using BeautifulSoup and Selenium. I have a script that works but only for the first page of the website. Is it possible to webscrape multiple javascript rendered pages or do I need to do them individually? Here is my script:
import time
from bs4 import BeautifulSoup as soup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
# The path to where you have your chrome webdriver stored:
webdriver_path = '/Users/rawlins/Downloads/chromedriver'
# Add arguments telling Selenium to not actually open a window
chrome_options = Options()
chrome_options.add_argument('--headless')
#chrome_options.add_argument('--window-size=1920x1080')
# Fire up the headless browser
browser = webdriver.Chrome(executable_path = webdriver_path,
chrome_options = chrome_options)
# Load webpage
url = "https://cnx.org/search?q=subject:Arts"
browser.get(url)
# to ensure that the page has loaded completely.
time.sleep(3)
data = []
n = 2
for i in range(1, n+1):
if (i == 1):
# handle first page
response = requests.get(url)
response = requests.get(url + "&page=" + str(i))
#response = requests.get(url + "&page=" + str(i),headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
# Parse HTML, close browser
page_soup = soup(browser.page_source, 'lxml')
containers = page_soup.findAll("tr")
browser.quit()
for container in containers:
item = {}
item['type'] = "Course Material"
if container.find('td', {'class' : 'title'}):
item['title'] = container.find('td', {'class' : 'title'}).h4.text.strip()
else:
item['title'] = ""
if container.find('td', {'class' : 'authors'}):
item['author'] = container.find('td', {'class' : 'authors'}).text.strip()
else:
item['author'] = ""
if container.find('td', {'class' : 'title'}):
item['link'] = "https://cnx.org/" + container.find('td', {'class' : 'title'}).a["href"]
else:
item['link'] = ""
if container.find('td', {'class' : 'title'}):
item['description'] = container.find('td', {'class' : 'title'}).span.text
else:
item['description'] = ""
item['subject'] = "Arts"
item['source'] = "OpenStax CNX"
item['base_url'] = "https://cnx.org/browse"
item['license'] = "Attribution"
data.append(item) # add the item to the list
with open("js-webscrape.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
Thanks in advance.
Couple of issues here:
You're mixing requests.get() in with browser.get(). No need for the requests module at all here since you're getting the page via the headless browser.
No need to have a special case for the first page. https://cnx.org/search?q=subject:Arts&page=1 works fine.
time.sleep() should be between browser.get() and the parsing, to allow the page to fully load before feeding it to BeautifulSoup.
You should write data to the JSON file outside the for loop, once all pages have been scraped.
Quit the browser outside the for loop as well, not after a single iteration.
To avoid encoding errors, specify the encoding when writing to the JSON file: with open("js-webscrape.json", "w", encoding="utf-8")
Here's a working implementation that scrapes all 7 pages:
import time
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
# The path to where you have your chrome webdriver stored:
webdriver_path = '/Users/Gebruiker/Downloads/chromedriver_win32/chromedriver'
# Add arguments telling Selenium to not actually open a window
chrome_options = Options()
chrome_options.add_argument('--headless')
# Fire up the headless browser
browser = webdriver.Chrome(executable_path = webdriver_path, options = chrome_options)
# Load webpage
url = "https://cnx.org/search?q=subject:Arts"
data = []
n = 7
for i in range(1, n+1):
response = browser.get(url + "&page=" + str(i))
time.sleep(5)
# Parse HTML
page_soup = soup(browser.page_source,'lxml')
containers = page_soup.findAll("tr")
for container in containers:
item = dict()
item['type'] = "Course Material"
if container.find('td', {'class' : 'title'}):
item['title'] = container.find('td', {'class' : 'title'}).h4.text.strip()
else:
item['title'] = ""
if container.find('td', {'class' : 'authors'}):
item['author'] = container.find('td', {'class' : 'authors'}).text.strip()
else:
item['author'] = ""
if container.find('td', {'class' : 'title'}):
item['link'] = "https://cnx.org/" + container.find('td', {'class' : 'title'}).a["href"]
else:
item['link'] = ""
if container.find('td', {'class' : 'title'}):
item['description'] = container.find('td', {'class' : 'title'}).span.text
else:
item['description'] = ""
item['subject'] = "Arts"
item['source'] = "OpenStax CNX"
item['base_url'] = "https://cnx.org/browse"
item['license'] = "Attribution"
data.append(item) # add the item to the list
# write data to file and quit browser when done
print(data)
with open("js-webscrape.json", "w", encoding="utf-8") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
browser.quit()
I'm working on facial expression recognition, and I'm using Keras. I've collected many datasets, and then I have applied data augmentation on the images, I've got about 500 000 images saved (as pixels) on a .csv file (same format as fer2013.csv).
This is the code I'm using :
def Zerocenter_ZCA_whitening_Global_Contrast_Normalize(list):
Intonumpyarray = numpy.asarray(list)
data = Intonumpyarray.reshape(img_width,img_height)
data2 = ZeroCenter(data)
data3 = zca_whitening(flatten_matrix(data2)).reshape(img_width,img_height)
data4 = global_contrast_normalize(data3)
data5 = numpy.rot90(data4,3)
return data5
def load_data():
train_x = []
train_y = []
val_x = []
val_y = []
test_x = []
test_y = []
f = open('ALL.csv')
csv_f = csv.reader(f)
for row in csv_f:
if str(row[2]) == "Training":
temp_list_train = []
for pixel in row[1].split():
temp_list_train.append(int(pixel))
data = Zerocenter_ZCA_whitening_Global_Contrast_Normalize(temp_list_train)
train_y.append(int(row[0]))
train_x.append(data.reshape(data_resh).tolist())
elif str(row[2]) == "PublicTest":
temp_list_validation = []
for pixel in row[1].split():
temp_list_validation.append(int(pixel))
data = Zerocenter_ZCA_whitening_Global_Contrast_Normalize(temp_list_validation)
val_y.append(int(row[0]))
val_x.append(data.reshape(data_resh).tolist())
elif str(row[2]) == "PrivateTest":
temp_list_test = []
for pixel in row[1].split():
temp_list_test.append(int(pixel))
data = Zerocenter_ZCA_whitening_Global_Contrast_Normalize(temp_list_test)
test_y.append(int(row[0]))
test_x.append(data.reshape(data_resh).tolist())
return train_x, train_y, val_x, val_y, test_x, test_y
And then I load data and feed them to the generator :
Train_x, Train_y, Val_x, Val_y, Test_x, Test_y = load_data()
Train_x = numpy.asarray(Train_x)
Train_x = Train_x.reshape(Train_x.shape[0],img_rows,img_cols)
Test_x = numpy.asarray(Test_x)
Test_x = Test_x.reshape(Test_x.shape[0],img_rows,img_cols)
Val_x = numpy.asarray(Val_x)
Val_x = Val_x.reshape(Val_x.shape[0],img_rows,img_cols)
Train_x = Train_x.reshape(Train_x.shape[0], img_rows, img_cols, 1)
Test_x = Test_x.reshape(Test_x.shape[0], img_rows, img_cols, 1)
Val_x = Val_x.reshape(Val_x.shape[0], img_rows, img_cols, 1)
Train_x = Train_x.astype('float32')
Test_x = Test_x.astype('float32')
Val_x = Val_x.astype('float32')
Train_y = np_utils.to_categorical(Train_y, nb_classes)
Test_y = np_utils.to_categorical(Test_y, nb_classes)
Val_y = np_utils.to_categorical(Val_y, nb_classes)
datagen = ImageDataGenerator(
featurewise_center=False,
samplewise_center=False,
featurewise_std_normalization=False,
samplewise_std_normalization=False,
zca_whitening=False,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True,
shear_range=0.03,
zoom_range=0.03,
vertical_flip=False)
datagen.fit(Train_x)
model.fit_generator(datagen.flow(Train_x, Train_y,
batch_size=batch_size),
samples_per_epoch=Train_x.shape[0],
nb_epoch=nb_epoch,
validation_data=(Val_x, Val_y))
When I run the code, RAM usage gets bigger and bigger until the pc freezes (I've have 16 Gb). It get stuck when loading_data() is called. Any solution for this problem that can fits my code ?
Seems to be a duplicate of this question. Basically, you'll have to use fit_generator() instead of fit() and pass in a function that loads the data into your model one batch at a time instead of all at once.
So I am having trouble obtaining the href link for the next pages of the url. I got up to obtaining all the text and what not that the tag contains but I can't seem to wrap my head around to removing the text that I don't need and just obtaining the href and navigating through the pages.
Here is my code:
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
filter_words = ['engineering', 'instrumentation', 'QA']
all_job_url = []
nextpages = []
filtered_job_links = []
http_flinks = []
flinks = []
def all_next_pages():
pages = prettify.find_all('div', {'class':'pagination'})
for next_page in pages:
next_page.find_all('a')
nextpages.append(next_page)
print(next_page)
all_next_pages()
Here is a way to get the links of the search result items. Find row result class and then find a tag, it contains all the information you need.
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.text
prettify = BeautifulSoup(rcontent, "lxml")
filter_words = ['engineering', 'instrumentation', 'QA']
all_job_url = []
nextpages = []
filtered_job_links = []
http_flinks = []
flinks = []
def all_next_pages():
pages = prettify.find_all('div', {'class':' row result'})
for next_page in pages:
info = next_page.find('a')
url = info.get('href')
title = info.get('title')
print(title,url)
all_next_pages()
I have been writing a simple plugin for hackerrank to compile and run code, of a problem on hackerrank from my system.
For ex: I need to test code for this problem https://www.hackerrank.com/challenges/solve-me-first
So, I ran my script like:
python hackerrank.py https://www.hackerrank.com/challenges/solve-me-first solve-me-first.cpp
I get following output:
Traceback (most recent call last):
File "hackerrank.py", line 126, in <module>
h.run()
File "hackerrank.py", line 113, in run
if self.compile_and_test() == "NOT_FOUND":
File "hackerrank.py", line 51, in compile_and_test
j = self.r.json()
File "/usr/lib/python2.7/dist-packages/requests/models.py", line 741, in json
return json.loads(self.text, **kwargs)
File "/usr/lib/python2.7/dist-packages/simplejson/__init__.py", line 488, in loads
return _default_decoder.decode(s)
File "/usr/lib/python2.7/dist-packages/simplejson/decoder.py", line 370, in decode
obj, end = self.raw_decode(s)
File "/usr/lib/python2.7/dist-packages/simplejson/decoder.py", line 389, in raw_decode
return self.scan_once(s, idx=_w(s, idx).end())
simplejson.scanner.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Here is my hackerrank.py file:
import requests
import time
import sys
import os.path
class HackerRank:
def __init__(self,url,code="",ext=".py"):
self.code = code
self.ext = ext
self.set_language()
self.problem_url = url
self.s = requests.session()
self.set_post_url()
def set_post_url(self):
#rootUrl = "https://www.hackerrank.com/"
#l = self.problem_url.split("/")
#print l
#if l[3] == "challenges":
# contestUrl = "/contests/master/challenges/" + l[4]
#else:
# # it's a contest
# contestUrl = "/".join(l[3:])
#self.post_url = rootUrl + "/rest/" + contestUrl + "/compile_tests/"
self.post_url = self.problem_url
def set_language(self):
if self.ext == ".py":
self.language = "python"
if self.ext == ".c":
self.language = "c"
if self.ext == ".cpp":
self.language = "cpp"
if self.ext == ".java":
self.language = "java"
# TODO : find out the language value for other exts
def set_code(self,code):
self.code = code
def generate_payload(self):
self.payload = {'code' : self.code, 'language' : self.language}
def compile_and_test(self):
self.generate_payload()
self.r = self.s.post(self.post_url, params=self.payload)
if self.r.status_code == 404:
print "not found 1"
return "NOT_FOUND"
print "yes"
j = self.r.json()
print j
self.submission_id = j['model']['id']
self.get_url = self.post_url + "/submissions/code/" + str(self.submission_id)
self.rr = self.s.get(self.get_url, cookies = self.s.cookies)
return self.rr
def fetch(self,last_status):
if self.r.status_code == 404:
return
self.rr = self.s.get(self.get_url, cookies = self.s.cookies)
self.res = self.rr.json()
if self.res['model']['status'] == 0:
new_status = self.res['model']['status_string']
if new_status != last_status:
print(new_status)
time.sleep(1)
self.fetch(new_status)
else:
return
def compiler_message(self):
return self.res['model']['compilemessage']
def testcase_message(self):
return self.res['model']['testcase_message']
def expected_output(self):
return self.res['model']['expected_output']
def stdin(self):
return self.res['model']['stdin']
def stdout(self):
return self.res['model']['stdout']
def dump(self):
cm = self.compiler_message()
tm = self.testcase_message()
eo = self.expected_output()
stdin = self.stdin()
stdout = self.stdout()
s = ""
for i in range(len(tm)):
s += (cm + "\n\n")
s += ("Testcase# " + str(i) + "\n")
s += ("Sample Input:\n\n")
s += (stdin[i])
s += ("\n\n")
s += ("Your Output:\n\n")
s += (stdout[i])
s += ("\n\n")
s += ("Expected Output:\n\n")
s += (eo[i])
s += ("\n\n")
s += ("Compiler Message:\n\n")
s += (tm[i])
s += ("\n\n")
print(s)
return s
def run(self):
if self.compile_and_test() == "NOT_FOUND":
print "not found"
return "404 : NOT_FOUND"
self.fetch("")
return self.dump()
if __name__=="__main__":
url = sys.argv[1]
codefile = sys.argv[2]
print url,codefile
ext = os.path.splitext(codefile)[1]
code = open(codefile).read()
h = HackerRank(url,code,ext)
h.run()
I am newbie to json and plugins. Can you help me out.