Webscraping Multiple JS pages at once

Webscraping Multiple JS pages at once - json

I am trying to webscrape a website that has multiple pages that are rendered by Javascript. I am using BeautifulSoup and Selenium. I have a script that works but only for the first page of the website. Is it possible to webscrape multiple javascript rendered pages or do I need to do them individually? Here is my script:
import time
from bs4 import BeautifulSoup as soup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
# The path to where you have your chrome webdriver stored:
webdriver_path = '/Users/rawlins/Downloads/chromedriver'
# Add arguments telling Selenium to not actually open a window
chrome_options = Options()
chrome_options.add_argument('--headless')
#chrome_options.add_argument('--window-size=1920x1080')
# Fire up the headless browser
browser = webdriver.Chrome(executable_path = webdriver_path,
chrome_options = chrome_options)
# Load webpage
url = "https://cnx.org/search?q=subject:Arts"
browser.get(url)
# to ensure that the page has loaded completely.
time.sleep(3)
data = []
n = 2
for i in range(1, n+1):
if (i == 1):
# handle first page
response = requests.get(url)
response = requests.get(url + "&page=" + str(i))
#response = requests.get(url + "&page=" + str(i),headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
# Parse HTML, close browser
page_soup = soup(browser.page_source, 'lxml')
containers = page_soup.findAll("tr")
browser.quit()
for container in containers:
item = {}
item['type'] = "Course Material"
if container.find('td', {'class' : 'title'}):
item['title'] = container.find('td', {'class' : 'title'}).h4.text.strip()
else:
item['title'] = ""
if container.find('td', {'class' : 'authors'}):
item['author'] = container.find('td', {'class' : 'authors'}).text.strip()
else:
item['author'] = ""
if container.find('td', {'class' : 'title'}):
item['link'] = "https://cnx.org/" + container.find('td', {'class' : 'title'}).a["href"]
else:
item['link'] = ""
if container.find('td', {'class' : 'title'}):
item['description'] = container.find('td', {'class' : 'title'}).span.text
else:
item['description'] = ""
item['subject'] = "Arts"
item['source'] = "OpenStax CNX"
item['base_url'] = "https://cnx.org/browse"
item['license'] = "Attribution"
data.append(item) # add the item to the list
with open("js-webscrape.json", "w") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
Thanks in advance.

Couple of issues here:
You're mixing requests.get() in with browser.get(). No need for the requests module at all here since you're getting the page via the headless browser.
No need to have a special case for the first page. https://cnx.org/search?q=subject:Arts&page=1 works fine.
time.sleep() should be between browser.get() and the parsing, to allow the page to fully load before feeding it to BeautifulSoup.
You should write data to the JSON file outside the for loop, once all pages have been scraped.
Quit the browser outside the for loop as well, not after a single iteration.
To avoid encoding errors, specify the encoding when writing to the JSON file: with open("js-webscrape.json", "w", encoding="utf-8")
Here's a working implementation that scrapes all 7 pages:
import time
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json
# The path to where you have your chrome webdriver stored:
webdriver_path = '/Users/Gebruiker/Downloads/chromedriver_win32/chromedriver'
# Add arguments telling Selenium to not actually open a window
chrome_options = Options()
chrome_options.add_argument('--headless')
# Fire up the headless browser
browser = webdriver.Chrome(executable_path = webdriver_path, options = chrome_options)
# Load webpage
url = "https://cnx.org/search?q=subject:Arts"
data = []
n = 7
for i in range(1, n+1):
response = browser.get(url + "&page=" + str(i))
time.sleep(5)
# Parse HTML
page_soup = soup(browser.page_source,'lxml')
containers = page_soup.findAll("tr")
for container in containers:
item = dict()
item['type'] = "Course Material"
if container.find('td', {'class' : 'title'}):
item['title'] = container.find('td', {'class' : 'title'}).h4.text.strip()
else:
item['title'] = ""
if container.find('td', {'class' : 'authors'}):
item['author'] = container.find('td', {'class' : 'authors'}).text.strip()
else:
item['author'] = ""
if container.find('td', {'class' : 'title'}):
item['link'] = "https://cnx.org/" + container.find('td', {'class' : 'title'}).a["href"]
else:
item['link'] = ""
if container.find('td', {'class' : 'title'}):
item['description'] = container.find('td', {'class' : 'title'}).span.text
else:
item['description'] = ""
item['subject'] = "Arts"
item['source'] = "OpenStax CNX"
item['base_url'] = "https://cnx.org/browse"
item['license'] = "Attribution"
data.append(item) # add the item to the list
# write data to file and quit browser when done
print(data)
with open("js-webscrape.json", "w", encoding="utf-8") as writeJSON:
json.dump(data, writeJSON, ensure_ascii=False)
browser.quit()

Related

Selenium MaxRetryError(_pool, url, error or ResponseError(cause)) urllib3.exceptions.MaxRetryError

class gmarket_sales():
def __init__(self):
chrome_driver = Service(ChromeDriverManager().install())
options = Options()
options.add_experimental_option('detach',True)
options.add_experimental_option('excludeSwitches',['enable-logging'])
# options.add_argument('--headless')
# options.add_argument('--window-size = x, y')
# options.add_argument('--start-maximazed')
# options.add_argument('--start-fullscreen')
# options.add_argument('--mute-audio')
self.driver = webdriver.Chrome(options=options,service=chrome_driver)
self.now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S (%a)')
self.hour = datetime.datetime.now().strftime('%H시_%M_분')
self.today = date.today()
self.folder = None
self.today_file = None
self.kakao_talk = kakao()
def connect(self):
url = 'http://minishop.gmarket.co.kr/meritblog'
# url = 'http://minishop.gmarket.co.kr/hanvitis'
self.driver.get(url)
return url
def shopping_mall(self):
mall_name = self.driver.find_element(By.CSS_SELECTOR,'a.shop_title_ui_txt').text
self.folder = f'./메리트몰_데이터베이스/지마켓'
self.today_file = f'{self.today}_{mall_name}_지마켓.json'
return mall_name
def soup(self,url_param):
try:
response = requests.get(url_param)
if response.status_code == 200:
sp = BeautifulSoup(response.text, 'html.parser')
return sp
except requests.packages.urllib3.exceptions.MaxRetryError as e:
print(str(e))
def total_product(self):
total_items = 0
products = self.driver.find_element(By.ID,'ulCategory').find_elements(By.CSS_SELECTOR,'span.data_num')
for product in products:
items = int(product.text.replace('(',"").replace(')',""))
total_items += items
# 391개
return total_items
def paging(self,total_items,url):
page_list = []
# 전체상품보기 클릭
self.driver.execute_script('arguments[0].click();',self.driver.find_element(By.CSS_SELECTOR,'.allList_view > a'))
time.sleep(2)
# 한 페이지의 상품 수
view_limit = int(self.driver.find_element(By.CSS_SELECTOR,'div.limit').text.replace("개씩",""))
# 페이지 수 구하기
if total_items % view_limit == 0:
page = total_items // view_limit
else:
page = total_items // view_limit + 1
# 페이지 리스트
for cnt in range(page):
page_url = f'{url}/List?CategoryType=General&SortType=FocusRank&DisplayType=SmallImage&Page={cnt+1}&PageSize=60'
page_list.append(page_url)
# self.driver.quit()
return page_list
def data_one(self,page_list):
"""상품 url 리스트
정상가/할인가/할인율 딕셔너리"""
url_list = []
price_dic = {}
for page in page_list:
html = self.soup(page)
for items in html.find('ul',class_='type2').find_all('li'):
# url
item_url = items.find('a')['href']
# 상품코드
item_code = item_url[-10:]
# 가격 및 할인율
if items.find('p',class_='prd_price').find('span',class_='del_important'):
original_price = items.find('p',class_='prd_price').find('span',class_='del_important').text.replace("원","")
discount_price = items.find('p',class_='prd_price').find('strong').text.replace("원","")
sale_rate = items.find('p',class_='prd_price').find('span',class_='splt_ico usr_clr').text
else:
original_price = items.find('p',class_='prd_price').find('strong').text.replace("원","")
discount_price = "없음"
sale_rate = "없음"
url_list.append(item_url)
price_dic[item_code]={"정상가":original_price,"할인가":discount_price,"할인율":sale_rate}
time.sleep(randint(1,10))
self.driver.quit()
return url_list , price_dic
def check_start(self):
url = self.connect()
mall_name = self.shopping_mall()
total_items = self.total_product()
page_list = self.paging(total_items,url)
url_list,price_dic = self.data_one(page_list)
if __name__ == "__main__":
g_market = gmarket_sales()
# g_market.check_start()
schedule.every().hour.at(":20").do(g_market.check_start)
while True:
schedule.run_pending()
time.sleep(1)
Hello, I am a student practicing web page crawling.
I'm making a code that scrapes data by parsing a website with selenium.
I wrote the program so that it runs at regular intervals using the final schedule module.
However, if chrome_driver is initialized in the init of the class and the driver.quit() command is put in the execution process, the MaxRetryError(_pool, url, error or ResponseError(cause)) urllib3.exceptions.MaxRetryError code is displayed when the second code is executed..
Below is the code I wrote.
I would really appreciate it if you could point out any problems.

Django webscraping JSONDecodeError

I'm trying to scrape data and it works fine if the {fplid} for url is like 30 for example. How do I fix this method, so it gets the user input and gets the data from the url without a decode error. This is the traceback
'''
C:\Users\krish\OneDrive\Desktop\FPLHangout\scrape\views.py, line 31, in home
data = get_html_content(fplid) …
Local vars
C:\Users\krish\OneDrive\Desktop\FPLHangout\scrape\views.py, line 9, in get_html_content
managerdata = json.loads(r.text)
def get_html_content(fplid):
url = 'https://fantasy.premierleague.com/api/entry/{fplid}/event/30/picks/'
r = requests.get(url)
managerdata = json.loads(r.text)
bootstrap = 'https://fantasy.premierleague.com/api/bootstrap-static/'
bootstrapdata = requests.get(bootstrap)
bootstrapjson = json.loads(bootstrapdata.text)
for pick in managerdata['picks']:
pick = (pick['element']) #correct id
location = 0
for player in bootstrapjson['elements']:
if player.get('id') == pick:
break
location += 1
#position = (pick['position'])
firstname = bootstrapjson['elements'][location]['first_name']
secondname = bootstrapjson['elements'][location]['second_name']
return firstname + " " + secondname
def home(request):
if 'fplid' in request.GET: #
fplid = request.GET.get('fplid')
data = get_html_content(fplid)
return render(request, 'scrape/home.html', {'fpldata': data})
return render(request, 'scrape/home.html')

PyTorch does not make initial weights random

I created a Neural Network that takes two greyscale images 14x14 pixels portraying a digit (from MNIST database) and returns 1 if the first digit is less or equal to the second digit, returns 0 otherwise. The code runs, but every time the initial weights are the same. They should be random
Forcing the initial weights to be random, by using the following line of code in the Net class, does not help.
torch.nn.init.normal_(self.layer1.weight, mean=0.0, std=0.01)
Here is the code of the "main.py" file:
import os; os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import torch
import torch.nn as nn
from dlc_practical_prologue import *
class Net(nn.Module):
def __init__(self):
super().__init__()
self.layer1 = nn.Linear(2*14*14, 32)
#torch.nn.init.normal_(self.layer1.weight, mean=0.0, std=0.01)
#self.layer2 = nn.Linear(100, 100)
#self.layer3 = nn.Linear(100, 100)
self.layer2 = nn.Linear(32, 1)
def forward(self, x):
x = torch.relu(self.layer1(x))
#x = torch.relu(self.layer2(x))
#x = torch.relu(self.layer3(x))
x = torch.sigmoid(self.layer2(x))
return x
if __name__ == '__main__':
# Data initialization
N = 1000
train_input, train_target, train_classes, _, _, _, = generate_pair_sets(N)
_, _, _, test_input, test_target, test_classes = generate_pair_sets(N)
train_input = train_input.view(-1, 2*14*14)
test_input = test_input.view(-1, 2*14*14)
train_target = train_target.view(-1, 1)
test_target = test_target.view(-1, 1)
# I convert the type to torch.float32
train_input, train_target, train_classes, test_input, test_target, test_classes = \
train_input.type(torch.float32), train_target.type(torch.float32), train_classes.type(torch.long), \
test_input.type(torch.float32), test_target.type(torch.float32), test_classes.type(torch.long)
# Create the neural network
net = Net()
# Training
learning_rate = 0.01
# Use MSELoss
loss = nn.MSELoss()
# Use Adam optimizer
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
EPOCHS = 50
for param in net.parameters():
print(param)
for epoch in range(EPOCHS):
target_predicted = net(train_input)
l = loss(train_target, target_predicted) #loss = nn.MSELoss()
#l = loss(target_predicted, train_target)
l.backward()
optimizer.step()
optimizer.zero_grad()
#print(l)
# Testing
total = 1000
correct = 0
with torch.no_grad():
correct = ( test_target == net(test_input).round() ).sum()
print("Accuracy %.2f%%" % (correct / total * 100))
Here is the code for "dlc_practical_monologue.py":
import os; os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch
from torchvision import datasets
import argparse
import os
import urllib
######################################################################
parser = argparse.ArgumentParser(description='DLC prologue file for practical sessions.')
parser.add_argument('--full',
action='store_true', default=False,
help = 'Use the full set, can take ages (default False)')
parser.add_argument('--tiny',
action='store_true', default=False,
help = 'Use a very small set for quick checks (default False)')
parser.add_argument('--seed',
type = int, default = 0,
help = 'Random seed (default 0, < 0 is no seeding)')
parser.add_argument('--cifar',
action='store_true', default=False,
help = 'Use the CIFAR data-set and not MNIST (default False)')
parser.add_argument('--data_dir',
type = str, default = None,
help = 'Where are the PyTorch data located (default $PYTORCH_DATA_DIR or \'./data\')')
# Timur's fix
parser.add_argument('-f', '--file',
help = 'quick hack for jupyter')
args = parser.parse_args()
if args.seed >= 0:
torch.manual_seed(args.seed)
######################################################################
# The data
def convert_to_one_hot_labels(input, target):
tmp = input.new_zeros(target.size(0), target.max() + 1)
tmp.scatter_(1, target.view(-1, 1), 1.0)
return tmp
def load_data(cifar = None, one_hot_labels = False, normalize = False, flatten = True):
if args.data_dir is not None:
data_dir = args.data_dir
else:
data_dir = os.environ.get('PYTORCH_DATA_DIR')
if data_dir is None:
data_dir = './data'
if args.cifar or (cifar is not None and cifar):
print('* Using CIFAR')
cifar_train_set = datasets.CIFAR10(data_dir + '/cifar10/', train = True, download = True)
cifar_test_set = datasets.CIFAR10(data_dir + '/cifar10/', train = False, download = True)
train_input = torch.from_numpy(cifar_train_set.data)
train_input = train_input.transpose(3, 1).transpose(2, 3).float()
train_target = torch.tensor(cifar_train_set.targets, dtype = torch.int64)
test_input = torch.from_numpy(cifar_test_set.data).float()
test_input = test_input.transpose(3, 1).transpose(2, 3).float()
test_target = torch.tensor(cifar_test_set.targets, dtype = torch.int64)
else:
print('* Using MNIST')
######################################################################
# import torchvision
# raw_folder = data_dir + '/mnist/raw/'
# resources = [
# ("https://fleuret.org/dlc/data/train-images-idx3-ubyte.gz", "f68b3c2dcbeaaa9fbdd348bbdeb94873"),
# ("https://fleuret.org/dlc/data/train-labels-idx1-ubyte.gz", "d53e105ee54ea40749a09fcbcd1e9432"),
# ("https://fleuret.org/dlc/data/t10k-images-idx3-ubyte.gz", "9fb629c4189551a2d022fa330f9573f3"),
# ("https://fleuret.org/dlc/data/t10k-labels-idx1-ubyte.gz", "ec29112dd5afa0611ce80d1b7f02629c")
# ]
# os.makedirs(raw_folder, exist_ok=True)
# # download files
# for url, md5 in resources:
# filename = url.rpartition('/')[2]
# torchvision.datasets.utils.download_and_extract_archive(url, download_root=raw_folder, filename=filename, md5=md5)
######################################################################
mnist_train_set = datasets.MNIST(data_dir + '/mnist/', train = True, download = True)
mnist_test_set = datasets.MNIST(data_dir + '/mnist/', train = False, download = True)
train_input = mnist_train_set.data.view(-1, 1, 28, 28).float()
train_target = mnist_train_set.targets
test_input = mnist_test_set.data.view(-1, 1, 28, 28).float()
test_target = mnist_test_set.targets
if flatten:
train_input = train_input.clone().reshape(train_input.size(0), -1)
test_input = test_input.clone().reshape(test_input.size(0), -1)
if args.full:
if args.tiny:
raise ValueError('Cannot have both --full and --tiny')
else:
if args.tiny:
print('** Reduce the data-set to the tiny setup')
train_input = train_input.narrow(0, 0, 500)
train_target = train_target.narrow(0, 0, 500)
test_input = test_input.narrow(0, 0, 100)
test_target = test_target.narrow(0, 0, 100)
else:
print('** Reduce the data-set (use --full for the full thing)')
train_input = train_input.narrow(0, 0, 1000)
train_target = train_target.narrow(0, 0, 1000)
test_input = test_input.narrow(0, 0, 1000)
test_target = test_target.narrow(0, 0, 1000)
print('** Use {:d} train and {:d} test samples'.format(train_input.size(0), test_input.size(0)))
if one_hot_labels:
train_target = convert_to_one_hot_labels(train_input, train_target)
test_target = convert_to_one_hot_labels(test_input, test_target)
if normalize:
mu, std = train_input.mean(), train_input.std()
train_input.sub_(mu).div_(std)
test_input.sub_(mu).div_(std)
return train_input, train_target, test_input, test_target
######################################################################
def mnist_to_pairs(nb, input, target):
input = torch.functional.F.avg_pool2d(input, kernel_size = 2)
a = torch.randperm(input.size(0))
a = a[:2 * nb].view(nb, 2)
input = torch.cat((input[a[:, 0]], input[a[:, 1]]), 1)
classes = target[a]
target = (classes[:, 0] <= classes[:, 1]).long()
return input, target, classes
######################################################################
def generate_pair_sets(nb):
if args.data_dir is not None:
data_dir = args.data_dir
else:
data_dir = os.environ.get('PYTORCH_DATA_DIR')
if data_dir is None:
data_dir = './data'
train_set = datasets.MNIST(data_dir + '/mnist/', train = True, download = True)
train_input = train_set.data.view(-1, 1, 28, 28).float()
train_target = train_set.targets
test_set = datasets.MNIST(data_dir + '/mnist/', train = False, download = True)
test_input = test_set.data.view(-1, 1, 28, 28).float()
test_target = test_set.targets
return mnist_to_pairs(nb, train_input, train_target) + \
mnist_to_pairs(nb, test_input, test_target)
######################################################################
Note that I have to add the following line of code to run the code on Windows 10, while it is not necessary to run it on Linux.
import os; os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
Also on Linux I always get the same initial weights.
Please, can you help me?

Correct me if I'm wrong here but only the weights of the first layer should be the same each time you run this. The thing is when you import the dlc_practical_monologue.py there's this thing in it:
if args.seed >= 0:
torch.manual_seed(args.seed)
which fires up if the seed is >=0 (default is 0).
This should only initialize the first layer with the same weights for each run. Check if this is the case.

The solution was to delete the following lines from "dlv_practical_prologue.py":
if args.seed >= 0:
torch.manual_seed(args.seed)

Keras handling large dataset which cannot fit into memory

I'm working on facial expression recognition, and I'm using Keras. I've collected many datasets, and then I have applied data augmentation on the images, I've got about 500 000 images saved (as pixels) on a .csv file (same format as fer2013.csv).
This is the code I'm using :
def Zerocenter_ZCA_whitening_Global_Contrast_Normalize(list):
Intonumpyarray = numpy.asarray(list)
data = Intonumpyarray.reshape(img_width,img_height)
data2 = ZeroCenter(data)
data3 = zca_whitening(flatten_matrix(data2)).reshape(img_width,img_height)
data4 = global_contrast_normalize(data3)
data5 = numpy.rot90(data4,3)
return data5
def load_data():
train_x = []
train_y = []
val_x = []
val_y = []
test_x = []
test_y = []
f = open('ALL.csv')
csv_f = csv.reader(f)
for row in csv_f:
if str(row[2]) == "Training":
temp_list_train = []
for pixel in row[1].split():
temp_list_train.append(int(pixel))
data = Zerocenter_ZCA_whitening_Global_Contrast_Normalize(temp_list_train)
train_y.append(int(row[0]))
train_x.append(data.reshape(data_resh).tolist())
elif str(row[2]) == "PublicTest":
temp_list_validation = []
for pixel in row[1].split():
temp_list_validation.append(int(pixel))
data = Zerocenter_ZCA_whitening_Global_Contrast_Normalize(temp_list_validation)
val_y.append(int(row[0]))
val_x.append(data.reshape(data_resh).tolist())
elif str(row[2]) == "PrivateTest":
temp_list_test = []
for pixel in row[1].split():
temp_list_test.append(int(pixel))
data = Zerocenter_ZCA_whitening_Global_Contrast_Normalize(temp_list_test)
test_y.append(int(row[0]))
test_x.append(data.reshape(data_resh).tolist())
return train_x, train_y, val_x, val_y, test_x, test_y
And then I load data and feed them to the generator :
Train_x, Train_y, Val_x, Val_y, Test_x, Test_y = load_data()
Train_x = numpy.asarray(Train_x)
Train_x = Train_x.reshape(Train_x.shape[0],img_rows,img_cols)
Test_x = numpy.asarray(Test_x)
Test_x = Test_x.reshape(Test_x.shape[0],img_rows,img_cols)
Val_x = numpy.asarray(Val_x)
Val_x = Val_x.reshape(Val_x.shape[0],img_rows,img_cols)
Train_x = Train_x.reshape(Train_x.shape[0], img_rows, img_cols, 1)
Test_x = Test_x.reshape(Test_x.shape[0], img_rows, img_cols, 1)
Val_x = Val_x.reshape(Val_x.shape[0], img_rows, img_cols, 1)
Train_x = Train_x.astype('float32')
Test_x = Test_x.astype('float32')
Val_x = Val_x.astype('float32')
Train_y = np_utils.to_categorical(Train_y, nb_classes)
Test_y = np_utils.to_categorical(Test_y, nb_classes)
Val_y = np_utils.to_categorical(Val_y, nb_classes)
datagen = ImageDataGenerator(
featurewise_center=False,
samplewise_center=False,
featurewise_std_normalization=False,
samplewise_std_normalization=False,
zca_whitening=False,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True,
shear_range=0.03,
zoom_range=0.03,
vertical_flip=False)
datagen.fit(Train_x)
model.fit_generator(datagen.flow(Train_x, Train_y,
batch_size=batch_size),
samples_per_epoch=Train_x.shape[0],
nb_epoch=nb_epoch,
validation_data=(Val_x, Val_y))
When I run the code, RAM usage gets bigger and bigger until the pc freezes (I've have 16 Gb). It get stuck when loading_data() is called. Any solution for this problem that can fits my code ?

Seems to be a duplicate of this question. Basically, you'll have to use fit_generator() instead of fit() and pass in a function that loads the data into your model one batch at a time instead of all at once.

How do I get the next pagination 'href'?

So I am having trouble obtaining the href link for the next pages of the url. I got up to obtaining all the text and what not that the tag contains but I can't seem to wrap my head around to removing the text that I don't need and just obtaining the href and navigating through the pages.
Here is my code:
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
filter_words = ['engineering', 'instrumentation', 'QA']
all_job_url = []
nextpages = []
filtered_job_links = []
http_flinks = []
flinks = []
def all_next_pages():
pages = prettify.find_all('div', {'class':'pagination'})
for next_page in pages:
next_page.find_all('a')
nextpages.append(next_page)
print(next_page)
all_next_pages()

Here is a way to get the links of the search result items. Find row result class and then find a tag, it contains all the information you need.
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.text
prettify = BeautifulSoup(rcontent, "lxml")
filter_words = ['engineering', 'instrumentation', 'QA']
all_job_url = []
nextpages = []
filtered_job_links = []
http_flinks = []
flinks = []
def all_next_pages():
pages = prettify.find_all('div', {'class':' row result'})
for next_page in pages:
info = next_page.find('a')
url = info.get('href')
title = info.get('title')
print(title,url)
all_next_pages()

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

Webscraping Multiple JS pages at once - json

Related

Selenium MaxRetryError(_pool, url, error or ResponseError(cause)) urllib3.exceptions.MaxRetryError

Django webscraping JSONDecodeError

PyTorch does not make initial weights random

Keras handling large dataset which cannot fit into memory

How do I get the next pagination 'href'?

Categories

Resources