Web scraping of hotel name and its price? - html

I am a new learner of python and i am trying to scrape the name and price of particular hotel form goibibo. But every time it shows the output "None". I am not able to figure it out.
Code:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import requests
from requests import get
driver=webdriver.Chrome("C:\\Users\\hp\\Desktop\\New folder\\chromedriver.exe")
driver.get("https://www.goibibo.com/hotels/fihalhohi-island-resort-hotel-in-maldives-3641257667928818414/?hquery={%22ci%22:%2220200903%22,%22co%22:%2220200904%22,%22r%22:%221-2-0%22,%22ibp%22:%22v15%22}&hmd=57c9d7df10ce2ccd7c8fa6e25f4961a9e2bfd39eec002b71e73c095ccec678b1755edc988f4cd5487a9cfba712fa5eb237dc2d5536cdeb75fecfaec0bfc0461f0c82c1fe7f95feb466a9a50609566a429b8bf31d8b3a4058c68a373f40919411fcb01f758fc7b1ff0846764e224629a9a423cd882cf963a63765c80233c253db9d1aeee5200a0a7bc860be97a52ef3df77f49aa906fbb53d10dd59707f1a01ced53756ceded90cbdd8ddec83bdaf5a7ce162f86cb0ed7e115362182c4b99d853f16c5f4e80622113ceadf4d80191000a9e84ded0531fde54fb8ab281943bc2bb7ad41d60a81ba59478e75ac61f6a58ace01e071429b0837292a94d8cfd4da1a5ef453856d6f7d46c6b1adb4abaa7a2ca8e955cb316afe5e220000346c85759a750fdee0887c402eb3ded5c1d054fb84df56afc7a64bc2b2f6c98222c948e80ff32bd88820398ec7b055f7bf27c60f31ebe7f2d1427302997b2b9da5db3aef2f81bac4c21729e84002fbe5afd065ea4c248aa115c405e&cc=PF&reviewType=gi")
content=driver.page_source
soup=BeautifulSoup(content,"html.parser")
soup.prettify()
soup_name=soup.find("h3",class_="dwebCommonstyles__SectionHeader-sc-112ty3f-5 HotelName__HotelNameText-sc-1bfbuq5-0 hMoKY")
print(soup_name.txt)
Output:
> None

Here is the hotel name code. Price depends on the room user chooses from hotel and there is no common price for hotel.
import requests
from bs4 import BeautifulSoup
url = "https://www.goibibo.com/hotels/fihalhohi-island-resort-hotel-in-maldives-3641257667928818414/?hquery={%22ci%22:%2220200903%22,%22co%22:%2220200904%22,%22r%22:%221-2-0%22,%22ibp%22:%22v15%22}&hmd=57c9d7df10ce2ccd7c8fa6e25f4961a9e2bfd39eec002b71e73c095ccec678b1755edc988f4cd5487a9cfba712fa5eb237dc2d5536cdeb75fecfaec0bfc0461f0c82c1fe7f95feb466a9a50609566a429b8bf31d8b3a4058c68a373f40919411fcb01f758fc7b1ff0846764e224629a9a423cd882cf963a63765c80233c253db9d1aeee5200a0a7bc860be97a52ef3df77f49aa906fbb53d10dd59707f1a01ced53756ceded90cbdd8ddec83bdaf5a7ce162f86cb0ed7e115362182c4b99d853f16c5f4e80622113ceadf4d80191000a9e84ded0531fde54fb8ab281943bc2bb7ad41d60a81ba59478e75ac61f6a58ace01e071429b0837292a94d8cfd4da1a5ef453856d6f7d46c6b1adb4abaa7a2ca8e955cb316afe5e220000346c85759a750fdee0887c402eb3ded5c1d054fb84df56afc7a64bc2b2f6c98222c948e80ff32bd88820398ec7b055f7bf27c60f31ebe7f2d1427302997b2b9da5db3aef2f81bac4c21729e84002fbe5afd065ea4c248aa115c405e&cc=PF&reviewType=gi"
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
htag=soup.findAll('section',attrs={"class":"HotelDetailsMain__HotelDetailsContainer-sc-2p7gdu-0 kuBApH"})
for x in htag:
print (x.find('h3').text)
Note: There is no need to use selenium webdriver if you just want to fetch the contents of webpage. Beautifulsoup itself does it for you

Related

When I use the following script with selenium and Beautifulsoup the text is correctly extracted but the json files is always the same

I have created the script below to extract the text from the posts of an instagram user profile.
It works perfectly to extract the posts but there is a problem once i start using the scroll function of selenium as the json file does not seem to be updating.
I have created a loop for 2 repetitions for test purposes
but there seems to be a problem in the line pageSource=driver.page_source.
I am expecting the script to load the new json file linked to the new page that but when I test it the pageSource file is always the same even if selenium is correctly scrolling through the page.
import requests
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
url = #instagram url
driver = webdriver.Firefox()
driver.get(url)
for n in range(2):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
pageSource=driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
body = soup.find('body')
script = body.find('script')
raw = script.text.strip().replace('window._sharedData =', '').replace(';', '')
json_data=json.loads(raw)
for post in json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
text_src = post['node']['edge_media_to_caption']['edges'][0]['node']['text']
print (text_src)
time.sleep(5)

parse a json object in a div while scraping using beautifulsoup python

I am learning scraping. I need to access the json string i encounter within a DIV. I am using beautifulsoup.
This is the json string i get in the DIV. I need the value (51.65) of the tag "lastprice". Please help. The JSON object is in json_d
import pip
import requests
import json
from bs4 import BeautifulSoup
print ('hi')
page = requests.get('https://www.nseindia.com/live_market/dynaContent/live_watch/get_quote/GetQuote.jsp?symbol=NBCC&illiquid=0&smeFlag=0&itpFlag=0')
soup = BeautifulSoup(page.text, 'html.parser')
json_d = soup.find(id='responseDiv')
print ('bye')
import bs4
import json
r= '''
<div id="responseDiv" style="display:none">{"tradedDate":"07DEC2018","data":[{"pricebandupper":"58.35","symbol":"NBCC","applicableMargin":"15.35","bcEndDate":"14-SEP-18","totalSellQuantity":"40,722","adhocMargin":"-","companyName":"NBCC (India) Limited","marketType":"N","exDate":"06-SEP-18","bcStartDate":"10-SEP-18","css_status_desc":"Listed","dayHigh":"53.55","basePrice":"53.05","securityVar":"10.35","pricebandlower":"47.75","sellQuantity5":"-","sellQuantity4":"-","sellQuantity3":"-","cm_adj_high_dt":"08-DEC-17","sellQuantity2":"-","dayLow":"51.55","sellQuantity1":"40,722","quantityTraded":"71,35,742","pChange":"-2.64","totalTradedValue":"3,714.15","deliveryToTradedQuantity":"40.23","totalBuyQuantity":"-","averagePrice":"52.05","indexVar":"-","cm_ffm":"2,424.24","purpose":"ANNUAL GENERAL MEETING\/DIVIDEND RE 0.56 PER SHARE","buyPrice2":"-","secDate":"7DEC2018","buyPrice1":"-","high52":"266.00","previousClose":"53.05","ndEndDate":"-","low52":"50.80","buyPrice4":"-","buyPrice3":"-","recordDate":"-","deliveryQuantity":"28,70,753","buyPrice5":"-","priceBand":"No Band","extremeLossMargin":"5.00","cm_adj_low_dt":"26-OCT-18","varMargin":"10.35","sellPrice1":"51.80","sellPrice2":"-","totalTradedVolume":"71,35,742","sellPrice3":"-","sellPrice4":"-","sellPrice5":"-","change":"-1.40","surv_indicator":"-","ndStartDate":"-","buyQuantity4":"-","isExDateFlag":false,"buyQuantity3":"-","buyQuantity2":"-","buyQuantity1":"-","series":"EQ","faceValue":"1.00","buyQuantity5":"-","closePrice":"51.80","open":"53.15","isinCode":"INE095N01031","lastPrice":"51.65"}],"optLink":"\/marketinfo\/sym_map\/symbolMapping.jsp?symbol=NBCC&instrument=-&date=-&segmentLink=17&symbolCount=2","otherSeries":["EQ"],"futLink":"\/live_market\/dynaContent\/live_watch\/get_quote\/GetQuoteFO.jsp?underlying=NBCC&instrument=FUTSTK&expiry=27DEC2018&type=-&strike=-","lastUpdateTime":"07-DEC-2018 15:59:59"}</div>'''
html = bs4.BeautifulSoup(r)
soup = html.find('div', {'id':'responseDiv'}).text
data = json.loads(soup)
last_price = data['data'][0]['lastPrice']
EDIT:
json_d = soup.find(id='responseDiv')
Try changing to
json_d = soup.find(‘div’, {‘id’:'responseDiv'})
Then you should be able to do
data = json.loads(json_d)
last_price = data['data'][0]['lastPrice']
See if that helps. I’m currently away from my computer until Tuesday so typing this up on my iPhone, so can’t test/play with it.
The other thing is the site might need to be read in after it’s loaded. In that case, I think you’d need to look into selenium package or html-requests packages.
Again, I can’t look until Tuesday when I get back home to my laptop.

beautiful soup findall not returning results

I am trying to webscrape and get the insurance dollars as listed in the below html.
Insurance
Insurance
Used the below code but it is not fetching anything. Can someone help? I am fairly new to python...
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.kbb.com/ford/escape/2017/s/?vehicleid=415933&intent=buy-new')
html_soup = BeautifulSoup(r.content, 'lxml')
test2 = html_soup.find_all('div',attrs={"class":"col-base-6"})
print(test2)
Not all the data you see on the page is actually the response to the get request to this URL. there are a lot of other requests the browser make in the background, which are initiated by javascript code.
Specifically, the request for the insurance data is made to this URL:
https://www.kbb.com/vehicles/hub/_costtoown/?vehicleid=415933
Here is a working code for what you need:
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.kbb.com/vehicles/hub/_costtoown/?vehicleid=415933')
html_soup = BeautifulSoup(r.text, 'html.parser')
Insurance = html_soup.find('div',string="Insurance").find_next().text
print(Insurance)

BeautifulSoup and prettify() function

To parse html codes of a website, I decided to use BeautifulSoup class and prettify() method. I wrote the code below.
import requests
import bs4
response = requests.get("https://www.doviz.com")
soup = bs4.BeautifulSoup(response.content, "html.parser")
print(soup.prettify())
When I execute this code on Mac terminal, indentation of the codes are not set. On the other hand, If I execute this code on windows cmd or PyCharm, all codes are set.
Do you know the reason for this ?
try this code:
import requests
from bs4 import BeautifulSoup
response = requests.get("https://www.doviz.com")
soup = BeautifulSoup(response.text, "html.parser")
print(soup.prettify())

Web scraper software for dynamic pages [duplicate]

I am trying to use python3 to return the bibtex citation generated by http://www.doi2bib.org/. The url's are predictable so the script can work out the url without having to interact with the web page. I have tried using selenium, bs4, etc but cant get the text inside the box.
url = "http://www.doi2bib.org/#/doi/10.1007/s00425-007-0544-9"
import urllib.request
from bs4 import BeautifulSoup
text = BeautifulSoup(urllib.request.urlopen(url).read())
print(text)
Can anyone suggest a way of returning the bibtex citation as a string (or whatever) in python?
You don't need BeautifulSoup here. There is an additional XHR request sent to the server to fill out the bibtex citation, simulate it, for example, with requests:
import requests
bibtex_id = '10.1007/s00425-007-0544-9'
url = "http://www.doi2bib.org/#/doi/{id}".format(id=bibtex_id)
xhr_url = 'http://www.doi2bib.org/doi2bib'
with requests.Session() as session:
session.get(url)
response = session.get(xhr_url, params={'id': bibtex_id})
print(response.content)
Prints:
#article{Burgert_2007,
doi = {10.1007/s00425-007-0544-9},
url = {http://dx.doi.org/10.1007/s00425-007-0544-9},
year = 2007,
month = {jun},
publisher = {Springer Science $\mathplus$ Business Media},
volume = {226},
number = {4},
pages = {981--987},
author = {Ingo Burgert and Michaela Eder and Notburga Gierlinger and Peter Fratzl},
title = {Tensile and compressive stresses in tracheids are induced by swelling based on geometrical constraints of the wood cell},
journal = {Planta}
}
You can also solve it with selenium. The key trick here is to use an Explicit Wait to wait for the citation to become visible:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get('http://www.doi2bib.org/#/doi/10.1007/s00425-007-0544-9')
element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//pre[#ng-show="bib"]')))
print(element.text)
driver.close()
Prints the same as the above solution.