New to web-scraping here. I basically want to extract a link from a web page into my jupyter notebook as shown in the image below :
Following is the code that I tried out:
from flask import Flask, render_template, request, jsonify
from flask_cors import CORS, cross_origin
import requests
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen as uReq
flipkart_url = "https://www.flipkart.com/search?q=" + 'acer-aspire-7-core-i5'
uClient = uReq(flipkart_url)
flipkartPage = uClient.read()
flipkart_html = bs(flipkartPage, "html.parser")
#Since I am only interested in the class "_1AtVbE col-12-12"
bigboxes = flipkart_html.findAll("div", {"class": "_1AtVbE col-12-12"})
Now here's the thing, I don't exactly understand what bigboxes is storing. The type of bigboxes is bs4.element.ResultSet, the length is 16.
Now if I run:
box = bigboxes[0]
productlink = "https://www.flipkart.com" + box.div.div.div.a['href']
I am getting an error. However when I run:
box = bigboxes[2]
productlink = "https://www.flipkart.com" + box.div.div.div.a['href']
I am successfully able to extract the link. Can someone please explain to me why the third element was able to read the link? I have a basic knowledge of HTML (at least I thought so) and I don't understand the layers to it. What exactly is bigboxes storing? Clearly, the HTML script shows no layers as such.
Your class filter is not very specific.
The first and second elements are pointing to html nodes which do not contain the link. Thus you are getting error.
A more specific class to check could be: _13oc-S
bigboxes = flipkart_html.findAll("div", {"class": "_13oc-S"})
I have created the script below to extract the text from the posts of an instagram user profile.
It works perfectly to extract the posts but there is a problem once i start using the scroll function of selenium as the json file does not seem to be updating.
I have created a loop for 2 repetitions for test purposes
but there seems to be a problem in the line pageSource=driver.page_source.
I am expecting the script to load the new json file linked to the new page that but when I test it the pageSource file is always the same even if selenium is correctly scrolling through the page.
import requests
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
url = #instagram url
driver = webdriver.Firefox()
driver.get(url)
for n in range(2):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
pageSource=driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
body = soup.find('body')
script = body.find('script')
raw = script.text.strip().replace('window._sharedData =', '').replace(';', '')
json_data=json.loads(raw)
for post in json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
text_src = post['node']['edge_media_to_caption']['edges'][0]['node']['text']
print (text_src)
time.sleep(5)
I am learning scraping. I need to access the json string i encounter within a DIV. I am using beautifulsoup.
This is the json string i get in the DIV. I need the value (51.65) of the tag "lastprice". Please help. The JSON object is in json_d
import pip
import requests
import json
from bs4 import BeautifulSoup
print ('hi')
page = requests.get('https://www.nseindia.com/live_market/dynaContent/live_watch/get_quote/GetQuote.jsp?symbol=NBCC&illiquid=0&smeFlag=0&itpFlag=0')
soup = BeautifulSoup(page.text, 'html.parser')
json_d = soup.find(id='responseDiv')
print ('bye')
import bs4
import json
r= '''
<div id="responseDiv" style="display:none">{"tradedDate":"07DEC2018","data":[{"pricebandupper":"58.35","symbol":"NBCC","applicableMargin":"15.35","bcEndDate":"14-SEP-18","totalSellQuantity":"40,722","adhocMargin":"-","companyName":"NBCC (India) Limited","marketType":"N","exDate":"06-SEP-18","bcStartDate":"10-SEP-18","css_status_desc":"Listed","dayHigh":"53.55","basePrice":"53.05","securityVar":"10.35","pricebandlower":"47.75","sellQuantity5":"-","sellQuantity4":"-","sellQuantity3":"-","cm_adj_high_dt":"08-DEC-17","sellQuantity2":"-","dayLow":"51.55","sellQuantity1":"40,722","quantityTraded":"71,35,742","pChange":"-2.64","totalTradedValue":"3,714.15","deliveryToTradedQuantity":"40.23","totalBuyQuantity":"-","averagePrice":"52.05","indexVar":"-","cm_ffm":"2,424.24","purpose":"ANNUAL GENERAL MEETING\/DIVIDEND RE 0.56 PER SHARE","buyPrice2":"-","secDate":"7DEC2018","buyPrice1":"-","high52":"266.00","previousClose":"53.05","ndEndDate":"-","low52":"50.80","buyPrice4":"-","buyPrice3":"-","recordDate":"-","deliveryQuantity":"28,70,753","buyPrice5":"-","priceBand":"No Band","extremeLossMargin":"5.00","cm_adj_low_dt":"26-OCT-18","varMargin":"10.35","sellPrice1":"51.80","sellPrice2":"-","totalTradedVolume":"71,35,742","sellPrice3":"-","sellPrice4":"-","sellPrice5":"-","change":"-1.40","surv_indicator":"-","ndStartDate":"-","buyQuantity4":"-","isExDateFlag":false,"buyQuantity3":"-","buyQuantity2":"-","buyQuantity1":"-","series":"EQ","faceValue":"1.00","buyQuantity5":"-","closePrice":"51.80","open":"53.15","isinCode":"INE095N01031","lastPrice":"51.65"}],"optLink":"\/marketinfo\/sym_map\/symbolMapping.jsp?symbol=NBCC&instrument=-&date=-&segmentLink=17&symbolCount=2","otherSeries":["EQ"],"futLink":"\/live_market\/dynaContent\/live_watch\/get_quote\/GetQuoteFO.jsp?underlying=NBCC&instrument=FUTSTK&expiry=27DEC2018&type=-&strike=-","lastUpdateTime":"07-DEC-2018 15:59:59"}</div>'''
html = bs4.BeautifulSoup(r)
soup = html.find('div', {'id':'responseDiv'}).text
data = json.loads(soup)
last_price = data['data'][0]['lastPrice']
EDIT:
json_d = soup.find(id='responseDiv')
Try changing to
json_d = soup.find(‘div’, {‘id’:'responseDiv'})
Then you should be able to do
data = json.loads(json_d)
last_price = data['data'][0]['lastPrice']
See if that helps. I’m currently away from my computer until Tuesday so typing this up on my iPhone, so can’t test/play with it.
The other thing is the site might need to be read in after it’s loaded. In that case, I think you’d need to look into selenium package or html-requests packages.
Again, I can’t look until Tuesday when I get back home to my laptop.
I made a simple script to retrieve sourcecode of a page, but I'd like to "isolate" the part of ips so that I can save to proxy.txt file. Any suggestions?
import urllib.request
sourcecode = urllib.request.urlopen("https://www.inforge.net/xi/threads/dichvusocks-us-15h10-pm-update-24-24-good-socks.455588/")
sourcecode = str(sourcecode.read())
out_file = open("proxy.txt","w")
out_file.write(sourcecode)
out_file.close()
I've added a couple of lines to your code, the only problem is that the UI version (check the page source) is being added as an IP address.
import urllib.request
import re
sourcecode = urllib.request.urlopen("https://www.inforge.net/xi/threads/dichvusocks-us-15h10-pm-update-24-24-good-socks.455588/")
sourcecode = str(sourcecode.read())
out_file = open("proxy.txt","w")
out_file.write(sourcecode)
out_file.close()
with open('proxy.txt') as fp:
for line in fp:
ip = re.findall('(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})', line)
for addr in ip:
print(addr)
UPDATE:
This is what you are looking for, BeatifulSoup can extract only the data we need from the page using CSS classes, however it needs to be installed with pip. You don't need to save the page to a file.
from bs4 import BeautifulSoup
import urllib.request
import re
url = urllib.request.urlopen('https://www.inforge.net/xi/threads/dichvusocks-us-15h10-pm-update-24-24-good-socks.455588/').read()
soup = BeautifulSoup(url, "html.parser")
# Searching the CSS class name
msg_content = soup.find_all("div", class_="messageContent")
ips = re.findall('(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})', str(msg_content))
for addr in ips:
print(addr)
Why won't you use re?
I need the source code to say exactly how.
I am trying to use python3 to return the bibtex citation generated by http://www.doi2bib.org/. The url's are predictable so the script can work out the url without having to interact with the web page. I have tried using selenium, bs4, etc but cant get the text inside the box.
url = "http://www.doi2bib.org/#/doi/10.1007/s00425-007-0544-9"
import urllib.request
from bs4 import BeautifulSoup
text = BeautifulSoup(urllib.request.urlopen(url).read())
print(text)
Can anyone suggest a way of returning the bibtex citation as a string (or whatever) in python?
You don't need BeautifulSoup here. There is an additional XHR request sent to the server to fill out the bibtex citation, simulate it, for example, with requests:
import requests
bibtex_id = '10.1007/s00425-007-0544-9'
url = "http://www.doi2bib.org/#/doi/{id}".format(id=bibtex_id)
xhr_url = 'http://www.doi2bib.org/doi2bib'
with requests.Session() as session:
session.get(url)
response = session.get(xhr_url, params={'id': bibtex_id})
print(response.content)
Prints:
#article{Burgert_2007,
doi = {10.1007/s00425-007-0544-9},
url = {http://dx.doi.org/10.1007/s00425-007-0544-9},
year = 2007,
month = {jun},
publisher = {Springer Science $\mathplus$ Business Media},
volume = {226},
number = {4},
pages = {981--987},
author = {Ingo Burgert and Michaela Eder and Notburga Gierlinger and Peter Fratzl},
title = {Tensile and compressive stresses in tracheids are induced by swelling based on geometrical constraints of the wood cell},
journal = {Planta}
}
You can also solve it with selenium. The key trick here is to use an Explicit Wait to wait for the citation to become visible:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get('http://www.doi2bib.org/#/doi/10.1007/s00425-007-0544-9')
element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//pre[#ng-show="bib"]')))
print(element.text)
driver.close()
Prints the same as the above solution.