web scrape save to specific json in python, bs4

web scrape save to specific json in python, bs4 - json

I have the following Python code:
import requests
import json
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 4.3; nl-nl; SAMSUNG GT-I9505 Build/JSS15J) AppleWebKit/537.36 (KHTML, like Gecko) Version/1.5 Chrome/28.0.1500.94 Mobile Safari/537.36'}
chapter = 0
while chapter < 3 :
url = 'http://www.komikgue.com/manga/one-piece/{chapter}/'
response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.text, 'html.parser')
mangas = soup.find_all('img', class_="img-responsive")
chapter += 1
def get_manga_details(manga):
src = manga.find('img', class_= "img-responsive").find("img")["src"]
alt = manga.find('img', class_= "img-responsive").find("img")["alt"]
return {
"chapter": chapter,
"src": src,"alt": alt
}
all_mangas = [get_manga_details(manga) for manga in mangas]
with open("manga.json", "w") as write_file:
json.dump(all_mangas, write_file)
print("Success")
This code functions in cmd but produces empty output.
Which is wrong, please teach me
I want it to be:
{
"chapter": "number": 1[
{
"src": "here", "alt" : "here",
"src": "here", "alt" : "here"
}]
}
Please guide me

There's a lot of things that are wrong with your code. First, the URL you are trying to access returns a 404, you need to rjust the chapter number with leading zeroes. Second, your logic and loops don't make much of a sense like defining your function and lists inside the loop, then expecting the output to contain all the chapters. Moreover, you're calling BeautifulSoup's find function again in your function which is not needed, you can directly access the attributes.
See my code below, it works on my machine
import requests
import json
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 4.3; nl-nl; SAMSUNG GT-I9505 Build/JSS15J) AppleWebKit/537.36 (KHTML, like Gecko) Version/1.5 Chrome/28.0.1500.94 Mobile Safari/537.36'}
chapter = 1
allmangas=[]
def get_manga_details(manga,i):
print(manga)
src = manga["src"]
alt = manga["alt"]
return {
"number": i,
"src": src,"alt": alt
}
while chapter < 3 :
url = 'http://www.komikgue.com/manga/one-piece/'+str(chapter).rjust(3,'0')
response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.text, 'html.parser')
mangas = soup.find_all('img', class_="img-responsive")
print(mangas)
allmangas.append({'chapter':chapter, 'data':[get_manga_details(manga,i) for i,manga in enumerate(mangas[:-1])]})
chapter += 1
with open("manga.json", "w") as write_file:
json.dump(allmangas, write_file)
print("Success")

Related

Append data into data frame

they show me these errors ValueError: All arrays must be of the same length how can I solve these error kindly anyone who give me solution of these problem I am trying many approaches but I can not solve these error so how can I handle these error my array is not same
import enum
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
url="https://www.fleetpride.com/parts/otr-coiled-air-hose-otr6818"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
raw_json = ""
for table_index,table in enumerate( soup.find_all("script")):
if('CCRZ.detailData.jsonProductData = {"' in str(table)):
x=str(table).split('CCRZ.detailData.jsonProductData = {"')
raw_json = "{\""+str(x[-1]).split('};')[0]+"}"
break
req_json = json.loads(raw_json)
# with open("text_json.json","w")as file:
# x=json.dump(req_json,file,indent=4)
temp = req_json
name=[]
specs=[]
title=temp['product']['prodBean']['name']
name.append(title)
item=temp['specifications']['MARKETING']
for i in item:
try:
get=i['value']
except:
pass
specs.append(get)
temp={'title':name,'Specification':specs}
df=pd.DataFrame(temp)
print(df)

While error is quite clear, question and expected result is not.
The way you try to create the DataFrame has to deal with missing rows, thats why the error comes up. To fix this you could create DataFrame from dict:
pd.DataFrame.from_dict(temp, orient='index')
But that looks pretty ugly and could not processed well later on, so an alternative would be:
data = [{
'title':temp['product']['prodBean']['name'],
'specs':','.join([s.get('value') for s in temp['specifications']['MARKETING']])
}]
pd.DataFrame(data)
or following if you like to have each spec in a new row:
data = {
'title':temp['product']['prodBean']['name'],
'specs':[s.get('value') for s in temp['specifications']['MARKETING']]
}
pd.DataFrame.from_dict(data)
Example
import enum
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
url="https://www.fleetpride.com/parts/otr-coiled-air-hose-otr6818"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
raw_json = ""
for table_index,table in enumerate( soup.find_all("script")):
if('CCRZ.detailData.jsonProductData = {"' in str(table)):
x=str(table).split('CCRZ.detailData.jsonProductData = {"')
raw_json = "{\""+str(x[-1]).split('};')[0]+"}"
break
temp = json.loads(raw_json)
data = [{
'title':temp['product']['prodBean']['name'],
'specs':','.join([s.get('value') for s in temp['specifications']['MARKETING']])
}]
pd.DataFrame(data)
Output
title
specs
0
OTR Trailer Air Hose and Electric Cable Assembly, 15'
Spiral wound,Includes hang collar,One bundle for easy management

How to efficiently crawl a website using Scrapy

I am trying to attempt web-scraping a real estate website using Scrapy and PyCharm, and failing miserably.
Desired Results:
Scrape 1 base URL (https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/), but 5 different internal URLs (https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/{**i**}-r/), where {i} = 1,2,3,4,5
Crawl all pages in each internal URL or using the base URL
Get all href links and crawl all href link and get span tag data from inside each href link.
Scrape around 5,000-7,000 unique listings as efficiently and fast as possible.
Output data into a CSV file while keeping Cyrillic characters.
Note: I have attempted web-scraping using BeautifulSoup but it took me around 1-2 min per listing, and around 2-3 hours to scrape all listings using a for loop. I was referred to Scrapy being faster option by a community member. I'm unsure if its cause of the data pipelines or if I can do multi-threading.
All and any help is greatly appreciated.^^
Website sample HTML snippet: This is a picture of the HTML I am trying to scrape.
Current Scrapy Code: This is what I have so far. When I use the scrapy crawl unegui_apts I cannot seem to get the results I want. I'm so lost.
# -*- coding: utf-8 -*-
# Import library
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Request
# Create Spider class
class UneguiApartments(scrapy.Spider):
name = 'unegui_apts'
allowed_domains = ['www.unegui.mn']
custom_settings = {'FEEDS': {'results1.csv': {'format': 'csv'}}}
start_urls = [
'https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/1-r/,'
'https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/2-r/'
]
headers = {
'user-agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
}
def parse(self, response):
self.logger.debug('callback "parse": got response %r' % response)
cards = response.xpath('//div[#class="list-announcement-block"]')
for card in cards:
name = card.xpath('.//meta[#itemprop="name"]/text()').extract_first()
price = card.xpath('.//meta[#itemprop="price"]/text()').extract_first()
city = card.xpath('.//meta[#itemprop="areaServed"]/text()').extract_first()
date = card.xpath('.//*[#class="announcement-block__date"]/text()').extract_first().strip().split(', ')[0]
request = Request(link, callback=self.parse_details, meta={'name': name,
'price': price,
'city': city,
'date': date})
yield request
next_url = response.xpath('//li[#class="pager-next"]/a/#href').get()
if next_url:
# go to next page until no more pages
yield response.follow(next_url, callback=self.parse)
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartments)
process.start()

Your code has a number of issues:
The start_urls list contains invalid links
You defined your user_agent string in the headers dictionary but you are not using it when yielding requests
Your xpath selectors are incorrect
The next_url is incorrect hence does not yield new requests to the next pages
I have updated your code to fix the issues above as follows:
import scrapy
from scrapy.crawler import CrawlerProcess
# Create Spider class
class UneguiApartments(scrapy.Spider):
name = 'unegui_apts'
allowed_domains = ['www.unegui.mn']
custom_settings = {'FEEDS': {'results1.csv': {'format': 'csv'}},
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"}
start_urls = [
'https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/'
]
def parse(self, response):
cards = response.xpath(
'//li[contains(#class,"announcement-container")]')
for card in cards:
name = card.xpath(".//a[#itemprop='name']/#content").extract_first()
price = card.xpath(".//*[#itemprop='price']/#content").extract_first()
date = card.xpath("normalize-space(.//div[contains(#class,'announcement-block__date')]/text())").extract_first()
city = card.xpath(".//*[#itemprop='areaServed']/#content").extract_first()
yield {'name': name,
'price': price,
'city': city,
'date': date}
next_url = response.xpath("//a[contains(#class,'red')]/parent::li/following-sibling::li/a/#href").extract_first()
if next_url:
# go to next page until no more pages
yield response.follow(next_url, callback=self.parse)
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(UneguiApartments)
process.start()
You run the above spider by executing the command python <filename.py> since you are running a standalone script and not a full blown project.
Sample csv results are as shown in the image below. You will need to clean up the data using pipelines and the scrapy item class. See the docs for more details.

Soup.find_all returning an empty list

I am trying to scrape a player stats table for NBA stats using requests and BeautifulSoup, but the response I am getting is not same as what I see using "Inspect Element"
The div containing this table is has class attribute: class="nba-stat-table__overflow. However, whenever I run the following code I get an empty list:
table = soup.find_all('div',attrs={'class="nba-stat-table__overflow'})
Here is my full code:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
url = 'https://stats.nba.com/players/boxscores/?Season=2018-19&SeasonType=Regular%20Season'
response = requests.get(url)
soup = BeautifulSoup(response.content,'html.parser')
table = soup.find_all('div',attrs={'class="nba-stat-table__overflow'})

Basically the page is load via JavaScript, So bs4 or requests modules will not be able to render the JavaScript on the fly.
You should use selenium or requests_html modules to render the JS, But i noticed that the website is using API, which can be used to fetch the data, So I've called it and extracted the data.
Check My previous Answer which explain for you how to fetch the API.
import requests
import pandas as pd
params = {
"Counter": "1000",
"DateFrom": "",
"DateTo": "",
"Direction": "DESC",
"LeagueID": "00",
"PlayerOrTeam": "P",
"Season": "2018-19",
"SeasonType": "Regular Season",
"Sorter": "DATE"
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
"x-nba-stats-origin": "stats",
"x-nba-stats-token": "true",
"Referer": "https://stats.nba.com/players/boxscores/?Season=2018-19&SeasonType=Regular%20Season"
}
def main(url):
r = requests.get(url, params=params, headers=headers).json()
goal = []
for item in r['resultSets']:
df = pd.DataFrame(item['rowSet'], columns=item['headers'])
goal.append(df)
new = pd.concat(goal)
print(new)
new.to_csv("data.csv", index=False)
main("https://stats.nba.com/stats/leaguegamelog")
Output: View-Online

Save Json file contents to CSV file in python/pandas

How to get the "data" information into a csv table as shown at the end (and also, the right 'headers' so that the source server doesn't throw me off thinking I am scraping)? The code I wrote so far is as below.
import requests, json
headers = {'User-Agent': 'Mozilla/5.0'}
data_json = requests.get('https://www1.nseindia.com/live_market/dynaContent/live_watch/stock_watch/foSecStockWatch.json', headers=headers)
print(data_json)
file = open('make_csv', 'w')
file.write(str(data_json))
file.close()
But as the output I receive is as follows:
<Response [200]>
and even the exported/saved file shows the same.
Here is the expected output table that I am trying to achieve:
Symbol,Open,High,Low,Last Traded Price,Change,%Change,Traded Volume(lacs),Traded Value(crs),52 Week High,52 Week Low,365 Days % Change,30 Days % Change
"LUPIN","582.45","665.90","578.00","662.00","82.95","14.33","64.93","411.13","884.00","504.75","-14.88","5.11"
"APOLLOHOSP","1,094.20","1,239.45","1,088.05","1,195.00","106.15","9.75","23.97","280.36","1,813.55","1,047.05","-4.80","-30.87"
"SUNPHARMA","343.95","389.80","340.00","376.45","32.90","9.58","285.51","1,055.40","483.90","312.00","-19.85","1.88"
"CIPLA","425.00","454.70","416.25","448.00","34.25","8.28","179.07","793.22","586.00","355.30","-14.28","11.46"
"CESC","393.00","429.80","386.25","420.00","26.85","6.83","9.30","38.63","851.70","365.25","-42.19","-34.53"
"TORNTPHARM","1,979.00","2,113.00","1,950.00","2,090.00","131.00","6.69","10.13","208.87","2,287.25","1,452.00","10.56","-1.75"
"ITC","167.90","182.75","167.00","177.50","11.10","6.67","628.68","1,100.88","310.00","134.60","-40.42","-9.11"
"OIL","82.25","85.60","80.25","84.50","5.25","6.62","27.05","22.39","189.70","63.50","-53.95","-16.91"
..........
..........

import requests
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def main(url):
r = requests.get(url, headers=headers).json()
x = []
for item in r['data']:
df = pd.DataFrame.from_dict([item])
x.append(df)
new = pd.concat(x, ignore_index=True)
print(new)
new.to_csv("Data.csv")
main("https://www1.nseindia.com/live_market/dynaContent/live_watch/stock_watch/foSecStockWatch.json")
Output: view online

Assigning beautifulsoup indexed values (html links and text) to a panda html DataFrame

The following code retrieves images and html links from a webpage and stores the values in a beautiful soup index. I am now using pandas in order to create an output html table for those images and links. I have managed to populate cells manually by calling on a specific index value but I can't seem to find a way add each indexed image and html text to the panda dataframe so that all the indexed values are displayed in the table. How could I do this ?
from bs4 import BeautifulSoup
import requests
import numpy as np
from pandas import *
import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth',500)
from IPython.display import HTML
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4729&siteid=1"
# add header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
r = requests.get(urldes, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
####################################
title_clean = soup.find('title')
print(title_clean)
image_links = [x['data-img'] for x in soup.find_all('a', rel='popover')]
for link in image_links:
print(link)
image_links_0 = image_links[0]
print(image_links_0)
mytags = []
tags = soup.find_all('td', width='41%')
for tag in tags:
image_text = tag.find('h5').text
mytags.append(image_text)
print(image_text)
for i in range(len(mytags)):
mytags[i]
mytags_0 = mytags[0]
image_links_0 = image_links[0]
#df = DataFrame({'foo1' : 'test',
df = DataFrame({'foo1' : '<img src="' + image_links_0 + '"/><p>' + mytags_0 + '</p>',
'foo2' : '' + mytags_0 + '',
'foo3' : mytags_0,
'foo4' : np.random.randn(2)})
print(df)
HTML(df.to_html('filename.html', escape=False))
print(tag)

This is the correct way to do it.
If you need any help with storing it and making an HTML out of it I'll be happy to provide a solution for that as well. Take care!
Update: Everything included, comments, scraping, writing to a file, creating tags with beautifulsoup.
from bs4 import BeautifulSoup
import requests
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4729&siteid=1"
# add header
mozila_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
headers = {'User-Agent': mozila_agent}
r = requests.get(urldes, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
############################################################
the_whole_table = soup.find('table', width='97%')
datalist = []
for tr in the_whole_table.find_all('tr')[1:]:
# you want to start from the 1st item not the 0th so [1:]
# Because the first is the thead i.e. Lot no, Picture, Lot Title...
index_num = tr.find('td', width='8%')
picture_link = index_num.next_sibling.a['data-img']
text_info = tr.find('td', width='41%')
current_bid = tr.find('td', width='13%')
time_left = tr.find('td', width='19%')
datalist.append([index_num.text, picture_link,
text_info.text, current_bid.text, time_left.text])
# for pic do ... print(picture_link) as for partial text only first 20
# characters
df = ['Index Number', 'Picture', 'Informational text',
'Current BID', 'Time Left now']
theads = BeautifulSoup('<table border="1"></table>', 'lxml')
thekeys = BeautifulSoup('<thead></thead>', 'html.parser')
for i in df:
tag = theads.new_tag('th')
tag.append(i)
thekeys.thead.append(tag)
theads.table.append(thekeys)
###############################################################
# The code above will initiate a table
# after that the for loop will create and populate the first row (thead)
for i in datalist:
thedata = BeautifulSoup('<tr></tr>', 'html.parser')
# we loop through the data we collected
for j in i:
if j.startswith('https'):
img_tag = theads.new_tag('img', src=j, height='50', width='50')
td_tag = theads.new_tag('td')
td_tag.append(img_tag)
thedata.tr.append(td_tag)
else:
tag = theads.new_tag('td')
tag.append(j)
thedata.tr.append(tag)
theads.table.append(thedata)
with open('asdf.html', 'w+') as f:
f.write(theads.prettify())
# each of these if you print them you'll get a information that you can store
# we use `.prettify()` as we can't write a BeautifulSoup object into a file.

We Keep Coding

html mysql json google-apps-script actionscript-3 ms-access google-chrome google-maps reporting-services sql-server-2008

web scrape save to specific json in python, bs4 - json

Related

Append data into data frame

How to efficiently crawl a website using Scrapy

Soup.find_all returning an empty list

Save Json file contents to CSV file in python/pandas

Assigning beautifulsoup indexed values (html links and text) to a panda html DataFrame

Categories

Resources