Save Json file contents to CSV file in python/pandas - json

How to get the "data" information into a csv table as shown at the end (and also, the right 'headers' so that the source server doesn't throw me off thinking I am scraping)? The code I wrote so far is as below.
import requests, json
headers = {'User-Agent': 'Mozilla/5.0'}
data_json = requests.get('https://www1.nseindia.com/live_market/dynaContent/live_watch/stock_watch/foSecStockWatch.json', headers=headers)
print(data_json)
file = open('make_csv', 'w')
file.write(str(data_json))
file.close()
But as the output I receive is as follows:
<Response [200]>
and even the exported/saved file shows the same.
Here is the expected output table that I am trying to achieve:
Symbol,Open,High,Low,Last Traded Price,Change,%Change,Traded Volume(lacs),Traded Value(crs),52 Week High,52 Week Low,365 Days % Change,30 Days % Change
"LUPIN","582.45","665.90","578.00","662.00","82.95","14.33","64.93","411.13","884.00","504.75","-14.88","5.11"
"APOLLOHOSP","1,094.20","1,239.45","1,088.05","1,195.00","106.15","9.75","23.97","280.36","1,813.55","1,047.05","-4.80","-30.87"
"SUNPHARMA","343.95","389.80","340.00","376.45","32.90","9.58","285.51","1,055.40","483.90","312.00","-19.85","1.88"
"CIPLA","425.00","454.70","416.25","448.00","34.25","8.28","179.07","793.22","586.00","355.30","-14.28","11.46"
"CESC","393.00","429.80","386.25","420.00","26.85","6.83","9.30","38.63","851.70","365.25","-42.19","-34.53"
"TORNTPHARM","1,979.00","2,113.00","1,950.00","2,090.00","131.00","6.69","10.13","208.87","2,287.25","1,452.00","10.56","-1.75"
"ITC","167.90","182.75","167.00","177.50","11.10","6.67","628.68","1,100.88","310.00","134.60","-40.42","-9.11"
"OIL","82.25","85.60","80.25","84.50","5.25","6.62","27.05","22.39","189.70","63.50","-53.95","-16.91"
..........
..........

import requests
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
}
def main(url):
r = requests.get(url, headers=headers).json()
x = []
for item in r['data']:
df = pd.DataFrame.from_dict([item])
x.append(df)
new = pd.concat(x, ignore_index=True)
print(new)
new.to_csv("Data.csv")
main("https://www1.nseindia.com/live_market/dynaContent/live_watch/stock_watch/foSecStockWatch.json")
Output: view online

Related

Append data into data frame

they show me these errors ValueError: All arrays must be of the same length how can I solve these error kindly anyone who give me solution of these problem I am trying many approaches but I can not solve these error so how can I handle these error my array is not same
import enum
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
url="https://www.fleetpride.com/parts/otr-coiled-air-hose-otr6818"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
raw_json = ""
for table_index,table in enumerate( soup.find_all("script")):
if('CCRZ.detailData.jsonProductData = {"' in str(table)):
x=str(table).split('CCRZ.detailData.jsonProductData = {"')
raw_json = "{\""+str(x[-1]).split('};')[0]+"}"
break
req_json = json.loads(raw_json)
# with open("text_json.json","w")as file:
# x=json.dump(req_json,file,indent=4)
temp = req_json
name=[]
specs=[]
title=temp['product']['prodBean']['name']
name.append(title)
item=temp['specifications']['MARKETING']
for i in item:
try:
get=i['value']
except:
pass
specs.append(get)
temp={'title':name,'Specification':specs}
df=pd.DataFrame(temp)
print(df)
While error is quite clear, question and expected result is not.
The way you try to create the DataFrame has to deal with missing rows, thats why the error comes up. To fix this you could create DataFrame from dict:
pd.DataFrame.from_dict(temp, orient='index')
But that looks pretty ugly and could not processed well later on, so an alternative would be:
data = [{
'title':temp['product']['prodBean']['name'],
'specs':','.join([s.get('value') for s in temp['specifications']['MARKETING']])
}]
pd.DataFrame(data)
or following if you like to have each spec in a new row:
data = {
'title':temp['product']['prodBean']['name'],
'specs':[s.get('value') for s in temp['specifications']['MARKETING']]
}
pd.DataFrame.from_dict(data)
Example
import enum
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
url="https://www.fleetpride.com/parts/otr-coiled-air-hose-otr6818"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
raw_json = ""
for table_index,table in enumerate( soup.find_all("script")):
if('CCRZ.detailData.jsonProductData = {"' in str(table)):
x=str(table).split('CCRZ.detailData.jsonProductData = {"')
raw_json = "{\""+str(x[-1]).split('};')[0]+"}"
break
temp = json.loads(raw_json)
data = [{
'title':temp['product']['prodBean']['name'],
'specs':','.join([s.get('value') for s in temp['specifications']['MARKETING']])
}]
pd.DataFrame(data)
Output
title
specs
0
OTR Trailer Air Hose and Electric Cable Assembly, 15'
Spiral wound,Includes hang collar,One bundle for easy management

Iterate append json and save as dataframe in Python

I want to iterate and extract tables from the link here, then concatenate or append them to save as a dataframe.
I have used a loop iterate tables but I'm not sure how can I append all json or dataframe into one?
Anyone could help? Thank you.
from requests import post
import json
import pandas as pd
import numpy as np
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
"Referer": "http://zjj.sz.gov.cn/projreg/public/jgys/jgysList.jsp"}
dfs = []
#dfs = pd.DataFrame()
for page in range(0, 5):
data = {"limit": 100, "offset": page * 100, "pageNumber": page + 1}
json_arr = requests.post("http://zjj.sz.gov.cn/projreg/public/jgys/webService/getJgysLogList.json", headers = headers, data = data).text
d = json.loads(json_arr)
df = pd.read_json(json.dumps(d['rows']) , orient='list')
Reference related: Iterate and extract tables from web saving as excel file in Python
Use concat,
import requests
import json
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Referer': 'http://zjj.sz.gov.cn/projreg/public/jgys/jgysList.jsp'
}
dfs = pd.DataFrame()
for page in range(0, 5):
data = {'limit': 100, 'offset': page * 100, 'pageNumber': page + 1}
json_arr = requests.post(
'http://zjj.sz.gov.cn/projreg/public/jgys/webService/getJgysLogList.json',
headers=headers,
data=data).text
d = json.loads(json_arr)
df = pd.read_json(json.dumps(d['rows']) , orient='list')
dfs = pd.concat([df, dfs], sort=False)
Or,
import requests
import json
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Referer': 'http://zjj.sz.gov.cn/projreg/public/jgys/jgysList.jsp'
}
dfs = []
for page in range(0, 5):
data = {'limit': 100, 'offset': page * 100, 'pageNumber': page + 1}
json_arr = requests.post(
'http://zjj.sz.gov.cn/projreg/public/jgys/webService/getJgysLogList.json',
headers=headers,
data=data).text
d = json.loads(json_arr)
dfs.append(pd.read_json(json.dumps(d['rows']) , orient='list'))
df = pd.concat(dfs, sort=False)
PS: The second block is much preferred as you should never call DataFrame.append or pd.concat inside a for-loop. It leads to quadratic copying. Thanks #parfait!

web scrape save to specific json in python, bs4

I have the following Python code:
import requests
import json
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 4.3; nl-nl; SAMSUNG GT-I9505 Build/JSS15J) AppleWebKit/537.36 (KHTML, like Gecko) Version/1.5 Chrome/28.0.1500.94 Mobile Safari/537.36'}
chapter = 0
while chapter < 3 :
url = 'http://www.komikgue.com/manga/one-piece/{chapter}/'
response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.text, 'html.parser')
mangas = soup.find_all('img', class_="img-responsive")
chapter += 1
def get_manga_details(manga):
src = manga.find('img', class_= "img-responsive").find("img")["src"]
alt = manga.find('img', class_= "img-responsive").find("img")["alt"]
return {
"chapter": chapter,
"src": src,"alt": alt
}
all_mangas = [get_manga_details(manga) for manga in mangas]
with open("manga.json", "w") as write_file:
json.dump(all_mangas, write_file)
print("Success")
This code functions in cmd but produces empty output.
Which is wrong, please teach me
I want it to be:
{
"chapter": "number": 1[
{
"src": "here", "alt" : "here",
"src": "here", "alt" : "here"
}]
}
Please guide me
There's a lot of things that are wrong with your code. First, the URL you are trying to access returns a 404, you need to rjust the chapter number with leading zeroes. Second, your logic and loops don't make much of a sense like defining your function and lists inside the loop, then expecting the output to contain all the chapters. Moreover, you're calling BeautifulSoup's find function again in your function which is not needed, you can directly access the attributes.
See my code below, it works on my machine
import requests
import json
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 4.3; nl-nl; SAMSUNG GT-I9505 Build/JSS15J) AppleWebKit/537.36 (KHTML, like Gecko) Version/1.5 Chrome/28.0.1500.94 Mobile Safari/537.36'}
chapter = 1
allmangas=[]
def get_manga_details(manga,i):
print(manga)
src = manga["src"]
alt = manga["alt"]
return {
"number": i,
"src": src,"alt": alt
}
while chapter < 3 :
url = 'http://www.komikgue.com/manga/one-piece/'+str(chapter).rjust(3,'0')
response = requests.get(url, headers = headers)
soup = BeautifulSoup(response.text, 'html.parser')
mangas = soup.find_all('img', class_="img-responsive")
print(mangas)
allmangas.append({'chapter':chapter, 'data':[get_manga_details(manga,i) for i,manga in enumerate(mangas[:-1])]})
chapter += 1
with open("manga.json", "w") as write_file:
json.dump(allmangas, write_file)
print("Success")

Dealing with hexadecimal values in BeautifulSoup response?

I am using beautiful soup to scrape some data with:
url = "https://www.transfermarkt.co.uk/jorge-molina/profil/spieler/94447"
heads = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
response = requests.get(url,headers=heads)
soup = BeautifulSoup(response.text, "lxml")
Then, I extract I particular piece of information with:
height = soup.find_all("th", string=re.compile("Height:"))[0].findNext("td").text
print(height)
which works as intended, printing
1,74 m
but when I try to evaluate that string with this function:
def format_height(height_string):
return int(height_string.split(" ")[0].replace(',',''))
I get the following error:
format_height(height)
Traceback (most recent call last):
File "get_player_info.py", line 73, in <module>
player_info = get_player_info(url)
File "get_player_info.py", line 39, in get_player_info
format_height(height)
File "/Users/kompella/Documents/la-segunda/util.py", line 49, in format_height
return int(height_string.split(" ")[0].replace(',',''))
ValueError: invalid literal for int() with base 10: '174\xa0m'
I am wondering how I should evaluate the hexadecimal values I am getting?
Use an attribute=value selector to target height instead then use function as is
import requests
from bs4 import BeautifulSoup as bs
def format_height(height_string):
return int(height_string.split(" ")[0].replace(',',''))
r = requests.get('https://www.transfermarkt.co.uk/jorge-molina/profil/spieler/94447', headers = {'User-Agent':'Mozilla\5.0'})
soup = bs(r.content,'lxml')
height_string = soup.select_one('[itemprop=height]').text
print(format_height(height_string))
Everything is perfectly fine, just deconstruct them & you can do whatever you want after.
import requests
import re
from bs4 import BeautifulSoup
url = "https://www.transfermarkt.co.uk/jorge-molina/profil/spieler/94447"
heads = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
response = requests.get(url,headers=heads)
soup = BeautifulSoup(response.text, "lxml")
height = soup.find_all("th", string=re.compile("Height:"))[0].findNext("td").text
numerals = [int(s) for s in re.findall(r'\b\d+\b', height)]
print (numerals)
#output: [1, 74]
print ("Height is: " + str(numerals[0]) +"."+ str(numerals[1]) +"m")
#output: Height is: 1.75m
print ("Height is: " + str(numerals[0]) + str(numerals[1]) +"cm")
#output: Height is: 175cm
Anyways, the same question was discuss in this thread. You may take a look:
ValueError: invalid literal for int() with base 10: ''

Assigning beautifulsoup indexed values (html links and text) to a panda html DataFrame

The following code retrieves images and html links from a webpage and stores the values in a beautiful soup index. I am now using pandas in order to create an output html table for those images and links. I have managed to populate cells manually by calling on a specific index value but I can't seem to find a way add each indexed image and html text to the panda dataframe so that all the indexed values are displayed in the table. How could I do this ?
from bs4 import BeautifulSoup
import requests
import numpy as np
from pandas import *
import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth',500)
from IPython.display import HTML
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4729&siteid=1"
# add header
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
r = requests.get(urldes, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
####################################
title_clean = soup.find('title')
print(title_clean)
image_links = [x['data-img'] for x in soup.find_all('a', rel='popover')]
for link in image_links:
print(link)
image_links_0 = image_links[0]
print(image_links_0)
mytags = []
tags = soup.find_all('td', width='41%')
for tag in tags:
image_text = tag.find('h5').text
mytags.append(image_text)
print(image_text)
for i in range(len(mytags)):
mytags[i]
mytags_0 = mytags[0]
image_links_0 = image_links[0]
#df = DataFrame({'foo1' : 'test',
df = DataFrame({'foo1' : '<img src="' + image_links_0 + '"/><p>' + mytags_0 + '</p>',
'foo2' : '' + mytags_0 + '',
'foo3' : mytags_0,
'foo4' : np.random.randn(2)})
print(df)
HTML(df.to_html('filename.html', escape=False))
print(tag)
This is the correct way to do it.
If you need any help with storing it and making an HTML out of it I'll be happy to provide a solution for that as well. Take care!
Update: Everything included, comments, scraping, writing to a file, creating tags with beautifulsoup.
from bs4 import BeautifulSoup
import requests
urldes = "https://www.johnpyeauctions.co.uk/lot_list.asp?saleid=4729&siteid=1"
# add header
mozila_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'
headers = {'User-Agent': mozila_agent}
r = requests.get(urldes, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
############################################################
the_whole_table = soup.find('table', width='97%')
datalist = []
for tr in the_whole_table.find_all('tr')[1:]:
# you want to start from the 1st item not the 0th so [1:]
# Because the first is the thead i.e. Lot no, Picture, Lot Title...
index_num = tr.find('td', width='8%')
picture_link = index_num.next_sibling.a['data-img']
text_info = tr.find('td', width='41%')
current_bid = tr.find('td', width='13%')
time_left = tr.find('td', width='19%')
datalist.append([index_num.text, picture_link,
text_info.text, current_bid.text, time_left.text])
# for pic do ... print(picture_link) as for partial text only first 20
# characters
df = ['Index Number', 'Picture', 'Informational text',
'Current BID', 'Time Left now']
theads = BeautifulSoup('<table border="1"></table>', 'lxml')
thekeys = BeautifulSoup('<thead></thead>', 'html.parser')
for i in df:
tag = theads.new_tag('th')
tag.append(i)
thekeys.thead.append(tag)
theads.table.append(thekeys)
###############################################################
# The code above will initiate a table
# after that the for loop will create and populate the first row (thead)
for i in datalist:
thedata = BeautifulSoup('<tr></tr>', 'html.parser')
# we loop through the data we collected
for j in i:
if j.startswith('https'):
img_tag = theads.new_tag('img', src=j, height='50', width='50')
td_tag = theads.new_tag('td')
td_tag.append(img_tag)
thedata.tr.append(td_tag)
else:
tag = theads.new_tag('td')
tag.append(j)
thedata.tr.append(tag)
theads.table.append(thedata)
with open('asdf.html', 'w+') as f:
f.write(theads.prettify())
# each of these if you print them you'll get a information that you can store
# we use `.prettify()` as we can't write a BeautifulSoup object into a file.