Extract data from an unreachable JsonObject() - json

I'm trying to reach a JsObject to scrape it.
import requests
from bs4 import BeautifulSoup
url ='https://fjrgcwbcbo-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%20(lite)%203.27.0%3Binstantsearch.js%202.8.0%3BJS%20Helper%202.26.0&x-algolia-application-id=FJRGCWBCBO&x-algolia-api-key=a214a1e7afd822b517723830f05e9449'
jsonObj = requests.get(url).json()
print(JsonObj)
There are four JsonObject from this URL, but each one seems to be blocked by an API.
Website URL
Thanks a lot.

It was tricky the way you need to feed in the query parameters and payload and figuring out how it receives it. But this does the trick:
import requests
import json
url ='https://fjrgcwbcbo-dsn.algolia.net/1/indexes/*/queries'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
params = {
'x-algolia-agent': 'Algolia for vanilla JavaScript (lite) 3.27.0;instantsearch.js 2.8.0;JS Helper 2.26.0',
'x-algolia-application-id': 'FJRGCWBCBO',
'x-algolia-api-key': 'a214a1e7afd822b517723830f05e9449'}
data = {"requests":[{"indexName":"CAPI-agents-stage","params":"query=&hitsPerPage=20&maxValuesPerFacet=10&page=0&filters=source.name%3Atremplin%20AND%20NOT%20source.id%3A400%20AND%20NOT%20source.id%3A2662%20AND%20NOT%20source.id%3A3292%20AND%20NOT%20source.id%3A720%20AND%20NOT%20source.id%3A1%20AND%20NOT%20source.id%3A5167%20AND%20NOT%20source.id%3A177%20AND%20NOT%20source.id%3A7907%20AND%20NOT%20source.id%3A4999%20AND%20NOT%20source.id%3A979%20AND%20NOT%20source.id%3A5538%20AND%20NOT%20source.id%3A8062%20AND%20NOT%20source.id%3A7908%20AND%20NOT%20source.id%3A38%20AND%20NOT%20source.id%3A7957%20AND%20NOT%20source.id%3A8084%20AND%20NOT%20source.id%3A8078%20AND%20NOT%20source.id%3A8158%20AND%20NOT%20source.id%3A9298%20AND%20NOT%20source.id%3A9299%20AND%20NOT%20source.id%3A9368%20AND%20NOT%20source.id%3A540%20AND%20NOT%20source.id%3A7905%20AND%20NOT%20source.id%3A10173&facets=%5B%22identity.last_name%22%2C%22geolocation.city%22%2C%22geolocation.postal_code%22%2C%22geolocation.region%22%2C%22geolocation.department%22%2C%22geolocation.country%22%2C%22specialities%22%5D&tagFilters="}]}
jsondata = json.dumps(data)
jsonObj = requests.post(url, data=jsondata, params=params).json()
print(jsonObj['results'][0])
for each in jsonObj['results'][0]['hits']:
print (each['email_address'])
Additional:
I tweaked the way that you can alter the query/data parameters.
import requests
import urllib
url ='https://fjrgcwbcbo-dsn.algolia.net/1/indexes/*/queries'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}
params = {
'x-algolia-agent': 'Algolia for vanilla JavaScript (lite) 3.27.0;instantsearch.js 2.8.0;JS Helper 2.26.0',
'x-algolia-application-id': 'FJRGCWBCBO',
'x-algolia-api-key': 'a214a1e7afd822b517723830f05e9449'}
#Utilize a simplier way to input query parameters
query = {
"indexName":"CAPI-agents-stage",
"hitsPerPage": "20",
"maxValuesPerFacet": "10",
"page": "0",
"filters": "source.name:tremplin AND NOT source.id:400 AND NOT source.id:2662 AND NOT source.id:3292 AND NOT source.id:720 AND NOT source.id:1 AND NOT source.id:5167 AND NOT source.id:177 AND NOT source.id:7907 AND NOT source.id:4999 AND NOT source.id:979 AND NOT source.id:5538 AND NOT source.id:8062 AND NOT source.id:7908 AND NOT source.id:38 AND NOT source.id:7957 AND NOT source.id:8084 AND NOT source.id:8078 AND NOT source.id:8158 AND NOT source.id:9298 AND NOT source.id:9299 AND NOT source.id:9368 AND NOT source.id:540 AND NOT source.id:7905 AND NOT source.id:10173",
"facets": str(["identity.last_name","geolocation.city","geolocation.postal_code","geolocation.region","geolocation.department","geolocation.country","specialities"]),
"tagFilters": ""}
#Create the string for the data parameter using the urllib parse quote
data = '''{"requests": [{"indexName": "%s", "params": "query=&hitsPerPage=%s&maxValuesPerFacet=%s&page=%s&filters=%s&facets=%s&tagFilters=%s"}]}''' %(query['indexName'],query['hitsPerPage'],query['maxValuesPerFacet'],query['page'],urllib.parse.quote(query['filters']),urllib.parse.quote(query['facets']),query['tagFilters'])
jsonObj = requests.post(url, data=data, params=params).json()
print(jsonObj['results'][0])
for each in jsonObj['results'][0]['hits']:
print (each['email_address'])

I find another solution to scrape all of the email adresses in one shot.
import requests
import json
from algoliasearch import algoliasearch
url ='https://fjrgcwbcbo-
dsn.algolia.net/1/indexes/*/queries'
client = algoliasearch.Client("FJRGCWBCBO", "a214a1e7afd822b517723830f05e9449")
index = client.init_index('CAPI-agents-stage')
for hit in index.browse_all({"query": ""}):
print(hit['email_address'])
If it could help anyone, thanks a lot Chitown88

Related

Python Request Not Returning Full HTML Source Code

I am trying to retrieve full HTML source code but I am finding it only returns partial.
Code:
from bs4 import BeautifulSoup as soup
import requests
HEADERS ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
url = 'https://us.louisvuitton.com/eng-us/products/louis-vuitton-and-nike-air-force-1-by-virgil-abloh-white-white-white-nvprod3690049v/1A9V88'
req = requests.get(url, headers=HEADERS)
page_soup = soup(req.text, "html5lib")
print(page_soup)
If you vist the website and its source code you will find it is 181 lines long. But in the code it returns 45 lines long.
Any idea what im doing wrong?
Thank you.
Try doing page_soup.getText() to get the entire text out of the beautifulSoup object

Append data into data frame

they show me these errors ValueError: All arrays must be of the same length how can I solve these error kindly anyone who give me solution of these problem I am trying many approaches but I can not solve these error so how can I handle these error my array is not same
import enum
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
url="https://www.fleetpride.com/parts/otr-coiled-air-hose-otr6818"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
raw_json = ""
for table_index,table in enumerate( soup.find_all("script")):
if('CCRZ.detailData.jsonProductData = {"' in str(table)):
x=str(table).split('CCRZ.detailData.jsonProductData = {"')
raw_json = "{\""+str(x[-1]).split('};')[0]+"}"
break
req_json = json.loads(raw_json)
# with open("text_json.json","w")as file:
# x=json.dump(req_json,file,indent=4)
temp = req_json
name=[]
specs=[]
title=temp['product']['prodBean']['name']
name.append(title)
item=temp['specifications']['MARKETING']
for i in item:
try:
get=i['value']
except:
pass
specs.append(get)
temp={'title':name,'Specification':specs}
df=pd.DataFrame(temp)
print(df)
While error is quite clear, question and expected result is not.
The way you try to create the DataFrame has to deal with missing rows, thats why the error comes up. To fix this you could create DataFrame from dict:
pd.DataFrame.from_dict(temp, orient='index')
But that looks pretty ugly and could not processed well later on, so an alternative would be:
data = [{
'title':temp['product']['prodBean']['name'],
'specs':','.join([s.get('value') for s in temp['specifications']['MARKETING']])
}]
pd.DataFrame(data)
or following if you like to have each spec in a new row:
data = {
'title':temp['product']['prodBean']['name'],
'specs':[s.get('value') for s in temp['specifications']['MARKETING']]
}
pd.DataFrame.from_dict(data)
Example
import enum
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
url="https://www.fleetpride.com/parts/otr-coiled-air-hose-otr6818"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.3"
}
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
raw_json = ""
for table_index,table in enumerate( soup.find_all("script")):
if('CCRZ.detailData.jsonProductData = {"' in str(table)):
x=str(table).split('CCRZ.detailData.jsonProductData = {"')
raw_json = "{\""+str(x[-1]).split('};')[0]+"}"
break
temp = json.loads(raw_json)
data = [{
'title':temp['product']['prodBean']['name'],
'specs':','.join([s.get('value') for s in temp['specifications']['MARKETING']])
}]
pd.DataFrame(data)
Output
title
specs
0
OTR Trailer Air Hose and Electric Cable Assembly, 15'
Spiral wound,Includes hang collar,One bundle for easy management

With BeautifulSoup I can't find some elements

image from the website I am trying to get what is under "col-12 col-md-3 product-grid-item-container rendered-enhanced" with running this script
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
r = requests.get('https://www.sneaksup.com/search?q=dunk&pagenumber=1', headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
hrefs = soup.find('div', class_='product-list-inner-container bg-white')
print(hrefs)
but unfortunately all I got is this . How can I get all of the info above the "col-12 col-md-3 product-grid-item-container rendered-enhanced"(I tried to find from:
hrefs = soup.find('div', class_='col-12 col-md-3 product-grid-item-container rendered-enhanced')
but only got [ ] )
As stated, you can get that dat from the api endpoint. Here's how you'd do that:
import requests
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
url = 'https://www.sneaksup.com/search?q=dunk&pagenumber=1&paginationType=20&orderby=0'
jsonData = requests.get(url, headers=headers).json()
df = pd.DataFrame(jsonData['Products'])
Data you are looking for it might not be visible through class but what you can do manually search for title of product so from that you can find in script tag
text=soup.find_all("script")[5].contents[0]
after using above code we can extract text using re module
import re
main_data=re.findall(r'\{.*?\}', text)
In which main_data return as list of dictionary and you can extract what so ever data you want
That URL loads data from an API endpoint.
You can get everything related to product from that Endpoint.
Here is the Endpoint
https://www.sneaksup.com/search?q=dunk&pagenumber=1&paginationType=20&orderby=0

Iterate append json and save as dataframe in Python

I want to iterate and extract tables from the link here, then concatenate or append them to save as a dataframe.
I have used a loop iterate tables but I'm not sure how can I append all json or dataframe into one?
Anyone could help? Thank you.
from requests import post
import json
import pandas as pd
import numpy as np
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
"Referer": "http://zjj.sz.gov.cn/projreg/public/jgys/jgysList.jsp"}
dfs = []
#dfs = pd.DataFrame()
for page in range(0, 5):
data = {"limit": 100, "offset": page * 100, "pageNumber": page + 1}
json_arr = requests.post("http://zjj.sz.gov.cn/projreg/public/jgys/webService/getJgysLogList.json", headers = headers, data = data).text
d = json.loads(json_arr)
df = pd.read_json(json.dumps(d['rows']) , orient='list')
Reference related: Iterate and extract tables from web saving as excel file in Python
Use concat,
import requests
import json
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Referer': 'http://zjj.sz.gov.cn/projreg/public/jgys/jgysList.jsp'
}
dfs = pd.DataFrame()
for page in range(0, 5):
data = {'limit': 100, 'offset': page * 100, 'pageNumber': page + 1}
json_arr = requests.post(
'http://zjj.sz.gov.cn/projreg/public/jgys/webService/getJgysLogList.json',
headers=headers,
data=data).text
d = json.loads(json_arr)
df = pd.read_json(json.dumps(d['rows']) , orient='list')
dfs = pd.concat([df, dfs], sort=False)
Or,
import requests
import json
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Referer': 'http://zjj.sz.gov.cn/projreg/public/jgys/jgysList.jsp'
}
dfs = []
for page in range(0, 5):
data = {'limit': 100, 'offset': page * 100, 'pageNumber': page + 1}
json_arr = requests.post(
'http://zjj.sz.gov.cn/projreg/public/jgys/webService/getJgysLogList.json',
headers=headers,
data=data).text
d = json.loads(json_arr)
dfs.append(pd.read_json(json.dumps(d['rows']) , orient='list'))
df = pd.concat(dfs, sort=False)
PS: The second block is much preferred as you should never call DataFrame.append or pd.concat inside a for-loop. It leads to quadratic copying. Thanks #parfait!

Python Scraper - Request Post Function Not Returning Correct Page

I am working on my first website scraper and have ran into another issue. Below is my code. The website that is returned is the main page not the specific site for the parcel number I searched.
Am I using the wrong html class to identify the search function? Or is there something missing in the Python code? Any help would be much appreciated.
from bs4 import BeautifulSoup
import requests
web_page = 'https://mcassessor.maricopa.gov/index.php'
web_header ={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
payload = {'homeSearchField' : '10218779'}
response = requests.post(web_page, data=payload, headers=web_header)
soup = BeautifulSoup(response.content, 'html.parser')
print (soup.prettify())