Django webscraping JSONDecodeError - json

I'm trying to scrape data and it works fine if the {fplid} for url is like 30 for example. How do I fix this method, so it gets the user input and gets the data from the url without a decode error. This is the traceback
'''
C:\Users\krish\OneDrive\Desktop\FPLHangout\scrape\views.py, line 31, in home
data = get_html_content(fplid) …
Local vars
C:\Users\krish\OneDrive\Desktop\FPLHangout\scrape\views.py, line 9, in get_html_content
managerdata = json.loads(r.text)
def get_html_content(fplid):
url = 'https://fantasy.premierleague.com/api/entry/{fplid}/event/30/picks/'
r = requests.get(url)
managerdata = json.loads(r.text)
bootstrap = 'https://fantasy.premierleague.com/api/bootstrap-static/'
bootstrapdata = requests.get(bootstrap)
bootstrapjson = json.loads(bootstrapdata.text)
for pick in managerdata['picks']:
pick = (pick['element']) #correct id
location = 0
for player in bootstrapjson['elements']:
if player.get('id') == pick:
break
location += 1
#position = (pick['position'])
firstname = bootstrapjson['elements'][location]['first_name']
secondname = bootstrapjson['elements'][location]['second_name']
return firstname + " " + secondname
def home(request):
if 'fplid' in request.GET: #
fplid = request.GET.get('fplid')
data = get_html_content(fplid)
return render(request, 'scrape/home.html', {'fpldata': data})
return render(request, 'scrape/home.html')

Related

Selenium MaxRetryError(_pool, url, error or ResponseError(cause)) urllib3.exceptions.MaxRetryError

class gmarket_sales():
def __init__(self):
chrome_driver = Service(ChromeDriverManager().install())
options = Options()
options.add_experimental_option('detach',True)
options.add_experimental_option('excludeSwitches',['enable-logging'])
# options.add_argument('--headless')
# options.add_argument('--window-size = x, y')
# options.add_argument('--start-maximazed')
# options.add_argument('--start-fullscreen')
# options.add_argument('--mute-audio')
self.driver = webdriver.Chrome(options=options,service=chrome_driver)
self.now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S (%a)')
self.hour = datetime.datetime.now().strftime('%H시_%M_분')
self.today = date.today()
self.folder = None
self.today_file = None
self.kakao_talk = kakao()
def connect(self):
url = 'http://minishop.gmarket.co.kr/meritblog'
# url = 'http://minishop.gmarket.co.kr/hanvitis'
self.driver.get(url)
return url
def shopping_mall(self):
mall_name = self.driver.find_element(By.CSS_SELECTOR,'a.shop_title_ui_txt').text
self.folder = f'./메리트몰_데이터베이스/지마켓'
self.today_file = f'{self.today}_{mall_name}_지마켓.json'
return mall_name
def soup(self,url_param):
try:
response = requests.get(url_param)
if response.status_code == 200:
sp = BeautifulSoup(response.text, 'html.parser')
return sp
except requests.packages.urllib3.exceptions.MaxRetryError as e:
print(str(e))
def total_product(self):
total_items = 0
products = self.driver.find_element(By.ID,'ulCategory').find_elements(By.CSS_SELECTOR,'span.data_num')
for product in products:
items = int(product.text.replace('(',"").replace(')',""))
total_items += items
# 391개
return total_items
def paging(self,total_items,url):
page_list = []
# 전체상품보기 클릭
self.driver.execute_script('arguments[0].click();',self.driver.find_element(By.CSS_SELECTOR,'.allList_view > a'))
time.sleep(2)
# 한 페이지의 상품 수
view_limit = int(self.driver.find_element(By.CSS_SELECTOR,'div.limit').text.replace("개씩",""))
# 페이지 수 구하기
if total_items % view_limit == 0:
page = total_items // view_limit
else:
page = total_items // view_limit + 1
# 페이지 리스트
for cnt in range(page):
page_url = f'{url}/List?CategoryType=General&SortType=FocusRank&DisplayType=SmallImage&Page={cnt+1}&PageSize=60'
page_list.append(page_url)
# self.driver.quit()
return page_list
def data_one(self,page_list):
"""상품 url 리스트
정상가/할인가/할인율 딕셔너리"""
url_list = []
price_dic = {}
for page in page_list:
html = self.soup(page)
for items in html.find('ul',class_='type2').find_all('li'):
# url
item_url = items.find('a')['href']
# 상품코드
item_code = item_url[-10:]
# 가격 및 할인율
if items.find('p',class_='prd_price').find('span',class_='del_important'):
original_price = items.find('p',class_='prd_price').find('span',class_='del_important').text.replace("원","")
discount_price = items.find('p',class_='prd_price').find('strong').text.replace("원","")
sale_rate = items.find('p',class_='prd_price').find('span',class_='splt_ico usr_clr').text
else:
original_price = items.find('p',class_='prd_price').find('strong').text.replace("원","")
discount_price = "없음"
sale_rate = "없음"
url_list.append(item_url)
price_dic[item_code]={"정상가":original_price,"할인가":discount_price,"할인율":sale_rate}
time.sleep(randint(1,10))
self.driver.quit()
return url_list , price_dic
def check_start(self):
url = self.connect()
mall_name = self.shopping_mall()
total_items = self.total_product()
page_list = self.paging(total_items,url)
url_list,price_dic = self.data_one(page_list)
if __name__ == "__main__":
g_market = gmarket_sales()
# g_market.check_start()
schedule.every().hour.at(":20").do(g_market.check_start)
while True:
schedule.run_pending()
time.sleep(1)
Hello, I am a student practicing web page crawling.
I'm making a code that scrapes data by parsing a website with selenium.
I wrote the program so that it runs at regular intervals using the final schedule module.
However, if chrome_driver is initialized in the init of the class and the driver.quit() command is put in the execution process, the MaxRetryError(_pool, url, error or ResponseError(cause)) urllib3.exceptions.MaxRetryError code is displayed when the second code is executed..
Below is the code I wrote.
I would really appreciate it if you could point out any problems.

I have a record template, it is creating well and when I open the edit template what I do is create a new record

I have the saveremission method, where the form that is in HTML creates it perfectly and loads it to the DB.
def guardarremision(request):
if request.method == 'POST':
fecharemi = request.POST['fecharemi']
fechaenvio = request.POST['fechaenvio']
clienteremi = request.POST['clienteremi']
enviadoa = request.POST['enviadoa']
fecharecibido = request.POST['fecharecibido']
telefonoremi = request.POST['telefonoremi']
formaenvio = request.POST['formaenvio']
statusremi = request.POST['statusremi']
preparadaremi = request.POST['preparadaremi']
guiaremi = request.POST['guiaremi']
remisiones = Remisiones(
fecha = fecharemi,
fecha_envio = fechaenvio,
clientes_id = clienteremi,
contacto = enviadoa,
fecha_recibido = fecharecibido,
telefono = telefonoremi,
id_transporte_id = formaenvio,
id_status = statusremi,
elaborado = preparadaremi,
guia = guiaremi
)
remisiones.save()
return HttpResponse("Remision Creada")
else:
return HttpResponse("Remision no puede ser creada")
When I go to edit I have the following method, I have another template with a form that brings all the information through the ID, but when I save the edition it creates a new record.
def editar_remision(request, id):
remision = Remisiones.objects.get(pk=id)
return render(request, "editar_remision.html", {
'remision' : remision
})
I try to upload the edit template but I get an error.
Instead of save, use update method
# fetch the queryset containing the object that has to be edited.
qs = Remisiones.objects.filter(id = id)
qs.update(
fecha = fecharemi,
fecha_envio = fechaenvio,
clientes_id = clienteremi,
contacto = enviadoa,
fecha_recibido = fecharecibido,
telefono = telefonoremi,
id_transporte_id = formaenvio,
id_status = statusremi,
elaborado = preparadaremi,
guia = guiaremi
)
Or
# fetch the object related to the passed id
obj = get_object_or_404(Remisiones, id = id)
obj.fecha = fecharemi,
obj.fecha_envio = fechaenvio,
obj.clientes_id = clienteremi,
obj.contacto = enviadoa,
obj.fecha_recibido = fecharecibido,
obj.telefono = telefonoremi,
obj.id_transporte_id = formaenvio,
obj.id_status = statusremi,
obj.elaborado = preparadaremi,
obj.guia = guiaremi
obj.save()
Or If you are using the model form, pass the instance into the form
# fetch the object related to the passed id
obj = get_object_or_404(Remisiones, id = id)
# pass the object as an instance in the form
form = RemisionesForm(request.POST or None, instance = obj)
# save the data from the form and redirect to detail_view
if form.is_valid():
form.save()
return HttpResponse("Referencia actualizada")

Javascript Base64 Decoding to Invalid JSON Output

I'm using a Firebase Cloud function to receive a JSON payload in an http request from the App Store (server-to-server notifications) that contains a Base64 encoded string. I'm decoding using:
const latestReceipt = request.body.latest_receipt ? Buffer.from(request.body.latest_receipt, 'base64').toString() : null;
This is works fine in another function to decode a message coming from the Google Play Store. The decoded string coming from Apple, however, is an invalid JSON object.
{
"original-purchase-date-pst" = "2019-09-20 16:40:20 America/Los_Angeles";
"quantity" = "1";
"subscription-group-identifier" = "****";
"unique-vendor-identifier" = "****";
"original-purchase-date-ms" = "1569022820000";
"expires-date-formatted" = "2019-09-24 02:55:47 Etc/GMT";
"is-in-intro-offer-period" = "false";
"purchase-date-ms" = "1569293447000";
"expires-date-formatted-pst" = "2019-09-23 19:55:47 America/Los_Angeles";
"is-trial-period" = "false";
"item-id" = "1478806339";
"unique-identifier" = "******";
"original-transaction-id" = "1000000570864027";
"expires-date" = "1569293747000";
"transaction-id" = "1000000571530192";
"bvrs" = "11";
"web-order-line-item-id" = "1000000047099385";
"version-external-identifier" = "0";
"bid" = "****";
"product-id" = "storage_increase";
"purchase-date" = "2019-09-24 02:50:47 Etc/GMT";
"purchase-date-pst" = "2019-09-23 19:50:47 America/Los_Angeles";
"original-purchase-date" = "2019-09-20 23:40:20 Etc/GMT";
}
I'm assuming that I'm doing something wrong, but I'm not sure what exactly. I wouldn't expect the App Store to send invalid JSON.

How do I get the next pagination 'href'?

So I am having trouble obtaining the href link for the next pages of the url. I got up to obtaining all the text and what not that the tag contains but I can't seem to wrap my head around to removing the text that I don't need and just obtaining the href and navigating through the pages.
Here is my code:
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
filter_words = ['engineering', 'instrumentation', 'QA']
all_job_url = []
nextpages = []
filtered_job_links = []
http_flinks = []
flinks = []
def all_next_pages():
pages = prettify.find_all('div', {'class':'pagination'})
for next_page in pages:
next_page.find_all('a')
nextpages.append(next_page)
print(next_page)
all_next_pages()
Here is a way to get the links of the search result items. Find row result class and then find a tag, it contains all the information you need.
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.text
prettify = BeautifulSoup(rcontent, "lxml")
filter_words = ['engineering', 'instrumentation', 'QA']
all_job_url = []
nextpages = []
filtered_job_links = []
http_flinks = []
flinks = []
def all_next_pages():
pages = prettify.find_all('div', {'class':' row result'})
for next_page in pages:
info = next_page.find('a')
url = info.get('href')
title = info.get('title')
print(title,url)
all_next_pages()

simplejson.scanner.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

I have been writing a simple plugin for hackerrank to compile and run code, of a problem on hackerrank from my system.
For ex: I need to test code for this problem https://www.hackerrank.com/challenges/solve-me-first
So, I ran my script like:
python hackerrank.py https://www.hackerrank.com/challenges/solve-me-first solve-me-first.cpp
I get following output:
Traceback (most recent call last):
File "hackerrank.py", line 126, in <module>
h.run()
File "hackerrank.py", line 113, in run
if self.compile_and_test() == "NOT_FOUND":
File "hackerrank.py", line 51, in compile_and_test
j = self.r.json()
File "/usr/lib/python2.7/dist-packages/requests/models.py", line 741, in json
return json.loads(self.text, **kwargs)
File "/usr/lib/python2.7/dist-packages/simplejson/__init__.py", line 488, in loads
return _default_decoder.decode(s)
File "/usr/lib/python2.7/dist-packages/simplejson/decoder.py", line 370, in decode
obj, end = self.raw_decode(s)
File "/usr/lib/python2.7/dist-packages/simplejson/decoder.py", line 389, in raw_decode
return self.scan_once(s, idx=_w(s, idx).end())
simplejson.scanner.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
Here is my hackerrank.py file:
import requests
import time
import sys
import os.path
class HackerRank:
def __init__(self,url,code="",ext=".py"):
self.code = code
self.ext = ext
self.set_language()
self.problem_url = url
self.s = requests.session()
self.set_post_url()
def set_post_url(self):
#rootUrl = "https://www.hackerrank.com/"
#l = self.problem_url.split("/")
#print l
#if l[3] == "challenges":
# contestUrl = "/contests/master/challenges/" + l[4]
#else:
# # it's a contest
# contestUrl = "/".join(l[3:])
#self.post_url = rootUrl + "/rest/" + contestUrl + "/compile_tests/"
self.post_url = self.problem_url
def set_language(self):
if self.ext == ".py":
self.language = "python"
if self.ext == ".c":
self.language = "c"
if self.ext == ".cpp":
self.language = "cpp"
if self.ext == ".java":
self.language = "java"
# TODO : find out the language value for other exts
def set_code(self,code):
self.code = code
def generate_payload(self):
self.payload = {'code' : self.code, 'language' : self.language}
def compile_and_test(self):
self.generate_payload()
self.r = self.s.post(self.post_url, params=self.payload)
if self.r.status_code == 404:
print "not found 1"
return "NOT_FOUND"
print "yes"
j = self.r.json()
print j
self.submission_id = j['model']['id']
self.get_url = self.post_url + "/submissions/code/" + str(self.submission_id)
self.rr = self.s.get(self.get_url, cookies = self.s.cookies)
return self.rr
def fetch(self,last_status):
if self.r.status_code == 404:
return
self.rr = self.s.get(self.get_url, cookies = self.s.cookies)
self.res = self.rr.json()
if self.res['model']['status'] == 0:
new_status = self.res['model']['status_string']
if new_status != last_status:
print(new_status)
time.sleep(1)
self.fetch(new_status)
else:
return
def compiler_message(self):
return self.res['model']['compilemessage']
def testcase_message(self):
return self.res['model']['testcase_message']
def expected_output(self):
return self.res['model']['expected_output']
def stdin(self):
return self.res['model']['stdin']
def stdout(self):
return self.res['model']['stdout']
def dump(self):
cm = self.compiler_message()
tm = self.testcase_message()
eo = self.expected_output()
stdin = self.stdin()
stdout = self.stdout()
s = ""
for i in range(len(tm)):
s += (cm + "\n\n")
s += ("Testcase# " + str(i) + "\n")
s += ("Sample Input:\n\n")
s += (stdin[i])
s += ("\n\n")
s += ("Your Output:\n\n")
s += (stdout[i])
s += ("\n\n")
s += ("Expected Output:\n\n")
s += (eo[i])
s += ("\n\n")
s += ("Compiler Message:\n\n")
s += (tm[i])
s += ("\n\n")
print(s)
return s
def run(self):
if self.compile_and_test() == "NOT_FOUND":
print "not found"
return "404 : NOT_FOUND"
self.fetch("")
return self.dump()
if __name__=="__main__":
url = sys.argv[1]
codefile = sys.argv[2]
print url,codefile
ext = os.path.splitext(codefile)[1]
code = open(codefile).read()
h = HackerRank(url,code,ext)
h.run()
I am newbie to json and plugins. Can you help me out.