Getting blank file trying to Import data with Beautifulsoup into csv - csv

Don't know what is going on. It was working perfectly with the other codes importing to csv file. But somehow in this one doesn't pass the data. It creates the file but is blank.
If anyone could give a tip would be appreciated. Quite possible that is just a simple solution because I'm a newbie.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import re
import csv
filename = "test.csv"
f = open(filename, "a")
headers = "location, country, type, level, deep, vision, water,
access, life kind \n"
f.write(headers)
my_url = "http://www.example.com"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
links = page_soup.select('sea > a[href]')
link = [tag.get('href') for tag in links]
for url in link:
Client = uReq(url)
pageHtml = Client.read()
Client.close()
pSoup = soup(pageHtml, "html.parser")
linkeas = pSoup.findAll(href=re.compile(my_url))
def linkas(href):
return href and re.compile("html").search(href) and re.compile(my_url).search(href)
linka = pSoup.findAll(href=linkas)
if linka != []:
linkia = [tag.get('href') for tag in linka]
for curl in linkia:
cClient = uReq(curl)
pageHtml = cClient.read()
cClient.close()
Soup = soup(pageHtml, "html.parser")
info = Soup.select('.val')
if info != None:
location = Soup.select('.last')
if location[0].string != 'Page 2':
country = Soup.select('.cru > a:nth-of-type(3)')
countri = country[0].text.strip()
print(countri)
locat = location[0].text.strip()
print(locat)
tipo = info[0].text.strip()
print(tipo)
expe = info[1].text.strip()
print(expe)
depth = info[2].text.strip()
print(depth)
avg = info[3].text.strip()
print(avg)
cur = info[4].text.strip()
print(tipo)
acc = info[5].text.strip()
print(acc)
life = info[6].text.strip()
print(life)
f.write(locat.replace(",", " |") + "," + countri.replace(",", " |") + "," + tipo.replace(",", " |") + "," + expe.replace(",", " |") + "," + depth.replace(",", " |") + "," + avg.replace(",", " |") + "," + cur.replace(",", " |") + "," + acc.replace(",", " |") + "," + life.replace(",", " |") + "\n")
continue
else:
continue

Related

Django webscraping JSONDecodeError

I'm trying to scrape data and it works fine if the {fplid} for url is like 30 for example. How do I fix this method, so it gets the user input and gets the data from the url without a decode error. This is the traceback
'''
C:\Users\krish\OneDrive\Desktop\FPLHangout\scrape\views.py, line 31, in home
data = get_html_content(fplid) …
Local vars
C:\Users\krish\OneDrive\Desktop\FPLHangout\scrape\views.py, line 9, in get_html_content
managerdata = json.loads(r.text)
def get_html_content(fplid):
url = 'https://fantasy.premierleague.com/api/entry/{fplid}/event/30/picks/'
r = requests.get(url)
managerdata = json.loads(r.text)
bootstrap = 'https://fantasy.premierleague.com/api/bootstrap-static/'
bootstrapdata = requests.get(bootstrap)
bootstrapjson = json.loads(bootstrapdata.text)
for pick in managerdata['picks']:
pick = (pick['element']) #correct id
location = 0
for player in bootstrapjson['elements']:
if player.get('id') == pick:
break
location += 1
#position = (pick['position'])
firstname = bootstrapjson['elements'][location]['first_name']
secondname = bootstrapjson['elements'][location]['second_name']
return firstname + " " + secondname
def home(request):
if 'fplid' in request.GET: #
fplid = request.GET.get('fplid')
data = get_html_content(fplid)
return render(request, 'scrape/home.html', {'fpldata': data})
return render(request, 'scrape/home.html')

Why is this element of an html generated in the wrong order?

I'm trying to replace the element lienconj by new_label with the following code
import requests
session = requests.Session()
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
r = session.get('https://www.larousse.fr/dictionnaires/francais/aimer/1925', headers = headers)
soup = BeautifulSoup(r.content, 'html.parser')
temp2 = [tag.attrs['href'] for tag in soup.select('.lienconj')]
for i in range(len(temp2)):
new_label_1 = '<label for = "' + str(i) + '">' + '<div class="boxed">Conjugaison</div>' + '</label>'
new_label_2 = '<input id = "' + str(i) + '" type = "checkbox" class = "trigger">'
new_label = new_label_1 + new_label_2
soup.select('.lienconj')[0].replace_with(BeautifulSoup(new_label))
format = open(r'E:\\test.html', 'w+', encoding = 'utf8')
format.write(str(soup.select_one('.wrapper')))
format.close()
It's clear from new_label_1 that <div class="boxed">Conjugaison</div> is inside <label for and </label>. Could you please explain why the resulted html does not has this correct order?
Here is the excerpt from the resulted html test.html:
<p class="CatgramDefinition">verbe transitif <label for="10"></label></p>

How do I get the next pagination 'href'?

So I am having trouble obtaining the href link for the next pages of the url. I got up to obtaining all the text and what not that the tag contains but I can't seem to wrap my head around to removing the text that I don't need and just obtaining the href and navigating through the pages.
Here is my code:
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
filter_words = ['engineering', 'instrumentation', 'QA']
all_job_url = []
nextpages = []
filtered_job_links = []
http_flinks = []
flinks = []
def all_next_pages():
pages = prettify.find_all('div', {'class':'pagination'})
for next_page in pages:
next_page.find_all('a')
nextpages.append(next_page)
print(next_page)
all_next_pages()
Here is a way to get the links of the search result items. Find row result class and then find a tag, it contains all the information you need.
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.text
prettify = BeautifulSoup(rcontent, "lxml")
filter_words = ['engineering', 'instrumentation', 'QA']
all_job_url = []
nextpages = []
filtered_job_links = []
http_flinks = []
flinks = []
def all_next_pages():
pages = prettify.find_all('div', {'class':' row result'})
for next_page in pages:
info = next_page.find('a')
url = info.get('href')
title = info.get('title')
print(title,url)
all_next_pages()

Converting html to Json object

I'm currently working on a project where I need to convert some older code into a json object. We're taking the result set from a sql query and returning the categories it gives back as json. I'm not that well versed in javascript let alone json so I'm not sure what's the simplest way to go about this. Here is the function I need to change into JSON:
function createOutputCategories(){
try
{
output =
"<html>" +
"<head>" +
"<title>" +
"You can find it!" +
"</title>" +
"</head>" +
"<body bgcolor='#CED3F3'>" +
"<a href='" + url + "file.xsjs?parent=1'>" +
"</a>" +
"<br><br>";
if(parent === "1"){
output = output + "<h3><font color='#AAAAAA'>Home</font>";
}else{
output = output +"<a href='javascript:history.back()'>" +
"<h3>Back";
}
output = output +
"</h3>" +
"</a>" +
"<h1>" +
"Categories:" +
"</h1>";
while(rs.next()){
if(rs.getString(3) === 0 || rs.getString(3) === null || rs.getString(3) === undefined || rs.getString(3) === "0" ){
output = output + "<br><a href='" + url + "yeti.xsjs?parent=" + rs.getString(1) + "'>" + rs.getString(2) + "</a>";
}else{
output = output + "<br><a href='" + url + "yeti.xsjs?parent=" + rs.getString(1) + "'>" + rs.getString(3) + "</a>";
}
}
}catch(Exception){
$.response.contentType = "text/plain";
$.response.setBody( "Failed to retreive data" );
$.response.status = $.net.http.INTERNAL_SERVER_ERROR;
}
Here is what I have so far but I am not returning a valid JSON object:
function createOutputCategories(){
try{
output =
"category: {name = \"" + parent + "\"; description = \"\"}";
output = output +
"subcategories: [ ";
while(rs.next()){
output = output +
"{ catid = \"" + rs.getString(1) + "\"; catname = \"" + rs.getString(2) + "\"; altname = \"" + rs.getString(3) + "\"; description = \"" + rs.getString(4) + "\"}";
}
output = output +
"];";
}
catch(Exception){
$.response.contentType = "text/plain";
$.response.setBody( "Failed to retreive data" );
$.response.status = $.net.http.INTERNAL_SERVER_ERROR;
}
If I need to provide anything else please let me know! Thanks!
Do you want to output a javascript object to a string?
Construct the object:
var category=new Object();
category.name="Name";
category.description="My lovely description";
category.subcategories=[];
var subCat=new Object();
subCat.catid=1;
subCat.catname="My subcat";
category.subcategories.push(subCat);
Alternatively, you could construct the object using literals:
var category={
name:"Name",
description:"My lovely description",
subcategories:[
{catid:1,catname:"My subcat"}
]
};
Then return the object as string.
return JSON.stringify(category);
A reference to Javascript objects if you need more help:
http://www.w3schools.com/js/js_objects.asp

dict[obj] pulling wrong info at 35-36 xml node why?

I'm not sure how to explain this, but why isn't this code pulling the correct image and icon file? It continues to pull the wrong info at the line 35, which makes all other data one node behind. Even when I delete the three locations below, it still gets the xml wrong. Why would it be doing this?
I verified the xml data and made sure all the files were in the correct folder.
For example, this is a result from a trace:
pm#: 35 xmlnodename: Causey's Pharmacy iconFL: http://www.mysite.com/folder1/folder2/media/marker.png imgFL: http://www.mysite.com/folder1/folder2/media/century21_img.swf
pm#: 36 xmlnodename: Century 21 Property Shoppe iconFL: http://www.mysite.com/folder1/folder2/media/century21_icon.gif imgFL: http://www.mysite.com/folder1/folder2/media/century21_img.swf
pm#: 37 xmlnodename: Century 21 Property Shoppe iconFL: http://www.mysite.com/folder1/folder2/media/century21_icon.gif imgFL: http://www.mysite.com/folder1/folder2/media/type1.swf
Here is my xml at lines 34-36
<location>
<name>Bobby&apos;s Pharmacy</name>
<street>123 Steet St.</street>
<city>SomeCity</city>
<state>ZZ</state>
<zip>12345</zip>
<lat>45.7520099</lat>
<long>-80.0816701</long>
<iconFile>marker.png</iconFile>
<imageFile>type1.swf</imageFile>
<motion>no</motion>
<featured>no</featured>
<category>none</category>
</location>
<location>
<name>Century 21 Property Shoppe</name>
<street>456 SomeOther St</street>
<city>SomeCity</city>
<state>ZZ</state>
<zip>12345</zip>
<lat>45.5683603</lat>
<long>-80.483271</long>
<iconFile>century21_icon.gif</iconFile>
<imageFile>century21_img.swf</imageFile>
<motion>no</motion>
<featured>yes</featured>
<category>none</category>
</location>
<location>
<name>Century 21 Property Shoppe</name>
<street>none</street>
<city>none</city>
<state>none</state>
<zip>none</zip>
<lat>45.4949689</lat>
<long>-80.6597955</long>
<iconFile>century21_icon.gif</iconFile>
<imageFile>century21_img.swf</imageFile>
<motion>no</motion>
<featured>yes</featured>
<category>none</category>
</location>
Here is my flex code:
private var mediaLoc:String = "http://www.mysite.com/folder1/folder2/media/";
public function addMarkers():void
{
var dict:Object = new Object();
var i:Number = 0;
var e:Number = locXML..location.length()+1;
trace("add markers: " + locXML..location.length());
for(i;i<e;i++){
if (locXML.location.name[i.toString()]== "mapLogo"){
trace(i + " logo");
//get lat and long from xml
dict["locPointMarker_lat" + i.toString()] = locXML.location.lat[i.toString()];
dict["locPointMarker_long" + i.toString()] = locXML.location.long[i.toString()];
//get iconfile name from xml and append it to mediaLoc--filepath
dict["iconFileLink" + i.toString()] = mediaLoc + locXML.location.iconFile[i.toString()];
//create a new url request using the dict["iconFileLink" + i.toString()] filepath
dict["iconFileReq" + i.toString()] = new URLRequest(dict["iconFileLink"+i.toString()]);
//create a new pointmarker for the map
dict["locPointMarker"+i.toString()] = new PointMarker();
//apply buttonmode and handcursor to the new pointmarker
dict["locPointMarker"+i.toString()].buttonMode = dict["locPointMarker"+i.toString()].useHandCursor = true;
//create a loader to load the icon file
dict["icon"+i.toString()] = new Loader();
//load the icon file
dict["icon"+i.toString()].load(dict["iconFileReq"+i.toString()], context);
//put the marker on the map
_map.putMarker(new Location(locXML.location.lat[i.toString()], locXML.location.long[i.toString()]), dict["locPointMarker"+i.toString()]);
trace("pm#: " + i + " xmlnodename: " + locXML.location.name[i.toString()] + " iconFL: " + dict["iconFileLink" + i.toString()] + " imgFL: " + dict["imageFileLink" + i.toString()]);
dict["locPointMarker"+i.toString()].mouseChildren=false;
}else{
//the following else does the same thing except it also adds the image file
dict["locPointMarker_lat" + i.toString()] = locXML.location.lat[i.toString()];
dict["locPointMarker_long" + i.toString()] = locXML.location.long[i.toString()];
dict["iconFileLink" + i.toString()] = mediaLoc + locXML.location.iconFile[i.toString()];
dict["iconFileReq" + i.toString()] = new URLRequest(dict["iconFileLink"+i.toString()]);
dict["locPointMarker"+i.toString()] = new PointMarker();
dict["locPointMarker"+i.toString()].buttonMode = dict["locPointMarker"+i.toString()].useHandCursor = true;
dict["icon"+i.toString()] = new Loader();
dict["icon"+i.toString()].load(dict["iconFileReq"+i.toString()], context);
dict["icon"+i.toString()].x = -iconHeight/2;
dict["icon"+i.toString()].y = -iconWidth/2;
_map.putMarker(new Location(locXML.location.lat[i.toString()], locXML.location.long[i.toString()]), dict["locPointMarker"+i.toString()]);
trace("pm#: " + i + " xmlnodename: " + locXML.location.name[i.toString()] + " iconFL: " + dict["iconFileLink" + i.toString()] + " imgFL: " + dict["imageFileLink" + i.toString()]);
dict["locPointMarker"+i.toString()].mouseChildren=false;
}
}
}
the object class stores the value hashed by the toString() property of the key. So if two key renders as the same string the values will collide.
Use Dictionary instead.
here some more info: http://livedocs.adobe.com/flash/9.0/ActionScriptLangRefV3/flash/utils/Dictionary.html