So, I am trying to get the location text in the profile of a given Twitter account
handles = ['IndieWire' , 'AFP', 'UN']
for x in handles:
url= "https://twitter.com/" + x
try:
html = req.get(url)
except Exception as e:
print(f"Failed to fetch page for url {url} due to: {e}")
continue
soup = BeautifulSoup(html.text,'html.parser')
try:
label = soup.find('span',{'class':"ProfileHeaderCard-locationText"})
label_formatted = label.string.lstrip()
label_formatted = label_formatted.rstrip()
if label_formatted != "":
location_list.append(label_formatted)
print(x + ' : ' + label_formatted)
else:
location_list.append(label_formatted)
print(x + ' : ' + 'Not found')
except AttributeError:
try:
label2 = soup.findAll('span',{"class":"ProfileHeaderCard-locationText"})[0].get_text()
label2 = str(label2)
label2_formatted = label2.lstrip()
label2_formatted = label2_formatted.rstrip()
location_list.append(label_formatted)
print(x + ' : ' + label2_formatted)
except:
print(x + ' : ' + 'Not found')
except:
print(x + ' : ' + 'Not found')
This code used to work when I used it a few months ago. I changed it a little bit now after checking the Twitter page source but I still cant get the locations. Hope you can help
Use mobile version of Twitter to get location.
For example:
import requests
from bs4 import BeautifulSoup
handles = ['IndieWire' , 'AFP', 'UN']
ref = 'https://twitter.com/{h}'
headers = {'Referer': '',}
url = 'https://mobile.twitter.com/i/nojs_router?path=/{h}'
for h in handles:
headers['Referer'] = ref.format(h=h)
soup = BeautifulSoup( requests.post(url.format(h=h), headers=headers).content, 'html.parser' )
loc = soup.select_one('.location')
if loc:
print(h, loc.text)
else:
print(h, 'Not Found')
Prints:
IndieWire New York, NY
AFP France
UN New York, NY
Related
I am trying to set up an automated alert emailing system for a project and I can format the email properly when it's plain text but when I try to add variables it won't format properly. I am pulling data from a JSON file and I can pull the data I just can't combine it with the email inside the triple quotes (""").
Here is the plain text
message = """\
Subject: AMLD Alert
Something is wrong."""
But when I add in my variables it will not fill in the body, but the subject will.
message = 'Subject: AMLD Alert for ' + d[0]['Project Name'] + ' Project'
'Car: ' + d[0]['CarID'] +' Driven by: ' + d[0]['DriverID'] + ' is sending alert '+ d[0]['Message']
Here is the entire script if it will help.
import json
import smtplib
import ssl
import socket
socket.getaddrinfo('localhost', 8080)
#Webiste use to help me build script
#https://realpython.com/python-send-email/
port = 587 # For starttls
smtp_server = "smtp.outlook.com"
sender_email = "MyWorkOutlookEmail" # Enter your address
password = 'Password'
receiver_email = "MyPersonalGmail" # Enter receiver address
context = ssl.create_default_context()
#Start of ITTT Code
with open('C:/Python/Messaging/Mes_V1.json') as f:
d = json.load(f)
if d[0]['Alert'] == "High":
print('Sending high alert email...\n')
message = 'Subject: AMLD Alert for ' + d[0]['Project Name'] + ' Project'
'Car: ' + d[0]['CarID'] +' Driven by: ' + d[0]['DriverID'] + ' is sending alert '+ d[0]['Message']
elif d[0]['Alert'] == "Medium":
print('Sending medium alert email...\n')
else:
print('Sending low alert email...\n')
context = ssl.create_default_context()
with smtplib.SMTP(smtp_server, port) as server:
print('Connecting to Server...\n')
server.starttls(context=context)
print('Logging in...\n')
server.login(sender_email, password)
server.sendmail(sender_email, receiver_email, message)
print('Email sent.\n')
I'm not sure if Outlook is affecting it in any way or if I need to format the message differently, any help would be very appreciated!
I'm trying to scrape images from a website. In the website's html code, the srcset sections exist and are of the form
srcset="https://...."
For example,
srcset="https://secure.img1-fg.wfcdn.com/im/80458162/resize-h300-w300%5Ecompr-r85/1068/106844956/Derry+84%2522+Reversible+Sectional.jpg 300w,https://secure.img1-fg.wfcdn.com/im/19496430/resize-h400-w400%5Ecompr-r85/1068/106844956/Derry+84%2522+Reversible+Sectional.jpg 400w,https://secure.img1-fg.wfcdn.com/im/75516274/resize-h500-w500%5Ecompr-r85/1068/106844956/Derry+84%2522+Reversible+Sectional.jpg 500w"
However, when I try to get these srcset link using selenium and beautiful soup, I get the following:
""
Moreover, every time the srcset fails to get a valid link, the string that it gets is always
""
I tried a bunch of different lines of code, but haven't had success with any of it. Here is the full code I currently have:
def get_info_from_product_link(product_link): #get the price and correctly filtered image link
info = dict()
driver = webdriver.Chrome('C:/Users/Brian/Downloads/chromedriver_win32/chromedriver.exe')
driver.implicitly_wait(200)
try:
driver.get(product_link)
soup = BeautifulSoup(driver.page_source, 'html.parser')
time.sleep(60)
image_carousel = soup.find_all('li', {"class" : "ProductDetailImageCarouselVariantB-carouselItem"})
print("Number of images in gallery: ", len(image_carousel))
#deal with captcha
while len(image_carousel) <= 0:
print("CAPTCHA ENCOUNTERED. FIX")
soup = BeautifulSoup(driver.page_source, 'html.parser')
image_carousel = soup.find_all('li', {"class" : "ProductDetailImageCarouselVariantB-carouselItem"})
time.sleep(30)
valid_image_links = []
highest_resolution_images = []
#get correct image links
#i = 1
for image_block in image_carousel:
try:
#print("image_block:")
#print(image_block)
#print("Image: ", i)
#i += 1
images = image_block.find('div', {"class" : "ImageComponent ImageComponent--overlay"})
#image_links = images.find('img').get_attribute('srcset').split(',')
print(images)
#driver.implicitly_wait(60)
#wait = WebDriverWait(images, 30)
#image_links = wait.until(EC.visibility_of_element_located((By.tagName, "img"))).get_attribute("srcset").split(',')
#image_links = wait.until(EC.text_to_be_present_in_element_value((By.tagName, 'img'), "https")).get_attribute("srcset").split(',')
#image_links = wait.until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, "img [srcset*='https']"), "https")).get_attribute("srcset").split(',')
#image_links = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "img[src*='https']"))).get_attribute("src").split(',')
images.implicitly_wait(30)
image_links = images.find_element_by_tag_name('img').get_attribute('srcset').split(',')
#"div[class='ajax_enabled'] [style='display:block']"
#image_links = images.find('img')['srcset'].split(',')
#print("Image links:")
#print(image_links)
#print("Number of links: ", len(image_links))
for link in image_links:
print(link)
for image_info in image_links:
image_link = image_info.split(" ")[0]
try:
if hasValidBackground(image_link) and hasValidSize(image_link):
valid_image_links.append(image_link)
else:
print("Invalid image size or background")
except:
print('ERROR when reading image: ' + image_link)
continue
if len(valid_image_links) > 0:
highest_resolution_images.append(valid_image_links[-1])
valid_image_links.clear()
except:
print("Error. Invalid image link.")
pass
#extract one link to a correctly filtered image
if len(highest_resolution_images) <= 0:
return -1
valid_image_link = highest_resolution_images[0];
info['img_url'] = valid_image_link
#get price information
standard_price_block = soup.find('div', {"class" : "StandardPriceBlock"})
base_price_block = standard_price_block.find('div', {"class" : "BasePriceBlock BasePriceBlock--highlight"})
if base_price_block is None:
base_price_block = standard_price_block.find('div', {"class" : "BasePriceBlock"})
base_price = base_price_block.find('span').text
#price_block = soup.find('span', {"class" : "notranslate"})
#base_price = standard_price_block.find('span').text
info['price'] = base_price
print(base_price)
#print(f"Image link: {image_link}\n")
#print(f"Link to product: {product_link}\n")
driver.close()
#browser.switch_to.window(browser.window_handles[0])
return info
except TimeoutException as e:
print("Page Load Timeout Occurred. Quitting...")
driver.close()
I was testing using this website:
https://www.wayfair.com/furniture/pdp/foundstone-derry-84-reversible-sectional-w001832490.html
My goal is to process each image in the image gallery/carousel and find one that has white background and has valid size of height >= 80 and width >= 80
I'm just starting to learn web scraping, so any help would be much appreciated!!
I was able to take a text file, read each line, create a dictionary per line, update(append) each line and store the json file. The issue is when reading the json file it will not read correctly. the error point to a storing file issue?
The text file looks like:
84.txt; Frankenstein, or the Modern Prometheus; Mary Wollstonecraft (Godwin) Shelley
98.txt; A Tale of Two Cities; Charles Dickens
...
import json
import re
path = "C:\\...\\data\\"
books = {}
books_json = {}
final_book_json ={}
file = open(path + 'books\\set_of_books.txt', 'r')
json_list = file.readlines()
open(path + 'books\\books_json.json', 'w').close() # used to clean each test
json_create = []
i = 0
for line in json_list:
line = line.replace('#', '')
line = line.replace('.txt','')
line = line.replace('\n','')
line = line.split(';', 4)
BookNumber = line[0]
BookTitle = line[1]
AuthorName = line[-1]
file
if BookNumber == ' 2701':
BookNumber = line[0]
BookTitle1 = line[1]
BookTitle2 = line[2]
AuthorName = line[3]
BookTitle = BookTitle1 + ';' + BookTitle2 # needed to combine title into one to fit dict format
books = json.dumps( {'AuthorName': AuthorName, 'BookNumber': BookNumber, 'BookTitle': BookTitle})
books_json = json.loads(books)
final_book_json.update(books_json)
with open(path + 'books\\books_json.json', 'a'
) as out_put:
json.dump(books_json, out_put)
with open(path + 'books\\books_json.json', 'r'
) as out_put:
'books\\books_json.json', 'r')]
print(json.load(out_put))
The reported error is: JSONDecodeError: Extra data: line 1 column 133
(char 132) - adding this is right between the first "}{". Not sure
how json should look in a flat-file format? The output file as seen on
an editor looks like: {"AuthorName": " Mary Wollstonecraft (Godwin)
Shelley", "BookNumber": " 84", "BookTitle": " Frankenstein, or the
Modern Prometheus"}{"AuthorName": " Charles Dickens", "BookNumber": "
98", "BookTitle": " A Tale of Two Cities"}...
I ended up changing the approach and used pandas to read the text and then spliting the single-cell input.
books = pd.read_csv(path + 'books\\set_of_books.txt', sep='\t', names =('r','t', 'a') )
#print(books.head(10))
# Function to clean the 'raw(r)' inoput data
def clean_line(cell):
...
return cell
books['r'] = books['r'].apply(clean_line)
books = books['r'].str.split(';', expand=True)
I am trying to build script which will download images using Python. I am having a problem in the code as the keyerror in Python v 2.7.11 is occurring. My code is
import urllib, urllib2, demjson, os
json = demjson.JSON()
def read_newbooks_file(path):
data = open(path)
isbnlist = []
for isbn in data.readlines():
isbnlist.append(isbn.replace("\n",""))
return isbnlist
isbns = read_newbooks_file("C:\\newbooks.txt")
print isbns
for isbn in isbns:
url="http://openlibrary.org/api/search?q={%22query%22:%22(isbn_10:(" + isbn + ")%20OR%20%20isbn_13:(" + isbn + "))%22}"
response=urllib.urlopen(url)
book=json.decode(response.read())
if book["result"]!=[]:
results = book["result"]
print results
url = "http://openlibrary.org/api/get?key=" + results[0]
OLResult=urllib.urlopen(url)
data=demjson.decode(OLResult.read())
print data
imgurl = 'http://covers.openlibrary.org/b/olid/' + results[0][3:] + '-M.jpg'
imgfile = urllib.urlretrieve(imgurl, "C:\\" + isbn + ".jpg")
fsize = os.path.getsize(imgfile[0])
if fsize < long(1000):
os.remove("C:\\" + isbn + ".jpg")
gparams = urllib.urlencode({'bibkeys': isbn, 'jscmd':'viewapi','callback':'gcallback'})
opener = urllib2.build_opener(urllib2.HTTPHandler())
request = urllib2.Request('http://books.google.com/books?%s' % gparams)
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3')]
g = opener.open(request).read()
print g
if g != "gcallback({});":
g = g[10:-2]
gbookinfo=demjson.decode(g)
if gbookinfo[isbn].has_key("info_url"):
print "GB info url: " + gbookinfo[isbn]["info_url"]
if gbookinfo[isbn].has_key("thumbnail_url"):
print "GB thumbnail url: " + gbookinfo[isbn]["thumbnail_url"]
opener = urllib2.build_opener(urllib2.HTTPHandler())
request = urllib2.Request(gbookinfo[isbn]["thumbnail_url"])
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3')]
picfile = open("C:\\" + isbn + "-g.jpg", "w+b")
picfile.write(opener.open(request).read())
Error I am getting is
Traceback (most recent call last):
File "C:/Python25/first.py", line 16, in <module>
if book["result"]!=[]:
KeyError: 'result'
When book is empty, or if the key you are trying to check for a value isn't there, you'll get a key error.
You can use in to check if a key exists within a dict
For example, if book is empty, or missing your key, you can use something like:
if 'result' in book:
book['result'] = ..... # Do something
This way, you know that the key exists and that your book['result '] is a valid statement that won't raise a key error. Also, it will help you debug if for example your book dictionary is empty or doesn't have your desired key.
Hope that helps
There was an earlier question on this, but the asker was just overwriting their output and solved their own problem.
I'm using a subprocess.popen to read video information and write the output to a json. It works fine on MOST videos, but on others is returning an empty string on others - even though it runs fine from the command line. I tried it several times and am getting the data fine through the command line.
Here's the relevant part of the script:
out_prj.write('[')
for m, i in enumerate(files):
print i
out_prj.write('{"$type":"BatchProcessor.Job, BatchProcessor","Id":0,"Ver":1.02,"CurrentTask":0,"IsSelected":true,"TaskList":[')
f_name = os.path.basename(i[0])
f_json = out_folder + os.sep + "06_Output" + os.sep + os.path.basename(i[0]).split(".")[0] + ".json"
trans_f = out_folder + os.sep + "04_Video" + os.sep + os.path.basename(i[0]).split(".")[0] + "-tr.ts"
trans_f_out = out_folder + os.sep + "06_Output" + os.sep + os.path.basename(i[0]).split(".")[0] + "-tr-out.ts"
ffprobe = 'ffprobe.exe'
command = [ffprobe, '-v', 'quiet', '-print_format', 'json', '-show_format', '-show_streams', i[0]]
p = sp.Popen(command, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
out, err = p.communicate()
io = cStringIO.StringIO(out)
info = json.load(io)
print info
filea = open(f_json, 'w')
filea.write(json.dumps(info))
filea.close()
f = open(f_json)
b = json.load(f)
print b
#########################
###################
f_format = str(b['streams'][0]['codec_long_name'])
Your code ignores error messages (err variable). print err or don't redirect stderr to see them.
Unrelated: the json handling in your code is insane: most operations are redundant.
To save output of the subprocess to a file:
import os
from subprocess import check_call
f_json = os.path.join(out_folder, "06_Output",
os.path.splitext(f_name)[0] + ".json")
with open(f_json, 'wb', 0) as file:
check_call(command, stdout=file)
Note: shell=True is not necessary here. If subprocess can't find ffprobe.exe then specify the full path e.g. (use the path appropriate for your system):
ffprobe = r'C:\Program Files\Real\RealPlayer\RPDS\Tools\ffmpeg\ffprobe.exe'
Note: r'' -- a raw string literal is used to avoid doubling the backslashes.