Building a proxy rotator with specific URL and script - html

I am struggling to build a proxy rotator with existing code structured for a different url.
The URLs I want are provided in the code example below. I am trying to have the provided script call the desired URLs and get ALL the 'IP:PORT' (current script limits to ten) when proxy type is "HTTPS".
It can be done in xpath or bs4. I am familair with bs4 more though.
I understand the logic but I am failing on how to structure this.
To start, I've tried stripping strings and trying to call specific td elements but its not working.
#URLs I want
url_list = ['http://spys.one/free-proxy-list/US/','http://spys.one/free-proxy-list/US/1/']
#code I have
from lxml.html import fromstring
import requests
from itertools import cycle
import traceback
def get_proxies():
url = 'https://free-proxy-list.net/'
response = requests.get(url)
parser = fromstring(response.text)
proxies = set()
for i in parser.xpath('//tbody/tr')[:10]:
if i.xpath('.//td[7][contains(text(),"yes")]'):
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
proxies.add(proxy)
return proxies
proxies = get_proxies()
proxy_pool = cycle(proxies)
proxy = next(proxy_pool)
response = requests.get(url,proxies={"http": proxy, "https": proxy})
I hope to learn how the code provided is structured for the 2 desired URLs, return all IP:PORT numbers when proxy type is HTTPS

One way is to issue port specific POST requests in a loop. You could amend to add to one final list. The endpoint is already https specific.
import requests
from bs4 import BeautifulSoup as bs
def get_proxies(number, port, p):
r = requests.post('http://spys.one/en/https-ssl-proxy/', data = {'xpp': 5, 'xf4': number})
proxies = [':'.join([str(i),port]) for i in p.findall(r.text)]
return proxies
ports = ['3128', '8080', '80']
p = re.compile(r'spy14>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})<script')
proxies = []
for number, port in enumerate(ports,1):
proxies+=get_proxies(number, port, p)
print(proxies)
Example results:
For country specific:
import requests
from bs4 import BeautifulSoup as bs
def get_proxies(number, port, p, country):
r = requests.post('http://spys.one/en/https-ssl-proxy/', data = {'xpp': 5, 'xf4': number})
soup = bs(r.content, 'lxml')
proxies = [':'.join([p.findall(i.text)[0], port]) for i in soup.select('table table tr:has(.spy14:contains("' + country + '")) td:has(script) .spy14')]
return proxies
ports = ['3128', '8080', '80']
p = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})document')
proxies = []
for number, port in enumerate(ports,1):
proxies+=get_proxies(number, port, p, 'United States')
print(proxies)
For the one you said is already written I will refer to my original answer of:
from bs4 import BeautifulSoup as bs
import requests
def get_proxies():
r = requests.get('https://free-proxy-list.net/')
soup = bs(r.content, 'lxml')
proxies = {tr.td.text + ':' + tr.td.next_sibling.text for tr in soup.select('tr:has(.hx:contains(yes))')}
return proxies
get_proxies()

Related

Can we retrive the value using xpath which is not part of page.content?

These are some of the approaches tried to extract the "Latest Value" from the below URL, but somehow could not get the text of this.
from requests_html import HTMLSession
session = HTMLSession()
url = "http://orewallets.com/#/contract"
response = session.get(url)
print(response.text)
meta_desc = response.html.find('//div[#class="num fn-20 increace"', first=True).text
print(meta_desc)
Also tried with selenium,
from selenium import webdriver
from lxml import html
url = "http://orewallets.com/#/contract"
xpath = '/html/body/div/main/div/div[1]/div[2]/div[2]/div/div[3]/div[1]/div[2]/text()'
browser = webdriver.Chrome()
browser.get(url)
html_source = browser.page_source
tree = html.fromstring(html_source)
text = tree.xpath(xpath)
print(text)
Can we get the value which is randomly getting updated using JAVA scripts and it doesn't be part of source contents.

beautiful soup findall not returning results

I am trying to webscrape and get the insurance dollars as listed in the below html.
Insurance
Insurance
Used the below code but it is not fetching anything. Can someone help? I am fairly new to python...
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.kbb.com/ford/escape/2017/s/?vehicleid=415933&intent=buy-new')
html_soup = BeautifulSoup(r.content, 'lxml')
test2 = html_soup.find_all('div',attrs={"class":"col-base-6"})
print(test2)
Not all the data you see on the page is actually the response to the get request to this URL. there are a lot of other requests the browser make in the background, which are initiated by javascript code.
Specifically, the request for the insurance data is made to this URL:
https://www.kbb.com/vehicles/hub/_costtoown/?vehicleid=415933
Here is a working code for what you need:
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.kbb.com/vehicles/hub/_costtoown/?vehicleid=415933')
html_soup = BeautifulSoup(r.text, 'html.parser')
Insurance = html_soup.find('div',string="Insurance").find_next().text
print(Insurance)

How to isolate a part of HTML page in Python 3

I made a simple script to retrieve sourcecode of a page, but I'd like to "isolate" the part of ips so that I can save to proxy.txt file. Any suggestions?
import urllib.request
sourcecode = urllib.request.urlopen("https://www.inforge.net/xi/threads/dichvusocks-us-15h10-pm-update-24-24-good-socks.455588/")
sourcecode = str(sourcecode.read())
out_file = open("proxy.txt","w")
out_file.write(sourcecode)
out_file.close()
I've added a couple of lines to your code, the only problem is that the UI version (check the page source) is being added as an IP address.
import urllib.request
import re
sourcecode = urllib.request.urlopen("https://www.inforge.net/xi/threads/dichvusocks-us-15h10-pm-update-24-24-good-socks.455588/")
sourcecode = str(sourcecode.read())
out_file = open("proxy.txt","w")
out_file.write(sourcecode)
out_file.close()
with open('proxy.txt') as fp:
for line in fp:
ip = re.findall('(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})', line)
for addr in ip:
print(addr)
UPDATE:
This is what you are looking for, BeatifulSoup can extract only the data we need from the page using CSS classes, however it needs to be installed with pip. You don't need to save the page to a file.
from bs4 import BeautifulSoup
import urllib.request
import re
url = urllib.request.urlopen('https://www.inforge.net/xi/threads/dichvusocks-us-15h10-pm-update-24-24-good-socks.455588/').read()
soup = BeautifulSoup(url, "html.parser")
# Searching the CSS class name
msg_content = soup.find_all("div", class_="messageContent")
ips = re.findall('(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})', str(msg_content))
for addr in ips:
print(addr)
Why won't you use re?
I need the source code to say exactly how.

Entering Value into Search Bar and Downloading Output from Webpage

I'm trying to search a webpage (http://www.phillyhistory.org/historicstreets/). I think the relevent source html is this:
<input name="txtStreetName" type="text" id="txtStreetName">
You can see the rest of the source html at the website. I want to go into the that text box and enter an street name and download an output (ie enter 'Jefferson' in the search box of the page and see historic street names with Jefferson). I have tried using requests.post, and tried typing ?get=Jefferson in the url to test if that works with no luck. Anyone have any ideas how to get this page? Thanks,
Cameron
code that I currently tried (some imports are unused as I plan to parse etc):
import requests
from bs4 import BeautifulSoup
import csv
from string import ascii_lowercase
import codecs
import os.path
import time
arrayofstreets = []
arrayofstreets = ['Jefferson']
for each in arrayofstreets:
url = 'http://www.phillyhistory.org/historicstreets/default.aspx'
payload = {'txtStreetName': each}
r = requests.post(url, data=payload).content
outfile = "raw/" + each + ".html"
with open(outfile, "w") as code:
code.write(r)
time.sleep(2)
This did not work and only gave me the default webpage downloaded (ie Jefferson not entered in the search bar and retrieved.
I'm guessing your reference to 'requests.post' relates to the requests module for python.
As you have not specified what you want to scrape from the search results I will simply give you a snippet to get the html for a given search query:
import requests
query = 'Jefferson'
url = 'http://www.phillyhistory.org/historicstreets/default.aspx'
post_data = {'txtStreetName': query}
html_result = requests.post(url, data=post_data).content
print html_result
If you need to further process the html file to extract some data, I suggest you use the Beautiful Soup module to do so.
UPDATED VERSION:
#!/usr/bin/python
import requests
from bs4 import BeautifulSoup
import csv
from string import ascii_lowercase
import codecs
import os.path
import time
def get_post_data(html_soup, query):
view_state = html_soup.find('input', {'name': '__VIEWSTATE'})['value']
event_validation = html_soup.find('input', {'name': '__EVENTVALIDATION'})['value']
textbox1 = ''
btn_search = 'Find'
return {'__VIEWSTATE': view_state,
'__EVENTVALIDATION': event_validation,
'Textbox1': '',
'txtStreetName': query,
'btnSearch': btn_search
}
arrayofstreets = ['Jefferson']
url = 'http://www.phillyhistory.org/historicstreets/default.aspx'
html = requests.get(url).content
for each in arrayofstreets:
payload = get_post_data(BeautifulSoup(html, 'lxml'), each)
r = requests.post(url, data=payload).content
outfile = "raw/" + each + ".html"
with open(outfile, "w") as code:
code.write(r)
time.sleep(2)
The problem in my/your first version was that we weren't posting all the required parameters. To find out what you need to send, open the network monitor in your browser (Ctrl+Shitf+Q in Firefox) and make that search as you would normally. If you select the POST request in the network log, on the right you should see 'parameters tab' where the post parameters your browser sent.

Web scraper software for dynamic pages [duplicate]

I am trying to use python3 to return the bibtex citation generated by http://www.doi2bib.org/. The url's are predictable so the script can work out the url without having to interact with the web page. I have tried using selenium, bs4, etc but cant get the text inside the box.
url = "http://www.doi2bib.org/#/doi/10.1007/s00425-007-0544-9"
import urllib.request
from bs4 import BeautifulSoup
text = BeautifulSoup(urllib.request.urlopen(url).read())
print(text)
Can anyone suggest a way of returning the bibtex citation as a string (or whatever) in python?
You don't need BeautifulSoup here. There is an additional XHR request sent to the server to fill out the bibtex citation, simulate it, for example, with requests:
import requests
bibtex_id = '10.1007/s00425-007-0544-9'
url = "http://www.doi2bib.org/#/doi/{id}".format(id=bibtex_id)
xhr_url = 'http://www.doi2bib.org/doi2bib'
with requests.Session() as session:
session.get(url)
response = session.get(xhr_url, params={'id': bibtex_id})
print(response.content)
Prints:
#article{Burgert_2007,
doi = {10.1007/s00425-007-0544-9},
url = {http://dx.doi.org/10.1007/s00425-007-0544-9},
year = 2007,
month = {jun},
publisher = {Springer Science $\mathplus$ Business Media},
volume = {226},
number = {4},
pages = {981--987},
author = {Ingo Burgert and Michaela Eder and Notburga Gierlinger and Peter Fratzl},
title = {Tensile and compressive stresses in tracheids are induced by swelling based on geometrical constraints of the wood cell},
journal = {Planta}
}
You can also solve it with selenium. The key trick here is to use an Explicit Wait to wait for the citation to become visible:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get('http://www.doi2bib.org/#/doi/10.1007/s00425-007-0544-9')
element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//pre[#ng-show="bib"]')))
print(element.text)
driver.close()
Prints the same as the above solution.