scrapy request form for scraping data - html

I am using this code and i am using url=http://money.moneygram.com.au/forex-tools/currency-converter-widget-part
from __future__ import absolute_import
#import __init__
#from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.http import Response
from scrapy.selector import HtmlXPathSelector
import MySQLdb
class DmozSpider(CrawlSpider):
name = "moneygram"
allowed_domains = ["moneygram.com"]
start_urls = ["http://money.moneygram.com.au/forex-tools/currency-converter-widget-part"]
def parse(self,response):
# yield FormRequest.from_response(response,formname='firstSelector',formdata="FromCurrency=USD&ToCurrency=INR&FromCurrency_dropDown=USD&ToCurrency_dropDown=INR",callback=self.parse1)
# request_with_cookies = Request(url="http://money.moneygram.com.au",
# cookies={'FromCurrency': 'USD', 'ToCurrency': 'INR'},callback=self.parse1)
yield FormRequest.from_response(response,formname=None,formnumber=0,formpath=None,formdata="FromCurrency=AED&ToCurrency=VND&FromCurrency_dropDown=AED&ToCurrency_dropDown=VND&FromAmount=2561&ToAmount=&X-Requested-With=XMLHttpRequest",callback=self.parse1)
To send the form data as required to me but is giving error that
raise ValueError("No <form> element found in %s" % response)
exceptions.ValueError: No <form> element found in <200 http://money.moneygram.com.au/forex-tools/currency-converter-widget-part>
How can I convert from usd to inr ?

The page you mentioned is not a valid HTML. Looks it's a SSI block which should be part of another page.
You the error because This page uses Javascript heavily. It doesn't have a
FormRequest.from_response tries to make the request using an existing form, but the form is not there.
You should either work with the whole page, or make up the FormRequest filling it's attributes manually and then submitting it to the URL from the whole page.

Related

Using flask_excel within Blueprints

I have an application that is using Blueprints and I need to be able to generate csv files for download. I found the flask_excel package which works great, but none of the examples I've found use Blueprints. Here is an example application that works:
#app.py
from flask import Flask
import flask_excel as excel
app=Flask(__name__)
excel.init_excel(app)
#app.route("/example", methods=['GET'])
def example():
return excel.make_response_from_array([[1,2], [3, 4]], "csv", file_name="example_data")
if __name__ == "__main__":
app.run()
However, I need something structured like this:
#__init__.py
from flask import Flask
import flask_excel as excel
from download_csv import download_csv_bp
def create_app():
app = Flask(__name__)
app.register_blueprint(download_csv_bp)
excel.init_excel(app)
return app
app = create_app()
and
#download_csv.py
from flask import Flask, Blueprint
download_csv_bp=Blueprint('download_csv',__name__)
#download_csv_bp.route("/example", methods=['GET'])
def example():
return excel.make_response_from_array([[1,2], [3, 4]], "csv", file_name="example_data")
How do I import the flask_excel functions that are initialized in __init__.py? If I put import flask_excel as excel in my download_csv.py file, the response generated just prints the resulting text to the screen.
The solution was simpler than I thought. If I add the line from app import excel to download_csv.py then it works. My new issue is with ajax calling this route, but as this is unrelated to my original question, I won't ask it here.

Display SVG image from qrcode in Django

As per the qrcode docs, I generate an SVG QR code:
import qrcode
import qrcode.image.svg
def get_qrcode_svg( uri ):
img = qrcode.make( uri, image_factory = qrcode.image.svg.SvgImage)
return img
And that's all fine. I get a <qrcode.image.svg.SvgImage object at 0x7f94d84aada0> object.
I'd like to display this now in Django html. I pass this object as context (as qrcode_svg) and try to display with <img src="{{qrcode_svg}}"/> but don't get really anywhere with this. The error shows it's trying to get the img url, but isn't there a way I can do this without saving the img etc.? Terminal output:
>>> UNKNOWN ?????? 2020-06-16 07:38:28.295038 10.0.2.2 GET
/user/<qrcode.image.svg.SvgImage object at 0x7f94d84aada0>
Not Found: /user/<qrcode.image.svg.SvgImage object at 0x7f94d84aada0>
"GET /user/%3Cqrcode.image.svg.SvgImage%20object%20at%200x7f94d84aada0%3E HTTP/1.1" 404 32447
You can write it to the response stream:
import qrcode
from qrcode.image.svg import SvgImage
from django.http import HttpResponse
from bytesIO import bytesIO
def get_qrcode_svg(uri):
stream = bytesIO()
img = qrcode.make( uri, image_factory=SvgImage)
img.save(stream)
return stream.getvalue().decode()
This will pass the svg source, not a URI with the source code. In the template, you thus render this with:
{{ qrcode_svg|safe }}
To solve this I transformed the <qrcode.image.svg.SvgImage> into a base64 string, this can then be used as an img src="{{variable}}" in HTML.
import io
import base64
from qrcode import make as qr_code_make
from qrcode.image.svg import SvgPathFillImage
def get_qr_image_for_user(qr_url: str) -> str:
svg_image_obj = qr_code_make(qr_url, image_factory=SvgPathFillImage)
image = io.BytesIO()
svg_image_obj.save(stream=image)
base64_image = base64.b64encode(image.getvalue()).decode()
return 'data:image/svg+xml;utf8;base64,' + base64_image
If this is not a good solution I would love some feedback. Cheers

When I use the following script with selenium and Beautifulsoup the text is correctly extracted but the json files is always the same

I have created the script below to extract the text from the posts of an instagram user profile.
It works perfectly to extract the posts but there is a problem once i start using the scroll function of selenium as the json file does not seem to be updating.
I have created a loop for 2 repetitions for test purposes
but there seems to be a problem in the line pageSource=driver.page_source.
I am expecting the script to load the new json file linked to the new page that but when I test it the pageSource file is always the same even if selenium is correctly scrolling through the page.
import requests
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
url = #instagram url
driver = webdriver.Firefox()
driver.get(url)
for n in range(2):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
pageSource=driver.page_source
soup = BeautifulSoup(pageSource, 'html.parser')
body = soup.find('body')
script = body.find('script')
raw = script.text.strip().replace('window._sharedData =', '').replace(';', '')
json_data=json.loads(raw)
for post in json_data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges']:
text_src = post['node']['edge_media_to_caption']['edges'][0]['node']['text']
print (text_src)
time.sleep(5)

HTML File link - force download always

I have a Django view that renders a list of uploaded files, and the user can click on them to begin the download.
When the project was deployed, we found there's one file that browsers open instead of download it. It seems related to the .dxf extension.
This is how the link is created:
...
As a result:
http://localhost:8003/media/folder/whatever.dxf
So, why the same browser behaves differently? if I run it on localhost, then it downloads the file. But accessing the real server, it'd open it. Can I prevent the server to open them in browsers?
You can try adding new Django view that will handle download.
urls.py
from django.conf.urls import url
import views
urlpatterns = [
url(r'^download/$', views.DownloadView.as_view(), name='download')
]
views.py
import urllib
from django.http import HttpResponse
from django.views.generic.base import View
class DownloadView(View):
def get(self, request):
location = request.GET.get('location')
file = urllib.urlretrieve(location)
contents = open(file[0], 'r')
content_type = '%s' % file[1].type
response = HttpResponse(contents, content_type=content_type)
response['Content-Disposition'] = 'attachment; filename="%s"' % location.split('/')[-1]
return response
template.html
...

Entering Value into Search Bar and Downloading Output from Webpage

I'm trying to search a webpage (http://www.phillyhistory.org/historicstreets/). I think the relevent source html is this:
<input name="txtStreetName" type="text" id="txtStreetName">
You can see the rest of the source html at the website. I want to go into the that text box and enter an street name and download an output (ie enter 'Jefferson' in the search box of the page and see historic street names with Jefferson). I have tried using requests.post, and tried typing ?get=Jefferson in the url to test if that works with no luck. Anyone have any ideas how to get this page? Thanks,
Cameron
code that I currently tried (some imports are unused as I plan to parse etc):
import requests
from bs4 import BeautifulSoup
import csv
from string import ascii_lowercase
import codecs
import os.path
import time
arrayofstreets = []
arrayofstreets = ['Jefferson']
for each in arrayofstreets:
url = 'http://www.phillyhistory.org/historicstreets/default.aspx'
payload = {'txtStreetName': each}
r = requests.post(url, data=payload).content
outfile = "raw/" + each + ".html"
with open(outfile, "w") as code:
code.write(r)
time.sleep(2)
This did not work and only gave me the default webpage downloaded (ie Jefferson not entered in the search bar and retrieved.
I'm guessing your reference to 'requests.post' relates to the requests module for python.
As you have not specified what you want to scrape from the search results I will simply give you a snippet to get the html for a given search query:
import requests
query = 'Jefferson'
url = 'http://www.phillyhistory.org/historicstreets/default.aspx'
post_data = {'txtStreetName': query}
html_result = requests.post(url, data=post_data).content
print html_result
If you need to further process the html file to extract some data, I suggest you use the Beautiful Soup module to do so.
UPDATED VERSION:
#!/usr/bin/python
import requests
from bs4 import BeautifulSoup
import csv
from string import ascii_lowercase
import codecs
import os.path
import time
def get_post_data(html_soup, query):
view_state = html_soup.find('input', {'name': '__VIEWSTATE'})['value']
event_validation = html_soup.find('input', {'name': '__EVENTVALIDATION'})['value']
textbox1 = ''
btn_search = 'Find'
return {'__VIEWSTATE': view_state,
'__EVENTVALIDATION': event_validation,
'Textbox1': '',
'txtStreetName': query,
'btnSearch': btn_search
}
arrayofstreets = ['Jefferson']
url = 'http://www.phillyhistory.org/historicstreets/default.aspx'
html = requests.get(url).content
for each in arrayofstreets:
payload = get_post_data(BeautifulSoup(html, 'lxml'), each)
r = requests.post(url, data=payload).content
outfile = "raw/" + each + ".html"
with open(outfile, "w") as code:
code.write(r)
time.sleep(2)
The problem in my/your first version was that we weren't posting all the required parameters. To find out what you need to send, open the network monitor in your browser (Ctrl+Shitf+Q in Firefox) and make that search as you would normally. If you select the POST request in the network log, on the right you should see 'parameters tab' where the post parameters your browser sent.