500 Internal Server Error from third party API - json

Python 3.6 - Scrapy 1.5
I'm scraping the John Deere warranty webpage to watch all new PMP's and its expiration date. Looking inside network communication between browser and webpage I found a REST API that feed data in webpage.
Now, I'm trying to get json data from API rather scraping the javascript page's content. However, I'm getting a Internal Server Error and I don't know why.
I'm using scrapy to log in and catch data.
import scrapy
class PmpSpider(scrapy.Spider):
name = 'pmp'
start_urls = ['https://jdwarrantysystem.deere.com/portal/']
def parse(self, response):
self.log('***Form Request***')
login ={
'USERNAME':*******,
'PASSWORD':*******
}
yield scrapy.FormRequest.from_response(
response,
url = 'https://registration.deere.com/servlet/com.deere.u90950.registrationlogin.view.servlets.SignInServlet',
method = 'POST', formdata = login, callback = self.parse_pmp
)
self.log('***PARSE LOGIN***')
def parse_pmp(self, response):
self.log('***PARSE PMP***')
cookies = response.headers.getlist('Set-Cookie')
for cookie in cookies:
cookie = cookie.decode('utf-8')
self.log(cookie)
cook = cookie.split(';')[0].split('=')[1]
path = cookie.split(';')[1].split('=')[1]
domain = cookie.split(';')[2].split('=')[1]
yield scrapy.Request(
url = 'https://jdwarrantysystem.deere.com/api/pip-products/collection',
method = 'POST',
cookies = {
'SESSION':cook,
'path':path,
'domain':domain
},
headers = {
"Accept":"application/json",
"accounts":["201445","201264","201167","201342","201341","201221"],
"excludedPin":"",
"export":"",
"language":"",
"metric":"Y",
"pipFilter":"OPEN",
"pipType":["MALF","SAFT"]
},
meta = {'dont_redirect': True},
callback = self.parse_pmp_list
)
def parse_pmp_list(self, response):
self.log('***LISTA PMP***')
self.log(response.body)
Why am I getting an error? How to get data from this API?
2018-07-05 17:26:19 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <POST https://jdwarrantysystem.deere.com/api/pip-products/collection> (failed 1 times): 500 Internal Server Error
2018-07-05 17:26:20 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <POST https://jdwarrantysystem.deere.com/api/pip-products/collection> (failed 2 times): 500 Internal Server Error
2018-07-05 17:26:21 [scrapy.downloadermiddlewares.retry] DEBUG: Gave up retrying <POST https://jdwarrantysystem.deere.com/api/pip-products/collection> (failed 3 times): 500 Internal Server Error
2018-07-05 17:26:21 [scrapy.core.engine] DEBUG: Crawled (500) <POST https://jdwarrantysystem.deere.com/api/pip-products/collection> (referer: https://jdwarrantysystem.deere.com/portal/)
2018-07-05 17:26:21 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <500 https://jdwarrantysystem.deere.com/api/pip-products/collection>: HTTP status code is not handled or not allowed

I found the problem: This is a POST request that must have a body data in json format, because unlike a GET request, the parameters don't go in the URI. The request header need too a "content-type": "application/json". See: How parameters are sent in POST request and Rest POST in python. So, editing the function parse_pmp:
def parse_pmp(self, response):
self.log('***PARSE PMP***')
cookies = response.headers.getlist('Set-Cookie')
for cookie in cookies:
cookie = cookie.decode('utf-8')
self.log(cookie)
cook = cookie.split(';')[0].split('=')[1]
path = cookie.split(';')[1].split('=')[1]
domain = cookie.split(';')[2].split('=')[1]
data = json.dumps({"accounts":["201445","201264","201167","201342","201341","201221"],"excludedPin":"","export":"","language":"","metric":"Y","pipFilter":"OPEN","pipType":["MALF","SAFT"]}) # <----
yield scrapy.Request(
url = 'https://jdwarrantysystem.deere.com/api/pip-products/collection',
method = 'POST',
cookies = {
'SESSION':cook,
'path':path,
'domain':domain
},
headers = {
"Accept":"application/json",
"content-type": "application/json" # <----
},
body = data, # <----
meta = {'dont_redirect': True},
callback = self.parse_pmp_list
)
Everything works fine!

Related

Trying to make a POST request, works with cURL, get a 403 when using Python requests

I'm trying to get some JSON data from this API - https://ped.uspto.gov/api/queries
This cURL request works fine and returns what is expected:
curl -X POST "https://ped.uspto.gov/api/queries" -H "accept: application/json" -H "Content-Type: application/json" -d "{ \"searchText\":\"*:*\", \"fq\":[ \"totalPtoDays:[1 TO 99999]\", \"appFilingDate:[2005-01-01T00:00:00Z TO 2005-12-31T23:59:59Z]\" ], \"fl\":\"*\", \"mm\":\"100%\", \"df\":\"patentTitle\", \"facet\":\"true\", \"sort\":\"applId asc\", \"start\":\"0\"}"
I have this python script to do the same thing:
from requests.structures import CaseInsensitiveDict
import json
url = "https://ped.uspto.gov/api/queries"
headers = CaseInsensitiveDict()
headers["accept"] = "application/json"
headers["Content-Type"] = "application/json"
data = json.dumps({
"searchText":"*:*",
"fq":[
"totalPtoDays:[1 TO 99999]",
"appFilingDate:[2005-01-01T00:00:00Z TO 2005-12-31T23:59:59Z]"
],
"fl":"*",
"mm":"100%",
"df":"patentTitle",
"facet":"true",
"sort":"applId asc",
"start":"0"
})
resp = requests.post(url, headers=headers, data=data)
print(resp.status_code)
but it returns a 403 error code and the following response header:
"Date":"Mon, 24 Oct 2022 16:13:58 GMT",
"Content-Type":"text/html",
"Content-Length":"919",
"Connection":"keep-alive",
"X-Cache":"Error from cloudfront",
"Via":"1.1 d387fec28536c5aa92926c56363afe9a.cloudfront.net (CloudFront)",
"X-Amz-Cf-Pop":"LHR50-P8",
"X-Amz-Cf-Id":"RMd69prehvXNAl97mo0qyFtuBIiY8r9liIxcQEmbdoBV1zwXLhirXA=="
I'm at quite a loss at what to do, because I really don't understand what my Python is missing to replicate the cURL request.
Thanks very much.
I was interested in this. I got an account with uspto.gov and acquired an access key. Their other API's work well. But the PEDS API? I kept getting the Cloudflare Gateway Timeout 503 error. While I was on their website, I looked into the PEDS API, I could not load any link to a https://ped.uspto.gov page.
I called them and they gave me an email address. I got this reply:
The PEDS API was taken down, because repeated data mining was bringing the entire PEDS System down.
The PEDS Team is working on a solution to fix the PEDS API, so that it can be re-enabled.
I tried it using PHP.
Cloudflare has been causing a lot of problems for curl.
I got a timeout.
I may have gotten past the 403 Forbidden, but did not have credentials and so the server dropped the connection.
An HTTP 504 status code (Gateway Timeout) indicates that when
CloudFront forwarded a request to the origin (because the requested
object wasn't in the edge cache), one of the following happened: The
origin returned an HTTP 504 status code to CloudFront. The origin
didn't respond before the request expired.
AWS Cloudflare Curl Issues
bypassing CloudFlare 403
How to Fix Error 403 Forbidden on Cloudflare
403 Forbidden cloudflare
██████████████████████████████████████████████████████████████
This is a conversion from you curl.
The Content-Type:application/data is added by default when you send JSON data.
I do not know about your json_data.dump or you putting the JSON in parentheses.
import requests
headers = {
'accept': 'application/json',
}
json_data = {
'searchText': '*:*',
'fq': [
'totalPtoDays:[1 TO 99999]',
'appFilingDate:[2005-01-01T00:00:00Z TO 2005-12-31T23:59:59Z]',
],
'fl': '*',
'mm': '100%',
'df': 'patentTitle',
'facet': 'true',
'sort': 'applId asc',
'start': '0',
}
response = requests.post('https://ped.uspto.gov/api/queries', headers=headers, json=json_data)

I am trying to call a DAG( wrtitten in Python) using Cloud Function(Python 3.7) and getting error "405 Method Not Found" Could someone help here?

I have used below code available on the Google Cloud Docs platform:
from google.auth.transport.requests import Request
from google.oauth2 import id_token
import requests
IAM_SCOPE = 'https://www.googleapis.com/auth/iam'
OAUTH_TOKEN_URI = 'https://www.googleapis.com/oauth2/v4/token'
def trigger_dag(data, context=None):
"""Makes a POST request to the Composer DAG Trigger API
When called via Google Cloud Functions (GCF),
data and context are Background function parameters.
For more info, refer to
https://cloud.google.com/functions/docs/writing/background#functions_background_parameters-python
To call this function from a Python script, omit the ``context`` argument
and pass in a non-null value for the ``data`` argument.
"""
# Fill in with your Composer info here
# Navigate to your webserver's login page and get this from the URL
# Or use the script found at
# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/composer/rest/get_client_id.py
client_id = '87431184677-jitlhi9o0u9sin3uvdebqrvqokl538aj.apps.googleusercontent.com'
# This should be part of your webserver's URL:
# {tenant-project-id}.appspot.com
webserver_id = 'b368a47a354ddf2f6p-tp'
# The name of the DAG you wish to trigger
dag_name = 'composer_sample_trigger_response_dag'
webserver_url = (
#'https://'
webserver_id
+ '.appspot.com/admin/airflow/tree?dag_id='
+ dag_name
#+ '/dag_runs'
)
# Make a POST request to IAP which then Triggers the DAG
make_iap_request(
webserver_url, client_id, method='POST', json={"conf": data, "replace_microseconds": 'false'})
# This code is copied from
# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/iap/make_iap_request.py
# START COPIED IAP CODE
def make_iap_request(url, client_id, method='GET', **kwargs):
"""Makes a request to an application protected by Identity-Aware Proxy.
Args:
url: The Identity-Aware Proxy-protected URL to fetch.
client_id: The client ID used by Identity-Aware Proxy.
method: The request method to use
('GET', 'OPTIONS', 'HEAD', 'POST', 'PUT', 'PATCH', 'DELETE')
**kwargs: Any of the parameters defined for the request function:
https://github.com/requests/requests/blob/master/requests/api.py
If no timeout is provided, it is set to 90 by default.
Returns:
The page body, or raises an exception if the page couldn't be retrieved.
"""
# Set the default timeout, if missing
if 'timeout' not in kwargs:
kwargs['timeout'] = 90
# Obtain an OpenID Connect (OIDC) token from metadata server or using service
# account.
google_open_id_connect_token = id_token.fetch_id_token(Request(), client_id)
# Fetch the Identity-Aware Proxy-protected URL, including an
# Authorization header containing "Bearer " followed by a
# Google-issued OpenID Connect token for the service account.
resp = requests.request(
method, url,
headers={'Authorization': 'Bearer {}'.format(
google_open_id_connect_token)}, **kwargs)
if resp.status_code == 403:
raise Exception('Service account does not have permission to '
'access the IAP-protected application.')
elif resp.status_code != 200:
raise Exception(
'Bad response from application: {!r} / {!r} / {!r}'.format(
resp.status_code, resp.headers, resp.text))
else:
return resp.text
# END COPIED IAP CODE
You encounter the error "405 method not found" because you are trying to send a request to your_webserver_id.appspot.com/admin/airflow/tree?dag_id=composer_sample_trigger_response_dag which is the URL seen "Tree view" in the airflow webserver.
To properly send request to the Airflow API you need to construct the webserver_url just like in the documentation in Trigger DAGs in Cloud Functions. The webserver_url was constructed to use Trigger a new DAG endpoint to send requests. So if you'd like to trigger the DAG you can stick with the code below.
Airflow run DAG endpoint:
POST https://airflow.apache.org/api/v1/dags/{dag_id}/dagRuns
webserver_url = (
'https://'
+ webserver_id
+ '.appspot.com/api/experimental/dags/'
+ dag_name
+ '/dag_runs'
)
Moving forward, if you would like to perform different operations using the Airflow API you can check the Airflow REST reference

What is wrong in the following Lambda code that throws up module error?

Using the following code to use make an API that connects to Amazon AWS. This is the AMazon Lambda code that I use-
import boto3
import json
import requests
from requests_aws4auth import AWS4Auth
region = 'us-east-1'
service = 'es'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region,
service, session_token=credentials.token)
host = 'XXX.com'
index = 'items'
url = 'https://' + host + '/' + index + '/_search'
# Lambda execution starts here
def handler(event, context):
# Put the user query into the query DSL for more accurate search results.
# Note that certain fields are boosted (^).
query = {
"query": {
"multi_match": {
"query": event['queryStringParameters']['q'],
}
}
}
# ES 6.x requires an explicit Content-Type header
headers = { "Content-Type": "application/json" }
# Make the signed HTTP request
r = requests.get(url, auth=awsauth, headers=headers,
data=json.dumps(query))
# Create the response and add some extra content to support CORS
response = {
"statusCode": 200,
"headers": {
"Access-Control-Allow-Origin": '*'
},
"isBase64Encoded": False
}
# Add the search results to the response
response['body'] = r.text
return response
This should connect to an AWS ES cluster with endpoint XXX.com
Getting output when trying to test -
START RequestId: f640016e-e4d6-469f-b74d-838b9402968b Version: $LATEST
Unable to import module 'index': Error
at Function.Module._resolveFilename (module.js:547:15)
at Function.Module._load (module.js:474:25)
at Module.require (module.js:596:17)
at require (internal/module.js:11:18)
END RequestId: f640016e-e4d6-469f-b74d-838b9402968b
REPORT RequestId: f640016e-e4d6-469f-b74d-838b9402968b Duration:
44.49 ms Billed Duration: 100 ms Memory Size: 128 MB Max
Memory Used: 58 MB
While creating a Lambda function, we need to specify a handler, which is a function in your code, that the AWS Lambda service can invoke when the given Lambda function is executed.
By default, a Python Lambda function is created with handler as lambda_function.lambda_handler which signifies that the service must invoke lambda_handler function contained inside lambda_function module.
From the error you're receiving, it seems that the handler is incorrectly configured to something like index.<something>, and since there's no Python module called index in your deployment package, Lambda is unable to import the same in order to start the execution.
If i am getting things correctly to connect to a AWS ES cluster you need something of this sort
import gitlab
import logging
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
import boto3
#from aws_requests_auth.aws_auth import AWSRequestsAuth
LOGGER = logging.getLogger()
ES_HOST = {'host':'search-testelasticsearch-xxxxxxxxxx.eu-west-2.es.amazonaws.com', 'port': 443}
def lambda_handler(event, context):
LOGGER.info('started')
dump2={
'number': 9
}
service = 'es'
credentials = boto3.Session().get_credentials()
print('-------------------------------------------')
print(credentials.access_key)
print(credentials.secret_key)
print('--------------------------------------------------------')
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, "eu-west-2", service, session_token=credentials.token)
es = Elasticsearch(hosts=[ES_HOST], http_auth = awsauth, use_ssl = True, verify_certs = True, connection_class = RequestsHttpConnection)
DAVID_INDEX = 'test_index'
response = es.index(index=DAVID_INDEX, doc_type='is_this_important?', body=dump2, id='4')

Python 3.x - Web Server - extract json body from POST request

I am sending a Post request with a json body to a server but can not extract the json file when it arrives. I have does exhaustive searches but to no avail. I have provided both client and server scripts to illustrate what is happening.
All I need is to extract the json portion at the end of the received string so I can analyze the request and return the appropriate data.
I'm sure it's simple but I can't seem to find the answer. Any direction would be appreciated
***
CLIENT: script to test Server
import json
import requests
def info_send():
url = 'http:1234abcd.ngrok.io'
payload = {
'command': '["command", "status", "off", None]',
'userID': 'userID string',
'status': 'current status',
}
requests.post(url, data=json.dumps(payload))
info_send()
***
SERVER: receives json POST request
HOST, PORT = '', 5000
listen_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
listen_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
listen_socket.bind((HOST, PORT))
listen_socket.listen(1)
print('Listening on port %s' % PORT)
while True:
client_connection, client_address = listen_socket.accept()
request = client_connection.recv(1024).decode('utf-8')
print(request)
***
This is what is printed at the server
POST / HTTP/1.1
Host: 1234abcd.ngrok.io
User-Agent: python-requests/2.18.4
Accept-Encoding: gzip, deflate
Accept: /
Content-Length: 112
X-Forwarded-For: 112.162.214.265
{"command": "[\"command\", \"status\", \"off\", None]", "userID": "userID string", "deviceID": "current status"}

POST JSON Parameter to REST HTTPS URL using HTTP Builder in Groovy script

I am trying to POST JSON Parameter to REST HTTPS URL using HTTP Builder in my Jenkins Job using Groovy script.
Below is my script :-
import groovyx.net.http.HTTPBuilder
import static groovyx.net.http.ContentType.URLENC
def http = new HTTPBuilder( 'http://restservice.appshop.com/' )
def postBody = [name: 'bob', title: 'customer'] // will be url-encoded
http.post( path: '/', body: postBody,
requestContentType: URLENC ) { resp ->
println "POST Success: ${resp.statusLine}"
assert resp.statusLine.statusCode == 201
}
I got below error :-
unable to resolve class groovyx.net.http.httpbuilder
I tried using #Grab also but still got the error only.
Kindly help me out to use this HTTP Builder.