Processing JSON Response using scrapy - json

I have the following code in my scrapy spider:
def parse(self, response):
jsonresponse = json.loads(response.body_as_unicode())
htmldata = jsonresponse["html"]
for sel in htmldata.xpath('//li/li'):
-- more xpath codes --
yield item
But i am having this error:
raise ValueError("No JSON object could be decoded")
exceptions.ValueError: No JSON object could be decoded
After checking the json reply, i found out about **<!--WPJM-->** and **<!--WPJM_END-->** which is causing this error.
<!--WPJM-->{"found_jobs":true,"html":"<html code>","max_num_pages":3}<!--WPJM_END-->
How do i parse my scrapy without looking at the !--WPJM-- and !--WPJM_END-- code?
EDIT: This is the error that i have:
File "/home/muhammad/Projects/project/project/spiders/crawler.py", line 150, in parse
for sel in htmldata.xpath('//li'):
exceptions.AttributeError: 'unicode' object has no attribute 'xpath'
def parse(self, response):
rawdata = response.body_as_unicode()
jsondata = rawdata.replace('<!--WPJM-->', '').replace('<!--WPJM_END-->', '')
# print jsondata # For debugging
# pass
data = json.loads(jsondata)
htmldata = data["html"]
# print htmldata # For debugging
# pass
for sel in htmldata.xpath('//li'):
item = ProjectjomkerjaItem()
item['title'] = sel.xpath('a/div[#class="position"]/div[#id="job-title-job-listing"]/strong/text()').extract()
item['company'] = sel.xpath('a/div[#class="position"]/div[#class="company"]/strong/text()').extract()
item['link'] = sel.xpath('a/#href').extract()

The easiest approach would be to get rid of the comments tags manually using replace():
data = response.body_as_unicode()
data = data.replace('<!--WPJM-->', '').replace('<!--WPJM_END-->', '')
jsonresponse = json.loads(data)
Though it is not quite pythonic and reliable.
Or, a better option would to be to get the text() by xpath:
$ scrapy shell index.html
>>> response.xpath('//text()').extract()[0]
u'{"found_jobs":true,"html":"<html code"}'

Related

Beautifulsoup find method returns not subscriptable object

I was trying to create a Twitter scraper using beautifulsoup, request, and json. However, when I tried to run the code, it raised the error object is not subscriptable. I checked the lines where the error is located, but I couldn't find what raises the error. Can someone please help? I couldn't fix it.
File "tweetscraper.py", line 131, in <module>
start()
File "tweetscraper.py", line 125, in start
tweets = get_tweets_data(username, soup)
File "tweetscraper.py", line 54, in get_tweets_data
next_pointer = soup.find("div", {"class": "stream-container"})["data-min-position"]
TypeError: 'NoneType' object is not subscriptable
Here is my code:
def get_tweet_text(tweet):
tweet_text_box = tweet.find("p", {"class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"})
images_in_tweet_tag = tweet_text_box.find_all("a", {"class": "twitter-timeline-link u-hidden"})
tweet_text = tweet_text_box.text
for image_in_tweet_tag in images_in_tweet_tag:
tweet_text = tweet_text.replace(image_in_tweet_tag.text, '')
return tweet_text
def get_this_page_tweets(soup):
tweets_list = list()
tweets = soup.find_all("li", {"data-item-type": "tweet"})
for tweet in tweets:
tweet_data = None
try:
tweet_data = get_tweet_text(tweet)
except Exception as e:
continue
#ignore if there is any loading or tweet error
if tweet_data:
tweets_list.append(tweet_data)
print(".", end="")
sys.stdout.flush()
return tweets_list
def get_tweets_data(username, soup):
tweets_list = list()
tweets_list.extend(get_this_page_tweets(soup))
next_pointer = soup.find("div", {"class": "stream-container"})["data-min-position"]
while True:
next_url = "https://twitter.com/i/profiles/show/" + username + \
"/timeline/tweets?include_available_features=1&" \
"include_entities=1&max_position=" + next_pointer + "&reset_error_state=false"
next_response = None
try:
next_response = requests.get(next_url)
except Exception as e:
# in case there is some issue with request. None encountered so far.
print(e)
return tweets_list
tweets_data = next_response.text
tweets_obj = json.loads(tweets_data)
if not tweets_obj["has_more_items"] and not tweets_obj["min_position"]:
# using two checks here bcz in one case has_more_items was false but there were more items
print("\nNo more tweets returned")
break
next_pointer = tweets_obj["min_position"]
html = tweets_obj["items_html"]
soup = BeautifulSoup(html, 'lxml')
tweets_list.extend(get_this_page_tweets(soup))
return tweets_list
# dump final result in a json file
def dump_data(username, tweets):
filename = username+"_twitter.json"
print("\nDumping data in file " + filename)
data = dict()
data["tweets"] = tweets
with open(filename, 'w') as fh:
fh.write(json.dumps(data))
return filename
def get_username():
# if username is not passed
if len(sys.argv) < 2:
usage()
username = sys.argv[1].strip().lower()
if not username:
usage()
return username
def start(username = None):
username = get_username()
url = "http://www.twitter.com/" + username
print("\n\nDownloading tweets for " + username)
response = None
try:
response = requests.get(url)
except Exception as e:
print(repr(e))
sys.exit(1)
if response.status_code != 200:
print("Non success status code returned "+str(response.status_code))
sys.exit(1)
soup = BeautifulSoup(response.text, 'lxml')
if soup.find("div", {"class": "errorpage-topbar"}):
print("\n\n Error: Invalid username.")
sys.exit(1)
tweets = get_tweets_data(username, soup)
# dump data in a text file
dump_data(username, tweets)
print(str(len(tweets))+" tweets dumped.")
start()
The method find() will only return the first occurrence that is matched from the website data. This is a single object returned. Whereas, the method find_all() will return all the occurrences that are matched to the condition specified. So the method find_all() returns a list that is subscriptable.
Find out more about this in the Beautiful Soup Documentation.

Python read json from site print object

I get this json string from a page:
{"lprice":"8330.1","curr1":"BTC","curr2":"EUR"}
I tried to access the lprice with this code:
import requests
def get_latest_price(api, currencie, real):
CEXIO_API_URL = "https://cex.io/api/last_price/%s/%s" % (currencie.upper(), real.upper())
response = requests.get(CEXIO_API_URL)
response_json = response.json()
return float(response_json['lprice'])
But if I do it like this, I get this error:
File
"/home/malte/Entwicklung/cryptoprice/build/all/app/install/qml/cryptoprice.py",
line 16, in get_latest_price
return float(response_json['lprice'])
KeyError: 'lprice'
I assume that your
response_json is your json-string {"lprice":"8330.1","curr1":"BTC","curr2":"EUR"}
Then it should work if you convert the json string into a dictionary with the loads function
import requests
import json
def get_latest_price(api, currencie, real):
CEXIO_API_URL = "https://cex.io/api/last_price/%s/%s" % (currencie.upper(), real.upper())
response = requests.get(CEXIO_API_URL)
response_json = response.json()
response_json = json.loads(response_json)
return float(response_json['lprice'])

How to convert simple JSON to DynamoDB JSON?

I have a simple JSON and want to convert it to DynamoDB JSON. Is there any easy way to do that?
If you mean JsonString to Dynamodb Map, you can use boto3.
Here is the example.
import boto3
import json
json_string = '{"key1": 1, "key2": "value"}'
json_obj = json.loads(json_string)
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('test-table')
table.put_item(Item={'pk': 'pk-value', 'map': json_obj})
If you just want to update the while Map attribute, you can use just JSON format the same as put_item.
json_string = '{"key1": 2, "key2": "value2"}'
json_obj = json.loads(json_string2)
rsp = table.update_item(
Key={'pk': 'pk-value'},
AttributeUpdates={'map': {'Value': json_obj2, 'Action': 'PUT'}}
)
However, If you want to update only specific nested attribute, you need to use UpdateExpression. For example, the below is code to update only key1 attribute to 'value3'.
nested_json_string = '{"nested": "key3"}'
nested_json_obj = json.loads(nested_json_string)
rsp = table.update_item(
Key={'pk': 'pk-value'},
UpdateExpression='SET #map.#key1 = :val3',
ExpressionAttributeNames={'#map': 'map', '#key1': 'key1'},
ExpressionAttributeValues={':val3': nested_json_obj}
)
I know this is an old question, but I came across it and the accepted answer didn't help me (it seems to suggest that you can feed boto3 with plain JSON, but it didn't work for me) and the library mentioned in the comments didn't help me either.
What did work for me was using the serializer/deserializer from boto3.dynamodb.types, basically as suggested by this answer on a very similar topic.
from boto3.dynamodb.types import TypeSerializer, TypeDeserializer
import json
serializer = TypeSerializer()
deserializer = TypeDeserializer()
# for building a DynamoDB JSON from a Python object
def serialize(item):
serialized_item = serializer.serialize(vars(item) if hasattr(item, '__dict__') else item)
return item if 'M' not in serialized_item else serialized_item['M']
# for building a plain JSON from a DynamoDB JSON
def deserialize(dynamodb_json_string):
return deserializer.deserialize({'M': dynamodb_json_string})
class MyItem:
def __init__(self, some_string_value=None, some_numeric_value=None):
self.my_key = some_string_value
self.my_other_key = some_numeric_value
def __str__(self):
return json.dumps(self, default=lambda x: x.__dict__)
if __name__ == '__main__':
my_classy_item = MyItem("my_string_value", 5)
my_string_item = json.loads('{"my_key": "my_string_value", "my_other_key" : 5}')
print(serialize(my_classy_item))
print(serialize(my_string_item))

keep getting error message when trying to read json from s3

I keep getting this error in my lambda function:
{"errorMessage": "module initialization error"}
This happens when i try to turn the following string containing json data into a json dictionary object within python.
"{\n\"main\": {\n \"PART_NAME\": \"Genuine Cardboard Honda Wing\",\n \"BRAND\": \"Honda\",\n \"MJR_CAT\": \"Aero\",\n \"CAT\": \"Rear Wing\",\n \"SUB_CAT\": \"NA\",\n \"Power_Increase\": \"0\"\n},\n\"forza\":\n{\n \"power\": \"[0, True]\",\n \"Torque\": \"[0, True]\",\n \"Traction\": \"[50, True]\",\n \"Handling\": \"[100, True]\",\n \"Breaking\": \"[40, True]\"\n},\n\"custom\": {\n\"length\": 120,\n\"car max height[m]\": 2,\n\"RICER RANK\": -10\n\n}\n"
Here is my code to replicate this error:
client = boto3.client('s3')
result = client.get_object(Bucket=BUCKET, Key=FILE_TO_READ)
text = result['Body'].read().decode('utf-8')
text = json.load(text)
print(text)
without the print(text) it produces that string above.
Thanks :)
Here is the full lambda function (though not commented) if you are interested.
import json
import boto3
print('got this far')
BUCKET = '******'
FILE_TO_READ = 'example_honda_wing.json'
client = boto3.client('s3')
result = client.get_object(Bucket=BUCKET, Key=FILE_TO_READ)
text = result['Body'].read().decode('utf-8')
#text = str(text).replace("\n","")
#text = text.replace('\"',' ')
#text = json.load(text)
print(text) # Use your desired JSON Key for your value
def lambda_handler(event, context):
# TODO implement
return text

Using the reults of multiple for loops to post a single json response

Okay, so this is a loaded question but and I'm sure theres an easy method to use here, but I'm stuck.
Long story short, I am tasked with creating a function in python (to be run an AWS lambda) which can perform acceptance tests on a series of URL's using python-requests. These requests will be used to assert the HTTP response codes and a custom HTTP header identifying if an haproxy backend is correct.
The URL's themselves will be maintained in a yaml document which will be converted to a dict in python and passed to a for loop which will use python requests to HTTP GET the response code and header of the URL.
The issue I am having is getting a single body object to return the results of multiple for loops.
I have tried to find similar use cases but cannot
import requests
import json
import yaml
def acc_tests():
with open("test.yaml", 'r') as stream:
testurls = yaml.safe_load(stream)
results = {}
# endpoint/path 1
for url in testurls["health endpoints"]:
r = requests.get(url, params="none")
stat = r.status_code
result = json.dumps(print(url, stat))
results = json.dumps(result)
# endpoint path with headers
for url in testurls["xtvapi"]:
headers = {'H': 'xtvapi.cloudtv.comcast.net'}
r = requests.get(url, headers=headers, params="none")
stat = r.status_code
head = r.headers["X-FINITY-TANGO-BACKEND"]
result = json.dumps((url, stat, head))
results = json.dumps(result)
return {
'statusCode': 200,
'body': json.dumps(results)
}
acc_tests()
YAML file:
health endpoints:
- https://xfinityapi-tango-production-aws-us-east-1-active.r53.aae.comcast.net/tango-health/
- https://xfinityapi-tango-production-aws-us-east-1-active.r53.aae.comcast.net/
- https://xfinityapi-tango-production-aws-us-east-2-active.r53.aae.comcast.net/tango-health/
- https://xfinityapi-tango-production-aws-us-east-2-active.r53.aae.comcast.net/
- https://xfinityapi-tango-production-aws-us-west-2-active.r53.aae.comcast.net/tango-health/
- https://xfinityapi-tango-production-aws-us-west-2-active.r53.aae.comcast.net/
xtvapi:
- https://xfinityapi-tango-production-aws-us-east-1-active.r53.aae.comcast.net/
- https://xfinityapi-tango-production-aws-us-east-2-active.r53.aae.comcast.net/
- https://xfinityapi-tango-production-aws-us-west-2-active.r53.aae.comcast.net/
What I think is happening is that both for loops are running one after another, but the value of results is empty, but I'm not sure what to do in order to update/append the results dict with the results of each loop.
Thanks folks. I ended up solving this by creating a dict with immutable keys for each test type and then using append to add the results to a nested list within the dict.
Here is the "working" code as it is in the AWS Lambda function:
from botocore.vendored import requests
import json
import yaml
def acc_tests(event, context):
with open("test.yaml", 'r') as stream:
testurls = yaml.safe_load(stream)
results = {'tango-health': [], 'xtvapi': []}
# Tango Health
for url in testurls["health endpoints"]:
r = requests.get(url, params="none")
result = url, r.status_code
assert r.status_code == 200
results["tango-health"].append(result)
# xtvapi default/cloudtv
for url in testurls["xtvapi"]:
headers = {'H': 'xtvapi.cloudtv.comcast.net'}
r = requests.get(url, headers=headers, params="none")
result = url, r.status_code, r.headers["X-FINITY-TANGO-BACKEND"]
assert r.status_code == 200
assert r.headers["X-FINITY-TANGO-BACKEND"] == "tango-big"
results["xtvapi"].append(result)
resbody = json.dumps(results)
return {
'statusCode': 200,
'body': resbody
}