Scenario:
* def Test_assignment_type = 'ALL'
* def TEST_NODE_ID = '123455667'
* def list = [ '123', '1234', '12345' ]
* def gId_list = karate.mapWithKey(list, 'HOOK_TEST_LOCATION_GID')
* callonce read('classpath:hook/delete-assignments-hook.feature') {HOOK_TEST_LOCATION_ID: "#(TEST_NODE_ID)", HOOK_TEST_ASSIGNMENT_TYPE: "#(Test_assignment_type)"}
delete-assignments-hook.feature:
Scenario:
* Given path '/nodes/'+HOOK_TEST_LOCATION_ID+'/locations/'+HOOK_TEST_LOCATION_GID+'/assignments'
* And param assignmentType = HOOK_TEST_ASSIGNMENT_TYPE
* When method delete
* Then assert responseStatus == 204 || responseStatus == 404
how to pass gId_list in my delete-assignments-hook.feature, so that delete api runs for each value in the list.
Just start with a simple example and study it. This below will make 3 calls with a parameter called id. And also read the documentation: https://github.com/karatelabs/karate#data-driven-features
Feature:
Scenario:
* def array = [1, 2, 3]
* def data = karate.mapWithKey(array, 'id')
* call read('#called') data
#ignore #called
Scenario:
* url 'https://httpbin.org/anything'
* param id = id
* method get
Related
* def myvariable = 1
* def schema =
"""
{
myvariable : '#number',
2: '#number',
3: '#number',
4: '#number',
5: '#number',
6: '#number',
}
"""
I need to use 'myvariable' as a key. How can I do this?
Here you go:
* def schema = {}
* schema.myvariable = 1
* match schema == { myvariable: 1 }
# dynamic key name
* def name = 'myvariable'
* def schema = {}
* schema[name] = 1
* match schema == { myvariable: 1 }
I'm retrieving data from the Open Weather Map API. I have the following code where I'm extracting the current weather from more than 500 cities and I want the log that is giving me separate the data in sets of 50 each
I did a non efficient way that I would really like to improve!
Many many thanks!
x = 1
for index, row in df.iterrows():
base_url = "http://api.openweathermap.org/data/2.5/weather?"
units = "imperial"
query_url = f"{base_url}appid={api_key}&units={units}&q="
city = row['Name'] #this comes from a df
response = requests.get(query_url + city).json()
try:
df.loc[index,"Max Temp"] = response["main"]["temp_max"]
if index < 50:
print(f"Processing Record {index} of Set {x} | {city}")
elif index <100:
x = 2
print(f"Processing Record {index} of Set {x} | {city}")
elif index <150:
x = 3
print(f"Processing Record {index} of Set {x} | {city}")
except (KeyError, IndexError):
pass
print("City not found. Skipping...")
I am trying to get a search function working. I have a database using excel and would like to be able to search it while on the go. I am stuck trying to get the search form working. Below is the code in python for the form:
from forms import ComicSearchForm
#app.route('/make_search', methods=['GET', 'POST'])
def search():
search = ComicSearchForm(request.form)
if request.method == 'GET, POST':
return search_results(result)
return render_template('search.html', form=search)
#app.route('/search')
# #login required
def search_results(result):
bob = create_bob('*', '*', '*', '*', '*', '*')
bobby = []
current_page = request.args.get('page', 1, type=int)
per_page = 10
end = (current_page * per_page) + 1
if end > len(bob):
end = len(bob)
start = ((current_page - 1) * per_page) + 1
sort_bob = sorted(bob, key=lambda v: (v.issue_type, v.publisher, v.sort, v.character, v.volume, v.issues,
v.publication_date))
if datetime.strptime(sort_bob[0][7], '%B, %Y') >= datetime.now():
sort_bob = sorted(sort_bob, key=lambda v: (v.publication_date, '%B, %Y'))
for result in bob[start:end]:
if result.bob.series_title == str.find(''):
bobby.append(result)
next = str(current_page + 1) if end < len(bob) else '0'
prev = str(current_page - 1)
if not result:
flash('No results found!')
return redirect('make_search')
else:
# display results
return render_template('search.html', bobby=bobby, header=original_header, next=next, prev=prev)
Here is the form:
from wtforms import Form, StringField, SelectField
class ComicSearchForm(Form):
choices = [('Series Title', 'Series Title'),
('Author', 'Author'),
('Artist', 'Artist'),
('Publisher', 'Publisher'),
('Publication Date', 'Publication Date')]
select = SelectField('Search for comics:', choices=choices)
search = StringField('')
I am stuck trying to figure out:
for result in bob[start:end]:
if result.bob.series_title == str.find(''):
bobby.append(result)
This is where i am currently think this needs work. I would love some ideas of where i need to go to make this work.
Thanks,
Zach
I use scrapy to crawl a page which contains a list of items, and I save each of the item in MySQL databases.
But the problem is that I found spider closed before all items are stored in mysql. Each time I ran the spider the result count is different.
Could you please help let me know how to solve this?
Below is my sample code:
Spider
class FutianSpider(scrapy.Spider):
name = 'futian_soufang'
allowed_domain = ["fang.com"]
start_urls = []
def __init__(self, category=None, *args, **kwargs):
self.count = 0
pass
def closed(self, reason):
print "*" * 20 + str(self.count)
def start_requests(self):
url = "http://fangjia.fang.com/pghouse-c0sz/a085-h321-i3{}/"
response = requests.get(url.format(1))
response.encoding = 'gb2312'
strpages = Selector(text=response.text).xpath('//p[contains(#class, "pages")]/span[last()]/a/text()').extract()
# print response.text
pages = int(strpages[0])
for num in range(1, pages + 1):
yield scrapy.Request(url.format(num), callback=self.parse_page)
def parse_page(self, response):
houses = response.xpath("//div[#class='list']//div[#class='house']")
for house in houses:
# house = Selector(house.decode("UTF-8", 'ignore'))
self.count += 1
housespan_hyperlink = house.xpath(".//span[#class='housetitle']/a")
house_title = housespan_hyperlink.xpath("text()").extract()[0].strip()
house_link_rel = housespan_hyperlink.xpath("#href").extract()[0].strip()
house_link = response.urljoin(house_link_rel)
# if isinstance(house_link_rel, list) and len(house_link_rel) > 0:
# house_link = response.urljoin(house_link_rel)
address = house.xpath(".//span[#class='pl5']/text()").extract()[0].strip()
esf_keyword = u'二手房'
esf_span = house.xpath(".//span[contains(text(),'%s')]" % (esf_keyword))
esf_number = esf_span.xpath("./a/text()").extract()[0].strip()
esf_number = int(re.findall(r"\d+", esf_number)[0])
esf_link = esf_span.xpath("./a/#href").extract()[0].strip()
zf_hyperlink = house.xpath(".//span[#class='p110']/a")
zf_number = zf_hyperlink.xpath("text()").extract()[0].strip()
zf_number = int(re.findall(r"\d+", zf_number)[0])
zf_link = zf_hyperlink.xpath("#href").extract()[0].strip()
price = 0
try:
price = int(house.xpath(".//span[#class='price']/text()").extract()[0].strip())
except:
None
change = 0.0
try:
increase_span = house.xpath(".//span[contains(#class, 'nor')]")
changetext = increase_span.xpath("text()").extract()[0].strip()
change = float(changetext[:changetext.index('%')])
if len(increase_span.css(".green-down")) > 0:
change *= -1
except:
None
print house_title, house_link, address, esf_number, esf_link, zf_number, zf_link, price, change
item = XiaoquItem(
title=house_title,
url=house_link,
address=address,
esf_number=esf_number,
esf_link=esf_link,
zf_number=zf_number,
zf_link=zf_link,
price=price,
change=change
)
yield item
Item:
class XiaoquItem(Item):
# define the fields for your item here like:
title = Field()
url = Field()
address = Field()
esf_number = Field()
esf_link = Field()
zf_number = Field()
zf_link = Field()
price = Field()
change = Field()
Pipeline:
class MySQLPipeLine(object):
def __init__(self):
settings = get_project_settings()
dbargs = settings.get('DB_CONNECT')
db_server = settings.get('DB_SERVER')
dbpool = adbapi.ConnectionPool(db_server, **dbargs)
self.dbpool = dbpool
def close_spider(self, spider):
self.dbpool.close()
def process_item(self, item, spider):
if isinstance(item, XiaoquItem):
self._process_plot(item)
elif isinstance(item, PlotMonthlyPriceItem):
self._process_plot_price(item)
return item
def _process_plot(self, item):
# run db query in thread pool
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self._handle_error, item)
# query.addBoth(lambda _: item)
def _conditional_insert(self, conn, item):
# create record if doesn't exist.
# all this block run on it's own thread
conn.execute("select * from houseplot where title = %s", item["title"])
result = conn.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level = log.DEBUG)
else:
conn.execute("insert into houseplot(title, url, address, esf_number, esf_link, zf_number, zf_link, price, price_change, upsert_time) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", \
(item["title"], item["url"], item["address"], int(item["esf_number"]), item["esf_link"], item["zf_number"], item["zf_link"], item["price"], item["change"], datetime.datetime.now())
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def _handle_error(self, e):
log.err(e)
def _process_plot_price(self, item):
query = self.dbpool.runInteraction(self._conditional_insert_price, item)
query.addErrback(self._handle_error, item)
def _conditional_insert_price(self, conn, item):
# create record if doesn't exist.
# all this block run on it's own thread
conn.execute("select * from houseplot_monthly_price where title = %s and price_date= %s", (item["title"], item["price_date"]))
result = conn.fetchone()
if result:
log.msg("Price Item already stored in db: %s" % item, level=log.DEBUG)
else:
conn.execute(
"insert into houseplot_monthly_price(title, price_date, price) values (%s, %s, %s)", (item["title"], item["price_date"], item["price"])
)
log.msg("Price Item stored in db: %s" % item, level=log.DEBUG)
I'm trying to run the spider found in this crawler and for simplicity sake I'm using this start_url because it is just a list of 320 movies. (So, the crawler won't run for 5 hours as given in the github page).
I crawl using scrapy crawl imdb -o output.json but the output.json file contains nothing. It has just a [ in it.
import scrapy
from texteval.items import ImdbMovie, ImdbReview
import urlparse
import math
import re
class ImdbSpider(scrapy.Spider):
name = "imdb"
allowed_domains = ["imdb.com"]
start_urls = [
# "http://www.imdb.com/chart/top",
# "http://www.imdb.com/chart/bottom"
"http://www.imdb.com/search/title?countries=csxx&sort=moviemeter,asc"
]
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.robotstxt.ROBOTSTXT_OBEY': True,
}
base_url = "http://www.imdb.com"
def parse(self, response):
movies = response.xpath("//*[#id='main']/table/tr/td[3]/a/#href")
for i in xrange(len(movies)):
l = self.base_url + movies[i].extract()
print l
request = scrapy.Request(l, callback=self.parse_movie)
yield request
next = response.xpath("//*[#id='right']/span/a")[-1]
next_url = self.base_url + next.xpath(".//#href")[0].extract()
next_text = next.xpath(".//text()").extract()[0][:4]
if next_text == "Next":
request = scrapy.Request(next_url, callback=self.parse)
yield request
'''
for sel in response.xpath("//table[#class='chart']/tbody/tr"):
url = urlparse.urljoin(response.url, sel.xpath("td[2]/a/#href").extract()[0].strip())
request = scrapy.Request(url, callback=self.parse_movie)
yield request
'''
def parse_movie(self, response):
movie = ImdbMovie()
i1 = response.url.find('/tt') + 1
i2 = response.url.find('?')
i2 = i2 - 1 if i2 > -1 else i2
movie['id'] = response.url[i1:i2]
movie['url'] = "http://www.imdb.com/title/" + movie['id']
r_tmp = response.xpath("//div[#class='titlePageSprite star-box-giga-star']/text()")
if r_tmp is None or r_tmp == "" or len(r_tmp) < 1:
return
movie['rating'] = int(float(r_tmp.extract()[0].strip()) * 10)
movie['title'] = response.xpath("//span[#itemprop='name']/text()").extract()[0]
movie['reviews_url'] = movie['url'] + "/reviews"
# Number of reviews associated with this movie
n = response.xpath("//*[#id='titleUserReviewsTeaser']/div/div[3]/a[2]/text()")
if n is None or n == "" or len(n) < 1:
return
n = n[0].extract().replace("See all ", "").replace(" user reviews", "")\
.replace(" user review", "").replace(",", "").replace(".", "").replace("See ", "")
if n == "one":
n = 1
else:
n = int(n)
movie['number_of_reviews'] = n
r = int(math.ceil(n / 10))
for x in xrange(1, r):
start = x * 10 - 10
url = movie['reviews_url'] + "?start=" + str(start)
request = scrapy.Request(url, callback=self.parse_review)
request.meta['movieObj'] = movie
yield request
def parse_review(self, response):
ranks = response.xpath("//*[#id='tn15content']/div")[0::2]
texts = response.xpath("//*[#id='tn15content']/p")
del texts[-1]
if len(ranks) != len(texts):
return
for i in xrange(0, len(ranks) - 1):
review = ImdbReview()
review['movieObj'] = response.meta['movieObj']
review['text'] = texts[i].xpath("text()").extract()
rating = ranks[i].xpath(".//img[2]/#src").re("-?\\d+")
if rating is None or rating == "" or len(rating) < 1:
return
review['rating'] = int(rating[0])
yield review
Can someone tell me where am I going wrong?
In my opinion, this web site should be load the list of movies use by js. Fristly, I suggest you should check the output about: movies = response.xpath("//*[#id='main']/table/tr/td[3]/a/#href"). If you want to get js content, you can use webkit in scrapy as a downloader middleware.