Dynamically create REST test steps in groovy - json

I have some JSON file that I want to use as input, let's consider that I have this folders
mainFolder --> Folder 1 : 10 JSON file (req)
--> Folder 2 : 10 JSON file (req)
I want to create from these folders : - Each directory is a testCase - Each file is a testStep
Here's my code :
import com.eviware.soapui.impl.wsdl.teststeps.registry.GroovyScriptStepFactory
import com.eviware.soapui.support.UISupport;
import com.eviware.soapui.impl.wsdl.teststeps.registry.RestRequestStepFactory
import com.eviware.soapui.config.TestStepConfig
import com.eviware.soapui.impl.rest.*;
def myTestCase = context.testCase
log.info myTestCase
def projectPath = Path
def endPoint = "anEndPoint";
def addTestStep(operation, requestFile, testCase, projectPath, endpoint){
def usageId = requestFile.name.replace("_request.json","")
def projectPathTest = projectPath+"SPecificPath";
def testStepName=usageId;
def iface = testCase.testSuite.project.getInterfaceList()[0];
def operationName= operation;
def op = iface.operations[operationName];
def config = com.eviware.soapui.impl.wsdl.teststeps.registry.RestRequestStepFactory.createConfig( op, testStepName);
def newTestStep = testCase.addTestStep( config );
newTestStep.getTestRequest().setRequestContent(requestFile.text)
newTestStep.httpRequest.endpoint = endpoint
}
if ( com.eviware.soapui.support.UISupport.confirm("Reconstruct ?","Confirm") ){
testSuite.getTestCaseList().each{testCase->testSuite.removeTestCase(testCase)}
new File(projectPathTest).eachDir{dir->
operation = dir.name
def RestTestCase = testSuite.addNewTestCase(operation)
RestTestCase.setFailOnError(false)
dir.eachFileMatch(~/.*_request\.json/){file->
addTestStep(operation, file, RestTestCase, projectPath, endPoint)
}
}
}
I verified many times, many pages, many forums, it seems that I have the correct form and algorithm to get what I want, instead, I succeed to create testCase with the names of the folders, I succeed to get the request JSON file, but I fail to create the test step, and I'm pretty sure it's either the config or interface/operation who make it fail :
def config = com.eviware.soapui.impl.wsdl.teststeps.registry.RestRequestStepFactory.createConfig( op, testStepName);
Any help please ?

Related

How to load models into Spark-nlp in Foundry code authoring

I'm trying to load a model from Spark-nlp model hub into a Universal Sentence Encoder Model as shown in the snippet below:
stages = []
documentAssembler = [
DocumentAssembler()
.setInputCol("description")
.setOutputCol("document")
]
stages += documentAssembler
// Where file_path is the location of extracted files of the model
use = [
UniversalSentenceEncoder.load(f"{file_path}")
.setInputCols(["document"])
.setOutputCol("sentence_embeddings")
]
stages += use
The model is downloadable from [Spark-nlp Model Hub - Universal Sentence Encoder] (https://nlp.johnsnowlabs.com/2020/04/17/tfhub_use.html/) The function I use to extract the downloaded model zip file is below:
def get_unzipped_path(dataset_with_model_files):
'''
Return a folder location for a temporary folder after reading file locations
encapsulated within a dataframe
Reference: https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
'''
input_filesystem = dataset_with_model_files.filesystem()
full_folder_path = input_filesystem.hadoop_path
files = [file_status.path for file_status in input_filesystem.ls('*.zip')]
file_name = files[0]
with dataset_with_model_files.filesystem().open(file_name, mode='rb') as f:
with tempfile.NamedTemporaryFile() as tmp:
shutil.copyfileobj(f, tmp)
tmp.flush()
# file_stem = Path(os.path.basename(file_name)).stem # Remove .zip
unpacked_path = os.path.join(full_folder_path, file_name)
with zipfile.ZipFile(tmp) as zip_ref:
zip_ref.extractall(unpacked_path)
return unpacked_path
The general problem is that although they exist in the temporary folder, they cannot be found, a sample of an error is provided below:
java.io.FileNotFoundException: File file:/data/ssd/01/palantir/foundry~node-manager/data/local-dir/usercache/palantir/appcache/application_1675144414001_39656/container_e103_1675144414001_39656_01_000002/tfhub_use_en_2.4.0_2.4_1587136330099/metadata/part-00000 does not exist
I have tried the solution of code I've provided, i.e. extracting to a temporary folder then feeding that path to the load function
using spark.sql(f"ADD ARCHIVE {zipped_file}") to add the archive which I believe is unpacked and available by SparkFiles.get(f"{file_name}") as shown below:
def get_archive_file_loc(dataset_with_model_files, spark, glob):
'''
Return a file name after submitting archive to the Spark Context
Reference: https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
'''
input_filesystem = dataset_with_model_files.filesystem()
full_folder_path = os.getcwd()
all_file_names = [
f"{f.path}" for f in input_filesystem.ls(glob=glob)
]
file_name = all_file_names[0]
with dataset_with_model_files.filesystem().open(file_name, mode='rb') as f:
zipped_file = os.path.join(full_folder_path, file_name)
with open(zipped_file, 'wb') as zf:
shutil.copyfileobj(f, zf)
# Check if file has been added to spark context as duplicate
# attempts to add can crash the executors
# if file_name not in os.listdir(SparkFiles.getRootDirectory()):
try:
spark.sql(f"ADD ARCHIVE {zipped_file}") # noqa
except Exception as e:
pass
return file_name
Has anyone been successful or has an idea of how to load the model in Spark-nlp?

This XML file does not appear to have any style information associated with it when try to download the file in Google Cloud Storage

I have a cloud functions that copy and paste a file from one standard bucket to a Nearline bucket. I also tried to save the file by opening as a dataframe and write it as dask dataframe. They both worked but Every time I try to download the file through the
GUI I get the an XML error message as stated below. Does anyone know why this is happening? How Can I prevent it to happen?
This XML file does not appear to have any style information associated with it
import base64
import json
from google.cloud import storage
import dask.dataframe as dd
import pandas as pd
def hello_pubsub(event, context):
"""Triggered from a message on a Cloud Pub/Sub topic.
Args:
event (dict): Event payload.
context (google.cloud.functions.Context): Metadata for the event.
"""
print('here')
print(event)
pubsub_message = base64.b64decode(event['data']).decode('utf-8')
payload = json.loads(pubsub_message)
bucket_name = payload['data']['bucket_name']
print(bucket_name)
blob_name = payload['data']['file_name']
print(blob_name)
destination_bucket_name = 'infobip-email-uploaded'
#destination_blob_name = blob_name[0:10]+'.csv'
destination_blob_name = 'ddf-*.csv'
df = pd.read_excel('gs://'+bucket_name+'/'+blob_name, sheet_name='Data', engine='xlrd')
print('excel has been read')
ddf = dd.from_pandas(df,npartitions=1, sort=True)
print('dataframe has been transformed into dask')
path = 'gs://'+destination_bucket_name +'/'+ destination_blob_name
print('path is')
print(path)
ddf.to_csv(path, index=False, sep=',', header=False)
destination_blob_name = blob_name[0:10]+'.xlsx'
copy_blob(bucket_name,blob_name,destination_bucket_name,destination_blob_name)
print('File has been successfully copied')
delete_blob(bucket_name,blob_name)
print('File has been successfully deleted')
return '200'
def copy_blob(bucket_name, blob_name, destination_bucket_name, destination_blob_name):
"""Copies a blob from one bucket to another with a new name."""
# bucket_name = "your-bucket-name"
# blob_name = "your-object-name"
# destination_bucket_name = "destination-bucket-name"
# destination_blob_name = "destination-object-name"
storage_client = storage.Client()
source_bucket = storage_client.bucket(bucket_name)
source_blob = source_bucket.blob(blob_name)
destination_bucket = storage_client.bucket(destination_bucket_name)
blob_copy = source_bucket.copy_blob(
source_blob, destination_bucket, destination_blob_name
)
print(
"Blob {} in bucket {} copied to blob {} in bucket {}.".format(
source_blob.name,
source_bucket.name,
blob_copy.name,
destination_bucket.name,
)
)
def delete_blob(bucket_name, blob_name):
"""Deletes a blob from the bucket."""
# bucket_name = "your-bucket-name"
# blob_name = "your-object-name"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.delete()
print("Blob {} deleted.".format(blob_name))

Is it possible to send a request to django rest api to run a script?

I installed Django and Django Rest Api. I want to send some data to rest api. Rest api will take the data and run a script with this data and get a result. Then send this result back to me.
There won't be database usage.
Like this, request : http://testerapi.com:8000/search?q=title:xfaster564CertVal9body:A%22&fl=id
Response : {validation : true}
Is it possible?
Yes it is possible ! But i will try to respond with api function based view.
Let's suppose that our worker function to call when call the API (GET or POST) is in the utilities.py file, the models.py, serializers.py and views.py.
utilities.py
def my_worker(a, b=0, c=0):
# do something with a, b, c
return a + b + c > 10
models.py
from datetime import datetime
class User(object):
def __init__(self, email, name, created = None):
self.email = email
self.name = name
self.created = created or datetime.now()
serializers.py
I use simple Serializer but ModelSerializer is better i think
from rest_framework import serializers
class UserSerializer(serializers.Serializer):
# initialize fields
email = serializers.EmailField()
name = serializers.CharField(max_length = 200)
created = serializers.DateTimeField()
views.py
from django.http import JsonResponse
from django.views.decorators.csrf import csrf_exempt # Allow request without csrf_token set
from rest_framework.decorators import api_view
from .models import User
from .serializers import UserSerializer
# Import my_worker from .utilities
from .utilities import my_worker
#csrf_exempt
#api_view('GET') # Only get request is allowed
def user_worker(request, a, b, c):
"""
Do something with
"""
if request.method == 'GET':
# Do some stuff
users = User.objects.all()
serializer = UserSerializer(users, many=True)
# Call the utilities script here
result = my_worker(a, b, c)
if result: # a+b+c > 10
return JsonResponse({"validation": "true"}, safe=False)
else:
return JsonResponse({"validation": "false"}, safe=False)
Note that i dont use the UserSerializer but show it at example.
You can then execute a more complex function (here the my_worker).
Adapt it according to your needs.

Scrapinghub plugs my results in the log and not in item

I have a functioning spider project to extract urls content (no css). I crawled several set of data and stored them in a series of .csv files. Now I try to set it up to work on Scrapinghub in order to go for a long run scraping.
So far, I am able to get the spider uploaded and work on Scrapinghub. My problem is the result appears in the 'log' and not under the 'item'. The amount of data exceeds the log capacity and thus gives me an error.
How can I set my pipelines/extractor to work and return a js or csv file? I am happy with a solution that have the scraped data to be sent to a database. As I failed to achieve that too.
Any guidance is appreciated.
The spider:
class DataSpider(scrapy.Spider):
name = "Data_2018"
def url_values(self):
time = list(range(1538140980, 1538140820, -60))
return time
def start_requests(self):
allowed_domains = ["https://website.net"]
list_urls = []
for n in self.url_values():
list_urls.append("https://website.net/.../.../.../all/{}".format(n))
for url in list_urls:
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response):
data = response.body
items = positionsItem()
items['file'] = data
yield items
The pipeline
class positionsPipeline(object):
def process_item(self, item, spider):
return item
The settings
BOT_NAME = 'Positions'
SPIDER_MODULES = ['Positions.spiders']
NEWSPIDER_MODULE = 'Positions.spiders'
USER_AGENT = get_random_agent()
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 10
SPIDER_MIDDLEWARES = {
'Positions.middlewares.positionsSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'Positions.middlewares.positionsDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'Positions.pipelines.positionsPipeline': 300,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
the item
class positionsItem(scrapy.Item):
file = scrapy.Field()
Scrapinghub log shows:
13: 2019-02-28 07:46:13 ERROR Rejected message because it was too big: ITM {"_type":"AircraftpositionsItem","file":"{\"success\":true,\"payload\":{\"aircraft\":{\"0\":{\"000001\":[null,null,\"CFFAW\",9.95729,-84.1405,9500,90,136,1538140969,null,null,\"2000\",\"2-39710687\",[9.93233,-84.1386,277]],\"000023\":[\"ULAC\",null,\"PH4P4\",
From your settings file it looks like there isn't a predefined feed output mechanism for Scrapy to use. It's odd that it worked the first time locally (in producing a .csv file).
In any case, here's the extra lines in settings.py you need to add for the Scrapy to work. If you just want to feed the output locally to a .csv file:
# Local .csv version
FEED_URI = 'file://NAME_OF_FILE_PATH.csv'
FEED_FORMAT = 'csv'
I also use this version for uploading a json file to an S3 bucket
# Remote S3 .json version
AWS_ACCESS_KEY_ID = YOUR_AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY = YOUR_AWS_SECRET_ACCESS_KEY
FEED_URI = 's3://BUCKET_NAME/NAME_OF_FILE_PATH.json'
FEED_FORMAT = 'json'

Fail to store data in csv file through scraping

I try to scraping a webpage and extracting data ,then store all data in a csv file. Before adding ScrapeCallback class and calling it, everything works fine. However, it does not store any type of data except headers in the cvs file after adding the new class. Can anyone help me to figure out the problem?
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
import csv
import lxml.html
class ScrapeCallback:
# extract and store all data in a csv file
def __init__( self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
self.writer.writerow( self.fields)
def __call__( self, url, html):
if re.search('/view/',url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
print row
self.writer.writerow(row)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
html = download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', max_depth =2, scrape_callback = ScrapeCallback())