How to load models into Spark-nlp in Foundry code authoring - palantir-foundry

I'm trying to load a model from Spark-nlp model hub into a Universal Sentence Encoder Model as shown in the snippet below:
stages = []
documentAssembler = [
DocumentAssembler()
.setInputCol("description")
.setOutputCol("document")
]
stages += documentAssembler
// Where file_path is the location of extracted files of the model
use = [
UniversalSentenceEncoder.load(f"{file_path}")
.setInputCols(["document"])
.setOutputCol("sentence_embeddings")
]
stages += use
The model is downloadable from [Spark-nlp Model Hub - Universal Sentence Encoder] (https://nlp.johnsnowlabs.com/2020/04/17/tfhub_use.html/) The function I use to extract the downloaded model zip file is below:
def get_unzipped_path(dataset_with_model_files):
'''
Return a folder location for a temporary folder after reading file locations
encapsulated within a dataframe
Reference: https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
'''
input_filesystem = dataset_with_model_files.filesystem()
full_folder_path = input_filesystem.hadoop_path
files = [file_status.path for file_status in input_filesystem.ls('*.zip')]
file_name = files[0]
with dataset_with_model_files.filesystem().open(file_name, mode='rb') as f:
with tempfile.NamedTemporaryFile() as tmp:
shutil.copyfileobj(f, tmp)
tmp.flush()
# file_stem = Path(os.path.basename(file_name)).stem # Remove .zip
unpacked_path = os.path.join(full_folder_path, file_name)
with zipfile.ZipFile(tmp) as zip_ref:
zip_ref.extractall(unpacked_path)
return unpacked_path
The general problem is that although they exist in the temporary folder, they cannot be found, a sample of an error is provided below:
java.io.FileNotFoundException: File file:/data/ssd/01/palantir/foundry~node-manager/data/local-dir/usercache/palantir/appcache/application_1675144414001_39656/container_e103_1675144414001_39656_01_000002/tfhub_use_en_2.4.0_2.4_1587136330099/metadata/part-00000 does not exist
I have tried the solution of code I've provided, i.e. extracting to a temporary folder then feeding that path to the load function
using spark.sql(f"ADD ARCHIVE {zipped_file}") to add the archive which I believe is unpacked and available by SparkFiles.get(f"{file_name}") as shown below:
def get_archive_file_loc(dataset_with_model_files, spark, glob):
'''
Return a file name after submitting archive to the Spark Context
Reference: https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
'''
input_filesystem = dataset_with_model_files.filesystem()
full_folder_path = os.getcwd()
all_file_names = [
f"{f.path}" for f in input_filesystem.ls(glob=glob)
]
file_name = all_file_names[0]
with dataset_with_model_files.filesystem().open(file_name, mode='rb') as f:
zipped_file = os.path.join(full_folder_path, file_name)
with open(zipped_file, 'wb') as zf:
shutil.copyfileobj(f, zf)
# Check if file has been added to spark context as duplicate
# attempts to add can crash the executors
# if file_name not in os.listdir(SparkFiles.getRootDirectory()):
try:
spark.sql(f"ADD ARCHIVE {zipped_file}") # noqa
except Exception as e:
pass
return file_name
Has anyone been successful or has an idea of how to load the model in Spark-nlp?

Related

How to parse 2 json files in Apache beam

I have 2 json configuration files to read and want to assign there values to variables. I am creating a data flow job using apache beam but unable to parse those files and assign there values to a variable.
config1.json - { "bucket_name": "mybucket"}
config2.json - { "dataset_name": "mydataset"}
This is the pipeline statements ---- I tried with one JSON file first but even that is not working
with beam.Pipeline(options=pipeline_options) as pipeline:
steps = (pipeline
| "Getdata" >> beam.io.ReadFromText(custom_options.configfile)
| "CUSTOM JSON PARSE" >> beam.ParDo(custom_json_parser(custom_options.configfile))
| "write to GCS" >> beam.io.WriteToText('gs://mynewbucket/outputfile.txt')
)
result = pipeline.run()
result.wait_until_finish()
I also tried creating a function to parse atleast one file. This is a sample method I created but it did not work.
class custom_json_parser(beam.DoFn):
import apache_beam as beam
from apache_beam.io.gcp import gcsio
import logging
def __init__(self, configfile):
self.configfile = configfile
def process(self, configfile):
logging.info("JSON PARSING STARTED")
with beam.io.gcp.gcsio.GcsIO().open(self.configfile, 'r') as f:
for line in f:
data = json.loads(line)
bucket = data.get('bucket_name')
dataset = data.get('dataset_name') ```
Can someone please suggest the best method to resolve this issue in apache beam?
Thanks in Advance
If you need to read only once your files in the pipeline, don't read them in the pipeline, but before running it.
Read the files from GCS
Parse the file and put the useful content in the pipeline options map
Run your pipeline and use the data from the options
EDIT 1
You can use this piece of code to load the file and read it, before your pipeline. Simple Python, standard GCS libraries.
from google.cloud import storage
import json
client = storage.Client()
bucket = client.get_bucket('your-bucket')
blob = bucket.get_blob("name.json")
json_data = blob.download_as_string().decode('UTF-8')
print(json_data) # print -> {"name": "works!!"}
print(json.loads(json_data)["name"]) # print -> works!!
You can try following code snippet: -
Function to Parse File
class custom_json_parser(beam.DoFn):
def process(self, element):
logging.info(element)
data = json.loads(element)
bucket = data.get('bucket_name')
dataset = data.get('dataset_name')
return [{"bucket": bucket , "dataset": dataset }]
Over Pipeline you can call function
with beam.Pipeline(options=pipeline_options) as pipeline:
steps = (pipeline
| "Getdata" >> beam.io.ReadFromText(custom_options.configfile)
| "CUSTOM JSON PARSE" >> beam.ParDo(custom_json_parser())
| "write to GCS" >> beam.io.WriteToText('gs://mynewbucket/outputfile.txt')
)
result = pipeline.run()
result.wait_until_finish()
It will work.

How can i extract information quickly from 130,000+ Json files located in S3?

i have an S3 was over 130k Json Files which i need to calculate numbers based on data in the json files (for example calculate the number of gender of Speakers). i am currently using s3 Paginator and JSON.load to read each file and extract information form. but it take a very long time to process such a large number of file (2-3 files per second). how can i speed up the process? please provide working code examples if possible. Thank you
here is some of my code:
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
result = paginator.paginate(Bucket='bucket-name',StartAfter='')
for page in result:
if "Contents" in page:
for key in page[ "Contents" ]:
keyString = key[ "Key" ]
s3 = boto3.resource('s3')
content_object = s3.Bucket('bucket-name').Object(str(keyString))
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)
x = (json_content['dict-name'])
In order to use the code below, I'm assuming you understand pandas (if not, you may want to get to know it). Also, it's not clear if your 2-3 seconds is on the read or includes part of the number crunching, nonetheless multiprocessing will speed this up dramatically. The gist is to read all the files in (as dataframes), concatenate them, then do your analysis.
To be useful for me, I run this on spot instances that have lots of vCPUs and memory. I've found the instances that are network optimized (like c5n - look for the n) and the inf1 (for machine learning) are much faster at reading/writing than T or M instance types, as examples.
My use case is reading 2000 'directories' with roughly 1200 files in each and analyzing them. The multithreading is orders of magnitude faster than single threading.
File 1: your main script
# create script.py file
import os
from multiprocessing import Pool
from itertools import repeat
import pandas as pd
import json
from utils_file_handling import *
ufh = file_utilities() #instantiate the class functions - see below (second file)
bucket = 'your-bucket'
prefix = 'your-prefix/here/' # if you don't have a prefix pass '' (empty string or function will fail)
#define multiprocessing function - get to know this to use multiple processors to read files simultaneously
def get_dflist_multiprocess(keys_list, num_proc=4):
with Pool(num_proc) as pool:
df_list = pool.starmap(ufh.reader_json, zip(repeat(bucket), keys_list), 15)
pool.close()
pool.join()
return df_list
#create your master keys list upfront; you can loop through all or slice the list to test
keys_list = ufh.get_keys_from_prefix(bucket, prefix)
# keys_list = keys_list[0:2000] # as an exampmle
num_proc = os.cpu_count() #tells you how many processors your machine has; function above defaults to 4 unelss given
df_list = get_dflist_multiprocess(keys_list, num_proc=num_proc) #collect dataframes for each file
df_new = pd.concat(df_list, sort=False)
df_new = df_new.reset_index(drop=True)
# do your analysis on the dataframe
File 2: class functions
#utils_file_handling.py
# create this in a separate file; name as you wish but change the import in the script.py file
import boto3
import json
import pandas as pd
#define client and resource
s3sr = boto3.resource('s3')
s3sc = boto3.client('s3')
class file_utilities:
"""file handling function"""
def get_keys_from_prefix(self, bucket, prefix):
'''gets list of keys and dates for given bucket and prefix'''
keys_list = []
paginator = s3sr.meta.client.get_paginator('list_objects_v2')
# use Delimiter to limit search to that level of hierarchy
for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/'):
keys = [content['Key'] for content in page.get('Contents')]
print('keys in page: ', len(keys))
keys_list.extend(keys)
return keys_list
def read_json_file_from_s3(self, bucket, key):
"""read json file"""
bucket_obj = boto3.resource('s3').Bucket(bucket)
obj = boto3.client('s3').get_object(Bucket=bucket, Key=key)
data = obj['Body'].read().decode('utf-8')
return data
# you may need to tweak this for your ['dict-name'] example; I think I have it correct
def reader_json(self, bucket, key):
'''returns dataframe'''
return pd.DataFrame(json.loads(self.read_json_file_from_s3(bucket, key))['dict-name'])

Flask form does not let me upload TSV files

I have the following flask app where I want to be able to upload a TXT or TSV file to a form. The problem is, when I try to upload a TXT file, it works, but when I try to upload a TSV file, I get the following error:
File "/Users/cdastmalchi/Desktop/author_script/main.py", line 89, in process_file
if not places_exist(os.path.join(app.config['UPLOAD_FOLDER'], filename)):
File "/Users/cdastmalchi/Desktop/author_script/main.py", line 27, in places_exist
infile = open(filename, 'rU')
IOError: [Errno 2] No such file or directory: './Authors_Template.tsv'
Authors_Template.tsv is a template file that gets downloaded from the form and goes into the Downloads, and then I want users to be able to edit this template and then re-upload it. When I make the template Authors_Template.txt instead and then Download and re-upload it, it works. How can I solve this problem? I've even tried narrowing down the ALLOWED_EXTENSIONS list to just TSV and I still get the same issue.
app.py
from werkzeug.utils import secure_filename
import flask, string, random
import json
import subprocess
import os
import re
import time
UPLOAD_FOLDER = '.'
ALLOWED_EXTENSIONS = set(['txt','tsv'])
app = flask.Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.secret_key = ''.join(random.choice(string.ascii_letters) for _ in range(20)) #needed to use flask.session
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
def places_exist(filename):
infile = open(filename, 'rU')
placeDict = {}
addresses_temp = []
addresses = []
places_temp =[]
places = []
places_exist = True
for i in infile:
item = i.rstrip("\n").split("\t")
places_temp.append(item[0])
addresses_temp.append(item[1])
p_index = (places_temp.index('Place')) + 1
a_index = (addresses_temp.index('Address')) + 1
places = places_temp[p_index:]
addresses = addresses_temp[a_index:]
infile.close()
infile = open(filename, 'rU')
return places_exist
#app.route('/', methods=['GET'])
def home():
return flask.render_template('index.html')
#app.route('/process_file', methods=['POST'])
def process_file():
#here, you can run all the checks as before, but instead of flash, you can return jsonified results to read in the front-end
if 'file' not in flask.request.files or not flask.request.files['file'].filename:
return flask.jsonify({'result':'False', 'message':'no files selected'})
return flask.redirect(url_for('home'))
file = flask.request.files['file']
filename = secure_filename(file.filename)
if not allowed_file(file.filename):
return flask.jsonify({'result':'False', 'message':'Must be TXT file!'})
return flask.redirect(url_for('home'))
if not places_exist(os.path.join(app.config['UPLOAD_FOLDER'], filename)):
return flask.jsonify({'result':'False', 'message':'There is an affiliation missing from your Place list. Please re-try.'})
return flask.redirect(url_for('home'))
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
flask.session['filename'] = filename
return flask.jsonify({'result':'True'})
UPDATE:
def process_file():
#here, you can run all the checks as before, but instead of flash, you can return jsonified results to read in the front-end
if 'file' not in flask.request.files or not flask.request.files['file'].filename:
return flask.jsonify({'result':'False', 'message':'no files selected'})
return flask.redirect(url_for('home'))
file = flask.request.files['file']
filename = secure_filename(file.filename)
if not allowed_file(file.filename):
return flask.jsonify({'result':'False', 'message':'Must be TXT file!'})
return flask.redirect(url_for('home'))
# Save the file in the temp folder
file.save(os.path.join(app.config['TEMP_FOLDER'], filename))
# Process the file
if not places_exist(os.path.join(app.config['TEMP_FOLDER'], filename)):
return flask.jsonify({'result':'False', 'message':'There is an affiliation missing from your Place list. Please re-try.'})
return flask.redirect(url_for('home'))
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
flask.session['filename'] = filename
return flask.jsonify({'result':'True'})
You are trying to read a file before its writing in your directory. First you need to save the file in your application upload directory then read it.
def process_file():
# here, you can run all the checks as before, but instead of flash, you can return jsonified results to read in the front-end
if 'file' not in flask.request.files or not flask.request.files['file'].filename:
return flask.jsonify({'result':'False', 'message':'no files selected'})
return flask.redirect(url_for('home'))
file = flask.request.files['file']
filename = secure_filename(file.filename)
if not allowed_file(file.filename):
return flask.jsonify({'result':'False', 'message':'Must be TXT file!'})
return flask.redirect(url_for('home'))
# Save the file in the correct Location
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
# Process your file already saved
if not places_exist(os.path.join(app.config['UPLOAD_FOLDER'], filename)):
return flask.jsonify({'result':'False', 'message':'There is an affiliation missing from your Place list. Please re-try.'})
return flask.redirect(url_for('home'))
flask.session['filename'] = filename
return flask.jsonify({'result':'True'})
EDIT: You have to be careful if you need to check the file before saving, if you save right away you will override your old file, a good approach will be to save the file in a temp location, check that file and then save in the final directory, and obviously delete the file in the tmp folder.
PS: Also you have 2 return, if you need to respond as a HTML or JSON you have to check the headers of the request.

Scrapinghub plugs my results in the log and not in item

I have a functioning spider project to extract urls content (no css). I crawled several set of data and stored them in a series of .csv files. Now I try to set it up to work on Scrapinghub in order to go for a long run scraping.
So far, I am able to get the spider uploaded and work on Scrapinghub. My problem is the result appears in the 'log' and not under the 'item'. The amount of data exceeds the log capacity and thus gives me an error.
How can I set my pipelines/extractor to work and return a js or csv file? I am happy with a solution that have the scraped data to be sent to a database. As I failed to achieve that too.
Any guidance is appreciated.
The spider:
class DataSpider(scrapy.Spider):
name = "Data_2018"
def url_values(self):
time = list(range(1538140980, 1538140820, -60))
return time
def start_requests(self):
allowed_domains = ["https://website.net"]
list_urls = []
for n in self.url_values():
list_urls.append("https://website.net/.../.../.../all/{}".format(n))
for url in list_urls:
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response):
data = response.body
items = positionsItem()
items['file'] = data
yield items
The pipeline
class positionsPipeline(object):
def process_item(self, item, spider):
return item
The settings
BOT_NAME = 'Positions'
SPIDER_MODULES = ['Positions.spiders']
NEWSPIDER_MODULE = 'Positions.spiders'
USER_AGENT = get_random_agent()
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 10
SPIDER_MIDDLEWARES = {
'Positions.middlewares.positionsSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'Positions.middlewares.positionsDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'Positions.pipelines.positionsPipeline': 300,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
the item
class positionsItem(scrapy.Item):
file = scrapy.Field()
Scrapinghub log shows:
13: 2019-02-28 07:46:13 ERROR Rejected message because it was too big: ITM {"_type":"AircraftpositionsItem","file":"{\"success\":true,\"payload\":{\"aircraft\":{\"0\":{\"000001\":[null,null,\"CFFAW\",9.95729,-84.1405,9500,90,136,1538140969,null,null,\"2000\",\"2-39710687\",[9.93233,-84.1386,277]],\"000023\":[\"ULAC\",null,\"PH4P4\",
From your settings file it looks like there isn't a predefined feed output mechanism for Scrapy to use. It's odd that it worked the first time locally (in producing a .csv file).
In any case, here's the extra lines in settings.py you need to add for the Scrapy to work. If you just want to feed the output locally to a .csv file:
# Local .csv version
FEED_URI = 'file://NAME_OF_FILE_PATH.csv'
FEED_FORMAT = 'csv'
I also use this version for uploading a json file to an S3 bucket
# Remote S3 .json version
AWS_ACCESS_KEY_ID = YOUR_AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY = YOUR_AWS_SECRET_ACCESS_KEY
FEED_URI = 's3://BUCKET_NAME/NAME_OF_FILE_PATH.json'
FEED_FORMAT = 'json'

Return file parse status django ajax

I have a script which is parsing and doing some actions on a json dir.
I want to show file status with current number of file being parsing.
here is what i do.
def post(self, request, *args, **kwargs):
self.response = _.default_response()
unlabeled = _.path_to_dict(settings.UNLABELEDJSONS)
for dirs in unlabeled['children']:
for jsonFile in dirs['children']:
if jsonFile['type']=='file':
#file current path with generated path.
filepath = "%s/%s/%s" % (settings.UNLABELEDJSONS,dirs['name'], jsonFile['name'])
targetpath = "%s/%s" % (settings.DATAFILESGENERATED, jsonFile['name'])
#Let's generate file.
_.call_testing_script(settings.GENERATEDRDA, filepath, targetpath)
#File generated let's move file.
shutil.move(targetpath, settings.DATAFILES)
#file is moved now and new path is existpath.
existpath = "%s/%s" % (settings.DATAFILES, jsonFile['name'])
#let's know database what we generated.
_.processDataFile(existpath,jsonFile['name'])
#We are done
self.response['status'] = True
return _.serialize_response(self.response)
Now i want to show current number of file being parsing IN THE UI
total number of files IN THE UI
and other information IN THE UI
Q: what library or method should I use?