Return file parse status django ajax - json

I have a script which is parsing and doing some actions on a json dir.
I want to show file status with current number of file being parsing.
here is what i do.
def post(self, request, *args, **kwargs):
self.response = _.default_response()
unlabeled = _.path_to_dict(settings.UNLABELEDJSONS)
for dirs in unlabeled['children']:
for jsonFile in dirs['children']:
if jsonFile['type']=='file':
#file current path with generated path.
filepath = "%s/%s/%s" % (settings.UNLABELEDJSONS,dirs['name'], jsonFile['name'])
targetpath = "%s/%s" % (settings.DATAFILESGENERATED, jsonFile['name'])
#Let's generate file.
_.call_testing_script(settings.GENERATEDRDA, filepath, targetpath)
#File generated let's move file.
shutil.move(targetpath, settings.DATAFILES)
#file is moved now and new path is existpath.
existpath = "%s/%s" % (settings.DATAFILES, jsonFile['name'])
#let's know database what we generated.
_.processDataFile(existpath,jsonFile['name'])
#We are done
self.response['status'] = True
return _.serialize_response(self.response)
Now i want to show current number of file being parsing IN THE UI
total number of files IN THE UI
and other information IN THE UI
Q: what library or method should I use?

Related

How to load models into Spark-nlp in Foundry code authoring

I'm trying to load a model from Spark-nlp model hub into a Universal Sentence Encoder Model as shown in the snippet below:
stages = []
documentAssembler = [
DocumentAssembler()
.setInputCol("description")
.setOutputCol("document")
]
stages += documentAssembler
// Where file_path is the location of extracted files of the model
use = [
UniversalSentenceEncoder.load(f"{file_path}")
.setInputCols(["document"])
.setOutputCol("sentence_embeddings")
]
stages += use
The model is downloadable from [Spark-nlp Model Hub - Universal Sentence Encoder] (https://nlp.johnsnowlabs.com/2020/04/17/tfhub_use.html/) The function I use to extract the downloaded model zip file is below:
def get_unzipped_path(dataset_with_model_files):
'''
Return a folder location for a temporary folder after reading file locations
encapsulated within a dataframe
Reference: https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
'''
input_filesystem = dataset_with_model_files.filesystem()
full_folder_path = input_filesystem.hadoop_path
files = [file_status.path for file_status in input_filesystem.ls('*.zip')]
file_name = files[0]
with dataset_with_model_files.filesystem().open(file_name, mode='rb') as f:
with tempfile.NamedTemporaryFile() as tmp:
shutil.copyfileobj(f, tmp)
tmp.flush()
# file_stem = Path(os.path.basename(file_name)).stem # Remove .zip
unpacked_path = os.path.join(full_folder_path, file_name)
with zipfile.ZipFile(tmp) as zip_ref:
zip_ref.extractall(unpacked_path)
return unpacked_path
The general problem is that although they exist in the temporary folder, they cannot be found, a sample of an error is provided below:
java.io.FileNotFoundException: File file:/data/ssd/01/palantir/foundry~node-manager/data/local-dir/usercache/palantir/appcache/application_1675144414001_39656/container_e103_1675144414001_39656_01_000002/tfhub_use_en_2.4.0_2.4_1587136330099/metadata/part-00000 does not exist
I have tried the solution of code I've provided, i.e. extracting to a temporary folder then feeding that path to the load function
using spark.sql(f"ADD ARCHIVE {zipped_file}") to add the archive which I believe is unpacked and available by SparkFiles.get(f"{file_name}") as shown below:
def get_archive_file_loc(dataset_with_model_files, spark, glob):
'''
Return a file name after submitting archive to the Spark Context
Reference: https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
'''
input_filesystem = dataset_with_model_files.filesystem()
full_folder_path = os.getcwd()
all_file_names = [
f"{f.path}" for f in input_filesystem.ls(glob=glob)
]
file_name = all_file_names[0]
with dataset_with_model_files.filesystem().open(file_name, mode='rb') as f:
zipped_file = os.path.join(full_folder_path, file_name)
with open(zipped_file, 'wb') as zf:
shutil.copyfileobj(f, zf)
# Check if file has been added to spark context as duplicate
# attempts to add can crash the executors
# if file_name not in os.listdir(SparkFiles.getRootDirectory()):
try:
spark.sql(f"ADD ARCHIVE {zipped_file}") # noqa
except Exception as e:
pass
return file_name
Has anyone been successful or has an idea of how to load the model in Spark-nlp?

How to parse 2 json files in Apache beam

I have 2 json configuration files to read and want to assign there values to variables. I am creating a data flow job using apache beam but unable to parse those files and assign there values to a variable.
config1.json - { "bucket_name": "mybucket"}
config2.json - { "dataset_name": "mydataset"}
This is the pipeline statements ---- I tried with one JSON file first but even that is not working
with beam.Pipeline(options=pipeline_options) as pipeline:
steps = (pipeline
| "Getdata" >> beam.io.ReadFromText(custom_options.configfile)
| "CUSTOM JSON PARSE" >> beam.ParDo(custom_json_parser(custom_options.configfile))
| "write to GCS" >> beam.io.WriteToText('gs://mynewbucket/outputfile.txt')
)
result = pipeline.run()
result.wait_until_finish()
I also tried creating a function to parse atleast one file. This is a sample method I created but it did not work.
class custom_json_parser(beam.DoFn):
import apache_beam as beam
from apache_beam.io.gcp import gcsio
import logging
def __init__(self, configfile):
self.configfile = configfile
def process(self, configfile):
logging.info("JSON PARSING STARTED")
with beam.io.gcp.gcsio.GcsIO().open(self.configfile, 'r') as f:
for line in f:
data = json.loads(line)
bucket = data.get('bucket_name')
dataset = data.get('dataset_name') ```
Can someone please suggest the best method to resolve this issue in apache beam?
Thanks in Advance
If you need to read only once your files in the pipeline, don't read them in the pipeline, but before running it.
Read the files from GCS
Parse the file and put the useful content in the pipeline options map
Run your pipeline and use the data from the options
EDIT 1
You can use this piece of code to load the file and read it, before your pipeline. Simple Python, standard GCS libraries.
from google.cloud import storage
import json
client = storage.Client()
bucket = client.get_bucket('your-bucket')
blob = bucket.get_blob("name.json")
json_data = blob.download_as_string().decode('UTF-8')
print(json_data) # print -> {"name": "works!!"}
print(json.loads(json_data)["name"]) # print -> works!!
You can try following code snippet: -
Function to Parse File
class custom_json_parser(beam.DoFn):
def process(self, element):
logging.info(element)
data = json.loads(element)
bucket = data.get('bucket_name')
dataset = data.get('dataset_name')
return [{"bucket": bucket , "dataset": dataset }]
Over Pipeline you can call function
with beam.Pipeline(options=pipeline_options) as pipeline:
steps = (pipeline
| "Getdata" >> beam.io.ReadFromText(custom_options.configfile)
| "CUSTOM JSON PARSE" >> beam.ParDo(custom_json_parser())
| "write to GCS" >> beam.io.WriteToText('gs://mynewbucket/outputfile.txt')
)
result = pipeline.run()
result.wait_until_finish()
It will work.

Flask form does not let me upload TSV files

I have the following flask app where I want to be able to upload a TXT or TSV file to a form. The problem is, when I try to upload a TXT file, it works, but when I try to upload a TSV file, I get the following error:
File "/Users/cdastmalchi/Desktop/author_script/main.py", line 89, in process_file
if not places_exist(os.path.join(app.config['UPLOAD_FOLDER'], filename)):
File "/Users/cdastmalchi/Desktop/author_script/main.py", line 27, in places_exist
infile = open(filename, 'rU')
IOError: [Errno 2] No such file or directory: './Authors_Template.tsv'
Authors_Template.tsv is a template file that gets downloaded from the form and goes into the Downloads, and then I want users to be able to edit this template and then re-upload it. When I make the template Authors_Template.txt instead and then Download and re-upload it, it works. How can I solve this problem? I've even tried narrowing down the ALLOWED_EXTENSIONS list to just TSV and I still get the same issue.
app.py
from werkzeug.utils import secure_filename
import flask, string, random
import json
import subprocess
import os
import re
import time
UPLOAD_FOLDER = '.'
ALLOWED_EXTENSIONS = set(['txt','tsv'])
app = flask.Flask(__name__)
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.secret_key = ''.join(random.choice(string.ascii_letters) for _ in range(20)) #needed to use flask.session
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS
def places_exist(filename):
infile = open(filename, 'rU')
placeDict = {}
addresses_temp = []
addresses = []
places_temp =[]
places = []
places_exist = True
for i in infile:
item = i.rstrip("\n").split("\t")
places_temp.append(item[0])
addresses_temp.append(item[1])
p_index = (places_temp.index('Place')) + 1
a_index = (addresses_temp.index('Address')) + 1
places = places_temp[p_index:]
addresses = addresses_temp[a_index:]
infile.close()
infile = open(filename, 'rU')
return places_exist
#app.route('/', methods=['GET'])
def home():
return flask.render_template('index.html')
#app.route('/process_file', methods=['POST'])
def process_file():
#here, you can run all the checks as before, but instead of flash, you can return jsonified results to read in the front-end
if 'file' not in flask.request.files or not flask.request.files['file'].filename:
return flask.jsonify({'result':'False', 'message':'no files selected'})
return flask.redirect(url_for('home'))
file = flask.request.files['file']
filename = secure_filename(file.filename)
if not allowed_file(file.filename):
return flask.jsonify({'result':'False', 'message':'Must be TXT file!'})
return flask.redirect(url_for('home'))
if not places_exist(os.path.join(app.config['UPLOAD_FOLDER'], filename)):
return flask.jsonify({'result':'False', 'message':'There is an affiliation missing from your Place list. Please re-try.'})
return flask.redirect(url_for('home'))
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
flask.session['filename'] = filename
return flask.jsonify({'result':'True'})
UPDATE:
def process_file():
#here, you can run all the checks as before, but instead of flash, you can return jsonified results to read in the front-end
if 'file' not in flask.request.files or not flask.request.files['file'].filename:
return flask.jsonify({'result':'False', 'message':'no files selected'})
return flask.redirect(url_for('home'))
file = flask.request.files['file']
filename = secure_filename(file.filename)
if not allowed_file(file.filename):
return flask.jsonify({'result':'False', 'message':'Must be TXT file!'})
return flask.redirect(url_for('home'))
# Save the file in the temp folder
file.save(os.path.join(app.config['TEMP_FOLDER'], filename))
# Process the file
if not places_exist(os.path.join(app.config['TEMP_FOLDER'], filename)):
return flask.jsonify({'result':'False', 'message':'There is an affiliation missing from your Place list. Please re-try.'})
return flask.redirect(url_for('home'))
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
flask.session['filename'] = filename
return flask.jsonify({'result':'True'})
You are trying to read a file before its writing in your directory. First you need to save the file in your application upload directory then read it.
def process_file():
# here, you can run all the checks as before, but instead of flash, you can return jsonified results to read in the front-end
if 'file' not in flask.request.files or not flask.request.files['file'].filename:
return flask.jsonify({'result':'False', 'message':'no files selected'})
return flask.redirect(url_for('home'))
file = flask.request.files['file']
filename = secure_filename(file.filename)
if not allowed_file(file.filename):
return flask.jsonify({'result':'False', 'message':'Must be TXT file!'})
return flask.redirect(url_for('home'))
# Save the file in the correct Location
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
# Process your file already saved
if not places_exist(os.path.join(app.config['UPLOAD_FOLDER'], filename)):
return flask.jsonify({'result':'False', 'message':'There is an affiliation missing from your Place list. Please re-try.'})
return flask.redirect(url_for('home'))
flask.session['filename'] = filename
return flask.jsonify({'result':'True'})
EDIT: You have to be careful if you need to check the file before saving, if you save right away you will override your old file, a good approach will be to save the file in a temp location, check that file and then save in the final directory, and obviously delete the file in the tmp folder.
PS: Also you have 2 return, if you need to respond as a HTML or JSON you have to check the headers of the request.

Scrapinghub plugs my results in the log and not in item

I have a functioning spider project to extract urls content (no css). I crawled several set of data and stored them in a series of .csv files. Now I try to set it up to work on Scrapinghub in order to go for a long run scraping.
So far, I am able to get the spider uploaded and work on Scrapinghub. My problem is the result appears in the 'log' and not under the 'item'. The amount of data exceeds the log capacity and thus gives me an error.
How can I set my pipelines/extractor to work and return a js or csv file? I am happy with a solution that have the scraped data to be sent to a database. As I failed to achieve that too.
Any guidance is appreciated.
The spider:
class DataSpider(scrapy.Spider):
name = "Data_2018"
def url_values(self):
time = list(range(1538140980, 1538140820, -60))
return time
def start_requests(self):
allowed_domains = ["https://website.net"]
list_urls = []
for n in self.url_values():
list_urls.append("https://website.net/.../.../.../all/{}".format(n))
for url in list_urls:
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response):
data = response.body
items = positionsItem()
items['file'] = data
yield items
The pipeline
class positionsPipeline(object):
def process_item(self, item, spider):
return item
The settings
BOT_NAME = 'Positions'
SPIDER_MODULES = ['Positions.spiders']
NEWSPIDER_MODULE = 'Positions.spiders'
USER_AGENT = get_random_agent()
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 32
DOWNLOAD_DELAY = 10
SPIDER_MIDDLEWARES = {
'Positions.middlewares.positionsSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'Positions.middlewares.positionsDownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
'Positions.pipelines.positionsPipeline': 300,
}
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = []
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
the item
class positionsItem(scrapy.Item):
file = scrapy.Field()
Scrapinghub log shows:
13: 2019-02-28 07:46:13 ERROR Rejected message because it was too big: ITM {"_type":"AircraftpositionsItem","file":"{\"success\":true,\"payload\":{\"aircraft\":{\"0\":{\"000001\":[null,null,\"CFFAW\",9.95729,-84.1405,9500,90,136,1538140969,null,null,\"2000\",\"2-39710687\",[9.93233,-84.1386,277]],\"000023\":[\"ULAC\",null,\"PH4P4\",
From your settings file it looks like there isn't a predefined feed output mechanism for Scrapy to use. It's odd that it worked the first time locally (in producing a .csv file).
In any case, here's the extra lines in settings.py you need to add for the Scrapy to work. If you just want to feed the output locally to a .csv file:
# Local .csv version
FEED_URI = 'file://NAME_OF_FILE_PATH.csv'
FEED_FORMAT = 'csv'
I also use this version for uploading a json file to an S3 bucket
# Remote S3 .json version
AWS_ACCESS_KEY_ID = YOUR_AWS_ACCESS_KEY_ID
AWS_SECRET_ACCESS_KEY = YOUR_AWS_SECRET_ACCESS_KEY
FEED_URI = 's3://BUCKET_NAME/NAME_OF_FILE_PATH.json'
FEED_FORMAT = 'json'

Local File from repository not saved after edit (groovy / jenkins)

In my Jenkins Job i checked out a repository. In that repo, there is a file which i want to edit during the job. But it seems that the file is not saved. I have a Method like this:
def updateFile(id, key){
def inputFile = readFile("${workspace}/config/cnf.json")
def inputJSON = new JsonSlurper().parseText(inputFile)
inputJSON."${key}"[0].pref = "${id}"
def result = JsonOutput.toJson(inputJSON)
//here it is changed.
println "result:\n${result}"
inputFile << "${JsonOutput.prettyPrint(result)}"
//and now it is again the old one.
println "Hier: \n ${inputFile}"
}
Problem is, that i can't use "new File" and ".write" or ".append" because Jenkins cant find the File that way.
def inputFile = new File("${workspace}/config/cnf.json") --> no File found
Is there any good way to save the existing File?
if readFile("${workspace}/config/cnf.json") works fine
then to write file use writeFile like this:
writeFile file:"${workspace}/config/cnf.json", text:result