How to parse 2 json files in Apache beam - json

I have 2 json configuration files to read and want to assign there values to variables. I am creating a data flow job using apache beam but unable to parse those files and assign there values to a variable.
config1.json - { "bucket_name": "mybucket"}
config2.json - { "dataset_name": "mydataset"}
This is the pipeline statements ---- I tried with one JSON file first but even that is not working
with beam.Pipeline(options=pipeline_options) as pipeline:
steps = (pipeline
| "Getdata" >> beam.io.ReadFromText(custom_options.configfile)
| "CUSTOM JSON PARSE" >> beam.ParDo(custom_json_parser(custom_options.configfile))
| "write to GCS" >> beam.io.WriteToText('gs://mynewbucket/outputfile.txt')
)
result = pipeline.run()
result.wait_until_finish()
I also tried creating a function to parse atleast one file. This is a sample method I created but it did not work.
class custom_json_parser(beam.DoFn):
import apache_beam as beam
from apache_beam.io.gcp import gcsio
import logging
def __init__(self, configfile):
self.configfile = configfile
def process(self, configfile):
logging.info("JSON PARSING STARTED")
with beam.io.gcp.gcsio.GcsIO().open(self.configfile, 'r') as f:
for line in f:
data = json.loads(line)
bucket = data.get('bucket_name')
dataset = data.get('dataset_name') ```
Can someone please suggest the best method to resolve this issue in apache beam?
Thanks in Advance

If you need to read only once your files in the pipeline, don't read them in the pipeline, but before running it.
Read the files from GCS
Parse the file and put the useful content in the pipeline options map
Run your pipeline and use the data from the options
EDIT 1
You can use this piece of code to load the file and read it, before your pipeline. Simple Python, standard GCS libraries.
from google.cloud import storage
import json
client = storage.Client()
bucket = client.get_bucket('your-bucket')
blob = bucket.get_blob("name.json")
json_data = blob.download_as_string().decode('UTF-8')
print(json_data) # print -> {"name": "works!!"}
print(json.loads(json_data)["name"]) # print -> works!!

You can try following code snippet: -
Function to Parse File
class custom_json_parser(beam.DoFn):
def process(self, element):
logging.info(element)
data = json.loads(element)
bucket = data.get('bucket_name')
dataset = data.get('dataset_name')
return [{"bucket": bucket , "dataset": dataset }]
Over Pipeline you can call function
with beam.Pipeline(options=pipeline_options) as pipeline:
steps = (pipeline
| "Getdata" >> beam.io.ReadFromText(custom_options.configfile)
| "CUSTOM JSON PARSE" >> beam.ParDo(custom_json_parser())
| "write to GCS" >> beam.io.WriteToText('gs://mynewbucket/outputfile.txt')
)
result = pipeline.run()
result.wait_until_finish()
It will work.

Related

How to load models into Spark-nlp in Foundry code authoring

I'm trying to load a model from Spark-nlp model hub into a Universal Sentence Encoder Model as shown in the snippet below:
stages = []
documentAssembler = [
DocumentAssembler()
.setInputCol("description")
.setOutputCol("document")
]
stages += documentAssembler
// Where file_path is the location of extracted files of the model
use = [
UniversalSentenceEncoder.load(f"{file_path}")
.setInputCols(["document"])
.setOutputCol("sentence_embeddings")
]
stages += use
The model is downloadable from [Spark-nlp Model Hub - Universal Sentence Encoder] (https://nlp.johnsnowlabs.com/2020/04/17/tfhub_use.html/) The function I use to extract the downloaded model zip file is below:
def get_unzipped_path(dataset_with_model_files):
'''
Return a folder location for a temporary folder after reading file locations
encapsulated within a dataframe
Reference: https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
'''
input_filesystem = dataset_with_model_files.filesystem()
full_folder_path = input_filesystem.hadoop_path
files = [file_status.path for file_status in input_filesystem.ls('*.zip')]
file_name = files[0]
with dataset_with_model_files.filesystem().open(file_name, mode='rb') as f:
with tempfile.NamedTemporaryFile() as tmp:
shutil.copyfileobj(f, tmp)
tmp.flush()
# file_stem = Path(os.path.basename(file_name)).stem # Remove .zip
unpacked_path = os.path.join(full_folder_path, file_name)
with zipfile.ZipFile(tmp) as zip_ref:
zip_ref.extractall(unpacked_path)
return unpacked_path
The general problem is that although they exist in the temporary folder, they cannot be found, a sample of an error is provided below:
java.io.FileNotFoundException: File file:/data/ssd/01/palantir/foundry~node-manager/data/local-dir/usercache/palantir/appcache/application_1675144414001_39656/container_e103_1675144414001_39656_01_000002/tfhub_use_en_2.4.0_2.4_1587136330099/metadata/part-00000 does not exist
I have tried the solution of code I've provided, i.e. extracting to a temporary folder then feeding that path to the load function
using spark.sql(f"ADD ARCHIVE {zipped_file}") to add the archive which I believe is unpacked and available by SparkFiles.get(f"{file_name}") as shown below:
def get_archive_file_loc(dataset_with_model_files, spark, glob):
'''
Return a file name after submitting archive to the Spark Context
Reference: https://www.palantir.com/docs/foundry/transforms-python/unstructured-files/
'''
input_filesystem = dataset_with_model_files.filesystem()
full_folder_path = os.getcwd()
all_file_names = [
f"{f.path}" for f in input_filesystem.ls(glob=glob)
]
file_name = all_file_names[0]
with dataset_with_model_files.filesystem().open(file_name, mode='rb') as f:
zipped_file = os.path.join(full_folder_path, file_name)
with open(zipped_file, 'wb') as zf:
shutil.copyfileobj(f, zf)
# Check if file has been added to spark context as duplicate
# attempts to add can crash the executors
# if file_name not in os.listdir(SparkFiles.getRootDirectory()):
try:
spark.sql(f"ADD ARCHIVE {zipped_file}") # noqa
except Exception as e:
pass
return file_name
Has anyone been successful or has an idea of how to load the model in Spark-nlp?

Convert a JSON export file to BQ JSON Newline delimited JSON

I have a JSON export from a database and I'd like to upload it and create a new table on BQ. This file is 600MB and I tried to use the jq on mac terminal but I'm a noob and I couldn't do it...
Is there any way to a convert a random json file and get the result into this newline delimited JSON file? If yes, pls help me with this
You can load your JSON into cloud storage following this documentation.
Also with this code (python), you can load into BigQuery previously stored in a bucket.
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"
job_config = bigquery.LoadJobConfig(
schema=[
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("post_abbr", "STRING"),
],
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"
load_job = client.load_table_from_uri(
uri,
table_id,
location="US", # Must match the destination dataset location.
job_config=job_config,
) # Make an API request.
load_job.result() # Waits for the job to complete.
destination_table = client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))

How can i extract information quickly from 130,000+ Json files located in S3?

i have an S3 was over 130k Json Files which i need to calculate numbers based on data in the json files (for example calculate the number of gender of Speakers). i am currently using s3 Paginator and JSON.load to read each file and extract information form. but it take a very long time to process such a large number of file (2-3 files per second). how can i speed up the process? please provide working code examples if possible. Thank you
here is some of my code:
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')
result = paginator.paginate(Bucket='bucket-name',StartAfter='')
for page in result:
if "Contents" in page:
for key in page[ "Contents" ]:
keyString = key[ "Key" ]
s3 = boto3.resource('s3')
content_object = s3.Bucket('bucket-name').Object(str(keyString))
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)
x = (json_content['dict-name'])
In order to use the code below, I'm assuming you understand pandas (if not, you may want to get to know it). Also, it's not clear if your 2-3 seconds is on the read or includes part of the number crunching, nonetheless multiprocessing will speed this up dramatically. The gist is to read all the files in (as dataframes), concatenate them, then do your analysis.
To be useful for me, I run this on spot instances that have lots of vCPUs and memory. I've found the instances that are network optimized (like c5n - look for the n) and the inf1 (for machine learning) are much faster at reading/writing than T or M instance types, as examples.
My use case is reading 2000 'directories' with roughly 1200 files in each and analyzing them. The multithreading is orders of magnitude faster than single threading.
File 1: your main script
# create script.py file
import os
from multiprocessing import Pool
from itertools import repeat
import pandas as pd
import json
from utils_file_handling import *
ufh = file_utilities() #instantiate the class functions - see below (second file)
bucket = 'your-bucket'
prefix = 'your-prefix/here/' # if you don't have a prefix pass '' (empty string or function will fail)
#define multiprocessing function - get to know this to use multiple processors to read files simultaneously
def get_dflist_multiprocess(keys_list, num_proc=4):
with Pool(num_proc) as pool:
df_list = pool.starmap(ufh.reader_json, zip(repeat(bucket), keys_list), 15)
pool.close()
pool.join()
return df_list
#create your master keys list upfront; you can loop through all or slice the list to test
keys_list = ufh.get_keys_from_prefix(bucket, prefix)
# keys_list = keys_list[0:2000] # as an exampmle
num_proc = os.cpu_count() #tells you how many processors your machine has; function above defaults to 4 unelss given
df_list = get_dflist_multiprocess(keys_list, num_proc=num_proc) #collect dataframes for each file
df_new = pd.concat(df_list, sort=False)
df_new = df_new.reset_index(drop=True)
# do your analysis on the dataframe
File 2: class functions
#utils_file_handling.py
# create this in a separate file; name as you wish but change the import in the script.py file
import boto3
import json
import pandas as pd
#define client and resource
s3sr = boto3.resource('s3')
s3sc = boto3.client('s3')
class file_utilities:
"""file handling function"""
def get_keys_from_prefix(self, bucket, prefix):
'''gets list of keys and dates for given bucket and prefix'''
keys_list = []
paginator = s3sr.meta.client.get_paginator('list_objects_v2')
# use Delimiter to limit search to that level of hierarchy
for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/'):
keys = [content['Key'] for content in page.get('Contents')]
print('keys in page: ', len(keys))
keys_list.extend(keys)
return keys_list
def read_json_file_from_s3(self, bucket, key):
"""read json file"""
bucket_obj = boto3.resource('s3').Bucket(bucket)
obj = boto3.client('s3').get_object(Bucket=bucket, Key=key)
data = obj['Body'].read().decode('utf-8')
return data
# you may need to tweak this for your ['dict-name'] example; I think I have it correct
def reader_json(self, bucket, key):
'''returns dataframe'''
return pd.DataFrame(json.loads(self.read_json_file_from_s3(bucket, key))['dict-name'])

How to read a csv file from S3 bucket using AWS lambda and write it as new CSV to another S3 bucket? Python boto3

Ok so I am a beginner to AWS in general. I am writing a lambda function to trigger based on file upload event in S3, remove some coulmns and write it to a new bucket. Been banging my head for the past two datas and I am getting different error each time. Can someone modify my code/fix it? outputlv will be my target bucket.. Currently I am getting '/outputlv/output.csv' path does not exist in the with open('/outputlv/output.csv', 'w') as output_file line. Thanks.
import json
import urllib.parse
import boto3
import csv
s3 = boto3.client('s3')
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
file_name = s3.get_object(Bucket=bucket, Key=key)
csv_reader = csv.reader(file_name)
with open('/outputlv/output.csv', 'w') as output_file:
wtr = csv.writer(output_file)
for i in csv_reader:
wtr.writerow(i[0], i[2], i[3])
target_bucket = 'outputlv'
final_file = 'outputlv/output.csv'
s3.put_object(Bucket=target_bucket, Key=final_file)
Why don't you get the content, is it required to work with local files at all ?
response = s3.get_object(Bucket=bucket, Key=key)
# Get file content
content = response['Body'].read()
# Pass file content to csv reader
csv_reader = csv.reader(content)

Import Kaggle csv from download url to pandas DataFrame

I've been trying different methods to import the SpaceX missions csv file on Kaggle directly into a pandas DataFrame, without any success.
I'd need to send requests to login. This is what I have so far:
import requests
import pandas as pd
from io import StringIO
# Link to the Kaggle data set & name of zip file
login_url = 'http://www.kaggle.com/account/login?ReturnUrl=/spacex/spacex-missions/downloads/database.csv'
# Kaggle Username and Password
kaggle_info = {'UserName': "user", 'Password': "pwd"}
# Login to Kaggle and retrieve the data.
r = requests.post(login_url, data=kaggle_info, stream=True)
df = pd.read_csv(StringIO(r.text))
r is returning the html content of the page.
df = pd.read_csv(url) gives a CParser error:
CParserError: Error tokenizing data. C error: Expected 1 fields in line 13, saw 6
I've searched for a solution, but so far nothing I've tried worked.
You are creating a stream and passing it directly to pandas. I think you need to pass a file like object to pandas. Take a look at this answer for a possible solution (using post and not get in the request though).
Also i think the login url with redirect that you use is not working as it is. I know i suggested that here. But i ended up not using is because the post request call did not handle the redirect (i suspect).
The code i ended up using in my project was this:
def from_kaggle(data_sets, competition):
"""Fetches data from Kaggle
Parameters
----------
data_sets : (array)
list of dataset filenames on kaggle. (e.g. train.csv.zip)
competition : (string)
name of kaggle competition as it appears in url
(e.g. 'rossmann-store-sales')
"""
kaggle_dataset_url = "https://www.kaggle.com/c/{}/download/".format(competition)
KAGGLE_INFO = {'UserName': config.kaggle_username,
'Password': config.kaggle_password}
for data_set in data_sets:
data_url = path.join(kaggle_dataset_url, data_set)
data_output = path.join(config.raw_data_dir, data_set)
# Attempts to download the CSV file. Gets rejected because we are not logged in.
r = requests.get(data_url)
# Login to Kaggle and retrieve the data.
r = requests.post(r.url, data=KAGGLE_INFO, stream=True)
# Writes the data to a local file one chunk at a time.
with open(data_output, 'wb') as f:
# Reads 512KB at a time into memory
for chunk in r.iter_content(chunk_size=(512 * 1024)):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
Example use:
sets = ['train.csv.zip',
'test.csv.zip',
'store.csv.zip',
'sample_submission.csv.zip',]
from_kaggle(sets, 'rossmann-store-sales')
You might need to unzip the files.
def _unzip_folder(destination):
"""Unzip without regards to the folder structure.
Parameters
----------
destination : (str)
Local path and filename where file is should be stored.
"""
with zipfile.ZipFile(destination, "r") as z:
z.extractall(config.raw_data_dir)
So i never really directly loaded it into the DataFrame, but rather stored it to disk first. But you could modify it to use a temp directory and just delete the files after you read them.