Download from Glacier - boto

I am able to upload a file to glacier vault. But how do I download a file?
import boto
ACCESS_KEY_ID = "abc"
SECRET_ACCESS_KEY = "pqr"
glacier_connection = boto.connect_glacier(aws_access_key_id=ACCESS_KEY_ID, aws_secret_access_key=SECRET_ACCESS_KEY)
vault = glacier_connection.create_vault("backup")
archive_id=vault.upload_archive("aws.sh")
I expected the following code would download a file from glacier. But it does not work.
retrieve_job=vault.retrieve_archive(archive_id)
if retrieve_job.completed:
retrieve_job.download_to_file("test.pl")

Related

How to parse 2 json files in Apache beam

I have 2 json configuration files to read and want to assign there values to variables. I am creating a data flow job using apache beam but unable to parse those files and assign there values to a variable.
config1.json - { "bucket_name": "mybucket"}
config2.json - { "dataset_name": "mydataset"}
This is the pipeline statements ---- I tried with one JSON file first but even that is not working
with beam.Pipeline(options=pipeline_options) as pipeline:
steps = (pipeline
| "Getdata" >> beam.io.ReadFromText(custom_options.configfile)
| "CUSTOM JSON PARSE" >> beam.ParDo(custom_json_parser(custom_options.configfile))
| "write to GCS" >> beam.io.WriteToText('gs://mynewbucket/outputfile.txt')
)
result = pipeline.run()
result.wait_until_finish()
I also tried creating a function to parse atleast one file. This is a sample method I created but it did not work.
class custom_json_parser(beam.DoFn):
import apache_beam as beam
from apache_beam.io.gcp import gcsio
import logging
def __init__(self, configfile):
self.configfile = configfile
def process(self, configfile):
logging.info("JSON PARSING STARTED")
with beam.io.gcp.gcsio.GcsIO().open(self.configfile, 'r') as f:
for line in f:
data = json.loads(line)
bucket = data.get('bucket_name')
dataset = data.get('dataset_name') ```
Can someone please suggest the best method to resolve this issue in apache beam?
Thanks in Advance
If you need to read only once your files in the pipeline, don't read them in the pipeline, but before running it.
Read the files from GCS
Parse the file and put the useful content in the pipeline options map
Run your pipeline and use the data from the options
EDIT 1
You can use this piece of code to load the file and read it, before your pipeline. Simple Python, standard GCS libraries.
from google.cloud import storage
import json
client = storage.Client()
bucket = client.get_bucket('your-bucket')
blob = bucket.get_blob("name.json")
json_data = blob.download_as_string().decode('UTF-8')
print(json_data) # print -> {"name": "works!!"}
print(json.loads(json_data)["name"]) # print -> works!!
You can try following code snippet: -
Function to Parse File
class custom_json_parser(beam.DoFn):
def process(self, element):
logging.info(element)
data = json.loads(element)
bucket = data.get('bucket_name')
dataset = data.get('dataset_name')
return [{"bucket": bucket , "dataset": dataset }]
Over Pipeline you can call function
with beam.Pipeline(options=pipeline_options) as pipeline:
steps = (pipeline
| "Getdata" >> beam.io.ReadFromText(custom_options.configfile)
| "CUSTOM JSON PARSE" >> beam.ParDo(custom_json_parser())
| "write to GCS" >> beam.io.WriteToText('gs://mynewbucket/outputfile.txt')
)
result = pipeline.run()
result.wait_until_finish()
It will work.

How to read a csv file from S3 bucket using AWS lambda and write it as new CSV to another S3 bucket? Python boto3

Ok so I am a beginner to AWS in general. I am writing a lambda function to trigger based on file upload event in S3, remove some coulmns and write it to a new bucket. Been banging my head for the past two datas and I am getting different error each time. Can someone modify my code/fix it? outputlv will be my target bucket.. Currently I am getting '/outputlv/output.csv' path does not exist in the with open('/outputlv/output.csv', 'w') as output_file line. Thanks.
import json
import urllib.parse
import boto3
import csv
s3 = boto3.client('s3')
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
file_name = s3.get_object(Bucket=bucket, Key=key)
csv_reader = csv.reader(file_name)
with open('/outputlv/output.csv', 'w') as output_file:
wtr = csv.writer(output_file)
for i in csv_reader:
wtr.writerow(i[0], i[2], i[3])
target_bucket = 'outputlv'
final_file = 'outputlv/output.csv'
s3.put_object(Bucket=target_bucket, Key=final_file)
Why don't you get the content, is it required to work with local files at all ?
response = s3.get_object(Bucket=bucket, Key=key)
# Get file content
content = response['Body'].read()
# Pass file content to csv reader
csv_reader = csv.reader(content)

How to use the Detectron2 .pth model for prediction from storage..?

I have trained the Detectron2 model on the google colab server free server.
from detectron2.engine import DefaultTrainer
from detectron2.config import get_cfg
import os
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("COCO-
InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml"))
cfg.DATASETS.TRAIN = ("Dataset_train",)
cfg.DATASETS.TEST = ()
cfg.DATALOADER.NUM_WORKERS = 2
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-
InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.SOLVER.BASE_LR = 0.00025
cfg.SOLVER.MAX_ITER = 2800
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 2
os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg)
trainer.resume_or_load(resume=False)
trainer.train()
This above code creates an "output" folder in which I have 4 files:
model_final.pth, metrics.json, last_checkpoint and events.out.file
I can use this model for prediction using
cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, "model_final.pth")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5
cfg.DATASETS.TEST = ("Datset_test")
predictor = DefaultPredictor(cfg)
Everything works fine when colab local session is not expired.
Problem:
When I mount this output folder somewhere else than the model not imported.
How to use .pth saved model for predictions?
How to reuse detectron2 trained model for prediction..?
Download your .pth file. When you open a new Colab runtime, upload that file to your local runtime (you can use "!cp" from Google Drive to your runtime to transfer .pth file quickly) and then follow the steps on official tutorial : https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5
You need to use your config file cfg.merge_from_file("YOUR CONFIG FILE") and your .pth file path: cfg.MODEL.WEIGHTS = ("YOUR .PTH FILE, PROBABLY /CONTENT/MODEL_FINAL.PTH")

upload a zip file to google team drive through python api

I am trying to upload a bunch of zip files to a Team Drive through Python API. The weird thing is that the program works if the zip file has extension "csv.gz" but fails if the extension is ".zip". Here is the code:
from __future__ import print_function
import uuid
from apiclient import discovery
from httplib2 import Http
from oauth2client import file, client, tools
from apiclient import errors
from apiclient.http import MediaFileUpload
args = tools.argparser.parse_args()
args.noauth_local_webserver = True
SCOPES = 'https://www.googleapis.com/auth/drive'
store = file.Storage('storage.json')
creds = store.get()
if not creds or creds.invalid:
flow = client.flow_from_clientsecrets('client_secret.json', SCOPES)
creds = tools.run_flow(flow, store, args)
DRIVE = discovery.build('drive', 'v3', http=creds.authorize(Http()))
def upload_file_to_td_folder(folder_id, fn, mimeType):
body = {'name': fn, 'mimeType': mimeType, 'parents': [folder_id]}
return DRIVE.files().create(body=body, media_body=fn,
supportsTeamDrives=True, fields='id').execute().get('id')
FILE_MIME = 'application/vnd.google-apps.file'
folder_id = "abcdefg"
# file = "data_2018-09-07.csv.gz" this one works!
file = "data_2018-09-07.zip"
upload_file_to_td_folder(folder_id, file, FILE_MIME)
Here is the error message:
Traceback (most recent call last):
File "test_td.py", line 103, in <module>
upload_file_to_td_folder(folder_id, file, FILE_MIME)
File "test_td.py", line 46, in upload_file_to_td_folder
supportsTeamDrives=True, fields='id').execute().get('id')
File "/home/bo/anaconda3/lib/python3.6/site-packages/oauth2client/_helpers.py", line 133, in positional_wrapper
return wrapped(*args, **kwargs)
File "/home/bo/anaconda3/lib/python3.6/site-packages/googleapiclient/http.py", line 840, in execute
raise HttpError(resp, content, uri=self.uri) googleapiclient.errors.HttpError:
<HttpError 400 when requesting https://www.googleapis.com/upload/drive/v3/files?supportsTeamDrives=true&fields=id&alt=json&uploadType=multipart
returned "Bad Request">
I think the problem is the type of file extension used here.
file = "data_2018-09-07.csv.gz"
Gzip is a file format for storing a single compressed file.
file = "data_2018-09-07.zip"
ZIP is a file format used for storing arbitrary number of files and folders together with lossless compression.
It means gzip is only capable to compress a single file while zip compresses multiple files one by one and archives them into a single file afterwards.
So if you need a single file inside a big archive, you have to decompress the whole zip/gzip file to get the file.
In your case, you have specified the file as a csv.gz, which is more effective as compressed as a single file.
You can try here the ZIP Extractor extension enables efficient management of zip archives within Drive.

Open JSON files in different directory - Python3, Windows, pathlib

I am trying to open JSON files located in a directory other than the current working directory (cwd). My setting: Python3.5 on Windows (using Anaconda).
from pathlib import *
import json
path = Path("C:/foo/bar")
filelist = []
for f in path.iterdir():
filelist.append(f)
for file in filelist:
with open(file.name) as data_file:
data = json.load(data_file)
In this setting I have these values:
file >> C:\foo\bar\0001.json
file.name >> 0001.json
However, I get the following error message:
---> 13 with open(file.name) as data_file:
14 data = json.load(data_file)
FileNotFoundError: [Errno 2] No such file or directory: '0001.json'
Here is what I tried so far:
Use .joinpath() to add the directory to the file name in the open command:
with open(path.joinpath(file.name)) as data_file:
data = json.load(data_file)
TypeError: invalid file: WindowsPath:('C:/foo/bar/0001.json')
Used .resolve() as that works for me to load CSV files into Pandas. Did not work here.
for file in filelist:
j = Path(path, file.name).resolve()
with open(j) as data_file:
data = json.load(data_file)
Since I'm on Windows write path as (and yes, the file is in that directory):
path = Path("C:\\foo\\bar") #resulted in the same FileNotFoundError above.
Instantiate path like this:
path = WindowsPath("C:/foo/bar")
#Same TypeError as above for both '\\' and '/'
The accepted answer has a lot of redundants - re-collected generator and mixed with statement with pathlib.Path.
pathlib.Path is awesome solution to handle paths especially if we want to create scripts which may work with Linux and Windows.
# modules
from pathlib import Path
import json
# static values
JSON_SUFFIXES = [".json", ".js", ".other_suffix"]
folder_path = Path("C:/users/user/documents")
for file_path in folder_path.iterdir():
if file_path.suffix in JSON_SUFFIXES:
data = json.loads(file_path.read_bytes())
Just adding modification for new users. pathlib.Path works with Python3.
Complete solution; thanks #eryksun:
from pathlib import *
import json
path = Path("C:/foo/bar")
filelist = []
for f in path.iterdir():
filelist.append(f)
for file in filelist:
with open(str(file) as data_file:
data = json.load(data_file)
This line works as well:
with file.open() as data_file: