This XML file does not appear to have any style information associated with it when try to download the file in Google Cloud Storage - google-cloud-functions

I have a cloud functions that copy and paste a file from one standard bucket to a Nearline bucket. I also tried to save the file by opening as a dataframe and write it as dask dataframe. They both worked but Every time I try to download the file through the
GUI I get the an XML error message as stated below. Does anyone know why this is happening? How Can I prevent it to happen?
This XML file does not appear to have any style information associated with it
import base64
import json
from google.cloud import storage
import dask.dataframe as dd
import pandas as pd
def hello_pubsub(event, context):
"""Triggered from a message on a Cloud Pub/Sub topic.
Args:
event (dict): Event payload.
context (google.cloud.functions.Context): Metadata for the event.
"""
print('here')
print(event)
pubsub_message = base64.b64decode(event['data']).decode('utf-8')
payload = json.loads(pubsub_message)
bucket_name = payload['data']['bucket_name']
print(bucket_name)
blob_name = payload['data']['file_name']
print(blob_name)
destination_bucket_name = 'infobip-email-uploaded'
#destination_blob_name = blob_name[0:10]+'.csv'
destination_blob_name = 'ddf-*.csv'
df = pd.read_excel('gs://'+bucket_name+'/'+blob_name, sheet_name='Data', engine='xlrd')
print('excel has been read')
ddf = dd.from_pandas(df,npartitions=1, sort=True)
print('dataframe has been transformed into dask')
path = 'gs://'+destination_bucket_name +'/'+ destination_blob_name
print('path is')
print(path)
ddf.to_csv(path, index=False, sep=',', header=False)
destination_blob_name = blob_name[0:10]+'.xlsx'
copy_blob(bucket_name,blob_name,destination_bucket_name,destination_blob_name)
print('File has been successfully copied')
delete_blob(bucket_name,blob_name)
print('File has been successfully deleted')
return '200'
def copy_blob(bucket_name, blob_name, destination_bucket_name, destination_blob_name):
"""Copies a blob from one bucket to another with a new name."""
# bucket_name = "your-bucket-name"
# blob_name = "your-object-name"
# destination_bucket_name = "destination-bucket-name"
# destination_blob_name = "destination-object-name"
storage_client = storage.Client()
source_bucket = storage_client.bucket(bucket_name)
source_blob = source_bucket.blob(blob_name)
destination_bucket = storage_client.bucket(destination_bucket_name)
blob_copy = source_bucket.copy_blob(
source_blob, destination_bucket, destination_blob_name
)
print(
"Blob {} in bucket {} copied to blob {} in bucket {}.".format(
source_blob.name,
source_bucket.name,
blob_copy.name,
destination_bucket.name,
)
)
def delete_blob(bucket_name, blob_name):
"""Deletes a blob from the bucket."""
# bucket_name = "your-bucket-name"
# blob_name = "your-object-name"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.delete()
print("Blob {} deleted.".format(blob_name))

Related

csv file isn't saved in different directory in python

My code reads a bunch of json files from a directory and extract "frequency" and "attenuation" data from those files and write to a csv file. Now I want to save that csv file in a different directory. The code executes without any error but saves in the current directory. Can anyone help to resolve this issue?
import csv
import glob
import json
import os
site = 'alpha'
frequency_to_check = '196050.000'
json_dir_name = 'V:/temp/test/'
json_pattern = os.path.join(json_dir_name, '*.json')
total_files = glob.glob(json_pattern)
atten = []
timestamp = []
save_path = 'V:/python/result/'
if not os.path.isdir(save_path):
os.makedirs(save_path)
filename = f'{site}-{frequency_to_check}.csv'
with open(filename, 'w', newline='') as csv_file:
for file in total_files:
with open(file) as json_file:
output_json = json.load(json_file)
for key in output_json:
if key['start-freq'] == frequency_to_check:
csv.writer(csv_file).writerow([key['start-freq'], key['attenuation']])
save_file = os.path.join(save_path, filename)
csv_file.close()
print(f'Total files processed {len(total_files)}')
The issue as far as I can deduce is here :
csv.writer(csv_file).writerow([key['start-freq'], key['attenuation']])
csv_file is your object that is loaded into memory , and everytime this line is executed you are just writing the rows in the already open file. After that you are just creating a new path :
save_file = os.path.join(save_path, filename)
which is never really used as you close the file too.
To fix this I would suggest that you put save_path as csv file :
import csv
import glob
import json
import os
site = 'alpha'
frequency_to_check = '196050.000'
json_dir_name = 'V:/temp/test/'
json_pattern = os.path.join(json_dir_name, '*.json')
total_files = glob.glob(json_pattern)
atten = []
timestamp = []
save_path = 'V:/python/result/'
if not os.path.isdir(save_path):
os.makedirs(save_path)
filename = f'{site}-{frequency_to_check}.csv'
save_file = os.path.join(save_path, filename)
with open(save_file, 'w', newline='') as csv_file:
for file in total_files:
with open(file) as json_file:
output_json = json.load(json_file)
for key in output_json:
if key['start-freq'] == frequency_to_check:
csv.writer(csv_file).writerow([key['start-freq'], key['attenuation']])
csv_file.close()
print(f'Total files processed {len(total_files)}')
I guess this should work.

How to read a csv file from S3 bucket using AWS lambda and write it as new CSV to another S3 bucket? Python boto3

Ok so I am a beginner to AWS in general. I am writing a lambda function to trigger based on file upload event in S3, remove some coulmns and write it to a new bucket. Been banging my head for the past two datas and I am getting different error each time. Can someone modify my code/fix it? outputlv will be my target bucket.. Currently I am getting '/outputlv/output.csv' path does not exist in the with open('/outputlv/output.csv', 'w') as output_file line. Thanks.
import json
import urllib.parse
import boto3
import csv
s3 = boto3.client('s3')
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
file_name = s3.get_object(Bucket=bucket, Key=key)
csv_reader = csv.reader(file_name)
with open('/outputlv/output.csv', 'w') as output_file:
wtr = csv.writer(output_file)
for i in csv_reader:
wtr.writerow(i[0], i[2], i[3])
target_bucket = 'outputlv'
final_file = 'outputlv/output.csv'
s3.put_object(Bucket=target_bucket, Key=final_file)
Why don't you get the content, is it required to work with local files at all ?
response = s3.get_object(Bucket=bucket, Key=key)
# Get file content
content = response['Body'].read()
# Pass file content to csv reader
csv_reader = csv.reader(content)

ROS service failed to save files

I want to have a service 'save_readings' that automatically saves data from a rostopic into a file. But each time the service gets called, it doesn't save any file.
I've tried to run those saving-file code in python without using a rosservice and the code works fine.
I don't understand why this is happening.
#!/usr/bin/env python
# license removed for brevity
import rospy,numpy
from std_msgs.msg import String,Int32MultiArray,Float32MultiArray,Bool
from std_srvs.srv import Empty,EmptyResponse
import geometry_msgs.msg
from geometry_msgs.msg import WrenchStamped
import json
# import settings
pos_record = []
wrench_record = []
def ftmsg2listandflip(ftmsg):
return [ftmsg.wrench.force.x,ftmsg.wrench.force.y,ftmsg.wrench.force.z, ftmsg.wrench.torque.x,ftmsg.wrench.torque.y,ftmsg.wrench.torque.z]
def callback_pos(data):
global pos_record
pos_record.append(data.data)
def callback_wrench(data):
global wrench_record
ft = ftmsg2listandflip(data)
wrench_record.append([data.header.stamp.to_sec()] + ft)
def exp_listener():
stop_sign = False
rospy.Subscriber("stage_pos", Float32MultiArray, callback_pos)
rospy.Subscriber("netft_data", WrenchStamped, callback_wrench)
rospy.spin()
def start_read(req):
global pos_record
global wrench_record
pos_record = []
wrench_record = []
return EmptyResponse()
def save_readings(req):
global pos_record
global wrench_record
filename = rospy.get_param('save_file_name')
output_data = {'pos_list':pos_record, 'wrench_list': wrench_record }
rospy.loginfo("output_data %s",output_data)
with open(filename, 'w') as outfile: # write data to 'data.json'
print('dumping json file')
json.dump(output_data, outfile) #TODO: find out why failing to save the file.
outfile.close()
print("file saved")
rospy.sleep(2)
return EmptyResponse()
if __name__ == '__main__':
try:
rospy.init_node('lisener_node', log_level = rospy.INFO)
s_1 = rospy.Service('start_read', Empty, start_read)
s_1 = rospy.Service('save_readings', Empty, save_readings)
exp_listener()
print ('mylistener ready!')
except rospy.ROSInterruptException:
pass
Got it. I need to specify a path for the file to be saved.
save_path = '/home/user/catkin_ws/src/motionstage/'
filename = save_path + filename

AWS: Boto3: AssumeRole example which includes role usage

I'm trying to use the AssumeRole in such a way that i'm traversing multiple accounts and retrieving assets for those accounts. I've made it to this point:
import boto3
stsclient = boto3.client('sts')
assumedRoleObject = sts_client.assume_role(
RoleArn="arn:aws:iam::account-of-role-to-assume:role/name-of-role",
RoleSessionName="AssumeRoleSession1")
Great, i have the assumedRoleObject. But now i want to use that to list things like ELBs or something that isn't a built-in low level resource.
How does one go about doing that? If i may ask - please code out a full example, so that everyone can benefit.
Here's a code snippet from the official AWS documentation where an s3 resource is created for listing all s3 buckets. boto3 resources or clients for other services can be built in a similar fashion.
# create an STS client object that represents a live connection to the
# STS service
sts_client = boto3.client('sts')
# Call the assume_role method of the STSConnection object and pass the role
# ARN and a role session name.
assumed_role_object=sts_client.assume_role(
RoleArn="arn:aws:iam::account-of-role-to-assume:role/name-of-role",
RoleSessionName="AssumeRoleSession1"
)
# From the response that contains the assumed role, get the temporary
# credentials that can be used to make subsequent API calls
credentials=assumed_role_object['Credentials']
# Use the temporary credentials that AssumeRole returns to make a
# connection to Amazon S3
s3_resource=boto3.resource(
's3',
aws_access_key_id=credentials['AccessKeyId'],
aws_secret_access_key=credentials['SecretAccessKey'],
aws_session_token=credentials['SessionToken'],
)
# Use the Amazon S3 resource object that is now configured with the
# credentials to access your S3 buckets.
for bucket in s3_resource.buckets.all():
print(bucket.name)
To get a session with an assumed role:
import botocore
import boto3
import datetime
from dateutil.tz import tzlocal
assume_role_cache: dict = {}
def assumed_role_session(role_arn: str, base_session: botocore.session.Session = None):
base_session = base_session or boto3.session.Session()._session
fetcher = botocore.credentials.AssumeRoleCredentialFetcher(
client_creator = base_session.create_client,
source_credentials = base_session.get_credentials(),
role_arn = role_arn,
extra_args = {
# 'RoleSessionName': None # set this if you want something non-default
}
)
creds = botocore.credentials.DeferredRefreshableCredentials(
method = 'assume-role',
refresh_using = fetcher.fetch_credentials,
time_fetcher = lambda: datetime.datetime.now(tzlocal())
)
botocore_session = botocore.session.Session()
botocore_session._credentials = creds
return boto3.Session(botocore_session = botocore_session)
# usage:
session = assumed_role_session('arn:aws:iam::ACCOUNTID:role/ROLE_NAME')
ec2 = session.client('ec2') # ... etc.
The resulting session's credentials will be automatically refreshed when required which is quite nice.
Note: my previous answer was outright wrong but I can't delete it, so I've replaced it with a better and working answer.
You can assume role using STS token, like:
class Boto3STSService(object):
def __init__(self, arn):
sess = Session(aws_access_key_id=ARN_ACCESS_KEY,
aws_secret_access_key=ARN_SECRET_KEY)
sts_connection = sess.client('sts')
assume_role_object = sts_connection.assume_role(
RoleArn=arn, RoleSessionName=ARN_ROLE_SESSION_NAME,
DurationSeconds=3600)
self.credentials = assume_role_object['Credentials']
This will give you temporary access key and secret keys, with session token. With these temporary credentials, you can access any service. For Eg, if you want to access ELB, you can use the below code:
self.tmp_credentials = Boto3STSService(arn).credentials
def get_boto3_session(self):
tmp_access_key = self.tmp_credentials['AccessKeyId']
tmp_secret_key = self.tmp_credentials['SecretAccessKey']
security_token = self.tmp_credentials['SessionToken']
boto3_session = Session(
aws_access_key_id=tmp_access_key,
aws_secret_access_key=tmp_secret_key, aws_session_token=security_token
)
return boto3_session
def get_elb_boto3_connection(self, region):
sess = self.get_boto3_session()
elb_conn = sess.client(service_name='elb', region_name=region)
return elb_conn
with reference to the solution by #jarrad which is not working as of Feb 2021, and as a solution that does not use STS explicitly please see the following
import boto3
import botocore.session
from botocore.credentials import AssumeRoleCredentialFetcher, DeferredRefreshableCredentials
def get_boto3_session(assume_role_arn=None):
session = boto3.Session(aws_access_key_id="abc", aws_secret_access_key="def")
if not assume_role_arn:
return session
fetcher = AssumeRoleCredentialFetcher(
client_creator=_get_client_creator(session),
source_credentials=session.get_credentials(),
role_arn=assume_role_arn,
)
botocore_session = botocore.session.Session()
botocore_session._credentials = DeferredRefreshableCredentials(
method='assume-role',
refresh_using=fetcher.fetch_credentials
)
return boto3.Session(botocore_session=botocore_session)
def _get_client_creator(session):
def client_creator(service_name, **kwargs):
return session.client(service_name, **kwargs)
return client_creator
the function can be called as follows
ec2_client = get_boto3_session(role_arn='my_role_arn').client('ec2', region_name='us-east-1')
If you want a functional implementation, this is what I settled on:
def filter_none_values(kwargs: dict) -> dict:
"""Returns a new dictionary excluding items where value was None"""
return {k: v for k, v in kwargs.items() if v is not None}
def assume_session(
role_session_name: str,
role_arn: str,
duration_seconds: Optional[int] = None,
region_name: Optional[str] = None,
) -> boto3.Session:
"""
Returns a session with the given name and role.
If not specified, duration will be set by AWS, probably at 1 hour.
If not specified, region will be left unset.
Region can be overridden by each client or resource spawned from this session.
"""
assume_role_kwargs = filter_none_values(
{
"RoleSessionName": role_session_name,
"RoleArn": role_arn,
"DurationSeconds": duration_seconds,
}
)
credentials = boto3.client("sts").assume_role(**assume_role_kwargs)["Credentials"]
create_session_kwargs = filter_none_values(
{
"aws_access_key_id": credentials["AccessKeyId"],
"aws_secret_access_key": credentials["SecretAccessKey"],
"aws_session_token": credentials["SessionToken"],
"region_name": region_name,
}
)
return boto3.Session(**create_session_kwargs)
def main() -> None:
session = assume_session(
"MyCustomSessionName",
"arn:aws:iam::XXXXXXXXXXXX:role/TheRoleIWantToAssume",
region_name="us-east-1",
)
client = session.client(service_name="ec2")
print(client.describe_key_pairs())
import json
import boto3
roleARN = 'arn:aws:iam::account-of-role-to-assume:role/name-of-role'
client = boto3.client('sts')
response = client.assume_role(RoleArn=roleARN,
RoleSessionName='RoleSessionName',
DurationSeconds=900)
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1',
aws_access_key_id=response['Credentials']['AccessKeyId'],
aws_secret_access_key=response['Credentials']['SecretAccessKey'],
aws_session_token = response['Credentials']['SessionToken'])
response = dynamodb_client.get_item(
Key={
'key1': {
'S': '1',
},
'key2': {
'S': '2',
},
},
TableName='TestTable')
print(response)
#!/usr/bin/env python3
import boto3
sts_client = boto3.client('sts')
assumed_role = sts_client.assume_role(RoleArn = "arn:aws:iam::123456789012:role/example_role",
RoleSessionName = "AssumeRoleSession1",
DurationSeconds = 1800)
session = boto3.Session(
aws_access_key_id = assumed_role['Credentials']['AccessKeyId'],
aws_secret_access_key = assumed_role['Credentials']['SecretAccessKey'],
aws_session_token = assumed_role['Credentials']['SessionToken'],
region_name = 'us-west-1'
)
# now we make use of the role to retrieve a parameter from SSM
client = session.client('ssm')
response = client.get_parameter(
Name = '/this/is/a/path/parameter',
WithDecryption = True
)
print(response)
Assuming that 1) the ~/.aws/config or ~/.aws/credentials file is populated with each of the roles that you wish to assume and that 2) the default role has AssumeRole defined in its IAM policy for each of those roles, then you can simply (in pseudo-code) do the following and not have to fuss with STS:
import boto3
# get all of the roles from the AWS config/credentials file using a config file parser
profiles = get_profiles()
for profile in profiles:
# this is only used to fetch the available regions
initial_session = boto3.Session(profile_name=profile)
# get the regions
regions = boto3.Session.get_available_regions('ec2')
# cycle through the regions, setting up session, resource and client objects
for region in regions:
boto3_session = boto3.Session(profile_name=profile, region_name=region)
boto3_resource = boto3_session.resource(service_name='s3', region_name=region)
boto3_client = boto3_session.client(service_name='s3', region_name=region)
[ do something interesting with your session/resource/client here ]
Credential Setup (boto3 - Shared Credentials File)
Assume Role Setup (AWS)
After a few days of searching, this is the simplest solution I have found. explained here but does not have a usage example.
import boto3
for profile in boto3.Session().available_profiles:
boto3.DEFAULT_SESSION = boto3.session.Session(profile_name=profile)
s3 = boto3.resource('s3')
for bucket in s3.buckets.all():
print(bucket)
This will switch the default role you will be using. To not make the profile the default, just do not assign it to boto3.DEFAULT_SESSION. but instead, do the following.
testing_profile = boto3.session.Session(profile_name='mainTesting')
s3 = testing_profile.resource('s3')
for bucket in s3.buckets.all():
print(bucket)
Important to note that the .aws credentials need to be set in a specific way.
[default]
aws_access_key_id = default_access_id
aws_secret_access_key = default_access_key
[main]
aws_access_key_id = main_profile_access_id
aws_secret_access_key = main_profile_access_key
[mainTesting]
source_profile = main
role_arn = Testing role arn
mfa_serial = mfa_arn_for_main_role
[mainProduction]
source_profile = main
role_arn = Production role arn
mfa_serial = mfa_arn_for_main_role
I don't know why but the mfa_serial key has to be on the roles for this to work instead of the source account which would make more sense.
Here's the code snippet I used
sts_client = boto3.client('sts')
assumed_role_object = sts_client.assume_role(
RoleArn=<arn of the role to assume>,
RoleSessionName="<role session name>"
)
print(assumed_role_object)
credentials = assumed_role_object['Credentials']
session = Session(
aws_access_key_id=credentials['AccessKeyId'],
aws_secret_access_key=credentials['SecretAccessKey'],
aws_session_token=credentials['SessionToken']
)
self.s3 = session.client('s3')

Fail to store data in csv file through scraping

I try to scraping a webpage and extracting data ,then store all data in a csv file. Before adding ScrapeCallback class and calling it, everything works fine. However, it does not store any type of data except headers in the cvs file after adding the new class. Can anyone help me to figure out the problem?
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
import csv
import lxml.html
class ScrapeCallback:
# extract and store all data in a csv file
def __init__( self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
self.writer.writerow( self.fields)
def __call__( self, url, html):
if re.search('/view/',url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
print row
self.writer.writerow(row)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
html = download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', max_depth =2, scrape_callback = ScrapeCallback())