Transfer values from JSON to Google sheet using python - json

So I have this code below which shows the number of times emails have been sent to a particular email address (from a list of emails saved in Google Sheet) and it countups the value in different sessions the code runs. The values are stored in a JSON file.
Is there a way I can push these values from JSON file to Google Sheet directly from Python. I want to do this without using Pandas dataframe and IMPORTJSON function in Google sheet.
As a newbie to programming and Python, I have tried researching this but have come to a dead-end. Any help appreciated for this.
import gspread
from oauth2client.service_account import ServiceAccountCredentials
# login & open sheet sheets
scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/spreadsheets',
"https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"]
credentials = ServiceAccountCredentials.from_json_keyfile_name('my-jsonfile.json', scope)
client = gspread.authorize(credentials)
sheet3 = client.open('Dashboard').worksheet('Sheet3') # Open the spreadsheet
sheet1 = client.open('Dashboard').worksheet('Sheet1') # Open the spreadsheet
sheet4 = client.open('Dashboard').worksheet('Sheet4') # Open the spreadsheet
###Countup ###########
import smtplib
import ssl
from email.mime.text import MIMEText # New line
from email.utils import formataddr # New line
# User configuration
sender_email = 'email'
sender_name = 'username'
password = "password"
x = sheet3.row_values(4)
c = len(sheet1.col_values(8))
cell = ("")
from collections import Counter
import json
counter_file_path = "counter.json"
try:
with open(counter_file_path, "r") as f:
email_stats = json.load(f)
except FileNotFoundError as ex:
email_stats = {}
successful_emails = []
for x in range (4, c):
names = sheet3.cell(x + 1, 6).value
emails = sheet3.cell(x + 1, 8).value
if names == cell and emails == cell:
print("no_data")
else:
receiver_names = list(names.split())
receiver_emails = list(emails.split())
# Email text
email_body = '''
This is a test email sent by Python. Isn't that cool?
'''
for receiver_email, receiver_name in zip(receiver_emails, receiver_names):
print("Sending the email...")
# Configurating user's info
msg = MIMEText(email_body, 'plain')
msg['To'] = formataddr((receiver_name, receiver_email))
msg['From'] = formataddr((sender_name, sender_email))
msg['Subject'] = 'Hello, my friend ' + receiver_name
try:
# Creating a SMTP session | use 587 with TLS, 465 SSL and 25
server = smtplib.SMTP('smtp.gmail.com', 587)
server.ehlo()
# Encrypts the email
context = ssl.create_default_context()
server.starttls(context=context)
# We log in into our Google account
server.login(sender_email, password)
# Sending email from sender, to receiver with the email body
server.sendmail(sender_email, receiver_email, msg.as_string())
print('Email sent!')
successful_emails.append(receiver_email)
if receiver_email in email_stats:
email_stats[receiver_email] += 1
else:
email_stats[receiver_email] = 1
except Exception as e:
print(f'Oh no! Something bad happened!n {e}')
finally:
print('Closing the server...')
server.quit()
# counter = Counter(successful_emails)
# print(counter)
print(email_stats) # output - all occurrences for each email
with open(counter_file_path, "w") as f:
results = json.dump(email_stats, f)
The results of the JSON file are {"receiver_emails_1": 6, "receiver_emails_2": 6}

Try this.
Please reference to this documentation.
(Update a range of cells using the top left corner address).
https://gspread.readthedocs.io/en/latest/#example-usage
import json
import gspread
from google.oauth2.service_account import Credentials
# connect to your google sheet
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
credentials = Credentials.from_service_account_file('key.json', scopes=scope)
gc = gspread.authorize(credentials)
wks = gc.open("your spreadsheet name").sheet1
# Let's say you have some json values
x = '{ "receiver_email_1":6, "receiver_email_2":8, "receiver_email_3":10 }'
y = json.loads(x)
result = []
for key in y:
result.append([key,y[key]])
wks.update('A1', result)

Related

This XML file does not appear to have any style information associated with it when try to download the file in Google Cloud Storage

I have a cloud functions that copy and paste a file from one standard bucket to a Nearline bucket. I also tried to save the file by opening as a dataframe and write it as dask dataframe. They both worked but Every time I try to download the file through the
GUI I get the an XML error message as stated below. Does anyone know why this is happening? How Can I prevent it to happen?
This XML file does not appear to have any style information associated with it
import base64
import json
from google.cloud import storage
import dask.dataframe as dd
import pandas as pd
def hello_pubsub(event, context):
"""Triggered from a message on a Cloud Pub/Sub topic.
Args:
event (dict): Event payload.
context (google.cloud.functions.Context): Metadata for the event.
"""
print('here')
print(event)
pubsub_message = base64.b64decode(event['data']).decode('utf-8')
payload = json.loads(pubsub_message)
bucket_name = payload['data']['bucket_name']
print(bucket_name)
blob_name = payload['data']['file_name']
print(blob_name)
destination_bucket_name = 'infobip-email-uploaded'
#destination_blob_name = blob_name[0:10]+'.csv'
destination_blob_name = 'ddf-*.csv'
df = pd.read_excel('gs://'+bucket_name+'/'+blob_name, sheet_name='Data', engine='xlrd')
print('excel has been read')
ddf = dd.from_pandas(df,npartitions=1, sort=True)
print('dataframe has been transformed into dask')
path = 'gs://'+destination_bucket_name +'/'+ destination_blob_name
print('path is')
print(path)
ddf.to_csv(path, index=False, sep=',', header=False)
destination_blob_name = blob_name[0:10]+'.xlsx'
copy_blob(bucket_name,blob_name,destination_bucket_name,destination_blob_name)
print('File has been successfully copied')
delete_blob(bucket_name,blob_name)
print('File has been successfully deleted')
return '200'
def copy_blob(bucket_name, blob_name, destination_bucket_name, destination_blob_name):
"""Copies a blob from one bucket to another with a new name."""
# bucket_name = "your-bucket-name"
# blob_name = "your-object-name"
# destination_bucket_name = "destination-bucket-name"
# destination_blob_name = "destination-object-name"
storage_client = storage.Client()
source_bucket = storage_client.bucket(bucket_name)
source_blob = source_bucket.blob(blob_name)
destination_bucket = storage_client.bucket(destination_bucket_name)
blob_copy = source_bucket.copy_blob(
source_blob, destination_bucket, destination_blob_name
)
print(
"Blob {} in bucket {} copied to blob {} in bucket {}.".format(
source_blob.name,
source_bucket.name,
blob_copy.name,
destination_bucket.name,
)
)
def delete_blob(bucket_name, blob_name):
"""Deletes a blob from the bucket."""
# bucket_name = "your-bucket-name"
# blob_name = "your-object-name"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.delete()
print("Blob {} deleted.".format(blob_name))

Instaloader JSON files: Convert 200 JSON files into a Single CSV (Python 3.7)

I want to automatically download pictures (or videos) along with their captions and other data from a specific Instagram Hashtag (e.g. #moodoftheday) using Instaloader. Instaloader returns JSON files including posts metadata.
The following code worked with just a single #user_profile metadata.
I want to do the same, but for a #hashtag not a specific #user.
The ultimate goal is to have all of the JSON files (e.g. 200) into a csv file.
How can I process my downloaded data in a clean excel/CSV file?
Here is my code:
# Install Instaloader
import instaloader
def get_instagram_posts(username, startdate, enddate):
# Create an instaloader object with parameters
L = instaloader.Instaloader(download_pictures = False, download_videos = False, download_comments= False, compress_json = False)
# Log in with the instaloader object
L.login("username" , "password")
# Search the instagram profile
profile = instaloader.Profile.from_username(L.context, username)
# Scrape the posts
posts = profile.get_posts()
for post in takewhile(lambda p: p.date > startdate, dropwhile(lambda p : p.date > enddate, posts)):
print(post.date)
L.download_post(post, target = profile.username)
'''
This function will now save all instagram posts and related data to a folder in you current working directory.
Let’s call this function on the instagram account of “moodoftheday”. let the script do its magic.
This might take a while so be patient.
'''
import os
import datetime
# instagram username
username = "realdonaldtrump"
# daterange of scraping
startdate = datetime(2020, 9, 1)
enddate = datetime(2020, 10, 1)
# get your current working directory
current_wkdir = os.get_cwd()
# Call the function. This will automatically store all the scrape data in a folder in your current working directory
get_instagram_posts(username, startdate, enddate)
'''
You notice that this data is NOT yet in the right format since each post has a separate json file.
You will need to process all these json files to a consolidated excel file in order to perform analyses on the data.
'''
def parse_instafiles(username, path):
"""
This function loads in all the json files generated by the instaloader package and parses it into a csv file.
"""
#print('Entering provided directory...')
os.chdir(os.path.join(path, username))
columns = ['filename', 'datetime', 'type', 'locations_id', 'locations_name', 'mentions', 'hashtags', 'video_duration']
dataframe = pd.DataFrame(columns=[])
#print('Traversing file tree...')
glob('*UTC.json')
for file in glob('*UTC.json'):
with open(file, 'r') as filecontent:
filename = filecontent.name
#print('Found JSON file: ' + filename + '. Loading...')
try:
metadata = orjson.loads(filecontent.read())
except IOError as e:
#print("I/O Error. Couldn't load file. Trying the next one...")
continue
else:
pass
#print('Collecting relevant metadata...')
time = datetime.fromtimestamp(int(metadata['node']['taken_at_timestamp']))
type_ = metadata['node']['__typename']
likes = metadata['node']['edge_media_preview_like']['count']
comments = metadata['node']['edge_media_to_comment']['count']
username = metadata['node']['owner']['username']
followers = metadata['node']['owner']['edge_followed_by']['count']
try:
text = metadata['node']['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ""
try:
post_id = metadata['node']['id']
except:
post_id = ""
minedata = {'filename': filename, 'time': time, 'text': text,
'likes': likes, 'comments' : comments, 'username' : username, 'followers' : followers, 'post_id' : post_id}
#print('Writing to dataframe...')
dataframe = dataframe.append(minedata, ignore_index=True)
#print('Closing file...')
del metadata
filecontent.close()
#print('Storing dataframe to CSV file...')
#print('Done.')
dataframe['source'] = 'Instagram'
return dataframe
'''
You can then use this function to process the "moodoftheday" Instagram data.
'''
df_instagram = parse_instafiles(username, os.getcwd() )
df_instagram.to_excel("moodoftheday.csv")
I am very new to Python and programming overall, therefore any help is very much appreciated!!
Thank you in advance! Sofia
I made some changes it's not showing error but still needs some professional works:
import instaloader
from datetime import datetime
import datetime
from itertools import takewhile
from itertools import dropwhile
import os
import glob as glob
import json
import pandas as pd
import csv
lusername = ''
lpassword = ''
def get_instagram_posts(username, startdate, enddate):
# Create an instaloader object with parameters
L = instaloader.Instaloader(download_pictures = False, download_videos = False, download_comments= False, compress_json = False)
# Log in with the instaloader object
L.login("lusername" , "lpassword")
# Search the instagram profile
profile = instaloader.Profile.from_username(L.context, username)
# Scrape the posts
posts = profile.get_posts()
for post in takewhile(lambda p: p.date > startdate, dropwhile(lambda p : p.date > enddate, posts)):
print(post.date)
L.download_post(post, target = profile.username)
# instagram username
username = "realdonaldtrump"
# daterange of scraping
startdate = datetime.datetime(2020, 9, 1,0,0)
enddate = datetime.datetime(2022, 2, 1,0,0)
# get your current working directory
current_wkdir = os.getcwd()
# Call the function. This will automatically store all the scrape data in a folder in your current working directory
get_instagram_posts(username, startdate, enddate)
def parse_instafiles(username, path):
#print('Entering provided directory...')
os.chdir(os.path.join(path, username))
columns = ['filename', 'datetime', 'type', 'locations_id', 'locations_name', 'mentions', 'hashtags', 'video_duration']
dataframe = pd.DataFrame(columns=[])
#print('Traversing file tree...')
# glob('*UTC.json')
for file in glob.glob('*UTC.json'):
with open(file, 'r') as filecontent:
filename = filecontent.name
#print('Found JSON file: ' + filename + '. Loading...')
try:
metadata = json.load(filecontent)
except IOError as e:
#print("I/O Error. Couldn't load file. Trying the next one...")
continue
else:
pass
#print('Collecting relevant metadata...')
time = datetime.datetime.fromtimestamp(int(metadata['node']['taken_at_timestamp']))
type_ = metadata['node']['__typename']
likes = metadata['node']['edge_media_preview_like']['count']
comments = metadata['node']['edge_media_to_comment']['count']
username = metadata['node']['owner']['username']
followers = metadata['node']['owner']['edge_followed_by']['count']
try:
text = metadata['node']['edge_media_to_caption']['edges'][0]['node']['text']
except:
text = ""
try:
post_id = metadata['node']['id']
except:
post_id = ""
minedata = {'filename': filename, 'time': time, 'text': text,
'likes': likes, 'comments' : comments, 'username' : username, 'followers' : followers, 'post_id' : post_id}
#print('Writing to dataframe...')
dataframe = dataframe.append(minedata, ignore_index=True)
#print('Closing file...')
del metadata
filecontent.close()
#print('Storing dataframe to CSV file...')
#print('Done.')
dataframe['source'] = 'Instagram'
return dataframe
'''
You can then use this function to process the "moodoftheday" Instagram data.
'''
df_instagram = parse_instafiles(username, os.getcwd() )
df_instagram.to_csv("moodoftheday.csv")
Instaloader has an example of hashtag search on its documentation, here's the code:
from datetime import datetime
import instaloader
L = instaloader.Instaloader()
posts = instaloader.Hashtag.from_name(L.context, "urbanphotography").get_posts()
SINCE = datetime(2020, 5, 10) # further from today, inclusive
UNTIL = datetime(2020, 5, 11) # closer to today, not inclusive
k = 0 # initiate k
#k_list = [] # uncomment this to tune k
for post in posts:
postdate = post.date
if postdate > UNTIL:
continue
elif postdate <= SINCE:
k += 1
if k == 50:
break
else:
continue
else:
L.download_post(post, "#urbanphotography")
# if you want to tune k, uncomment below to get your k max
#k_list.append(k)
k = 0 # set k to 0
#max(k_list)
here's the link for more info:
https://instaloader.github.io/codesnippets.html
I'm trying do to something similar, but I'm still very new to programming, so I'm sorry if I can't offer much help

passing request body variables into script2 when importing

I am hoping someone can help me find a way to pass the variables I post when calling my POST service (script1) into script2 when it gets imported.
I have this script1 below as a POST rest service built using flask
from testingdf import *
from flask import Flask, json, request
app = Flask(__name__)
#app.route("/results", methods=['POST'])
def createResults():
entry = request.get_json().get('entry', '')
passcode = request.get_json().get('passcode', '')
print "**testing results**"
print results
response = app.response_class(
response=json.dumps(results),
status=200,
mimetype='application/json'
)
return json.dumps(results)
I want to use script1 to call script 2 below and return script 2 output (results) as a json response from script 1 api:
import pandas as pd
df = pd.DataFrame()
number = [21, 44, 31, 553, 63, 35]
access = ["denied", "Try Again", "Retry", "Accepted", "Error", "Success"]
def mapping(entry, passcode):
team = ''
if entry in range(20,30):
team = 'Revolt'
elif (entry in range(40,50)) and (passcode == 'Try Again'):
team = 'Strike'
elif (entry in range(60,100)) and (passcode == 'Error'):
team = 'Exception'
return team
testInput = zip(entry, passcode)
for number, access in testInput:
Team = mapping(number, access)
df = df.append({'Access Message': access, 'Number': number}, ignore_index=True)
results = {
'Entry' : number,
'Outcome' : access,
'team': team
}
I used from testingdf import * to import script2 into script1 however whenever I call my api it runs script2 first and the variables that I pass in the request body return as unrecognized, which is an expected behavior since the imported module is executed first. Is there another way around this?
How can I import script2 and it's variables without having it executed initially when I call my api?

AWS: Boto3: AssumeRole example which includes role usage

I'm trying to use the AssumeRole in such a way that i'm traversing multiple accounts and retrieving assets for those accounts. I've made it to this point:
import boto3
stsclient = boto3.client('sts')
assumedRoleObject = sts_client.assume_role(
RoleArn="arn:aws:iam::account-of-role-to-assume:role/name-of-role",
RoleSessionName="AssumeRoleSession1")
Great, i have the assumedRoleObject. But now i want to use that to list things like ELBs or something that isn't a built-in low level resource.
How does one go about doing that? If i may ask - please code out a full example, so that everyone can benefit.
Here's a code snippet from the official AWS documentation where an s3 resource is created for listing all s3 buckets. boto3 resources or clients for other services can be built in a similar fashion.
# create an STS client object that represents a live connection to the
# STS service
sts_client = boto3.client('sts')
# Call the assume_role method of the STSConnection object and pass the role
# ARN and a role session name.
assumed_role_object=sts_client.assume_role(
RoleArn="arn:aws:iam::account-of-role-to-assume:role/name-of-role",
RoleSessionName="AssumeRoleSession1"
)
# From the response that contains the assumed role, get the temporary
# credentials that can be used to make subsequent API calls
credentials=assumed_role_object['Credentials']
# Use the temporary credentials that AssumeRole returns to make a
# connection to Amazon S3
s3_resource=boto3.resource(
's3',
aws_access_key_id=credentials['AccessKeyId'],
aws_secret_access_key=credentials['SecretAccessKey'],
aws_session_token=credentials['SessionToken'],
)
# Use the Amazon S3 resource object that is now configured with the
# credentials to access your S3 buckets.
for bucket in s3_resource.buckets.all():
print(bucket.name)
To get a session with an assumed role:
import botocore
import boto3
import datetime
from dateutil.tz import tzlocal
assume_role_cache: dict = {}
def assumed_role_session(role_arn: str, base_session: botocore.session.Session = None):
base_session = base_session or boto3.session.Session()._session
fetcher = botocore.credentials.AssumeRoleCredentialFetcher(
client_creator = base_session.create_client,
source_credentials = base_session.get_credentials(),
role_arn = role_arn,
extra_args = {
# 'RoleSessionName': None # set this if you want something non-default
}
)
creds = botocore.credentials.DeferredRefreshableCredentials(
method = 'assume-role',
refresh_using = fetcher.fetch_credentials,
time_fetcher = lambda: datetime.datetime.now(tzlocal())
)
botocore_session = botocore.session.Session()
botocore_session._credentials = creds
return boto3.Session(botocore_session = botocore_session)
# usage:
session = assumed_role_session('arn:aws:iam::ACCOUNTID:role/ROLE_NAME')
ec2 = session.client('ec2') # ... etc.
The resulting session's credentials will be automatically refreshed when required which is quite nice.
Note: my previous answer was outright wrong but I can't delete it, so I've replaced it with a better and working answer.
You can assume role using STS token, like:
class Boto3STSService(object):
def __init__(self, arn):
sess = Session(aws_access_key_id=ARN_ACCESS_KEY,
aws_secret_access_key=ARN_SECRET_KEY)
sts_connection = sess.client('sts')
assume_role_object = sts_connection.assume_role(
RoleArn=arn, RoleSessionName=ARN_ROLE_SESSION_NAME,
DurationSeconds=3600)
self.credentials = assume_role_object['Credentials']
This will give you temporary access key and secret keys, with session token. With these temporary credentials, you can access any service. For Eg, if you want to access ELB, you can use the below code:
self.tmp_credentials = Boto3STSService(arn).credentials
def get_boto3_session(self):
tmp_access_key = self.tmp_credentials['AccessKeyId']
tmp_secret_key = self.tmp_credentials['SecretAccessKey']
security_token = self.tmp_credentials['SessionToken']
boto3_session = Session(
aws_access_key_id=tmp_access_key,
aws_secret_access_key=tmp_secret_key, aws_session_token=security_token
)
return boto3_session
def get_elb_boto3_connection(self, region):
sess = self.get_boto3_session()
elb_conn = sess.client(service_name='elb', region_name=region)
return elb_conn
with reference to the solution by #jarrad which is not working as of Feb 2021, and as a solution that does not use STS explicitly please see the following
import boto3
import botocore.session
from botocore.credentials import AssumeRoleCredentialFetcher, DeferredRefreshableCredentials
def get_boto3_session(assume_role_arn=None):
session = boto3.Session(aws_access_key_id="abc", aws_secret_access_key="def")
if not assume_role_arn:
return session
fetcher = AssumeRoleCredentialFetcher(
client_creator=_get_client_creator(session),
source_credentials=session.get_credentials(),
role_arn=assume_role_arn,
)
botocore_session = botocore.session.Session()
botocore_session._credentials = DeferredRefreshableCredentials(
method='assume-role',
refresh_using=fetcher.fetch_credentials
)
return boto3.Session(botocore_session=botocore_session)
def _get_client_creator(session):
def client_creator(service_name, **kwargs):
return session.client(service_name, **kwargs)
return client_creator
the function can be called as follows
ec2_client = get_boto3_session(role_arn='my_role_arn').client('ec2', region_name='us-east-1')
If you want a functional implementation, this is what I settled on:
def filter_none_values(kwargs: dict) -> dict:
"""Returns a new dictionary excluding items where value was None"""
return {k: v for k, v in kwargs.items() if v is not None}
def assume_session(
role_session_name: str,
role_arn: str,
duration_seconds: Optional[int] = None,
region_name: Optional[str] = None,
) -> boto3.Session:
"""
Returns a session with the given name and role.
If not specified, duration will be set by AWS, probably at 1 hour.
If not specified, region will be left unset.
Region can be overridden by each client or resource spawned from this session.
"""
assume_role_kwargs = filter_none_values(
{
"RoleSessionName": role_session_name,
"RoleArn": role_arn,
"DurationSeconds": duration_seconds,
}
)
credentials = boto3.client("sts").assume_role(**assume_role_kwargs)["Credentials"]
create_session_kwargs = filter_none_values(
{
"aws_access_key_id": credentials["AccessKeyId"],
"aws_secret_access_key": credentials["SecretAccessKey"],
"aws_session_token": credentials["SessionToken"],
"region_name": region_name,
}
)
return boto3.Session(**create_session_kwargs)
def main() -> None:
session = assume_session(
"MyCustomSessionName",
"arn:aws:iam::XXXXXXXXXXXX:role/TheRoleIWantToAssume",
region_name="us-east-1",
)
client = session.client(service_name="ec2")
print(client.describe_key_pairs())
import json
import boto3
roleARN = 'arn:aws:iam::account-of-role-to-assume:role/name-of-role'
client = boto3.client('sts')
response = client.assume_role(RoleArn=roleARN,
RoleSessionName='RoleSessionName',
DurationSeconds=900)
dynamodb_client = boto3.client('dynamodb', region_name='us-east-1',
aws_access_key_id=response['Credentials']['AccessKeyId'],
aws_secret_access_key=response['Credentials']['SecretAccessKey'],
aws_session_token = response['Credentials']['SessionToken'])
response = dynamodb_client.get_item(
Key={
'key1': {
'S': '1',
},
'key2': {
'S': '2',
},
},
TableName='TestTable')
print(response)
#!/usr/bin/env python3
import boto3
sts_client = boto3.client('sts')
assumed_role = sts_client.assume_role(RoleArn = "arn:aws:iam::123456789012:role/example_role",
RoleSessionName = "AssumeRoleSession1",
DurationSeconds = 1800)
session = boto3.Session(
aws_access_key_id = assumed_role['Credentials']['AccessKeyId'],
aws_secret_access_key = assumed_role['Credentials']['SecretAccessKey'],
aws_session_token = assumed_role['Credentials']['SessionToken'],
region_name = 'us-west-1'
)
# now we make use of the role to retrieve a parameter from SSM
client = session.client('ssm')
response = client.get_parameter(
Name = '/this/is/a/path/parameter',
WithDecryption = True
)
print(response)
Assuming that 1) the ~/.aws/config or ~/.aws/credentials file is populated with each of the roles that you wish to assume and that 2) the default role has AssumeRole defined in its IAM policy for each of those roles, then you can simply (in pseudo-code) do the following and not have to fuss with STS:
import boto3
# get all of the roles from the AWS config/credentials file using a config file parser
profiles = get_profiles()
for profile in profiles:
# this is only used to fetch the available regions
initial_session = boto3.Session(profile_name=profile)
# get the regions
regions = boto3.Session.get_available_regions('ec2')
# cycle through the regions, setting up session, resource and client objects
for region in regions:
boto3_session = boto3.Session(profile_name=profile, region_name=region)
boto3_resource = boto3_session.resource(service_name='s3', region_name=region)
boto3_client = boto3_session.client(service_name='s3', region_name=region)
[ do something interesting with your session/resource/client here ]
Credential Setup (boto3 - Shared Credentials File)
Assume Role Setup (AWS)
After a few days of searching, this is the simplest solution I have found. explained here but does not have a usage example.
import boto3
for profile in boto3.Session().available_profiles:
boto3.DEFAULT_SESSION = boto3.session.Session(profile_name=profile)
s3 = boto3.resource('s3')
for bucket in s3.buckets.all():
print(bucket)
This will switch the default role you will be using. To not make the profile the default, just do not assign it to boto3.DEFAULT_SESSION. but instead, do the following.
testing_profile = boto3.session.Session(profile_name='mainTesting')
s3 = testing_profile.resource('s3')
for bucket in s3.buckets.all():
print(bucket)
Important to note that the .aws credentials need to be set in a specific way.
[default]
aws_access_key_id = default_access_id
aws_secret_access_key = default_access_key
[main]
aws_access_key_id = main_profile_access_id
aws_secret_access_key = main_profile_access_key
[mainTesting]
source_profile = main
role_arn = Testing role arn
mfa_serial = mfa_arn_for_main_role
[mainProduction]
source_profile = main
role_arn = Production role arn
mfa_serial = mfa_arn_for_main_role
I don't know why but the mfa_serial key has to be on the roles for this to work instead of the source account which would make more sense.
Here's the code snippet I used
sts_client = boto3.client('sts')
assumed_role_object = sts_client.assume_role(
RoleArn=<arn of the role to assume>,
RoleSessionName="<role session name>"
)
print(assumed_role_object)
credentials = assumed_role_object['Credentials']
session = Session(
aws_access_key_id=credentials['AccessKeyId'],
aws_secret_access_key=credentials['SecretAccessKey'],
aws_session_token=credentials['SessionToken']
)
self.s3 = session.client('s3')

Fail to store data in csv file through scraping

I try to scraping a webpage and extracting data ,then store all data in a csv file. Before adding ScrapeCallback class and calling it, everything works fine. However, it does not store any type of data except headers in the cvs file after adding the new class. Can anyone help me to figure out the problem?
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
import csv
import lxml.html
class ScrapeCallback:
# extract and store all data in a csv file
def __init__( self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
self.writer.writerow( self.fields)
def __call__( self, url, html):
if re.search('/view/',url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
print row
self.writer.writerow(row)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
html = download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', max_depth =2, scrape_callback = ScrapeCallback())