Pandas parallel URL downloads with pd.read_html - html

I know I can download a csv file from a web page by doing:
import pandas as pd
import numpy as np
from io import StringIO
URL = "http://www.something.com"
data = pd.read_html(URL)[0].to_csv(index=False, header=True)
file = pd.read_csv(StringIO(data), sep=',')
Now I would like to do the above for more URLs at the same time, like when you open different tabs in your browser. In other words, a way to parallelize this when you have different URLs, instead of looping through or doing it one at a time. So, I thought of having a series of URLs inside a dataframe, and then create a new column which contains the strings 'data', one for each URL.
list_URL = ["http://www.something.com", "http://www.something2.com",
"http://www.something3.com"]
df = pd.DataFrame(list_URL, columns =['URL'])
df['data'] = pd.read_html(df['URL'])[0].to_csv(index=False, header=True)
But it gives me error: cannot parse from 'Series'
Is there a better syntax, or does this mean I cannot do this in parallel for more than one URL?

You could try like this:
import pandas as pd
URLS = [
"https://en.wikipedia.org/wiki/Periodic_table#Presentation_forms",
"https://en.wikipedia.org/wiki/Planet#Planetary_attributes",
]
df = pd.DataFrame(URLS, columns=["URL"])
df["data"] = df["URL"].map(
lambda x: pd.read_html(x)[0].to_csv(index=False, header=True)
)
print(df)
# Output
URL data
0 https://en.wikipedia.org/wiki/Periodic_t... 0\r\nPart of a series on the\r\nPeriodic...
1 https://en.wikipedia.org/wiki/Planet#Pla... 0\r\n"The eight known planets of the Sol...

Related

How can I save some json files generated in a for loop as csv?

Sorry, I am new in coding in Python, I would need to save a json file generated in a for loop as csv for each iteration of the loop.
I wrote a code that works fine to generate the first csv file but then it is overwritten and I did not find a solution yet. Can anyone help me? many thanks
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file.csv')
You need to add a sequence number or some other unique identifier to the filename. The clearest example would be to keep track of a counter, or use a GUID. Below I've used a counter that is initialized before your loop, and is incremented in each iteration. This will produce a list of files like output_file_1.csv, output_file_2.csv, output_file_3.csv and so on.
counter = 0
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file_' + str(counter) + '.csv')
counter += 1
We convert the integer to a string, and paste it inbetween the name of your file and its extension.
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for idx, user in enumerate(user_objects):
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv(f'output_file{str(idx)}.csv')

How to merge multiple JSON files reading from S3, convert to single .csv and store in S3?

Input :
There are 5 part JSON files named as test_par1.json, test_part2.json, test_part3.json, test_part4.json, test_part5.json in s3://test/json_files/data/.
Expected Output :
Single csv file
Explanation : All of the json files are having same number of columns with same structure. They are basically part files of same source.
I want to merge/re partition all of them and convert them into a csv file and store it in S3.
import pandas as pd
import os
import boto3
import numpy
# Boto3 clients
resource = boto3.resource('s3')
client = boto3.client('s3')
session = boto3.session.Session()
bucket = 'test'
path = 'json_files/data/'
delimiter = '/'
suffix = '.json'
json_files = client.list_objects(Bucket=bucket, Prefix=path, Delimiter=delimiter)
#print(inter_files)
for obj in inter_files['Contents']:
#print(obj)
obj = client.get_object(Bucket=bucket, Key=obj['Key'])
#print(obj)
df = pd.read_json(obj["Body"], lines=True)
print(df)

Export JSON to CSV using Python

I wrote a code to extract some information from a website. the output is in JSON and I want to export it to CSV. So, I tried to convert it to a pandas dataframe and then export it to CSV in pandas. I can print the results but still, it doesn't convert the file to a pandas dataframe. Do you know what the problem with my code is?
# -*- coding: utf-8 -*-
# To create http request/session
import requests
import re, urllib
import pandas as pd
from BeautifulSoup import BeautifulSoup
url = "https://www.indeed.com/jobs?
q=construction%20manager&l=Houston&start=10"
# create session
s = requests.session()
html = s.get(url).text
# exctract job IDs
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' +
urllib.quote(job_ids)
# do Ajax request and convert the response to json
ajax_content = s.get(ajax_url).json()
print(ajax_content)
#Convert to pandas dataframe
df = pd.read_json(ajax_content)
#Export to CSV
df.to_csv("c:\\users\\Name\desktop\\newcsv.csv")
The error message is:
Traceback (most recent call last):
File "C:\Users\Mehrdad\Desktop\Indeed 06.py", line 21, in
df = pd.read_json(ajax_content)
File "c:\python27\lib\site-packages\pandas\io\json\json.py", line 408, in read_json
path_or_buf, encoding=encoding, compression=compression,
File "c:\python27\lib\site-packages\pandas\io\common.py", line 218, in get_filepath_or_buffer
raise ValueError(msg.format(_type=type(filepath_or_buffer)))
ValueError: Invalid file path or buffer object type:
The problem was that nothing was going into the dataframe when you called read_json() because it was a nested JSON dict:
import requests
import re, urllib
import pandas as pd
from pandas.io.json import json_normalize
url = "https://www.indeed.com/jobs?q=construction%20manager&l=Houston&start=10"
s = requests.session()
html = s.get(url).text
job_ids = ','.join(re.findall(r"jobKeysWithInfo\['(.+?)'\]", html))
ajax_url = 'https://www.indeed.com/rpc/jobdescs?jks=' + urllib.quote(job_ids)
ajax_content= s.get(ajax_url).json()
df = json_normalize(ajax_content).transpose()
df.to_csv('your_output_file.csv')
Note that I called json_normalize() to collapse the nested columns from the JSON. I also called transpose() so that the rows were labelled with the job ID rather than columns. This will give you a dataframe that looks like this:
0079ccae458b4dcf <p><b>Company Environment: </b></p><p>Planet F...
0c1ab61fe31a5c62 <p><b>Commercial Construction Project Manager<...
0feac44386ddcf99 <div><div>Trendmaker Homes is currently seekin...
...
It's not really clear what your expected output is, though ... what are you expecting the DataFrame/CSV file to look like?. If you actually were looking for just a single row/Series with the job ID's as column labels, just remove the call to transpose()

Reading a big JSON file with multiple objects in Python

I have a big GZ compressed JSON file where each line is a JSON object (i.e. a python dictionary).
Here is an example of the first two lines:
{"ID_CLIENTE":"o+AKj6GUgHxcFuaRk6/GSvzEWRYPXDLjtJDI79c7ccE=","ORIGEN":"oaDdZDrQCwqvi1YhNkjIJulA8C0a4mMZ7ESVlEWGwAs=","DESTINO":"OOcb8QTlctDfYOwjBI02hUJ1o3Bro/ir6IsmZRigja0=","PRECIO":0.0023907284768211919,"RESERVA":"2015-05-20","SALIDA":"2015-07-26","LLEGADA":"2015-07-27","DISTANCIA":0.48962542317352847,"EDAD":"19","sexo":"F"}{"ID_CLIENTE":"WHDhaR12zCTCVnNC/sLYmN3PPR3+f3ViaqkCt6NC3mI=","ORIGEN":"gwhY9rjoMzkD3wObU5Ito98WDN/9AN5Xd5DZDFeTgZw=","DESTINO":"OOcb8QTlctDfYOwjBI02hUJ1o3Bro/ir6IsmZRigja0=","PRECIO":0.001103046357615894,"RESERVA":"2015-04-08","SALIDA":"2015-07-24","LLEGADA":"2015-07-24","DISTANCIA":0.21382548869717155,"EDAD":"13","sexo":"M"}
So, I'm using the following code to read each line into a Pandas DataFrame:
import json
import gzip
import pandas as pd
import random
with gzip.GzipFile('data/000000000000.json.gz', 'r',) as fin:
data_lan = pd.DataFrame()
for line in fin:
data_lan = pd.DataFrame([json.loads(line.decode('utf-8'))]).append(data_lan)
But it's taking years.
Any suggestion to read the data quicker?
EDIT:
Finally what solved the problem:
import json
import gzip
import pandas as pd
with gzip.GzipFile('data/000000000000.json.gz', 'r',) as fin:
data_lan = []
for line in fin:
data_lan.append(json.loads(line.decode('utf-8')))
data = pd.DataFrame(data_lan)
I've worked on a similar problem myself, The append() is kinda slow. I generally use a list of dicts to load the json file and then create a Dataframe at once. This ways, you can have the flexibility the lists give you and only when you're sure about the Data in the list you convert it into a Dataframe. Below is an implementation of the concept:
import pandas as pd
import gzip
def get_contents_from_json(file_path)-> dict:
"""
Reads the contents of the json file into a dict
:param file_path:
:return: A dictionary of all contents in the file.
"""
try:
with gzip.open(file_path) as file:
contents = file.read()
return json.loads(contents.decode('UTF-8'))
except json.JSONDecodeError:
print('Error while reading json file')
except FileNotFoundError:
print(f'The JSON file was not found at the given path: \n{file_path}')
def main(file_path: str):
file_contents = get_contents_from_json(file_path)
if not isinstance(file_contents,list):
# I've considered you have a JSON Array in your file
# if not let me know in the comments
raise TypeError("The file doesn't have a JSON Array!!!")
all_columns = file_contents[0].keys()
data_frame = pd.DataFrame(columns=all_columns, data=file_contents)
print(f'Loaded {int(data_frame.size / len(all_columns))} Rows', 'Done!', sep='\n')
if __name__ == '__main__':
main(r'C:\Users\carrot\Desktop\dummyData.json.gz')
A pandas DataFrame fits into a contiguous block of memory which means that pandas needs to know the size of the data set when the frame is created. Since append changes the size, new memory must be allocated and the original plus new data sets are copied in. As your set grows, the copy gets bigger and bigger.
You can use from_records to avoid this problem. First, you need to know the row count and that means scanning the file. You could potentially cache that number if you do it often, but its a relatively fast operation. Now you have the size and pandas can allocate the memory efficiently.
# count rows
with gzip.GzipFile(file_to_test, 'r',) as fin:
row_count = sum(1 for _ in fin)
# build dataframe from records
with gzip.GzipFile(file_to_test, 'r',) as fin:
data_lan = pd.DataFrame.from_records(fin, nrows=row_count)

Imbed matplotlib figure into iPython HTML

I want to dynamically write and display HTML with a code cell in Jupyter Notebook. The objective is to generate the HTML to display table, div, img tags in some way I choose. I want to capture img data and place it where I want in this auto generated HTML.
So far I've figured out that I can do the following:
from IPython.core.display import HTML
HTML("<h1>Hello</h1>")
and get:
Hello
That's great. However, I want to be able to do this:
HTML("<h1>Hello</h1><hr/><img src='somestring'/>")
and get something similar to a Hello with a horizontal line and an image below it, where the image is the same one as below.
import pandas as pd
import numpy as np
np.random.seed(314)
df = pd.DataFrame(np.random.randn(1000, 2), columns=['x', 'y'])
df.plot.scatter(0, 1)
The result should look like this:
Question
What do I replace 'something' with in order to implement this? And more to the point, how do I get it via python?
I would have imagined there was an attribute on a figure object that would hold an serialized version of the image but I can't find it.
After some digging around. Credit to Dmitry B. for pointing me in the right direction.
Solution
from IPython.core.display import HTML
import binascii
from StringIO import StringIO
import matplotlib.pyplot as plt
# open IO object
sio = StringIO()
# generate random DataFrame
np.random.seed(314)
df = pd.DataFrame(np.random.randn(1000, 2), columns=['x', 'y'])
# initialize figure and axis
fig, ax = plt.subplots(1, 1)
# plot DataFrame
ax.scatter(df.iloc[:, 0], df.iloc[:, 1]);
# print raw canvas data to IO object
fig.canvas.print_png(sio)
# convert raw binary data to base64
# I use this to embed in an img tag
img_data = binascii.b2a_base64(sio.getvalue())
# keep img tag outter html in its own variable
img_html = '<img src="data:image/png;base64,{}
">'.format(img_data)
HTML("<h1>Hello</h1><hr/>"+img_html)
I end up with:
from IPython.core.display import Image
import io
s = io.BytesIO()
# make your figure here
plt.savefig(s, format='png', bbox_inches="tight")
plt.close()
Image(s.getvalue())
Let say you have base64 encoded image data:
img_data =
"iVBORw0KGgoAAAANSUhEUgAAAIAAAACACAYAAADDPmHLAAAb2ElEQVR42u1dB3wU5bY/m+xuOklIARIgdKQqeunk2kClSRNsKD9UVFR4ei8PBFTKu1f8Xd8PeCpeBCPlonRBmggiXaogYBIJJQkppPdNts68cybZzZaZrbNJNsyByexO3++c73/Kd843MpZlQaJ7l+RiXUiGRMK0ZMkSWXJysqy5NVSvXr1MPWXRokUs/lzTPtaHe5FMpGeXTZkyxQ8byb+8vNwfya+6uloWGxsLtPaVxggODjY1RkFBgcX20NBQNjc3F+Li4pji4mJWo9Ew+Jnt2bMnu337dgshMQqILwiGGAIgw15PjFcEBAQEMgwThEuAVquVI/kkEqAAE4O5dd0mRqfTsfjd4OfnZ8Dfp8ffZkDS48IEBQWxuI2hz6WlpWyHDh0YOgeRkDUKxeLFi9mmiBYeCwAy3w9XysrKylC9Xh+Fkh+NbRGODRWIDYIrP18TAmoTP2Q2g7+Fwd/E4HcGf4ce9+nwsxY/a3GfBn8nrXUkFLhdT4JB3/FcHQlHRESEHlGDwY5hMCIGCUZTEghPBYDr/QiJwfg5BnvC4926dZtHKoA6Ut31fUoAUGUFIJq1IEYRM3GtwaUCEaAE9+Wo1eo0ZG4B7lPh9hr8rRqjYNCxKAzVtB2PUdN3hUKhxc9aPJ8ERxcVFaXH9uIEAtGCIYRoTJXhsQCg7ld06dIlDH9QW2yMyTNnzlyAEGja72vwj8yCsrIyqKqqAmQUlJSUADIKampqAJkPiHQsfVYqlWxgYCCpgCrcfxOPv4pokYNMrkIkqMK2oHU1flfRGr+rcOGEA7dpSHAqKip0aCcRsjBoSxhSUlJYQoaGFAQxEECBPz4CJbwjNspzKAD/hQLg016AsU1obd0+aNtAVlYWpKamcoKBzITo6GgSHBYNR0alUumwPfJQcK7hsel4Sin27kpcyglJaMFzKvG6lUa0QEFSE0qgsalDlWEgZNi2bRvTEKjgsQDMnj1bGRYWFoHw2AUNo+ffQvJ1AXDg7gL2aE4wCC3u3LkDFy5cADIau3btCt27d+cQJDs7m/Yx2Mv1KBTliBxpuL6BKJGJjCehKMVrkMtUhp8rSCBw4dQK2g6kTvRoRBpIRXgTFUSJA2DvN+p6v+YeOCE+kBDQgsyDTp06QUJCAiCj4ejRo3Dz5k0YNmwY9OnTB3r37u2HxytROGLy8/Nj0tPTB+Nag51FhUsm9vQzKBB38FpFeK0ivHwJfi7D7ZXYmapjYmLUqIZ0iAb6OptEdESQg0QeCwMaetCyZUsYN24cIJPh2LFjFC+AAQMGcPsR4jkhad++PQlEEC0oCNG///57n8LCQhUanWm4nMbtmXg8BSAKUX2UoEooQ+GpwuvVoH2gnTx5soE8EzGFQBQVgD8wEh+4CzbEC6gB3mzOKsAZoSB1QGhANsKTTz7JIYXRnjC3K4yfc3Jy4OrVq+qioqIKVB9XEE2OI6OzccnDc8njKEG1U0nqITw8nDwTRiy1ICGAF2wE9Pth+PDh8Ouvv8KBAwdg1KhRgJAuKABt27aF+Pj4QPwciHbD8HPnzg1C6E9FAdqP6jUDr5mDh+ejEJArWonIoEEB0IuhEiQB8JIQkFoYMmQIt963bx+MHTvWQgjMBcB8G6EnqoswNCL7owD1RG8iGZdduP8WoQIKQD6ibSkaoDWoEvSeqgRJALxIxHyjHXD8+HEYMWKEIPOtt7dr145iLKF3794dcPr06R5oK1xEQfgWhYjC7RRmL27durUKkUDnCRL4SWzyLlGvf+ihh7j4QWZmJhc34FvITuDbhqpBhj29xSOPPPLXNm3azMOe3xu3J+A6Cq8dgqpCgULgts0lCUADIcHIkSPh7NmznCAICYG9BeMB8tGjR3dFe2EhdvZ+eNn26EJGoj0QiMEjf3ejrpIANJBNEBISAgMHDoQTJ064JQC0oGtJaNAa7YT52PsHIhK0RpsgDLcraDheZp6kINkATYsIzilKePnyZQqc0ViCXYMQo4acyqCwM6EGRR2NKqVz584R2Pv/hvvJMCzHMQpVZGQk5x5KAtDEhQAHzuDatWvQv39/CwGg2AGFlW/dusWFkmk7MpWgH9D3Bxxp5c6nfVeuXJGhELXEkPFk/J6LAlCMKFCDtgBJCSsJQBMlgvEHH3wQtm7dyqkDYjJa94B+PzfyiFlH0KNHD+jYsSMXS6DjjWFnI+G4C6AxSJFHGbqJT+DA00a8ToC76lwSgEbwCoi5ZBBmZGRw8E69/IknnuCMRaO+d4QkJEgXL16U47GUgSXHkUg/FCbJBvAFFEhMTIRTp07B4MGDuSggMdS6pzsyKkmIWrRoUUqpavjdH9FDRmgiCYAPeAQE4RMnTjQx3t3rkL4nyx8NRLfjAJIANJIQkCoQ41Keptx5TQDIhVm4cCHn8rhKmDcHX375peD+Dz/8ENLS0uzqWbKgaUiWhmGHDh0KZmlqEjWEANTlz7l1LulHe0S+MV3fHlFOHx1HFjZZ3agvYe7cudC3b1+J6w0hAJQgQQYPGSaUQkW9kqxcoz9rJIxkcShhXOgcDG+KDrkkjB988AGHBJ999pnEeW8LAKYzwa5du0zfiQGUYbty5Uq4ffs2t40YTulTU6dO5WCfAh6uGkWYScNF2Ohc821U2UMoQNk4RreKrk0ZO6tWrYK3335b8Jo7zmfAxbQcfvWEeZx/f+5xh66aEJ25VQI/nEnl3Rci08Ks0f0AAzy+LwDWRAEMWgiKzYl0NIY23b4uhkThvffeg/vuu493P6VnrVixwsINI8GkoVmsYeA953JmKWQbInn3USx/1sRETrDcoRt55YLXLr59Fcb2yoAHHnig+QlAYxEOo8KgQYMAB0tM6EIDM8uXL4fVq1fbt2MqSqDq2k8WKilGn2+hwlylPm1CYPf2LZZM6DAAlJFtmpcKaEpEvfW1116DpKQk0zaKwjkibXUFvP/C45wQiUX9OsXC9yvnW2yb891FyNM1TtvcM8PB1tY/JWz6EpWodFBWo5cQwF2i6Ju17UBGIg3AuENkaG7ZsoUzZCnfj2wZIcKULm4YmCqIJkyYwIVxXaGj14sh6WwejRlDQsVl+OfslyQBcJUw5dpGANxlPhEZs4cOHeKuc/DgQS7xk4/IBSa3k+IW5IE888wzLt/ryLVs/KsgNwZOXS/gruWuEXrPqgAKCpkT9UZPYwvkSRBR8oaQTUE9nxhGgkI1Au6EgGOCWMDJBzApBFEr77ZYYeR7SwB++OEHi+9Ux+cpPf/88yY0+Pzzz3mP2b9/P7em2UVeffVVt+4zY0RvaJl/DspObYJZ44eIKgDye4X5WFpl+p6Xlwevv/66x9el4BPVBlJgi8b3qZebM4einIQARBTcEYpVcJlBqCoYA39GV7BSDl/Mf03yAlwhKtakqt033ngDvvnmG9N2Sr3CWnzADFtR7mPs1TT4ZJwryEg0pkE2AKkIinYKEcG7Xq/Dtb7B28nnEYB6FsX4jcEZSp4UGoGknka9f8eOHaLdHyuAufENmjRi06ZNJrVAtGfPHhPiPPvss4LXIN1u0PlxaykO4CIR44nhZGjRIsR8rLDhrH5SB55Y/3xEVcFEJATGcQ5iOhmeJHSUAGrvntTzSX3glFOSAIhFBPU0+ETGF6HCV199Bd99951df92EFAxbOwmEwTmGkG9PRGMdRmOQqoOJaFDKkfFHut+AKsAgqQD3AjI0GQPpYE6icbAH6+q4gR4y0CgpxNURRpzsCxkiA72TDCGfnCKNWOIN58+fr/XdjxwxqSQa8bRHBur9OkayAdwhgk4a2hWysN0hrkeiKsE54pw+h8YacK4Ezh6gyB/lQVDG76RJk5y4H6kAGScIkgA0ASLG1zLE+REaUi3GnIT169dza5r4Ydq0aU4IAAZ59KxdFaBSU7KMDIKU4rJMqg3kg2TGUKeTXTPKXnjhBW6N079wayrwsB6DEPYChN3AQ5duw/D5m+DhOevgoxVJkgB4HQEIknV6lyGZQsPGah4q4Zo+fbqT9zPY9QK2HvkNdLiP4kRbDp93K9FWEgCXVEAtAjB61xqaXFKju0cC4GywyWAw2PUCIoP8TFPTacprcywlAWgAFaB30SqnVHWKQBJDaY4gSoJ12uawY3TOm/oUtNFlg+7WKXhr4sMuDydLRqDLKsBglyG8sQPsnZRsSkRVvo7SzWxsAC4UzK8CYiJCYP+apb7lBVCyxcmTJ7lxeFooA4d6B1nGppvjwAnNooUzYnIjauTLU3Yw+dRUMNmYNkAtJDOgRv+8qkZby2RcFDgPdqCVJU6xiJ07d3JDwhT3p3Aw1fzxkVZnAI2VrjcwjTdhuNcEgBjtqBeQsUQRO+sqn2XLlnFBlUZTAaSTEQEUYVGw+MdsYH/czHGfmxiy8DqcXLeEM9rIxaNt5kYZ6f45c+YIXvvLPWdg4y8pTQbtJBUgGJjRc32em8CBhmtrcR6K8/M4I4yElwSFgj30neoe6NgNGzbY1f2uupY+KwBUrPHuu+9y8OgqOUq7Xrp0KVy6dMn0XQhu3aVPX31cuDxt+Dhuehdy99asWcMN/lCvp9oGCj07Cju/M24QdFYKJaTGcrGDZoMAjz76qFeui3Pucou3iNLFxo8f79RxrqaWkQXvzLUbiiQ38B4nSQAkAZDoXibJC7BDyw8kQ2GZCkwzr9V5Ax0jFTDjqX5uVwhLAuAjdPb6XVCzCpMLyLEf/x86eACeHdK5Qcu4fVoATt8uh0q1/bBqgNwPHu0W6bVn+OXPIqjR1vrgprhb3SSNT/aO5SJ81sSa+f9G5hvjApIKcIE2nboJlWyQw+POHT8M78+Y4pVnWHfsOqhlgbXMs2AswJ9Ht8Ocd2fxnGWcydOM+cBCc3rddoMoMWensj9TFABVKpVXnoEx9lybXs1y0Tz+54b6c4zMb2YvW29SVkxYXFf4x9od3pJCYUi3d46J6fVh4eYkBU3OjL2ubw2FRcXe4H8dlBsZWQ/pgufUCUk988GEGpIAeIlCYtrB0qQfvIQAZnrcGUg3f52LGfMlG8DLlBvYGTKzc8Tlvz1Id4AabDNlfpMVgKCIGFiStF98BDDT+xa92p4SMEMMo1HISirA+15DZXRvuJaaJq4nYq73ndDn9WaD8bjmxfwmIQBlWdd5XUZlcAtYtvmoqErAXO+bIN0+AFhAv+QFeIFiylOwPRkz5tcbXvo2/eDU+cuiegHmTDR5Bna9gHrXUbIBvEDRGCBsUXWnPvWKrdfX/spAWL77nLgIUNerWWcgnTVLCbOOIUgCIB40z31mEDA6rU2Qhv7IE/4Ce38+KQ4ECEG6fdgwsxvqhchZUml0kFFQAclZxZBXqsIKH6ZJCUDjjgbWNXDH1i2hleEsFPi3s4rQsTjkKoc1R1Jh7PBEzxEArKDfCUi3Tgp1hBqZhZVw6Pc7cOTybShWaaF2/Kke2eizQsZArzbB8OKI/tC/a2uu6POeEwDrRlz43MPw9uY/ONg3979pHdThQdj4/Y/w8sSRnscBLHo1OIgD8I0d8B97Ni0Pvj50FW4WqMwEBXjP1+JLPi5lV8Olb47j7H9aePmvXWH66MH3jgrg079U/dI1oNTSWDO5aX6w5UKuZzVxZnF/a2PTfuwAeMYO6s+hoo41h5Nhwbdn6plvZctYqzVzW0TLymHt8XQYvzAJcoormr8AWBt75jT/+cfAoK608NmNDR6ScD98sWm3x4jDWvVOp7wAnhgCURXW7M/++hhsPpkGnGwKDTiZXYc3rIzrfE0ATFqyBc7/caO5CgDrEFJDg5TQL1JtE6Qxnnfgukpw6Nal+5v1TodxAJ6xA6Pl8M8dFyAlu8wW0XgSSMyRzcIWMUchRRC88++f4cr1jGYmAHYh0ZL++7nHQK8qtbHU6XNofHf4ZO029+1NnvuzjmwA3uFgnHzyYhbq/XwbZKGS8uIbFyH98HooOrkB/P/YCcq0H6Hq0i7I//0w6FRl9WFlYG3OlwWEwIyVeyEjO6+ZGYG8kGh7GKVmPdZBCcfybY01+vdrvpxLGgnFlz64fH8Z3/3tuXXWzK89N6RVAmw8edOSebjcvXwY2mgyYO60F7H4Y4HN21EIvX7EiaXX7joKBSHdwT8wlN/TCAyDaYvWwPGkj5oRAghAIh/NHD8UDJWFvJAaHNsBFq3a7JYTKAzJTngOZoITFBVfW9Fbt4/RaeDW3pUwZ/xDcPzIT/Dyyy/bMJ8Lb2NJ2binn4Z961bAwlGdQFuYIagWa8LawadrtzQPAWB506qELXA/rK+b0DfK0lI308fJNS1dTxphwen7W5xjoTJYyzQx+oiTSaQfXA3ff73cpfmHx416Arb/z3TQVxTwqiVaNp/Nwd9Z1AwEAMDGvXMUUZv65ABgK/LMjLV65gVFtoaFq7a66XpaQbodT8DaUrdwH+u2ZZ/ZDUn/WggDBgxwuV06tY+HFa+P4CaKtu4k9N8vJAIWfLq2OagAnkY3xdT5G59iY9MTO/H4z7XnZWHUMP1OtptegJPpXTyWurkQGBD6+8XKuPcAuEuJ/ftCfIDaRjiNz3YGU+r1Xpw/sIEQQNj/tQfCY4b0BnlFLq//rAyLhA9Wf++iGuKBdBcCVtYxhPyrRzGd/B2P22f+tKd4kYmWgNhOsGn7Dz6OAHxpVU5m1swefb/g+SWhXeBK8nXn1RDf/R0khFiqLHNEYHD8Io97BbynNKBnR4iASt5OQsvGfSeagREoFPxwQMP6doZgVQ7v+YqgUFiy/kf34hHWYwJCwQOBGEZx2gX428xXRGujWZMSbZGp7tlyynVey0RqMBuAZVkBSHVM708ews2gxXe+OqoHHD/7m1NCyHt/V8cCjHZD8S1RJ3p4+q/98F3wBl5kkkfE231buk+oAJvIngupVX06x3Nv7OSDZD9FAPxr6wmnej/w3J916rltYwjtYyNFrw4OkTO8nSIgPAaOnz7n4wgAIFCa5dwlPnzpcWBp5k4+SG7TG3YdPOZ8PAIsz7evAfhjGG1jwkVvpshg4cmlzl39s3kEgtxhPlEHTBppLy/hhWSZzA9WHbjshP73tDSsds1g8KdT21ait1NcVJjgvsKSch9GALCK6bNOND4PLZk+EhhtDS8kK+N7w9db9rhxf8dp4dYxBF1lGSS0byd6G3WIE55wqrJG68MIALYJGI4an4+iw0Ogd7iaP6yMf7/9NcNO0gj//R1Gg3liEOrKIq/MUtatvfDU8tUavQ8jgM0YvPul1h9NHw36miresHJA3H2wPGmrsA3Id38nh4PN3TOdqtwrAhATKawCanxZAPj0rnm1jUuWcqAChrWT84eVcb0nuZQ/aYQ3LYt1ujTMXIgVWLTi6nuInKGSyhrhJzFofRwBQKA0yw2a99JTwFBiBU9YOQhDp0tXbeJXQyw/pNs3Xm1jCEqcQ5hmBBebsguFDb3gALkPI4BF9M0SUt0hShoZ0yeKN6xL6xNZtXP38nkB7paGmXsziuBwyMgUXwDScwoF94UGKnwYAWzG4J1ofAf01qRHgFUV847UBbSMgwX/9x9bNcR7f+dKw6wDRzcyc0VvpayCUsF9LYIDfBwBeEa6wIMSK0oaeXFIB5uwrhFtrpQFQX5BoVOQ7ih2wBdDSM8VP1GjoLxGUBTbRof5uhcgUJrlAb00cjD4qwp5I3zKFjEwb+UmW+bb3N+RF8AfQ7hbUiV6M1Wo+aeS15QVQP9+fX1bBfCOdIlQbfvWyL6C2b63DdFwK/2OlSFqdX8n4gB8MYRCJlTUEbrTyVmgZ/k9i+qCDBg4cKCP2wCCkOrZlccMux8C1QVWkcG6GrygcJj3+RZLV5QH0u17AfwxDEVsN1i/dbdoLbR2/3nh56jM516F20y8AHBO/7pAcycP5fLq+NKqCpRt4dK1FMH728UgFngrlo3XWbP3rCjPn5JZCFcziwUbr1dciFfiDg3sBQiVZnlOw+7vCpGGIl5j018ZBB98tVv4/qz9ugB7cwtpWnSAoyfPePz8SQeF8xkq7vwBM15+3mucaQQvwGo4VyRaNO0JLkWbL8hTFdYZNAZ7pWHOqAHbGILMXwEfr9vn0XNnFpTDsWvCMQXD3T9gzJgxvi0AjqplxaDemDQSLy/jtfT9/OXAyBQCuQguloZZxRCKAtvBZ5v2uvXM5So1vPPFPkEg1JTlw9RRidzr9XwbAawa3aLqVkT6x4yxmDqm4y3gEPIUWNbRWIBwDIN7jRzmIqw7lQVJu35x6VkLy1Xw1mf7sCRcJXjvqqsH4L333vUqb5pkaZi7lNA6CrqH1fAWcNje3/GIpFAdQU1Jbq2g1ZEMEWbVzzfhxY++huKKGrvPqMeXUe45ex3GL9oMqdklgscVp5yGTxbMhpYtW3qVNfKGQQDWvdIsN+jjNyfCxGWYGCJX2hibnpWGgWmtLs6BoZ0j4LcSy+ZLLWZgxPyNkBAhh7HDHoD42AiIbhGM8whouTePpN4phJ8u3oBqrf2JLrQVRdAztIJ7A6m3qYGqg1nB0iyxKQqTRvq38YMLBayAC2cF6U5MFs1Xx/Dp36fB+LmroEIRY3WWDDLLDPDFvt/cen5tFaJCyh7YeHBvg3CmEUrDnJ2m1X1a+sYEYDQq4RwAZyd/dlDHsPXjmcCUZon23LrqctD9vhMO7f0eYmJimo8A1GfX2pZmeYOCA5Uwonu43RwAd0rDrKlVdEv4acUsCC7902N7pjz9CshTdsPhvTshLi4OGoqafGmYuzT/lacxj6qcd25gZ0vD6oVAmFq1ioUTm/4XBrYoger8dNchv7IEco6uh1eGxsGF08e9+kbURrMBxneTw7oN/zHTkmzdGqDnSO+8XpaSRhaMuQ9WrlrtkLFd+4/nPealv0TBth07bbZHKzUWL4imApF/L5sPqamp8NWGLXDw0m2AiARQhkaCPMhyGJfFF1Nrq0qhMisZ/EvTYcLwIfDm7vVei/U3CQGYMmYEtzQ0jXw8kVvcpRlTn+EWZ6lHjx6w8pMlnHDRy6dTUlIgKycHM30KoKCoFCJClJAQ1wriu8fBQ6+/CcOGDWv0dw9K7w30AtHATWJiIrc0dZJeHXuPkyQAkgBIJAmAOHrPsyxPidzyr+vavXEFAF0ieggKcBsknjSosUkCYMD5B5ng4GC2UQQgKioKvR6WwQfR4aI2SqbEngbp/QYkjU6nM1RVVbnV5h67gbm5uSwOWepRCFTo05bfvXsXJCFoGMLp4xhMFinDtQYFgenVq5fLbS7zNBw7ZcoU/4SEhEB8gIiAgIBueL0huERgmTaVsvhDbcBPIrG6PRJ2NELbalwyNRrNCVznh4eHVy1evFjHushQjxGgTup0xcXFVIyXhvZAHkJSCAqEUkbpMhKJrveRDP7+/mps56qgoKBybHt1dna2W/aXxwiADyRDyaNeLkc9pAgNDVXgAylQSv3QOJF6v3dQgMGORnpfhypAh71ft2jRIoM7alcmxoicUQh69uwp+/nnn/1wOFOGQiAx34tExjd5XsnJyey2bdsYd20umZhDsrK66oXm9nrVJqwOTHaBu9f4fyVgzJGpmA/3AAAAAElFTkSuQmCC"
then in have it rendered inside of an iPython cell you simply do:
from IPython.core.display import Image
Image(data=img_data)
I'm going to build on what was answered by others (piRSquared) because it didn't work for me with Jupyter and Python 3. I wrote the following function, which will take any plot function I define and call it, and capture the outputs without displaying them in Jupyter. I personally use this in to build custom HTML machine learning reports based on many model iterations I execute using Livy and Spark.
from IPython.core.display import HTML
import binascii
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import base64
def capturePlotHTML(plotFunction):
# open IO object
sio3 = BytesIO()
plotFunction()
plt.savefig(sio3)
sio3.seek(0)
data_uri = base64.b64encode(sio3.read()).decode('ascii')
html_out = '<html><head></head><body>'
html_out += '<img src="data:image/png;base64,{0}" align="left">'.format(data_uri)
html_out += '</body></html>'
#prevents plot from showing in output
plt.close()
return (HTML(html_out))
# Plot Wrappers
# Advanced Wrapper for more complex visualizations (seaborn, etc)
class plotRegline:
def __init__(self):
#// could also pass in name as arg like this #def __init__(self, name):
reg_line_prepped_pdf = pandas_input_pdf
sns.lmplot(x='predicted',y='actual',data=reg_line_prepped_pdf,fit_reg=True, height=3, aspect=2).fig.suptitle("Regression Line")
# Basic Wrapper for simple matplotlib visualizations
def plotTsPred():
ts_plot_prepped_pdf = pandas_input_pdf
ts_plot_prepped_pdf.index = pd.to_datetime(ts_plot_prepped_pdf.DAYDATECOLUMN)
ts_plot_prepped_pdf = ts_plot_prepped_pdf.drop(columns=["DAYDATECOLUMN"])
ts_plot_prepped_pdf.plot(title="Predicted Vs Actual -- Timeseries Plot -- Days", figsize=(25,6))
#building the plots and capturing the outputs
regline_html = capturePlotHTML(plotRegline)
ts_plot_day_html = capturePlotHTML(plotTsPred)
# could be any list number of html objects
html_plots = [regline_html, ts_plot_day_html]
combined_html_plots = display_html(*html_plots)
# the following can be run in this code block or another display the results
combined_html_plotes
The answer by piRSquared no longer works with Python 3. I had to change it to:
from IPython.core.display import HTML
import binascii
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# open IO object
bio = BytesIO()
# generate random DataFrame
np.random.seed(314)
df = pd.DataFrame(np.random.randn(1000, 2), columns=['x', 'y'])
# initialize figure and axis
fig, ax = plt.subplots(1, 1);
# plot DataFrame
ax.scatter(df.iloc[:, 0], df.iloc[:, 1]);
# print raw canvas data to IO object
fig.canvas.print_png(bio)
plt.close(fig)
# convert raw binary data to base64
# I use this to embed in an img tag
img_data = binascii.b2a_base64(bio.getvalue()).decode()
# keep img tag outter html in its own variable
img_html = '<img src="data:image/png;base64,{}
">'.format(img_data)
HTML("<h1>Hello</h1><hr/>"+img_html)
Specifically, I import from io, not StringIO, and I use BytesIO rather than StringIO. I needed to decode the bytes into a string for inserting into the HTML. I also added the required imports of numpy and pandas for the example plot to work, and added plt.close(fig) so that you don't end up with two figures in the output.
If you want to show the results of DataFrame.plot in an iPython cell, try this:
import pandas as pd
import numpy as np
%matplotlib inline
np.random.seed(314)
df = pd.DataFrame(np.random.randn(1000, 2), columns=['x', 'y'])
df.plot.scatter(0, 1)