Sorry, I am new in coding in Python, I would need to save a json file generated in a for loop as csv for each iteration of the loop.
I wrote a code that works fine to generate the first csv file but then it is overwritten and I did not find a solution yet. Can anyone help me? many thanks
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file.csv')
You need to add a sequence number or some other unique identifier to the filename. The clearest example would be to keep track of a counter, or use a GUID. Below I've used a counter that is initialized before your loop, and is incremented in each iteration. This will produce a list of files like output_file_1.csv, output_file_2.csv, output_file_3.csv and so on.
counter = 0
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file_' + str(counter) + '.csv')
counter += 1
We convert the integer to a string, and paste it inbetween the name of your file and its extension.
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for idx, user in enumerate(user_objects):
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv(f'output_file{str(idx)}.csv')
I'm trying to split the lines of a column in the CSV file, then I want to speed up the process I'm going to do with the help of thread, but I don't know how to do the splitting.
import csv
import pandas as pd
import threading
df = pd.read_csv("clearData.csv")
a = df["Product"].head(50)
def halfSplit():
for i in a:
for k in a:
result = simTest(i,k)
print(result,i,k)
def otherHalfSplit():
for i in a:
for k in a:
result = simTest(i,k)
print(result,i,k)
if __name__ =="__main__":
t1 = threading.Thread(target=halfSplit, args=(10,))
t2 = threading.Thread(target=otherHalfSplit, args=(10,))
t1.start()
t2.start()
t1.join()
t2.join()
I am trying to write data into pymongo and this the TypeError that I am getting. The Type for mydict1 is List. Do I have to convert my data into json or bson before I write it to pymongo? Kindly help.
Thanks.
from numpy.polynomial import Polynomial as poly
import numpy as np
import matplotlib.pyplot as plt
import pymongo
import json
import pandas as pd
df = pd.read_csv(r'D:\polynomial\points.csv')
print(df)
x= np.array(df['Wavelength(A)'].tolist())
x= np.divide([299792.458], x)
y= np.array(df['Level(A)'].tolist())
x_trimmed = np.delete(x, np.where(y < 1e-4))
y_trimmed = np.delete(y, np.where(y < 1e-4))
test= poly.fit(x_trimmed, y_trimmed, 10)
print (test)
list1= test.convert().coef
print (list1)
print (len(list1))
#print (type(list1))
to_list= list1.tolist()
#print(to_list)
#data_format= json.dumps(to_list)
l = len(to_list)
#print (l)
mydict1= []
for i in range(l):
mydict = { "a"+str(i) : to_list[i] }
mydict1.append(mydict)
print (mydict1)
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["mydatabase"]
mycol = mydb["coefficients"]
x = mycol.insert_one(mydict1)
This is mydict1=
[{'a0': -2.3373800910827825e+34}, {'a1': 1.2084654060419298e+33}, {'a2': -2.811587585787653e+31}, {'a3': 3.876370042231405e+29}, {'a4': -3.507261557232249e+27}, {'a5': 2.1759768836934694e+25}, {'a6': -9.37514311649608e+22}, {'a7': 2.7697765301392782e+20}, {'a8': -5.370081422614614e+17}, {'a9': 616983041924503.2}, {'a10': -318990754999.1472}]
The problem is that MongoDB's insert_one method inserts a single document that is represented by a dictionary, not a list.
The possible solutions are:
use insert_many instead. In this case, you will have every list item as a separate mongodb document
make a dict with your list values. You can use something like {"items": mydict1}, or reduce(lambda x, y: x | y, mydict1) depending on the document structure that will be better for your needs
I know I can download a csv file from a web page by doing:
import pandas as pd
import numpy as np
from io import StringIO
URL = "http://www.something.com"
data = pd.read_html(URL)[0].to_csv(index=False, header=True)
file = pd.read_csv(StringIO(data), sep=',')
Now I would like to do the above for more URLs at the same time, like when you open different tabs in your browser. In other words, a way to parallelize this when you have different URLs, instead of looping through or doing it one at a time. So, I thought of having a series of URLs inside a dataframe, and then create a new column which contains the strings 'data', one for each URL.
list_URL = ["http://www.something.com", "http://www.something2.com",
"http://www.something3.com"]
df = pd.DataFrame(list_URL, columns =['URL'])
df['data'] = pd.read_html(df['URL'])[0].to_csv(index=False, header=True)
But it gives me error: cannot parse from 'Series'
Is there a better syntax, or does this mean I cannot do this in parallel for more than one URL?
You could try like this:
import pandas as pd
URLS = [
"https://en.wikipedia.org/wiki/Periodic_table#Presentation_forms",
"https://en.wikipedia.org/wiki/Planet#Planetary_attributes",
]
df = pd.DataFrame(URLS, columns=["URL"])
df["data"] = df["URL"].map(
lambda x: pd.read_html(x)[0].to_csv(index=False, header=True)
)
print(df)
# Output
URL data
0 https://en.wikipedia.org/wiki/Periodic_t... 0\r\nPart of a series on the\r\nPeriodic...
1 https://en.wikipedia.org/wiki/Planet#Pla... 0\r\n"The eight known planets of the Sol...
I would like create a daily candlestick plot from data i downloaded from yahoo using pandas. I'm having trouble figuring out how to use the candlestick matplotlib function in this context.
Here is the code:
#The following example, downloads stock data from Yahoo and plots it.
from pandas.io.data import get_data_yahoo
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots, draw
from matplotlib.finance import candlestick
symbol = "GOOG"
data = get_data_yahoo(symbol, start = '2013-9-01', end = '2013-10-23')[['Open','Close','High','Low','Volume']]
ax = subplots()
candlestick(ax,data['Open'],data['High'],data['Low'],data['Close'])
Thanks
Andrew.
Using bokeh:
import io
from math import pi
import pandas as pd
from bokeh.plotting import figure, show, output_file
df = pd.read_csv(
io.BytesIO(
b'''Date,Open,High,Low,Close
2016-06-01,69.6,70.2,69.44,69.76
2016-06-02,70.0,70.15,69.45,69.54
2016-06-03,69.51,70.48,68.62,68.91
2016-06-04,69.51,70.48,68.62,68.91
2016-06-05,69.51,70.48,68.62,68.91
2016-06-06,70.49,71.44,69.84,70.11
2016-06-07,70.11,70.11,68.0,68.35'''
)
)
df["Date"] = pd.to_datetime(df["Date"])
inc = df.Close > df.Open
dec = df.Open > df.Close
w = 12*60*60*1000
TOOLS = "pan,wheel_zoom,box_zoom,reset,save"
p = figure(x_axis_type="datetime", tools=TOOLS, plot_width=1000, title
= "Candlestick")
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3
p.segment(df.Date, df.High, df.Date, df.Low, color="black")
p.vbar(df.Date[inc], w, df.Open[inc], df.Close[inc], fill_color="#D5E1DD", line_color="black")
p.vbar(df.Date[dec], w, df.Open[dec], df.Close[dec], fill_color="#F2583E", line_color="black")
output_file("candlestick.html", title="candlestick.py example")
show(p)
Code above forked from here:
http://docs.bokeh.org/en/latest/docs/gallery/candlestick.html
I have no reputation to comment #randall-goodwin answer, but for pandas 0.16.2 line:
# convert the datetime64 column in the dataframe to 'float days'
data.Date = mdates.date2num(data.Date)
must be:
data.Date = mdates.date2num(data.Date.dt.to_pydatetime())
because matplotlib does not support the numpy datetime64 dtype
I stumbled across a great pastebin entry: http://pastebin.com/ne7Fjdiq that does this well. I too was having trouble getting the calling syntax right. It usually revolves around transforming your data in simple ways to get the function to work right. My issue was with the datetime. There must be something in my format data. Once I replaced the Date series with range(maxdata) then it worked.
data = pandas.read_csv('data.csv', parse_dates={'Timestamp': ['Date', 'Time']}, index_col='Timestamp')
ticks = data.ix[:, ['Price', 'Volume']]
bars = ticks.Price.resample('1min', how='ohlc')
barsa = bars.fillna(method='ffill')
fig = plt.figure()
fig.subplots_adjust(bottom=0.1)
ax = fig.add_subplot(111)
plt.title("Candlestick chart")
volume = ticks.Volume.resample('1min', how='sum')
value = ticks.prod(axis=1).resample('1min', how='sum')
vwap = value / volume
Date = range(len(barsa))
#Date = matplotlib.dates.date2num(barsa.index)#
DOCHLV = zip(Date , barsa.open, barsa.close, barsa.high, barsa.low, volume)
matplotlib.finance.candlestick(ax, DOCHLV, width=0.6, colorup='g', colordown='r', alpha=1.0)
plt.show()
Here is the solution:
from pandas.io.data import get_data_yahoo
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
from matplotlib import ticker as mticker
from matplotlib.finance import candlestick_ohlc
import datetime as dt
symbol = "GOOG"
data = get_data_yahoo(symbol, start = '2014-9-01', end = '2015-10-23')
data.reset_index(inplace=True)
data['Date']=mdates.date2num(data['Date'].astype(dt.date))
fig = plt.figure()
ax1 = plt.subplot2grid((1,1),(0,0))
plt.ylabel('Price')
ax1.xaxis.set_major_locator(mticker.MaxNLocator(6))
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
candlestick_ohlc(ax1,data.values,width=0.2)
Found this question when I too was looking how to use candlestick with a pandas dataframe returned from one of the DataReader services like get_data_yahoo. I eventually figured it out. One of the keys was this other question, answered by Wes McKinney and RJRyV. Here is that link:
Pandas convert dataframe to array of tuples
The key was to read the candlestick.py function definition to determine how it expected to receive the data. The date needed to be converted first, then the entire dataframe needed to be converted to an array of tuples.
Here is the final code that worked for me. Maybe there is some other Candlestick chart out there somewhere that works directly on a pandas dataframe returned from one of the stock quote services. That would be very nice.
# Imports
from pandas.io.data import get_data_yahoo
from datetime import datetime, timedelta
import matplotlib.dates as mdates
from matplotlib.pyplot import subplots, draw
from matplotlib.finance import candlestick
import matplotlib.pyplot as plt
# get the data on a symbol (gets last 1 year)
symbol = "TSLA"
data = get_data_yahoo(symbol, datetime.now() - timedelta(days=365))
# drop the date index from the dateframe
data.reset_index(inplace = True)
# convert the datetime64 column in the dataframe to 'float days'
data.Date = mdates.date2num(data.Date)
# make an array of tuples in the specific order needed
dataAr = [tuple(x) for x in data[['Date', 'Open', 'Close', 'High', 'Low']].to_records(index=False)]
# construct and show the plot
fig = plt.figure()
ax1 = plt.subplot(1,1,1)
candlestick(ax1, dataAr)
plt.show()