I have the code below which reads data in from a json file to a pandas dataframe. Some of the columns like "attributes" still wind up with dicts in them. I'd like them to be columns like "attributes.GoodForMeal.Dessert", similar to what the flatten function from r does.
Can anyone suggest a way to do this in python?
Code:
df_business = pd.read_json('dataset/business.json', lines=True)
print(df_business[1:3])
Data:
address attributes \
1 2824 Milton Rd {u'GoodForMeal': {u'dessert': False, u'latenig...
2 337 Danforth Avenue {u'BusinessParking': {u'garage': False, u'stre...
business_id categories \
1 mLwM-h2YhXl2NCgdS84_Bw [Food, Soul Food, Convenience Stores, Restaura...
2 v2WhjAB3PIBA8J8VxG3wEg [Food, Coffee & Tea]
city hours is_open \
1 Charlotte {u'Monday': u'10:00-22:00', u'Tuesday': u'10:0... 0
2 Toronto {u'Monday': u'10:00-19:00', u'Tuesday': u'10:0... 0
latitude longitude name neighborhood \
1 35.236870 -80.741976 South Florida Style Chicken & Ribs Eastland
2 43.677126 -79.353285 The Tea Emporium Riverdale
postal_code review_count stars state
1 28215 4 4.5 NC
2 M4K 1N7 7 4.5 ON
Update:
from pandas.io.json import json_normalize
print json_normalize('dataset/business.json')
Error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-12-bb0ce59acb26> in <module>()
1 from pandas.io.json import json_normalize
----> 2 print json_normalize('dataset/business.json')
/Users/anaconda/lib/python2.7/site-packages/pandas/io/json.pyc in json_normalize(data, record_path, meta, meta_prefix, record_prefix)
791
792 if record_path is None:
--> 793 if any([isinstance(x, dict) for x in compat.itervalues(data[0])]):
794 # naive normalization, this is idempotent for flat records
795 # and potentially will inflate the data considerably for
/Users/anaconda/lib/python2.7/site-packages/pandas/compat/__init__.pyc in itervalues(obj, **kw)
169
170 def itervalues(obj, **kw):
--> 171 return obj.itervalues(**kw)
172
173 next = lambda it : it.next()
AttributeError: 'str' object has no attribute 'itervalues'
Update2:
Code:
import json;
json_normalize(json.load('dataset/business.json'))
Error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-20-4fb4bf64efc6> in <module>()
1 import json;
----> 2 json_normalize(json.load('dataset/business.json'))
/Users/anaconda/lib/python2.7/json/__init__.pyc in load(fp, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
285
286 """
--> 287 return loads(fp.read(),
288 encoding=encoding, cls=cls, object_hook=object_hook,
289 parse_float=parse_float, parse_int=parse_int,
AttributeError: 'str' object has no attribute 'read'
Update3:
Code:
with open('dataset/business.json') as f:
df = json_normalize(json.load(f))
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-21-e3449614f320> in <module>()
1 with open('dataset/business.json') as f:
----> 2 df = json_normalize(json.load(f))
/Users/anaconda/lib/python2.7/json/__init__.pyc in load(fp, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
289 parse_float=parse_float, parse_int=parse_int,
290 parse_constant=parse_constant, object_pairs_hook=object_pairs_hook,
--> 291 **kw)
292
293
/Users/anaconda/lib/python2.7/json/__init__.pyc in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
337 parse_int is None and parse_float is None and
338 parse_constant is None and object_pairs_hook is None and not kw):
--> 339 return _default_decoder.decode(s)
340 if cls is None:
341 cls = JSONDecoder
/Users/anaconda/lib/python2.7/json/decoder.pyc in decode(self, s, _w)
365 end = _w(s, end).end()
366 if end != len(s):
--> 367 raise ValueError(errmsg("Extra data", s, end, len(s)))
368 return obj
369
ValueError: Extra data: line 2 column 1 - line 156640 column 1 (char 731 - 132272455)
Update4:
Code:
with open('dataset/business.json') as f:
reviews = f.read().strip().split("\n")
reviews = [json.loads(review) for review in reviews]
reviews[1:5]
Sample Data:
[{u'address': u'2824 Milton Rd',
u'attributes': {u'Ambience': {u'casual': False,
u'classy': False,
u'divey': False,
u'hipster': False,
u'intimate': False,
u'romantic': False,
u'touristy': False,
u'trendy': False,
u'upscale': False},
u'BusinessAcceptsCreditCards': False,
u'GoodForKids': True,
u'GoodForMeal': {u'breakfast': False,
u'brunch': False,
u'dessert': False,
u'dinner': False,
u'latenight': False,
u'lunch': False},
u'HasTV': False,
u'NoiseLevel': u'average',
u'OutdoorSeating': False,
u'RestaurantsAttire': u'casual',
u'RestaurantsDelivery': True,
u'RestaurantsGoodForGroups': True,
u'RestaurantsPriceRange2': 2,
u'RestaurantsReservations': False,
u'RestaurantsTakeOut': True},
u'business_id': u'mLwM-h2YhXl2NCgdS84_Bw',
u'categories': [u'Food',
u'Soul Food',
u'Convenience Stores',
u'Restaurants'],
u'city': u'Charlotte',
u'hours': {u'Friday': u'10:00-22:00',
u'Monday': u'10:00-22:00',
u'Saturday': u'10:00-22:00',
u'Sunday': u'10:00-22:00',
u'Thursday': u'10:00-22:00',
u'Tuesday': u'10:00-22:00',
u'Wednesday': u'10:00-22:00'},
u'is_open': 0,
u'latitude': 35.23687,
u'longitude': -80.7419759,
u'name': u'South Florida Style Chicken & Ribs',
u'neighborhood': u'Eastland',
u'postal_code': u'28215',
u'review_count': 4,
u'stars': 4.5,
u'state': u'NC'},
{u'address': u'337 Danforth Avenue',
u'attributes': {u'BikeParking': True,
u'BusinessAcceptsCreditCards': True,
u'BusinessParking': {u'garage': False,
u'lot': False,
u'street': True,
u'valet': False,
u'validated': False},
u'OutdoorSeating': False,
u'RestaurantsPriceRange2': 2,
u'WheelchairAccessible': True,
u'WiFi': u'no'},
u'business_id': u'v2WhjAB3PIBA8J8VxG3wEg',
u'categories': [u'Food', u'Coffee & Tea'],
u'city': u'Toronto',
u'hours': {u'Friday': u'10:00-19:00',
u'Monday': u'10:00-19:00',
u'Saturday': u'10:00-18:00',
u'Sunday': u'12:00-17:00',
u'Thursday': u'10:00-19:00',
u'Tuesday': u'10:00-19:00',
u'Wednesday': u'10:00-19:00'},
u'is_open': 0,
u'latitude': 43.6771258,
u'longitude': -79.3532848,
u'name': u'The Tea Emporium',
u'neighborhood': u'Riverdale',
u'postal_code': u'M4K 1N7',
u'review_count': 7,
u'stars': 4.5,
u'state': u'ON'},
{u'address': u'7702 E Doubletree Ranch Rd, Ste 300',
u'attributes': {},
u'business_id': u'CVtCbSB1zUcUWg-9TNGTuQ',
u'categories': [u'Professional Services', u'Matchmakers'],
u'city': u'Scottsdale',
u'hours': {u'Friday': u'9:00-17:00',
u'Monday': u'9:00-17:00',
u'Thursday': u'9:00-17:00',
u'Tuesday': u'9:00-17:00',
u'Wednesday': u'9:00-17:00'},
u'is_open': 1,
u'latitude': 33.5650816,
u'longitude': -111.9164003,
u'name': u'TRUmatch',
u'neighborhood': u'',
u'postal_code': u'85258',
u'review_count': 3,
u'stars': 3.0,
u'state': u'AZ'},
{u'address': u'4719 N 20Th St',
u'attributes': {u'Alcohol': u'none',
u'Ambience': {u'casual': False,
u'classy': False,
u'divey': False,
u'hipster': False,
u'intimate': False,
u'romantic': False,
u'touristy': False,
u'trendy': False,
u'upscale': False},
u'BikeParking': True,
u'BusinessAcceptsCreditCards': True,
u'BusinessParking': {u'garage': False,
u'lot': False,
u'street': False,
u'valet': False,
u'validated': False},
u'Caters': True,
u'GoodForKids': True,
u'GoodForMeal': {u'breakfast': False,
u'brunch': False,
u'dessert': False,
u'dinner': False,
u'latenight': False,
u'lunch': False},
u'HasTV': False,
u'NoiseLevel': u'quiet',
u'OutdoorSeating': False,
u'RestaurantsAttire': u'casual',
u'RestaurantsDelivery': False,
u'RestaurantsGoodForGroups': True,
u'RestaurantsPriceRange2': 1,
u'RestaurantsReservations': False,
u'RestaurantsTableService': False,
u'RestaurantsTakeOut': True,
u'WiFi': u'no'},
u'business_id': u'duHFBe87uNSXImQmvBh87Q',
u'categories': [u'Sandwiches', u'Restaurants'],
u'city': u'Phoenix',
u'hours': {},
u'is_open': 0,
u'latitude': 33.5059283,
u'longitude': -112.0388474,
u'name': u'Blimpie',
u'neighborhood': u'',
u'postal_code': u'85016',
u'review_count': 10,
u'stars': 4.5,
u'state': u'AZ'}]
Related
I'd like to convert API response into a pandas dataframe to make it easier to manipulate.
Below it's what I've tried so far:
import requests
import pandas as pd
URL = 'https://api.gleif.org/api/v1/lei-records?page[size]=10&page[number]=1&filter[entity.names]=*'
r = requests.get(URL, proxies=proxyDict)
x = r.json()
x
out:
{'meta': {'goldenCopy': {'publishDate': '2020-07-14T00:00:00Z'},
'pagination': {'currentPage': 1,
'perPage': 10,
'from': 1,
'to': 10,
'total': 1675786,
'lastPage': 167579}},
'links': {'first': 'https://api.gleif.org/api/v1/lei-records?filter%5Bentity.names%5D=%2A&page%5Bnumber%5D=1&page%5Bsize%5D=10',
'next': 'https://api.gleif.org/api/v1/lei-records?filter%5Bentity.names%5D=%2A&page%5Bnumber%5D=2&page%5Bsize%5D=10',
'last': 'https://api.gleif.org/api/v1/lei-records?filter%5Bentity.names%5D=%2A&page%5Bnumber%5D=167579&page%5Bsize%5D=10'},
'data': [{'type': 'lei-records',
'id': '254900RR9EUYHB7PI211',
'attributes': {'lei': '254900RR9EUYHB7PI211',
'entity': {'legalName': {'name': 'MedicLights Research Inc.',
'language': None},
'otherNames': [],
'transliteratedOtherNames': [],
'legalAddress': {'language': None,
'addressLines': ['300 Ranee Avenue'],
'addressNumber': None,
'addressNumberWithinBuilding': None,
'mailRouting': None,
'city': 'Toronto',
'region': 'CA-ON',
'country': 'CA',
'postalCode': 'M6A 1N8'},
'headquartersAddress': {'language': None,
'addressLines': ['76 Marble Arch Crescent'],
'addressNumber': None,
'addressNumberWithinBuilding': None,
'mailRouting': None,
'city': 'Toronto',
'region': 'CA-ON',
'country': 'CA',
'postalCode': 'M1R 1W9'},
'registeredAt': {'id': 'RA000079', 'other': None},
'registeredAs': '002185472',
'jurisdiction': 'CA-ON',
'category': None,
'legalForm': {'id': 'O90R', 'other': None},
'associatedEntity': {'lei': None, 'name': None},
'status': 'ACTIVE',
'expiration': {'date': None, 'reason': None},
'successorEntity': {'lei': None, 'name': None},
'otherAddresses': []},
'registration': {'initialRegistrationDate': '2020-07-13T21:09:50Z',
'lastUpdateDate': '2020-07-13T21:09:50Z',
'status': 'ISSUED',
'nextRenewalDate': '2021-07-13T21:09:50Z',
'managingLou': '5493001KJTIIGC8Y1R12',
'corroborationLevel': 'PARTIALLY_CORROBORATED',
'validatedAt': {'id': 'RA000079', 'other': None},
'validatedAs': '002185472'},
'bic': None},
'relationships': {'managing-lou': {'links': {'related': 'https://api.gleif.org/api/v1/lei-records/254900RR9EUYHB7PI211/managing-lou'}},
'lei-issuer': {'links': {'related': 'https://api.gleif.org/api/v1/lei-records/254900RR9EUYHB7PI211/lei-issuer'}},
'direct-parent': {'links': {'reporting-exception': 'https://api.gleif.org/api/v1/lei-records/254900RR9EUYHB7PI211/direct-parent-reporting-exception'}},
'ultimate-parent': {'links': {'reporting-exception': 'https://api.gleif.org/api/v1/lei-records/254900RR9EUYHB7PI211/ultimate-parent-reporting-exception'}}},
'links': {'self': 'https://api.gleif.org/api/v1/lei-records/254900RR9EUYHB7PI211'}},
{'type': 'lei-records',
'id': '254900F9XV2K6IR5TO93',
Then I tried to put it into pandas and gives me the following results
f = pd.DataFrame(x['data'])
f
type id attributes relationships links
0 lei-records 254900RR9EUYHB7PI211 {'lei': '254900RR9EUYHB7PI211', 'entity': {'le... {'managing-lou': {'links': {'related': 'https:... {'self': 'https://api.gleif.org/api/v1/lei-rec...
1 lei-records 254900F9XV2K6IR5TO93 {'lei': '254900F9XV2K6IR5TO93', 'entity': {'le... {'managing-lou': {'links': {'related': 'https:... {'self': 'https://api.gleif.org/api/v1/lei-rec...
2 lei-records 254900DIC0729LEXNL12 {'lei': '254900DIC0729LEXNL12', 'entity': {'le... {'managing-lou': {'links': {'related': 'https:... {'self': 'https://api.gleif.org/api/v1/lei-rec...
Which isn't the result expected. I even tried to read_json with below codes:
g = pd.read_json(x.text)
g
which gives me the error
AttributeError: 'dict' object has no attribute 'text'
the expected output should look like this:
lei entity.legalName.name entity.legalAddress.addressLines entity.legalAddress.city entity.legalAddress.postalcode status registration.status
254900RR9EUYHB7PI211 MedicLights Research Inc. 300 Ranee Avenue Toronto M6A 1N8 ACTIVE ISSUED
Thanks for anyone helping
Use json_normalize like:
pd.json_normalize(x['data'])
Here is another method to use the pandas to normalize the json file using pandas.io.json.json_normalize from pandas.io.json library.
How to normalize json correctly by Python Pandas
Suppose we have the following dataframe pulled from SQL called df:
ProdHouse Date_Year Date_Month
Software6 2001 Jan
Software6 2020 Feb
Software1 2004 Mar
Software4 2004 Apr
Software5 2004 May
Software3 2009 Dec
Software5 1995 Dec
Software3 1995 Oct
The objective is to display the total number of products per month. The year is selected using the drop down. It appears that when the x-axis is categorical (i.e month) it does not display the data points. However, if i substitute it with an integer, points are displayed.
def serve_layout():
session_id = str(uuid.uuid4())
return html.Div([ html.Div(session_id, id='session-id', style={'display': 'none'}),
html.Label('Year'),
dcc.Dropdown( id='year-dropdown',
options=[
{'label': year ,'value': year} for year in df['Date_Year'].unique()
],
value=[2020],#[df['Date_Year'].unique()],
multi=True ),
dcc.Graph(id='graph-with-dropdown')
] , style={'width':'33%','display':'inline-block'} )
app.layout = serve_layout
#app.callback(
dash.dependencies.Output('graph-with-dropdown', 'figure'),
[dash.dependencies.Input('year-dropdown', 'value')]) # Add the marks as a State
def update_figure(selected_year):
print('selected_year: ', selected_year)
filtered_df = df[df.Date_Year.isin(selected_year)]
#filtered_df = df[df.Date_Year == selected_year]
df_grouped = filtered_df.groupby(['ProdHouse','Date_Month']).size().rename('Total_Active_Products').reset_index()
traces=[]
for i in filtered_df.ProdHouse.unique():
df_by_ProdHouse = df_grouped[df_grouped['ProdHouse'] == i]
traces.append(go.Scatter(
x=df_by_ProdHouse['Date_Month'], #df_by_ProdHouse['Total_Active_Products'],
y=df_by_ProdHouse['Total_Active_Products'],
##text=df_by_ProdHouse['brand'],
mode='markers',
opacity=0.7,
marker={
'size': 15,
'line': {'width': 0.5, 'color': 'white'}
},
name=i
) )
return {
'data': traces,
'layout': dict(
xaxis={'type': 'linear', 'title': 'Active Products Per Month'},
yaxis={'title': 'Total Active Products'},
margin={'l': 40, 'b': 40, 't': 10, 'r': 10},
legend={'x': 0, 'y': 1},
hovermode='closest',
transition = {'duration': 500},
)
}
How would one modify the above code so that the data can be displayed on the plot?
This answers the first part of the question which is related to, the points not being displayed. I manage to get the categorical data to display by changing the scatter plot to a bar chart. Since the graph was changed, I removed the mode and type parameters.
#app.callback(
dash.dependencies.Output('graph-with-dropdown', 'figure'),
[dash.dependencies.Input('year-dropdown', 'value')]) # Add the marks as a State
def update_figure(selected_year):
print('selected_year: ', selected_year)
filtered_df = df[df.Date_Year.isin(selected_year)]
df_grouped = filtered_df.groupby(['ProdHouse','Date_Month']).size().rename('Total_Active_Products').reset_index()
traces=[]
for i in filtered_df.ProdHouse.unique():
df_by_ProdHouse = df_grouped[df_grouped['ProdHouse'] == i]
traces.append(go.Bar(
x=df_by_ProdHouse['Date_Month'],
y=df_by_ProdHouse['Total_Active_Products'],
name=i
) )
return {
'data': traces,
'layout': dict(
xaxis={ 'title': 'Active Products Per Month'},
yaxis={'title': 'Total Active Products'},
margin={'l': 40, 'b': 40, 't': 10, 'r': 10},
legend={'x': 0, 'y': 1},
hovermode='closest',
transition = {'duration': 500},
)
}
Alternatively if you want to still use Scatter plot, convert df['Date_Month'] and df['Date_Year'] from category to object with dates eg: May 2020 is 2020-05-01.
This can be achieved using the following example:
import pandas as pd
df = pd.DataFrame({'ProdHouse': ['software 1', 'software 2', 'software 3', 'software 4', 'software 3'],
'Date_Year': [2018, 2018, 2018, 2018, 2018], 'Date_Month': ['January', 'February', 'March', 'April', 'May'],'Total_Active_Products':[1,2,7,8,6]})
date_1 ='{}-{}'.format(df['Date_Month'].iloc[0], df['Date_Year'].iloc[0])
date_2 = '{}-{}'.format('June', df['Year'].iloc[4])
df['dates'] = pd.date_range(date_1, date_2, freq='M')
print(df)
Since you are now using objects, replace isin with the following:
filtered_df = df[(pd.to_datetime(df.dates).dt.year>=selected_year_min)& (pd.to_datetime(df.dates).dt.year<=selected_year_max)]
Please adjust the above code accordingly. It is designed to get the min and max year from the dropdown.
Lastly, change x input value in scatter plot as shown below:
traces.append(go.Scatter(
x=df_by_ProdHouse['dates'],
y=df_by_ProdHouse['Total_Active_Products'],
mode='lines+markers',
line={
'color': '#CD5C5C',
'width': 2},
marker={
'color': '#CD5C5C',
'size': 10,
'symbol': "diamond-open"
},
# marker_line_width=1.5, opacity=0.6,
) )
return {
'data': traces,
'layout': dict(
xaxis={ 'title': 'Date',
'showticklabels':True,
'linecolor':'rgb(204, 204, 204)',
'linewidth':2,
'ticks':'outside'
},
yaxis={'title': 'Total Active Products'},
margin={'l': 40, 'b': 40, 't': 10, 'r': 10},
legend={'x': 0, 'y': 1},
#marker=dict(color='#CD5C5C', size=1,symbol="diamond-open"),
hovermode='closest',
transition = {'duration': 500},
title={
'text': "Softwares",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
font=dict(
color="#7f7f7f"
)
)
}
Cannot extract components of data parsed from JSON to Python dictionary.
I attempted to print the value corresponding with a dictionary entry but get an error.
import urllib, json, requests
url = "https://storage.googleapis.com/osbuddy-exchange/summary.json"
response = urllib.urlopen(url)
data = json.loads(response.read())
print type(data)
for key, value in data.iteritems():
print value
print ''
print "data['entry']: ", data['99']
print "name: ", data['name']```
I was hoping I could get attributes of an entry. Say the 'buy_average' given a specific key. Instead I get an error when referencing specific components.
<type 'dict'>
22467 {u'sell_average': 3001, u'buy_average': 0, u'name': u'Bastion potion(2)', u'overall_average': 3001, u'sp': 180, u'overall_quantity': 2, u'members': True, u'sell_quantity': 2, u'buy_quantity': 0, u'id': 22467}
22464 {u'sell_average': 4014, u'buy_average': 0, u'name': u'Bastion potion(3)', u'overall_average': 4014, u'sp': 270, u'overall_quantity': 612, u'members': True, u'sell_quantity': 612, u'buy_quantity': 0, u'id': 22464}
5745 {u'sell_average': 0, u'buy_average': 0, u'name': u'Dragon bitter(m)', u'overall_average': 0, u'sp': 2, u'overall_quantity': 0, u'members': True, u'sell_quantity': 0, u'buy_quantity': 0, u'id': 5745}
...
data['entry']: {u'sell_average': 7843, u'buy_average': 7845, u'name': u'Ranarr potion (unf)', u'overall_average': 7844, u'sp': 25, u'overall_quantity': 23838, u'members': True, u'sell_quantity': 15090, u'buy_quantity': 8748, u'id': 99}
name:
Traceback (most recent call last):
File "C:/Users/Michael/PycharmProjects/osrsGE/osrsGE.py", line 16, in <module>
print "name: ", data['name']
KeyError: 'name'
Process finished with exit code 1
There is no key named 'name' in the dict named 'data'.
The first level keys are numbers like: "6", "2", "8",etc
The seconds level object has a key named 'name' so code like:
print(data['2']['name']) # Cannonball
should work
I have a JSON url with the following structure from THIS url and am trying to get the name, price and volume from the structure below
{'data': {'1': {'id': 1,
'name': 'Bitcoin',
'symbol': 'BTC',
'website_slug': 'bitcoin',
'rank': 1,
'circulating_supply': 17115025.0,
'total_supply': 17115025.0,
'max_supply': 21000000.0,
'quotes': {'USD': {'price': 6317.68,
'volume_24h': 5034440000.0,
'market_cap': 108127251142.0,
'percent_change_1h': 0.22,
'percent_change_24h': 5.26,
'percent_change_7d': -4.37}},
'last_updated': 1529943576},
'2': {'id': 2,
'name': 'Litecoin',
'symbol': 'LTC',
'website_slug': 'litecoin',
'rank': 6,
'circulating_supply': 57133246.0,
'total_supply': 57133246.0,
'max_supply': 84000000.0,
'quotes': {'USD': {'price': 84.4893,
'volume_24h': 512241000.0,
'market_cap': 4827147957.0,
'percent_change_1h': 1.97,
'percent_change_24h': 8.96,
'percent_change_7d': -12.54}},
'last_updated': 1529943541}},
'metadata': {'timestamp': 1529943282,
'num_cryptocurrencies': 1586,
'error': None}}
I tried several variations to get each coin in a row but have failed so far
Attempt 1
df = pd.read_json('https://api.coinmarketcap.com/v2/ticker')
Attempt 2
data = requests.get('https://api.coinmarketcap.com/v2/ticker',params).json()
df = pd.DataFrame(data['data'])
df
Attempt 3
I found this function on stackoverflow called json normalize and I tried to use it but no luck so far
df = pd.io.json.json_normalize(data['data'])
df
Any suggestions on how to turn each coin into a row are super appreciated
UPDATE 1
params = {'start': 0, 'sort': 'id', 'limit': 100}
data = requests.get('https://api.coinmarketcap.com/v2/ticker', params).json()
df = pd.DataFrame(data['data'])
df = df.transpose()
df.set_index('id')
This is pretty close to what I want, but how do I get the volume and price out of quotes
assuming "quotes" only have 1 row and the key is "USD", I did this
df.drop('quotes', 1).join(
pd.DataFrame(
df.quotes.apply(
lambda x: {'USD'+'_'+key: val for key, val in x['USD'].items()}
).tolist()
)
)
I have a MySQL database and a table with the schema
tweet_id BIGINT
tweet_metadata LONGBLOB
I am trying to insert a row into my database as follows :
import MySQLdb as mysql
host = 'localhost'
user = 'root'
passwd = '************'
db = 'twitter'
insert_tweet_query = ''' INSERT INTO tweets(tweet_id, tweet_metadata) VALUES(%s, %s)'''
''' Creates a MySQL connection and returns the cursor '''
def create_connection():
connection = mysql.connect(host, user, passwd, db,use_unicode=True)
connection.set_character_set('utf8')
cursor = connection.cursor()
cursor.execute('SET NAMES utf8;')
cursor.execute('SET CHARACTER SET utf8;')
cursor.execute('SET character_set_connection=utf8;')
return connection, cursor
''' Close the connection '''
def close_connection(cursor, connection):
cursor.close()
connection.commit()
connection.close()
connection, cursor = create_connection()
tweet = dict({u'contributors': None, u'truncated': False, u'text': u'RT #HMV_Anime: \u7530\u6751\u3086\u304b\u308a\u59eb\u30d9\u30b9\u30c8\u30a2\u30eb\u30d0\u30e0\u300cEverlasting Gift\u300d\u98db\u3076\u3088\u3046\u306b\u58f2\u308c\u3066\u3044\u307e\u3059\uff01\u6728\u66dc\u306f\u6a2a\u30a2\u30ea\u516c\u6f14\uff01\u300c\u30d1\u30fc\u30c6\u30a3\u30fc\u306f\u7d42\u308f\u3089\u306a\u3044\u300d\u306e\u30e9\u30c3\u30d7\u30d1\u30fc\u30c8\u306e\u4e88\u7fd2\u5fa9\u7fd2\u306b\u3082\u5fc5\u9808\u3067\u3059\uff01 http://t.co/SVWm2E1r http://t.co/rSP ...', u'in_reply_to_status_id': None, u'id': 258550064480387072L, u'source': u'ShootingStar', u'retweeted': False, u'coordinates': None, u'entities': {u'user_mentions': [{u'indices': [3, 13], u'id': 147791077, u'id_str': u'147791077', u'screen_name': u'HMV_Anime', u'name': u'HMV\u30a2\u30cb\u30e1\uff01'}], u'hashtags': [], u'urls': [{u'indices': [100, 120], u'url': u'http://t.co/SVWm2E1r', u'expanded_url': u'http://ow.ly/evEvT', u'display_url': u'ow.ly/evEvT'}, {u'indices': [121, 136], u'url': u'http://t.co/rSP', u'expanded_url': u'http://t.co/rSP', u'display_url': u't.co/rSP'}]}, u'in_reply_to_screen_name': None, u'in_reply_to_user_id': None, u'retweet_count': 40, u'id_str': u'258550064480387072', u'favorited': False, u'retweeted_status': {u'contributors': None, u'truncated': False, u'text': u'\u7530\u6751\u3086\u304b\u308a\u59eb\u30d9\u30b9\u30c8\u30a2\u30eb\u30d0\u30e0\u300cEverlasting Gift\u300d\u98db\u3076\u3088\u3046\u306b\u58f2\u308c\u3066\u3044\u307e\u3059\uff01\u6728\u66dc\u306f\u6a2a\u30a2\u30ea\u516c\u6f14\uff01\u300c\u30d1\u30fc\u30c6\u30a3\u30fc\u306f\u7d42\u308f\u3089\u306a\u3044\u300d\u306e\u30e9\u30c3\u30d7\u30d1\u30fc\u30c8\u306e\u4e88\u7fd2\u5fa9\u7fd2\u306b\u3082\u5fc5\u9808\u3067\u3059\uff01 http://t.co/SVWm2E1r http://t.co/rSPYm0bE #yukarin', u'in_reply_to_status_id': None, u'id': 258160273171574784L, u'source': u'HootSuite', u'retweeted': False, u'coordinates': None, u'entities': {u'user_mentions': [], u'hashtags': [{u'indices': [127, 135], u'text': u'yukarin'}], u'urls': [{u'indices': [85, 105], u'url': u'http://t.co/SVWm2E1r', u'expanded_url': u'http://ow.ly/evEvT', u'display_url': u'ow.ly/evEvT'}, {u'indices': [106, 126], u'url': u'http://t.co/rSPYm0bE', u'expanded_url': u'http://twitpic.com/awuzz0', u'display_url': u'twitpic.com/awuzz0'}]}, u'in_reply_to_screen_name': None, u'in_reply_to_user_id': None, u'retweet_count': 40, u'id_str': u'258160273171574784', u'favorited': False, u'user': {u'follow_request_sent': None, u'profile_use_background_image': True, u'id': 147791077, u'verified': False, u'profile_image_url_https': u'https://si0.twimg.com/profile_images/2573283223/mn4nu924bnxh643sgu1p_normal.jpeg', u'profile_sidebar_fill_color': u'DDEEF6', u'geo_enabled': False, u'profile_text_color': u'333333', u'followers_count': 17108, u'profile_sidebar_border_color': u'C0DEED', u'location': u'\u4e03\u68ee\u4e2d\u5b66\u6821', u'default_profile_image': False, u'listed_count': 1012, u'utc_offset': 32400, u'statuses_count': 33277, u'description': u'\u79c1\u3001\u8d64\u5ea7\u3042\u304b\u308a\u3002\u3069\u3053\u306b\u3067\u3082\u3044\u308b\u3054\u304f\u666e\u901a\u306e\u4e2d\u5b66\u751f\u3002\u305d\u3093\u306a\u79c1\u3060\u3051\u3069\u3001\u6bce\u65e5\u3068\u3063\u3066\u3082\u5145\u5b9f\u3057\u3066\u308b\u306e\u3002\u3060\u3063\u3066\u3042\u304b\u308a\u306f\u2026\u2026 \u3060\u3063\u3066\u3042\u304b\u308a\u306f\u2026\u2026\u3000\uff08\u203b\u3053\u3061\u3089\u306f#HMV_Japan\u306e\u59c9\u59b9\u30a2\u30ab\u30a6\u30f3\u30c8\u3067\u3059\u3002\u3054\u8cea\u554f\u30fb\u304a\u554f\u3044\u5408\u308f\u305b\u306f\u3001HMV\u30b5\u30a4\u30c8\u4e0a\u306e\u5c02\u7528\u30d5\u30a9\u30fc\u30e0\u3088\u308a\u304a\u9858\u3044\u81f4\u3057\u307e\u3059\u3002\uff09', u'friends_count': 17046, u'profile_link_color': u'0084B4', u'profile_image_url': u'http://a0.twimg.com/profile_images/2573283223/mn4nu924bnxh643sgu1p_normal.jpeg', u'following': None, u'profile_background_image_url_https': u'https://si0.twimg.com/profile_background_images/104844943/bg_hmv2.gif', u'profile_background_color': u'202020', u'id_str': u'147791077', u'profile_background_image_url': u'http://a0.twimg.com/profile_background_images/104844943/bg_hmv2.gif', u'name': u'HMV\u30a2\u30cb\u30e1\uff01', u'lang': u'ja', u'profile_background_tile': False, u'favourites_count': 0, u'screen_name': u'HMV_Anime', u'notifications': None, u'url': u'http://www.hmv.co.jp/anime/', u'created_at': u'Tue May 25 02:07:35 +0000 2010', u'contributors_enabled': False, u'time_zone': u'Tokyo', u'protected': False, u'default_profile': False, u'is_translator': False}, u'geo': None, u'in_reply_to_user_id_str': None, u'possibly_sensitive': False, u'created_at': u'Tue Oct 16 10:59:40 +0000 2012', u'possibly_sensitive_editable': True, u'in_reply_to_status_id_str': None, u'place': None}, u'user': {u'follow_request_sent': None, u'profile_use_background_image': True, u'id': 500471418, u'verified': False, u'profile_image_url_https': u'https://si0.twimg.com/profile_images/2722246932/b71d269b9e1e16f59698b4f7fa23a0fe_normal.jpeg', u'profile_sidebar_fill_color': u'DDEEF6', u'geo_enabled': False, u'profile_text_color': u'333333', u'followers_count': 2241, u'profile_sidebar_border_color': u'C0DEED', u'location': u'\u3072\u3060\u307e\u308a\u8358204\u53f7\u5ba4', u'default_profile_image': False, u'listed_count': 41, u'utc_offset': 32400, u'statuses_count': 18879, u'description': u'\u611f\u3058\u308d\u2026\u2026\u3002 \u2514(\u2510L \u309c\u03c9\u3002)\u2518\u305d\u3057\u3066\uff71\uff8d\u9854\uff80\uff9e\uff8c\uff9e\uff99\uff8b\uff9f\uff70\uff7d\u3060 \u270c( \u055e\u0a0a \u055e)\u270c \u2026\u2026\uff01 \u3051\u3044\u304a\u3093\u3001\u307e\u3069\u30de\u30ae\u3001AB\u3001\u3089\u304d\u2606\u3059\u305f\u3001\u3086\u308b\u3086\u308a\u3001\u30df\u30eb\u30ad\u30a3\u3068\u304b\u306e\u30a2\u30cb\u30e1\u3001\u6771\u65b9\u3001\u30dc\u30ab\u30ed\u597d\u304d\u3060\u3088\u2517(^\u03c9^ )\u251b\u30c7\u30c7\u30f3\uff01 \u30d5\u30a9\u30ed\u30d0\u306f\u3059\u308b\u304b\u3089\u5f85\u3063\u3068\u3044\u3066 \u53ef\u6190\u3061\u3083\u3093\u540c\u76dfNo.9 \u308c\u3044\u3080\u540c\u76dfNo.4 \u898f\u5236\u57a2\u2192#SpeedPer_2', u'friends_count': 2038, u'profile_link_color': u'0084B4', u'profile_image_url': u'http://a0.twimg.com/profile_images/2722246932/b71d269b9e1e16f59698b4f7fa23a0fe_normal.jpeg', u'following': None, u'profile_background_image_url_https': u'https://si0.twimg.com/profile_background_images/600710368/ff2z5gv4s83u313432hj.jpeg', u'profile_background_color': u'C0DEED', u'id_str': u'500471418', u'profile_background_image_url': u'http://a0.twimg.com/profile_background_images/600710368/ff2z5gv4s83u313432hj.jpeg', u'name': u'\u3055\u30fc\u3057\u3083\u3059#\u30cf\u30cb\u30ab\u30e0\u30ac\u30c1\u52e2', u'lang': u'ja', u'profile_background_tile': True, u'favourites_count': 3066, u'screen_name': u'SpeedPer', u'notifications': None, u'url': u'https://mobile.twitter.com/account', u'created_at': u'Thu Feb 23 05:10:57 +0000 2012', u'contributors_enabled': False, u'time_zone': u'Irkutsk', u'protected': False, u'default_profile': False, u'is_translator': False}, u'geo': None, u'in_reply_to_user_id_str': None, u'possibly_sensitive': False, u'created_at': u'Wed Oct 17 12:48:33 +0000 2012', u'possibly_sensitive_editable': True, u'in_reply_to_status_id_str': None, u'place': None})
cursor.execute(insert_tweet_query, (tweet['id_str'], tweet))
close_connection(cursor, connection)
However, despite setting appropriate 'UTF-8' encodings I get an exception as follows
_mysql_exceptions.ProgrammingError: (1064, 'You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near \': \'NULL\', u\'truncated\': \'0\', u\'text\': "\'RT #HMV_Anime: \\xe7\\x94\\xb0\\xe6\\x9d\\x91\\\' at line 1')
What am I doing wrong?
you could try with repr:
cursor.execute(insert_tweet_query, (tweet['id_str'], repr(tweet)))