Data which I am scraping using the beautiful soup contains one category of device name, device names contains Colors mentioned in them eg. Lumia 800 Black. I want to create a new column which contains this color.
I want to search the device name for any color against a list of colors & if color is present in that device name I want to remove that color from device name and put it in new column named Color.
I am using below referred code to accomplish this, I am creating a function named color and trying to search the device name string for presence of color and if present I am trying to feed that color to new variable named color_column. But my output csv is not returning any values at all. It is empty.
Please check the referred code below:
# -*- coding: cp1252 -*-
import csv
import urllib2
import sys
import urllib
import time
import mechanize
import cookielib
from bs4 import BeautifulSoup
from itertools import islice
colors = ["Black","Gray"]
def color(arg):
for colors_1 in colors:
if arg.find(colors_1) == -1:
return color_column == ""
return color_column == colors_1
url = 'http://www.t-mobile.com/shop/phones/default.aspx?shape=smartphones'
user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1;Trident/5.0)'
values = {
'Phones':'MBBDevice',
'__ASYNCPOST':'true',
'__EVENTARGUMENT':'',
'__EVENTTARGET':'pgrTop$lnkPageShowAll',
'__LASTFOCUS':'',
'__VIEWSTATE':'/wEPDwULLTE1NTE5NDk1ODIPFgIeEEN1cnJlbnRQYWdlSW5kZXgCARYCAgEPZBYCAgEPZBYCAgEPZBYCZg9kFgICAQ9kFhgCCg9kFgJmD2QWAmYPZBYCZg8UKwACZDKJBAABAAAA/////wEAAAAAAAAADAIAAABfVE1vYmlsZS5XZWIuVE1vYmlsZURvdENvbS5VSS5XZWJDb250cm9scywgVmVyc2lvbj0xLjAuMC4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAAEFUTW9iaWxlLldlYi5UTW9iaWxlRG90Q29tLlVJLldlYkNvbnRyb2xzLkJyZWFkQ3J1bWJJdGVtQ29sbGVjdGlvbgEAAAATQ29sbGVjdGlvbkJhc2UrbGlzdAMcU3lzdGVtLkNvbGxlY3Rpb25zLkFycmF5TGlzdAIAAAAJAwAAAAQDAAAAHFN5c3RlbS5Db2xsZWN0aW9ucy5BcnJheUxpc3QDAAAABl9pdGVtcwVfc2l6ZQhfdmVyc2lvbgUAAAgICQQAAAACAAAABQAAABAEAAAABAAAAAkFAAAACQYAAAANAgUFAAAAN1RNb2JpbGUuV2ViLlRNb2JpbGVEb3RDb20uVUkuV2ViQ29udHJvbHMuQnJlYWRDcnVtYkl0ZW0DAAAABV90ZXh0BF91cmwJX3Nob3dMaW5rAQEAAQIAAAAGBwAAAARIb21lBggAAAAAAQEGAAAABQAAAAYJAAAAGVNtYXJ0cGhvbmVzICYgQ2VsbCBQaG9uZXMGCgAAAAtzaG9wL3Bob25lcwELZAIMD2QWAgIDDxYCHgxIdG1sT3ZlcnJpZGUFkwI8aW1nIHN0eWxlPSJGTE9BVDogcmlnaHQ7IENVUlNPUjogcG9pbnRlciEgaW1wb3J0YW50IiBvbmNsaWNrPSJqYXZhc2NyaXB0OnBvcFVwKCAnL3RlbXBsYXRlcy9wb3B1cC5hc3B4P1BBc3NldD1TaHBfUGhuX3NoaXBwaW5nRGV0YWlscycsICczNDAnLCAnNTY4JywgJzQ1JywgJzMwJywgJzAnLCAnMCcsICcxJyApIiBhbHQ9IkZyZWUgU2hpcHBpbmcgb24gYWxsIGNlbGwgcGhvbmVzIGFuZCBkZXZpY2VzLiIgc3JjPSIuLi9pbWFnZXMvZnJlZV9zaGlwcGluZy1iYW5uZXIuZ2lmIiAvPmQCDg8PFgIeB1Zpc2libGVoZGQCGA9kFgJmD2QWAmYPZBYCZg9kFggCAQ9kFgICAQ8QDxYEHgdDaGVja2VkaB4HRW5hYmxlZGgWAh4LbWFrZWVuYWJsZWQFBWZhbHNlZGRkAgUPZBYCAgEPEA9kFgIfBQUEdHJ1ZWRkZAIHD2QWAgIBDxAPZBYCHwUFBHRydWVkZGQCCQ9kFgICAQ8QD2QWAh8FBQR0cnVlZGRkAhoPZBYCZg9kFgJmD2QWAmYPZBYEAgMPZBYCAgEPEA9kFgIfBQUEdHJ1ZWRkZAIFD2QWAgIBDxAPFgIeBFRleHQF2AU8dGFibGUgaGVpZ2h0PSIxNSIgY2VsbHNwYWNpbmc9IjAiIGNlbGxwYWRkaW5nPSIwIiB3aWR0aD0iNzciIGJvcmRlcj0iMCI+CiAgICAgIDx0Ym9keT4KICAgICAgICA8dHI+CiAgICAgICAgICA8dGQgY2xhc3M9InJlZnVyYmlzaGVkIj5SZWZ1cmJpc2hlZDwvdGQ+CgogICAgICAgICAgPHRkIGNsYXNzPSJyZWZ1cmJpc2hlZCI+CiAgICAgICAgICAgIDxkaXYgb25tb3VzZW92ZXI9ImphdmFzY3JpcHQ6ZGlzcENPQkRlc2MoKTsiIHN0eWxlPSJGTE9BVDogbGVmdCIgb25tb3VzZW91dD0iamF2YXNjcmlwdDpoaWRlQ09CRGVzYygpOyIgcnVuYXQ9InNlcnZlciI+CiAgICAgICAgICAgICAgPGltZyBzcmM9Ii9pbWFnZXMvaWNvbl9oZWxwLmdpZiIgLz4gPGRpdiBjbGFzcz0idG9vbHRpcCIgaWQ9ImRpdkNPQkRlc2NyaXB0aW9uIiBzdHlsZT0iRElTUExBWTogbm9uZSI+CiAgICAgIDxkaXYgY2xhc3M9InRvb2x0aXAtYnRtLWJrZyI+CiAgICAgICAgPGRpdiBjbGFzcz0idG9vbHRpcC1jb250YWluZXIiPgogICAgICAgICAgR2V0IGEgZ3JlYXQgdmFsdWUgb24gYSBsaWtlLW5ldyBwaG9uZQogICAgICAgICAgPGJyIC8+CiAgICAgICAgICAgd2l0aCBhIDkwLWRheSB3YXJyYW50eS4KICAgICAgICA8L2Rpdj4KICAgICAgPC9kaXY+CiAgICA8L2Rpdj4KICAgICAgICAgICAgPC9kaXY+CiAgICAgICAgICA8L3RkPgogICAgICAgIDwvdHI+CiAgICAgIDwvdGJvZHk+CiAgICA8L3RhYmxlPhYCHwUFBHRydWVkZGQCIA8WAh4Fc3R5bGUFDmRpc3BsYXk6YmxvY2s7FgJmD2QWAmYPZBYCZg9kFgYCAw9kFgICAQ8QD2QWAh8FBQR0cnVlZGRkAgUPZBYCAgEPEA9kFgIfBQUEdHJ1ZWRkZAIHD2QWAgIBDxAPZBYCHwUFBHRydWVkZGQCKg9kFgJmD2QWAmYPZBYEZg8PFgIfAmcWAh4HT25DbGljawUKQ2xlYXJJRFMoKWQCAQ8PZBYCHwgFCkNsZWFySURTKClkAi4PZBYCZg9kFgJmD2QWAgIKD2QWCAIBDw8WAh8CaGRkAgMPFgIeCl9QYWdlQ291bnQCBBYGAgIPFgIfAmhkAgcPD2QWAh8HBQxkaXNwbGF5Om5vbmVkAggPDxYCHwJnZGQCBw8WAh8JAgQWBgICDxYCHwJoZAIIDw9kFgIfBwUMZGlzcGxheTpub25lZAIJDw8WAh8CZ2RkAgsPFgIfAmhkAjAPFgIeE0Ntc0NvbGxlY3Rpb25TdHJpbmdlZAI0D2QWAmYPZBYCZg9kFgQCAQ8WAh4MQ21zQXNzZXROYW1lBRVUb3V0X0ZBUV9EZXZBbGxQaG9uZXNkAgQPFgIfCgUPdG91dF9odG1sX2xvZ2luZAI2D2QWBGYPZBYCZg9kFgJmDxYCHwJoZAIBD2QWAmYPZBYCZg8WAh8LBRJzaHBfcGhuX2xlZ2FsTm90ZXNkAjgPDxYCHhxUaXRsZXBvcHVwUGxhbkNoYW5nZVJlcXVpcmVkZWQWBAIPDxYCHwJoZAITDxYCHwJoZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WNAUJTUJCRGV2aWNlBQ1QcmVQYWlkUGhvbmVzBQ1QcmVQYWlkUGhvbmVzBSFyZXBQcmljZVJhbmdlJGN0bDAwJGNoa1ByaWNlUmFuZ2UFDmNoa05ld0Fycml2YWxzBQ9jaGtXZWJPbmx5RGVhbHMFEmNoa1dlYk9ubHlQcm9kdWN0cwUPY2hrTmV3Q29uZGl0aW9uBQZjaGtDT0IFFnJlcFR5cGVzJGN0bDAwJGNoa1R5cGUFFnJlcFR5cGVzJGN0bDAyJGNoa1R5cGUFFnJlcFR5cGVzJGN0bDA0JGNoa1R5cGUFFnJlcFR5cGVzJGN0bDA1JGNoa1R5cGUFFnJlcFR5cGVzJGN0bDA2JGNoa1R5cGUFDGNoa0FuZHJvaWRPUwUPY2hrQmxhY2tCZXJyeU9TBQhjaGtXaW5PUwUgcmVwRmVhdHVyZUZpbHRlciRjdGwwMCRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMDEkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDAyJGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwwMyRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMDQkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDA1JGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwwNiRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMDckY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDA4JGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwwOSRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMTAkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDExJGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwxMiRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMTMkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDE0JGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwxNSRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMTYkY2hrRmlsdGVyBSdyZXBNYW51ZmFjdHVyZXJzJGN0bDAwJGNoa01hbnVmYWN0dXJlcnMFJ3JlcE1hbnVmYWN0dXJlcnMkY3RsMDEkY2hrTWFudWZhY3R1cmVycwUncmVwTWFudWZhY3R1cmVycyRjdGwwMiRjaGtNYW51ZmFjdHVyZXJzBSdyZXBNYW51ZmFjdHVyZXJzJGN0bDA0JGNoa01hbnVmYWN0dXJlcnMFJ3JlcE1hbnVmYWN0dXJlcnMkY3RsMDUkY2hrTWFudWZhY3R1cmVycwUncmVwTWFudWZhY3R1cmVycyRjdGwwNiRjaGtNYW51ZmFjdHVyZXJzBSdyZXBNYW51ZmFjdHVyZXJzJGN0bDA3JGNoa01hbnVmYWN0dXJlcnMFJ3JlcE1hbnVmYWN0dXJlcnMkY3RsMDgkY2hrTWFudWZhY3R1cmVycwUabXJwUGhvbmVzJGN0bDAwJGNoa0NvbXBhcmUFGm1ycFBob25lcyRjdGwwMiRjaGtDb21wYXJlBRptcnBQaG9uZXMkY3RsMDQkY2hrQ29tcGFyZQUabXJwUGhvbmVzJGN0bDA2JGNoa0NvbXBhcmUFGm1ycFBob25lcyRjdGwwOCRjaGtDb21wYXJlBRptcnBQaG9uZXMkY3RsMTAkY2hrQ29tcGFyZQUabXJwUGhvbmVzJGN0bDEyJGNoa0NvbXBhcmUFGm1ycFBob25lcyRjdGwxNCRjaGtDb21wYXJlBRptcnBQaG9uZXMkY3RsMTYkY2hrQ29tcGFyZQUabXJwUGhvbmVzJGN0bDE4JGNoa0NvbXBhcmVnDy0KUN8keEvS5/wEmJXssTUSNw==',
'ctl09':'ctl13|pgrTop$lnkPageShowAll',
'ddlSort':'0',
'hdnBlackBerryID':'3c2c3562-aa1c-4fe4-a0ca-da5dd8e4bd84',
'hdnCapCode':'',
'hdnDeviceId':'',
'hdnFeature':'',
'hdnFeatureNames':'',
'hdnFilter':'',
'hdnIsPricingOptionLockedB':'false',
'hdnLocationParameter':'',
'hdnManufacturer':'',
'hdnManufacturerID':'',
'hdnManufacturerNames':'',
'hdnOtherFilters':'',
'hdnPageIndex':'',
'hdnPriceRange':'',
'hdnPriceRangeText':'',
'hdnProductType':'GSM',
'hdnSelectedDeviceId':'',
'hdnSelections':'',
'hdnSortFilter':'0',
'hdnTitle':'',
'hdnType':'smp,',
'hdnTypeNames':'Smartphone|',
'popupPlanChangeRequired$hdnDeviceID':'',
'popupPlanChangeRequired$hdnFamilyID':'',
'popupPlanChangeRequired$hiddenImagePath':'',
'repTypes$ctl05$chkType':'on',
'txtSelectedDevices':'0',
'txtSelectedFeatures':'0'}
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)
response = urllib2.urlopen(req)
page = response.read()
soup = BeautifulSoup(page)
with open('tmob_colortest.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',')
items = soup.findAll('div', {"class": "phonename"}, text = colors)
prices = soup.findAll('p', {"class": "totalitemprice"})
for item, price in zip(items, prices):
textcontent = u' '.join(islice(item.stripped_strings, 0, 2, 1))
textcontent2 = u' '.join(price.stripped_strings)
name_1 = unicode(textcontent).encode('utf8').replace('Nexus 4','LG Nexus 4').replace(' T-Mobile Refurbished Device','').replace('™','').replace('®','').replace(' ›','').replace("NEW! ","").replace(" Web-only offer -- now thru Thu 1/3/13","").replace(" Web-only offer!","").strip()
oem = list(name_1)
pos = oem.index(' ')
if name_1.find('Refurbished')== -1:
name= name_1
refur = "N"
else:
name = name_1.replace("Refurbished","").replace(" -","")
refur = "Y"
spamwriter.writerow(["US", "T-Mobile",
name[0:pos],name,refur,color_column,
"24 Months","$",unicode(textcontent2).encode('utf8').replace("FREE","0").replace('$','')])
Please help me to solve this issue and pardon my ignorance as I am new to coding.
You never actually use your function, so color_column is never filled.
What you want to do is make your function return the changed product name, and the color detected, as two separate values:
def handle_color(arg):
for col in colors:
if col.lower() not in arg.lower():
continue
# color found, remove it from arg (case insensitively)
start = arg.lower().index(col.lower())
arg = arg[:start] + arg[start + len(col):]
return arg, col
# No matching color found, return arg unchanged and an empty value for the color
return arg, ''
Now all you have to do is call this function and unpack it's return value into two variables for your CSV:
name, color_column = handle_color(name)
and color_column will either be an empty value or the matched color (now removed from name).
Related
Sorry, I am new in coding in Python, I would need to save a json file generated in a for loop as csv for each iteration of the loop.
I wrote a code that works fine to generate the first csv file but then it is overwritten and I did not find a solution yet. Can anyone help me? many thanks
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file.csv')
You need to add a sequence number or some other unique identifier to the filename. The clearest example would be to keep track of a counter, or use a GUID. Below I've used a counter that is initialized before your loop, and is incremented in each iteration. This will produce a list of files like output_file_1.csv, output_file_2.csv, output_file_3.csv and so on.
counter = 0
for user in user_objects:
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv('output_file_' + str(counter) + '.csv')
counter += 1
We convert the integer to a string, and paste it inbetween the name of your file and its extension.
from twarc.client2 import Twarc2
import itertools
import pandas as pd
import csv
import json
import numpy as np
# Your bearer token here
t = Twarc2(bearer_token="AAAAAAAAAAAAAAAAAAAAA....WTW")
# Get a bunch of user handles you want to check:
list_of_names = np.loadtxt("usernames.txt",dtype="str")
# Get the `data` part of every request only, as one list
def get_data(results):
return list(itertools.chain(*[result['data'] for result in results]))
user_objects = get_data(t.user_lookup(users=list_of_names, usernames=True))
for idx, user in enumerate(user_objects):
following = get_data(t.following(user['id']))
# Do something with the lists
print(f"User: {user['username']} Follows {len(following)} -2")
json_string = json.dumps(following)
df = pd.read_json(json_string)
df.to_csv(f'output_file{str(idx)}.csv')
I have some code which collects the description, price, and old price(if on sale) from online retailers over multiple pages. I'm looking to export this into a DataFrame and have had a go but run into the following error:
ValueError: Shape of passed values is (1, 3210), indices imply (3, 3210).
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
# Start Timer
then = time.time()
# Headers
headers = {"User-Agent": "Mozilla/5.0"}
# Set HTTPCode = 200 and Counter = 1
Code = 200
i = 1
scraped_data = []
while Code == 200:
# Put url together
url = "https://www.asos.com/women/jumpers-cardigans/cat/?cid=2637&page="
url = url + str(i)
# Request URL
r = requests.get(url, allow_redirects=False, headers=headers) # No redirects to allow infinite page count
data = r.text
Code = r.status_code
# Soup
soup = BeautifulSoup(data, 'lxml')
# For loop each product then scroll through title price, old price and description
divs = soup.find_all('article', attrs={'class': '_2qG85dG'}) # want to cycle through each of these
for div in divs:
# Get Description
Description = div.find('div', attrs={'class': '_3J74XsK'})
Description = Description.text.strip()
scraped_data.append(Description)
# Fetch TitlePrice
NewPrice = div.find('span', attrs={'data-auto-id':'productTilePrice'})
NewPrice = NewPrice.text.strip("£")
scraped_data.append(NewPrice)
# Fetch OldPrice
try:
OldPrice = div.find('span', attrs={'data-auto-id': 'productTileSaleAmount'})
OldPrice = OldPrice.text.strip("£")
scraped_data.append(OldPrice)
except AttributeError:
OldPrice = ""
scraped_data.append(OldPrice)
print('page', i, 'scraped')
# Print Array
#array = {"Description": str(Description), "CurrentPrice": str(NewPrice), "Old Price": str(OldPrice)}
#print(array)
i = i + 1
else:
i = i - 2
now = time.time()
pd.DataFrame(scraped_data, columns=["A", "B", "C"])
print('Parse complete with', i, 'pages' + ' in', now-then, 'seconds')
Right now your data is appended to list based on an algorithm that I can describe like this:
Load the web page
Append to list value A
Append to list value B
Append to list value C
What this creates for each run through the dataset is:
[A1, B1, C1, A2, B2, C2]
There exists only one column with data, which is what pandas is telling you. To construct the dataframe properly, either you need to swap it into a format where you have, on each row entry, a tuple of three values (heh) like:
[
(A1, B1, C1),
(A2, B2, C2)
]
Or, in my preferred way because it's far more robust to coding errors and inconsistent lengths to your data: creating each row as a dictionary of columns. Thus,
rowdict_list = []
for row in data_source:
a = extract_a()
b = extract_b()
c = extract_c()
rowdict_list.append({'column_a': a, 'column_b': b, 'column_c': c})
And the data frame is constructed easily without having to explicitly specify columns in the constructor with df = pd.DataFrame(rowdict_list).
You can create a DataFrame using the array dictionary.
You would want to set the values of the array dict to empty lists that way you can append the values from the webpage into the correct list. Also move the array variable outside of the while loop.
array = {"Description": [], "CurrentPrice": [], "Old Price": []}
scraped_data = []
while Code == 200:
...
On the line where you were previously defining the array variable you would then want to append the desciption, price and old price values like so.
array['Description'].append(str(Description))
array['CurrentPrice'].append(str(NewPrice))
array['Old Price'].append(str(OldPrice))
Then you can to create a DataFrame using the array variable
pd.DataFrame(array)
So the final solution would look something like
array = {"Description": [], "CurrentPrice": [], "Old Price": []}
scraped_data = []
while Code == 200:
...
# For loop
for div in divs:
# Get Description
Description = div.find('h3', attrs={'class': 'product__title'})
Description = Description.text.strip()
# Fetch TitlePrice
try:
NewPrice = div.find('div', attrs={'class': 'price product__price--current'})
NewPrice = NewPrice.text.strip()
except AttributeError:
NewPrice = div.find('p', attrs={'class': 'price price--reduced'})
NewPrice = NewPrice.text.strip()
# Fetch OldPrice
try:
OldPrice = div.find('p', attrs={'class': 'price price--previous'})
OldPrice = OldPrice.text.strip()
except AttributeError:
OldPrice = ""
array['Description'].append(str(Description))
array['CurrentPrice'].append(str(NewPrice))
array['Old Price'].append(str(OldPrice))
# Print Array
print(array)
df = pd.DataFrame(array)
i = i + 1
else:
i = i - 2
now = time.time()
print('Parse complete with', i, 'pages' + ' in', now - then, 'seconds')
Finally make sure you've imported pandas at the top of the module
import pandas as pd
when I am using the API of Object_detection,I followed the instruction ,everything is fine .However ,when I begin to test my picture , I met a problem , it seems that the function named
" visualize_boxes_and_labels_on_image_array " ( in the 57 line ) didn't work . Here is my source codes
import cv2
import numpy as np
import tensorflow as tf
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util
class TOD(object):
def __init__(self):
self.PATH_TO_CKPT = '/home/xiyou/Desktop/ssd_training/result/frozen_inference_graph.pb'
self.PATH_TO_LABELS = '/home/xiyou/Desktop/ssd_training/detection_for_smoke.pbtxt'
self.NUM_CLASSES = 1
self.detection_graph = self._load_model()
self.category_index = self._load_label_map()
def _load_model(self):
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(self.PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
return detection_graph
def _load_label_map(self):
label_map = label_map_util.load_labelmap(self.PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map,
max_num_classes=self.NUM_CLASSES,
use_display_name=True)
category_index = label_map_util.create_category_index(categories)
return category_index
def detect(self, image):
with self.detection_graph.as_default():
with tf.Session(graph=self.detection_graph) as sess:
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image, axis=0)
image_tensor = self.detection_graph.get_tensor_by_name('image_tensor:0')
boxes = self.detection_graph.get_tensor_by_name('detection_boxes:0')
scores = self.detection_graph.get_tensor_by_name('detection_scores:0')
classes = self.detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = self.detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
print(boxes, scores, classes, num_detections)
#print(np.squeeze(boxes))
# Visualization of the results of a detection.
#######Here is the problem
# image1 = vis_util.visualize_boxes_and_labels_on_image_array(
image, #######Here is the problem
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
self.category_index,
use_normalized_coordinates=True,
line_thickness=50,
)
#print(np.squeeze(boxes),np.squeeze(classes))
cv2.namedWindow("detection")
cv2.imshow("detection", image1)
cv2.waitKey(0)
if __name__ == '__main__':
image = cv2.imread('/home/xiyou/Pictures/timg1.jpg')
detecotr = TOD()
detecotr.detect(image)
when I run this code , the image did show ,but nothing changed , no detected area in the pic and no an other informations . the input pic is the same as the out image . But when I was Debug , I found the Varibles such as soucres , classes , boxes do have values.
Is anyone can help me ? Thanks!!!
And my Tensorflow version is 1.4.0 , CUDA 8.0 in Ubuntu 16.04
I have as of now written a Python script to stream tweets and I have made use of the tweepy module to do so. After streaming for around 3 minutes for tweets, I dump these tweets into a .json file. I populate these tweets (I try to) into a pandas dataframe for location and text fields of the tweet. The text field of the tweet gets populated but not for every tweet (problem 1) in the .json file and as far as the location field is concerned a keyerror (problem 2) is thrown. May I know what exactly is going wrong.
twitter_stream_dump.py
import time
import json
import pandas as pd
import re
#tweepy based modules
import tweepy
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy.streaming import StreamListener
#initializing authentication credentials
consumer_key = ''
consumer_secret = ''
access_key = ''
access_secret = ''
#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener) :
def __init__(self,time_limit) :
self.start_time = time.time()
self.limit = time_limit
self.saveFile = open('requests.json','a')
super(StdOutListener,self).__init__()
def on_data(self, data) :
if ((time.time() - self.start_time) < self.limit) :
self.saveFile.write(data)
self.saveFile.write('\n')
return True
else :
self.saveFile.close()
return False
def on_error(self, status) :
print(status)
def getwords(string) :
return re.findall(r"[\w'#]+|[.,!?;]",string)
if __name__ == '__main__' :
#This handles Twitter authetification and the connection to Twitter Streaming API
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
time_limit = input("Enter the time limit in minutes : ")
time_limit *= 60
stream = Stream(auth,listener = StdOutListener(time_limit))
string = raw_input("Enter the list of keywords/hashtags to be compared : ")
keyword_list = getwords(string)
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track = keyword_list)
tweets_data_path = 'requests.json'
tweets_data = []
tweet_list = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file :
try :
tweet = json.loads(line)
tweet_list.append(tweet)
except :
continue
num_tweets_collected = len(tweet_list)
#Creates a data frame structure
tweet_dataframe = pd.DataFrame()
text_dump = open('text_dump.txt', 'w')
#Populating the location field of the data frame
#tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
tweet_dataframe['text'] = map(lambda tweet : tweet['text'], tweet_list)
print(tweet_dataframe['text'])
Errors :
abhijeet-mohanty-2:Desktop SubrataMohanty$ python twitter_stream_dump.py
Enter the time limit in minutes : 3
Enter the list of keywords/hashtags to be compared : python ruby scala
Traceback (most recent call last):
File "twitter_stream_dump.py", line 81, in <module>
tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
File "twitter_stream_dump.py", line 81, in <lambda>
tweet_dataframe['location'] = map(lambda tweet : tweet['location'], tweet_list)
KeyError: 'location'
requests.json (My .json file)
https://drive.google.com/file/d/0B1p05OszaBkXLWFsQ2VmeWVjbDQ/view?usp=sharing
The location field is a user-defined value and will sometimes not be present.
That's why you're getting the KeyError.
Note that location is part of the "user profile" metadata that comes with a tweet. It's intended to describe a user's location (like their hometown), and not the geotagged location of a given tweet.
In case you're interested in geotags, first check a tweeet to see if the geo_enabled field is true. If so, the geo, coordinates, and place fields may contain geotagged information.
As for missing text entries, I don't see the same issue when using the data you provided. It's possible the issue was caused by your try/except clause when reading in lines of data. Consider this approach:
for i, line in enumerate(tweets_file):
if line.rstrip():
tweet = json.loads(line)
tweet_list.append(tweet)
num_tweets_collected = len(tweet_list)
texts = [tweet['text'] for tweet in tweet_list]
tweet_dataframe = pd.DataFrame(texts, columns=['text'])
Sample output:
print(tweet_dataframe.head())
# text
# 0 Tweets and python BFF <3 15121629.976126991
# 1 RT #zeroSteiner: Can now write more post modul...
# 2 •ruby• #MtvInstagLSelena #MtvColabTaylors
# 3 Ruby Necklace July Birthstone Jewelry Rosary...
# 4 #ossia I didn't see any such thing as Python. ...
A few quick summary stats show that no lines are missing, and no entries are null:
print("N tweets: {}".format(num_tweets_collected))
# N tweets: 286
print("N rows in dataframe: {}".format(tweet_dataframe.shape[0]))
# N rows in dataframe: 286
null_count = tweet_dataframe.text.isnull().sum()
print("Tweets with no text field extracted: {}".format(null_count))
# Tweets with no text field extracted: 0
I try to scraping a webpage and extracting data ,then store all data in a csv file. Before adding ScrapeCallback class and calling it, everything works fine. However, it does not store any type of data except headers in the cvs file after adding the new class. Can anyone help me to figure out the problem?
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
import csv
import lxml.html
class ScrapeCallback:
# extract and store all data in a csv file
def __init__( self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')
self.writer.writerow( self.fields)
def __call__( self, url, html):
if re.search('/view/',url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
print row
self.writer.writerow(row)
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1, scrape_callback=None):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = [seed_url]
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently
"""
domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
html = download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
# link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', max_depth =2, scrape_callback = ScrapeCallback())