Web Scraping: How to get info from dynamic pages? - html

I'm newbie in web scraping. I know how to get data from an HTML or from a JSON but there is a place where I can't know how to do it. I would like to get the positions of points and X's that you can see in the short chart of this page.
http://www.fiba.basketball/euroleaguewomen/18-19/game/2410/Nadezhda-ZVVZ-USK-Praha#|tab=shot_chart
How can I do that?

I'm fairly new as well, but learning as I go. It looks like this page is dynamic, so you'd need to use Selenium to load the page first, before grabbing the html with beautifulsoup to get the x and y coordinates from the Made Shots and Missed shots. So I gave it a shot and was able to get a dataframe with the x, y coords along with if it was 'made' or 'miss'.
I plotted it afterwards just to check to see if it matched, and it appears to be flipped about the x-axis. I believe this is because when you plot on a chart like this graphically, the top, left corner is your (0,0). So your y coordinates are going to be opposite when you want to plot it. I could be wrong though.
None the less, here's the code I used.
import pandas as pd
import bs4
from selenium import webdriver
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
driver.get('http://www.fiba.basketball/euroleaguewomen/18-19/game/2410/Nadezhda-ZVVZ-USK-Praha#|tab=shot_chart')
html = driver.page_source
soup = bs4.BeautifulSoup(html,'html.parser')
made_shots = soup.findAll("svg", {"class": "shot-hit icon icon-point clickable"})
missed_shots = soup.findAll("svg", {"class": "shot-miss icon icon-miss clickable"})
def get_coordiantes(element, label):
results = pd.DataFrame()
for point in element:
x_point = float(point.get('x'))
y_point = float(point.get('y'))
marker = label
temp_df = pd.DataFrame([[x_point, y_point, marker]], columns=['x','y','marker'])
results = results.append(temp_df)
return results
made_results = get_coordiantes(made_shots, 'made')
missed_results = get_coordiantes(missed_shots, 'missed')
results = made_results.append(missed_results)
results = results.reset_index(drop=True)
results['y'] = results['y'] * -1
driver.close()
gives this output:
In [6]:results.head(5)
Out[6]:
x y marker
0 33.0 -107.0 made
1 159.0 -160.0 made
2 143.0 -197.0 made
3 38.0 -113.0 made
4 65.0 -130.0 made
and when I plot it:
import seaborn as sns
import numpy as np
# Add a column: the color depends of x and y values, but you can use whatever function.
value=(results['marker'] == 'made')
results['color']= np.where( value==True , "green", "red")
# plot
sns.regplot(data=results, x="x", y="y", fit_reg=False, scatter_kws={'facecolors':results['color']})
ADDITIONAL: I'm sure there's a better, more efficient, cleaner way to code this up. But just doing it on the fly, came up with this. It should get you going. Feel free to dive into it and look at the html source code to start seeing how it's grabbing the different data. have fun.
import pandas as pd
import bs4
from selenium import webdriver
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
driver.get('http://www.fiba.basketball/euroleaguewomen/18-19/game/2410/Nadezhda-ZVVZ-USK-Praha#|tab=shot_chart')
html = driver.page_source
soup = bs4.BeautifulSoup(html,'html.parser')
###############################################################################
shots = soup.findAll("g", {"class": "shot-item"})
results = pd.DataFrame()
for point in shots:
hit = point.get('data-play-by-play-action-hit')
action_id = point.get('data-play-by-play-action-id')
period = point.get('data-play-by-play-action-period')
player_id = point.get('data-play-by-play-action-player-id')
team_id = point.get('data-play-by-play-action-team-id')
x_point = float(point.find('svg').get('x'))
y_point = float(point.find('svg').get('y'))
temp_df = pd.DataFrame([[hit, action_id, period, player_id, team_id, x_point, y_point]],
columns=['hit','action_id','period','player_id','team_id','x','y'])
results = results.append(temp_df)
results['y'] = results['y'] * -1
results = results.reset_index(drop=True)
###############################################################################
player_ids = soup.findAll('label', {"class": "item-label"})
players = pd.DataFrame()
for player in player_ids:
player_id = player.find('input').get('data-play-by-play-action-player-id')
if player_id == None:
continue
player_name = player.find('span').text
temp_df = pd.DataFrame([[player_id, player_name]],
columns=['player_id','player_name'])
players = players.append(temp_df)
players = players.reset_index(drop=True)
###############################################################################
team_ids = soup.findAll('div', {"class": "header-scores_desktop"})
teams_A = team_ids[0].find('div', {"class": "team-A"})
team_id_A = teams_A.find('img').get('src').rsplit('/')[-1]
team_name_A = teams_A.find('span').text
teams_B = team_ids[0].find('div', {"class": "team-B"})
team_id_B = teams_B.find('img').get('src').rsplit('/')[-1]
team_name_B = teams_B.find('span').text
teams = pd.DataFrame([[team_id_A, team_name_A],[team_id_B,team_name_B]],
columns=['team_id','team_name'])
teams = teams.reset_index(drop=True)
###############################################################################
actions = pd.DataFrame()
action_ids = soup.findAll('div', {"class": "overlay-wrapper"})
for action in action_ids:
action_id = action.get('data-play-by-play-action-id')
time_remaining = action.find('div').find('span', {'class': 'time'}).text
full_name = action.find('div').find('span', {'class': 'athlete-name'}).text
if not action.find('div').find('span', {'class': 'action-code'}):
result_of_action = '+0'
else:
result_of_action = action.find('div').find('span', {'class': 'action-code'}).text
action_description = action.find('div').find('span', {'class': 'action-description'}).text
team_A_score = action.find('div').find('span', {'class': 'team-A'}).text
team_B_score = action.find('div').find('span', {'class': 'team-B'}).text
temp_df = pd.DataFrame([[action_id, time_remaining, full_name, result_of_action, team_A_score, team_B_score, action_description]],
columns=['action_id','time_remaining', 'full_name', 'result_of_action', team_name_A+'_score', team_name_B+' score', 'action-description'])
actions = actions.append(temp_df)
actions = actions.reset_index(drop=True)
###############################################################################
results = pd.merge(results, players, how='left', on='player_id')
results = pd.merge(results, teams, how='left', on='team_id')
results = pd.merge(results, actions, how='left', on='action_id')
driver.close()
And to clean it a bit, you can sort the rows so that they are in order, play-by-play from start to finish
results.sort_values(['period', 'time_remaining'], ascending=[True, False], inplace=True)
results = results.reset_index(drop=True)

Related

How do I apply NLP to the search engine I’m building using MySQL as data storage

I’m working on a search engine project for my country. I have the country’s domains list of the sites to crawl. So I have built a bot (the bot was written in python) to crawl some of the sites at the moment. When crawling is successful, the crawler will commit the crawled content to MySQL database. So I have data that people can search for in the MySQL remote server as I speak.
Now, I want to implement NLP in the search such that when a user enters a keyword in the search box, relevant results from MySQL database will show to the user based on the keyword used. I’m using python 3.8 and NLTK for this project. I haven’t done anything about NLP before. This is my first time. But I have read about it though. I also want to ask if using MySQL database is the right option for the search engine. If not, why can’t I use it and what should I use? I’m currently using MySQL because I’m more familiar with it a lot and I enjoy when using it for data storage. I’ve been struggling with this thing since last December. What I really need is the right NLP algorithm to use for selecting relevant results from MySQL database. I know that NLP is difficult to implement but I will appreciate if you can at least try to help out.
Here’s the code:
What I have done so far, I copied some of the code from Kaggle.com . Here is the link https://www.kaggle.com/amitkumarjaiswal/nlp-search-engine/notebook but I still haven’t been able to make it work for my own project.
import pandas as pd
import numpy as np
import string
import random
import nltk
import os
import re
#import nltk.corpus
import csv
#nltk.download('all')
#print(os.listdir(nltk.data.find("corpora")))
#pip install --upgrade nltk
from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
#load 10k reuters news documents
len(reuters.fileids())
#view text from one document
reuters.raw(fileids=['test/14826'])[0:201]
exclude = set(string.punctuation)
alldocslist = []
for index, i in enumerate(reuters.fileids()):
text = reuters.raw(fileids=[i])
text = ''.join(ch for ch in text if ch not in exclude)
alldocslist.append(text)
print(alldocslist[1])
#tokenize words in all DOCS
plot_data = [[]] * len(alldocslist)
for doc in alldocslist:
text = doc
tokentext = word_tokenize(text)
plot_data[index].append(tokentext)
print(plot_data[0][1])
# Navigation: first index gives all documents, second index gives specific document, third index gives words of that doc
plot_data[0][1][0:10]
#make all words lower case for all docs
for x in range(len(reuters.fileids())):
lowers = [word.lower() for word in plot_data[0][x]]
plot_data[0][x] = lowers
plot_data[0][1][0:10]
# remove stop words from all docs
stop_words = set(stopwords.words('english'))
for x in range(len(reuters.fileids())):
filtered_sentence = [w for w in plot_data[0][x] if not w in stop_words]
plot_data[0][x] = filtered_sentence
plot_data[0][1][0:10]
#stem words EXAMPLE (could try others/lemmers )
snowball_stemmer = SnowballStemmer("english")
stemmed_sentence = [snowball_stemmer.stem(w) for w in filtered_sentence]
stemmed_sentence[0:10]
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")
stemmed_sentence = [ porter_stemmer.stem(w) for w in filtered_sentence]
stemmed_sentence[0:10]
# Create inverse index which gives document number for each document and where word appears
#first we need to create a list of all words
l = plot_data[0]
flatten = [item for sublist in l for item in sublist]
words = flatten
wordsunique = set(words)
wordsunique = list(wordsunique)
import math
from textblob import TextBlob as tb
def tf(word, doc):
return doc.count(word) / len(doc)
def n_containing(word, doclist):
return sum(1 for doc in doclist if word in doc)
def idf(word, doclist):
return math.log(len(doclist) / (0.01 + n_containing(word, doclist)))
def tfidf(word, doc, doclist):
return (tf(word, doc) * idf(word, doclist))
# THIS ONE-TIME INDEXING IS THE MOST PROCESSOR-INTENSIVE STEP AND WILL TAKE TIME TO RUN (BUT ONLY NEEDS TO BE RUN ONCE)
plottest = plot_data[0][0:1000]
worddic = {}
for doc in plottest:
for word in wordsunique:
if word in doc:
word = str(word)
index = plottest.index(doc)
positions = list(np.where(np.array(plottest[index]) == word)[0])
idfs = tfidf(word,doc,plottest)
try:
worddic[word].append([index,positions,idfs])
except:
worddic[word] = []
worddic[word].append([index,positions,idfs])
# the index creates a dic with each word as a KEY and a list of doc indexs, word positions, and td-idf score as VALUES
worddic['china']
# pickel (save) the dictonary to avoid re-calculating
np.save('worddic_1000.npy', worddic)
# create word search which takes multiple words and finds documents that contain both along with metrics for ranking:
## (1) Number of occruances of search words
## (2) TD-IDF score for search words
## (3) Percentage of search terms
## (4) Word ordering score
## (5) Exact match bonus
from collections import Counter
def search(searchsentence):
try:
# split sentence into individual words
searchsentence = searchsentence.lower()
try:
words = searchsentence.split(' ')
except:
words = list(words)
enddic = {}
idfdic = {}
closedic = {}
# remove words if not in worddic
realwords = []
for word in words:
if word in list(worddic.keys()):
realwords.append(word)
words = realwords
numwords = len(words)
# make metric of number of occurances of all words in each doc & largest total IDF
for word in words:
for indpos in worddic[word]:
index = indpos[0]
amount = len(indpos[1])
idfscore = indpos[2]
enddic[index] = amount
idfdic[index] = idfscore
fullcount_order = sorted(enddic.items(), key=lambda x:x[1], reverse=True)
fullidf_order = sorted(idfdic.items(), key=lambda x:x[1], reverse=True)
# make metric of what percentage of words appear in each doc
combo = []
alloptions = {k: worddic.get(k, None) for k in (words)}
for worddex in list(alloptions.values()):
for indexpos in worddex:
for indexz in indexpos:
combo.append(indexz)
comboindex = combo[::3]
combocount = Counter(comboindex)
for key in combocount:
combocount[key] = combocount[key] / numwords
combocount_order = sorted(combocount.items(), key=lambda x:x[1], reverse=True)
# make metric for if words appear in same order as in search
if len(words) > 1:
x = []
y = []
for record in [worddic[z] for z in words]:
for index in record:
x.append(index[0])
for i in x:
if x.count(i) > 1:
y.append(i)
y = list(set(y))
closedic = {}
for wordbig in [worddic[x] for x in words]:
for record in wordbig:
if record[0] in y:
index = record[0]
positions = record[1]
try:
closedic[index].append(positions)
except:
closedic[index] = []
closedic[index].append(positions)
x = 0
fdic = {}
for index in y:
csum = []
for seqlist in closedic[index]:
while x > 0:
secondlist = seqlist
x = 0
sol = [1 for i in firstlist if i + 1 in secondlist]
csum.append(sol)
fsum = [item for sublist in csum for item in sublist]
fsum = sum(fsum)
fdic[index] = fsum
fdic_order = sorted(fdic.items(), key=lambda x:x[1], reverse=True)
while x == 0:
firstlist = seqlist
x = x + 1
else:
fdic_order = 0
# also the one above should be given a big boost if ALL found together
#could make another metric for if they are not next to each other but still close
return(searchsentence,words,fullcount_order,combocount_order,fullidf_order,fdic_order)
except:
return("")
search('indonesia crude palm oil')[1]
# 0 return will give back the search term, the rest will give back metrics (see above)
search('indonesia crude palm oil')[1][1:10]
# save metrics to dataframe for use in ranking and machine learning
result1 = search('china daily says what')
result2 = search('indonesia crude palm oil')
result3 = search('price of nickel')
result4 = search('north yemen sugar')
result5 = search('nippon steel')
result6 = search('China')
result7 = search('Gold')
result8 = search('trade')
df = pd.DataFrame([result1,result2,result3,result4,result5,result6,result7,result8])
df.columns = ['search term', 'actual_words_searched','num_occur','percentage_of_terms','td-idf','word_order']
df
# look to see if the top documents seem to make sense
alldocslist[1]
# create a simple (non-machine learning) rank and return function
def rank(term):
results = search(term)
# get metrics
num_score = results[2]
per_score = results[3]
tfscore = results[4]
order_score = results[5]
final_candidates = []
# rule1: if high word order score & 100% percentage terms then put at top position
try:
first_candidates = []
for candidates in order_score:
if candidates[1] > 1:
first_candidates.append(candidates[0])
second_candidates = []
for match_candidates in per_score:
if match_candidates[1] == 1:
second_candidates.append(match_candidates[0])
if match_candidates[1] == 1 and match_candidates[0] in first_candidates:
final_candidates.append(match_candidates[0])
# rule2: next add other word order score which are greater than 1
t3_order = first_candidates[0:3]
for each in t3_order:
if each not in final_candidates:
final_candidates.insert(len(final_candidates),each)
# rule3: next add top td-idf results
final_candidates.insert(len(final_candidates),tfscore[0][0])
final_candidates.insert(len(final_candidates),tfscore[1][0])
# rule4: next add other high percentage score
t3_per = second_candidates[0:3]
for each in t3_per:
if each not in final_candidates:
final_candidates.insert(len(final_candidates),each)
#rule5: next add any other top results for metrics
othertops = [num_score[0][0],per_score[0][0],tfscore[0][0],order_score[0][0]]
for top in othertops:
if top not in final_candidates:
final_candidates.insert(len(final_candidates),top)
# unless single term searched, in which case just return
except:
othertops = [num_score[0][0],num_score[1][0],num_score[2][0],per_score[0][0],tfscore[0][0]]
for top in othertops:
if top not in final_candidates:
final_candidates.insert(len(final_candidates),top)
for index, results in enumerate(final_candidates):
if index < 5:
print("RESULT", index + 1, ":", alldocslist[results][0:100],"...")
# example of output
rank('indonesia palm oil')
# example of output
rank('china')
# Create pseudo-truth set using first 5 words
# Because I don't have a turth set I will generate a pseudo one by pulling terms from the documents - this is far from perfect
# as it may not approximate well peoples actual queries but it will serve well to build the ML architecture
df_truth = pd.DataFrame()
for doc in plottest:
first_five = doc[0:5]
test_sentence = ' '.join(first_five)
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth= pd.concat([df_truth, df_temp])
df_truth['truth'] = range(0,len(plottest))
df_truth1 = pd.DataFrame()
seqlen = 3
for doc in plottest:
try:
start = random.randint(0,(len(doc)-seqlen))
random_seq = doc[start:start+seqlen]
test_sentence = ' '.join(random_seq)
except:
test_sentence = doc[0]
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth1= pd.concat([df_truth1, df_temp])
df_truth1['truth'] = range(0,len(plottest))
# create another psuedo-truth set using different random 4 word sequence from docs
df_truth2 = pd.DataFrame()
seqlen = 4
for doc in plottest:
try:
start = random.randint(0,(len(doc)-seqlen))
random_seq = doc[start:start+seqlen]
test_sentence = ' '.join(random_seq)
except:
test_sentence = doc[0]
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth2= pd.concat([df_truth2, df_temp])
df_truth2['truth'] = range(0,len(plottest))
# create another psuedo-truth set using different random 2 word sequence from docs
df_truth3 = pd.DataFrame()
seqlen = 2
for doc in plottest:
try:
start = random.randint(0,(len(doc)-seqlen))
random_seq = doc[start:start+seqlen]
test_sentence = ' '.join(random_seq)
except:
test_sentence = doc[0]
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth3= pd.concat([df_truth3, df_temp])
df_truth3['truth'] = range(0,len(plottest))
# combine the truth sets and save to disk
truth_set = pd.concat([df_truth,df_truth1,df_truth2,df_truth3])
truth_set.columns = ['search term', 'actual_words_searched','num_occur','percentage_of_terms','td-idf','word_order','truth']
truth_set.to_csv("truth_set_final.csv")
truth_set[0:10]
truth_set
test_set = truth_set[0:3]
test_set
# convert to long format for ML
# WARNING AGAIN THIS IS A SLOW PROCESS DUE TO RAM ILOC - COULD BE OPTIMISED FOR FASTER PERFORMANCE
# BUG When min(maxnum, len(truth_set) <- is a int not a list because of very short variable length)
# row is row
# column is variable
# i is the result
final_set = pd.DataFrame()
test_set = truth_set[1:100]
maxnum = 5
for row in range(0,len(test_set.index)):
test_set = truth_set[1:100]
for col in range(2,6):
for i in range(0,min(maxnum,len(truth_set.iloc[row][col]))):
x = pd.DataFrame([truth_set.iloc[row][col][i]])
x['truth'] = truth_set.iloc[row]['truth']
x.columns = [(str(truth_set.columns[col]),"index",i),(str(truth_set.columns[col]),"score",i),'truth']
test_set = test_set.merge(x,on='truth')
final_set = pd.concat([final_set,test_set])
final_set.head()
final_set.to_csv("ML_set_100.csv")
final_set2 = final_set.drop(['actual_words_searched','num_occur','percentage_of_terms','search term','td-idf','word_order'], 1)
final_set2.to_csv("ML_set_100_3.csv")
final_set2.head()
final_set3 = final_set2
final_set3[0:10]
Obviously, the code above isn't returning searched keywords from MySQL database. I believe you understand me? Thank you very much!

Why do I have to declare the list inside the loop?

I just wrote the code below and it wasn't working like it's supposed to at first because I declared the list (ltag) outside the loop. I just wanna know what the difference is and why it didn't work in the first case.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter the URL mboy :')
pos= input("Enter position: ")
n= int(pos) - 1
count = input("Enter count: ")
c= int(count) +1
times = 0
while times < c :
ltag = list()
print('Retrieving: ', url)
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
tags = soup('a')
for i in range (c):
for tag in tags:
ltag.append(tag)
url = ltag[n].get('href', None)
times = times + 1

getting the wrong text from web scrape with beautifulsoup

I'm getting the wrong text when I scrape this url:
http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018
this is what I have
from requests import get
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import pandas as pd
#Define year
year_number = 2018
# Define the URL
i = range(0, 1)
names = []
metascores = []
userscores = []
userscoresNew = []
release_dates = []
release_datesNew = []
publishers = []
ratings = []
genres = []
for element in i:
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=" + format(year_number)
print(url)
year_number -= 1
# not sure about this but it works (I was getting blocked by something and this the way I found around it)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
#this grabs the all the text from the page
html_soup = BeautifulSoup(webpage, 'html5lib')
#this is for selecting all the games in from 1 to 100 (the list of them)
game_names = html_soup.find_all("div", class_="main_stats")
game_metas = html_soup.find_all("a", class_="basic_stat product_score")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
game_releases = html_soup.find_all("ul", class_='more_stats')
# game_publishers = html_soup.find_all("ul", class_='more_stats')
# game_ratings = html_soup.find_all("ul", class_='more_stats')
# game_genres = html_soup.find_all("ul", class_='more_stats')
#Extract data from each game
for games in game_names:
name = games.find()
names.append(name.text.strip())
for games2 in game_metas:
metascore = games2.find()
metascores.append(metascore.text.strip())
for games3 in game_releases:
release_date = games3.find()
release_dates.append(release_date.text.strip())
for games4 in game_users:
game_user = games4.find()
userscores.append(game_user.text.strip())
# print(name)
# print(metascore)
# print(userscore)
# for i in userscores:
# temp = str(i)
# temp2 = temp.replace("User:\n ", "")
# userscoresNew.append(temp2)
for x in release_dates:
temp = str(x)
temp2 = temp.replace("Release Date:\n ", "")
release_datesNew.append(temp2)
# df = pd.DataFrame({'Games:': names,
# 'Metascore:': metascores,
# 'Userscore:': userscoresNew})
# df.to_csv("metacritic scrape.csv")
the above is looking for the user score but I get the text "User Score:" repeated 100x when what I want is the data in the next set of tags however, when I try to change the above variable to:
game_users = html_soup.find_all("span", class_='data textscore textscore_favorable')
I get an error when I run the code:
AttributeError: 'NoneType' object has no attribute 'text'
also I don't think the 2nd option is a good approach because when the user score falls below a certain level the class changes on the HTML (from "data textscore textscore_favorable" to "data textscore textscore_mixed")
any help would be appreicated
FYI I modifying code I have already written but grabing more details from a more detailed view
This should help.
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
url = "http://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?view=detailed&sort=desc&year_selected=2018"
html = requests.get(url, headers=headers)
html_soup = BeautifulSoup(html.text, "html.parser")
game_users = html_soup.find_all("li", class_='stat product_avguserscore')
for i in game_users:
userScore = i.find('span', class_="data textscore textscore_favorable")
if userScore:
print(userScore.text)
Output:
7.6
7.8
8.2
7.8
8.1
8.5
7.5
7.5
....
Use html_soup.find_all("li", class_='stat product_avguserscore') to get score

The Tensorflow Object_detection API 's visualize don't work

when I am using the API of Object_detection,I followed the instruction ,everything is fine .However ,when I begin to test my picture , I met a problem , it seems that the function named
" visualize_boxes_and_labels_on_image_array " ( in the 57 line ) didn't work . Here is my source codes
import cv2
import numpy as np
import tensorflow as tf
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util
class TOD(object):
def __init__(self):
self.PATH_TO_CKPT = '/home/xiyou/Desktop/ssd_training/result/frozen_inference_graph.pb'
self.PATH_TO_LABELS = '/home/xiyou/Desktop/ssd_training/detection_for_smoke.pbtxt'
self.NUM_CLASSES = 1
self.detection_graph = self._load_model()
self.category_index = self._load_label_map()
def _load_model(self):
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(self.PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
return detection_graph
def _load_label_map(self):
label_map = label_map_util.load_labelmap(self.PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map,
max_num_classes=self.NUM_CLASSES,
use_display_name=True)
category_index = label_map_util.create_category_index(categories)
return category_index
def detect(self, image):
with self.detection_graph.as_default():
with tf.Session(graph=self.detection_graph) as sess:
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image, axis=0)
image_tensor = self.detection_graph.get_tensor_by_name('image_tensor:0')
boxes = self.detection_graph.get_tensor_by_name('detection_boxes:0')
scores = self.detection_graph.get_tensor_by_name('detection_scores:0')
classes = self.detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = self.detection_graph.get_tensor_by_name('num_detections:0')
# Actual detection.
(boxes, scores, classes, num_detections) = sess.run(
[boxes, scores, classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
print(boxes, scores, classes, num_detections)
#print(np.squeeze(boxes))
# Visualization of the results of a detection.
#######Here is the problem
# image1 = vis_util.visualize_boxes_and_labels_on_image_array(
image, #######Here is the problem
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
self.category_index,
use_normalized_coordinates=True,
line_thickness=50,
)
#print(np.squeeze(boxes),np.squeeze(classes))
cv2.namedWindow("detection")
cv2.imshow("detection", image1)
cv2.waitKey(0)
if __name__ == '__main__':
image = cv2.imread('/home/xiyou/Pictures/timg1.jpg')
detecotr = TOD()
detecotr.detect(image)
when I run this code , the image did show ,but nothing changed , no detected area in the pic and no an other informations . the input pic is the same as the out image . But when I was Debug , I found the Varibles such as soucres , classes , boxes do have values.
Is anyone can help me ? Thanks!!!
And my Tensorflow version is 1.4.0 , CUDA 8.0 in Ubuntu 16.04

Can I export a tensorflow summary to CSV?

Is there a way to extract scalar summaries to CSV (preferably from within tensorboard) from tfevents files?
Example code
The following code generates tfevent files in a summary_dir within the same directory. Suppose you let it run and you find something interesting. You want to get the raw data for further investigation. How would you do that?
#!/usr/bin/env python
"""A very simple MNIST classifier."""
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
ce_with_logits = tf.nn.softmax_cross_entropy_with_logits
FLAGS = None
def inference(x):
"""
Build the inference graph.
Parameters
----------
x : placeholder
Returns
-------
Output tensor with the computed logits.
"""
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b
return y
def loss(logits, labels):
"""
Calculate the loss from the logits and the labels.
Parameters
----------
logits : Logits tensor, float - [batch_size, NUM_CLASSES].
labels : Labels tensor, int32 - [batch_size]
"""
cross_entropy = tf.reduce_mean(ce_with_logits(labels=labels,
logits=logits))
return cross_entropy
def training(loss, learning_rate=0.5):
"""
Set up the training Ops.
Parameters
----------
loss : Loss tensor, from loss().
learning_rate : The learning rate to use for gradient descent.
Returns
-------
train_op: The Op for training.
"""
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_step = optimizer.minimize(loss)
return train_step
def main(_):
# Import data
mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
y = inference(x)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])
loss_ = loss(logits=y, labels=y_)
train_step = training(loss_)
# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.name_scope('accuracy'):
tf.summary.scalar('accuracy', accuracy)
merged = tf.summary.merge_all()
sess = tf.InteractiveSession()
train_writer = tf.summary.FileWriter('summary_dir/train', sess.graph)
test_writer = tf.summary.FileWriter('summary_dir/test', sess.graph)
tf.global_variables_initializer().run()
for train_step_i in range(100000):
if train_step_i % 100 == 0:
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.test.images,
y_: mnist.test.labels})
test_writer.add_summary(summary, train_step_i)
summary, acc = sess.run([merged, accuracy],
feed_dict={x: mnist.train.images,
y_: mnist.train.labels})
train_writer.add_summary(summary, train_step_i)
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
print(sess.run(accuracy, feed_dict={x: mnist.test.images,
y_: mnist.test.labels}))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir',
type=str,
default='/tmp/tensorflow/mnist/input_data',
help='Directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
While the answer here is as requested within tensorboard it only allows to download a csv for a single run of a single tag.
If you have for example 10 tags and 20 runs (what is not at all much) you would need to do the above step 200 times (that alone will probably take you more than a hour).
If now you for some reason would like to actually do something with the data for all runs for a single tag you would need to write some weird CSV accumulation script or copy everything by hand (what will probably cost you more than a day).
Therefore I would like to add a solution that extracts a CSV file for every tag with all runs contained. Column headers are the run path names and row indices are the run step numbers.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
summary_iterators = [EventAccumulator(os.path.join(dpath, dname)).Reload() for dname in os.listdir(dpath)]
tags = summary_iterators[0].Tags()['scalars']
for it in summary_iterators:
assert it.Tags()['scalars'] == tags
out = defaultdict(list)
steps = []
for tag in tags:
steps = [e.step for e in summary_iterators[0].Scalars(tag)]
for events in zip(*[acc.Scalars(tag) for acc in summary_iterators]):
assert len(set(e.step for e in events)) == 1
out[tag].append([e.value for e in events])
return out, steps
def to_csv(dpath):
dirs = os.listdir(dpath)
d, steps = tabulate_events(dpath)
tags, values = zip(*d.items())
np_values = np.array(values)
for index, tag in enumerate(tags):
df = pd.DataFrame(np_values[index], index=steps, columns=dirs)
df.to_csv(get_file_path(dpath, tag))
def get_file_path(dpath, tag):
file_name = tag.replace("/", "_") + '.csv'
folder_path = os.path.join(dpath, 'csv')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
return os.path.join(folder_path, file_name)
if __name__ == '__main__':
path = "path_to_your_summaries"
to_csv(path)
My solution builds upon: https://stackoverflow.com/a/48774926/2230045
EDIT:
I created a more sophisticated version and released it on GitHub: https://github.com/Spenhouet/tensorboard-aggregator
This version aggregates multiple tensorboard runs and is able to save the aggregates to a new tensorboard summary or as a .csv file.
Just check the "Data download links" option on the upper-left in TensorBoard, and then click on the "CSV" button that will appear under your scalar summary.
Here is my solution which bases on the previous solutions but can scale up.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
def tabulate_events(dpath):
final_out = {}
for dname in os.listdir(dpath):
print(f"Converting run {dname}",end="")
ea = EventAccumulator(os.path.join(dpath, dname)).Reload()
tags = ea.Tags()['scalars']
out = {}
for tag in tags:
tag_values=[]
wall_time=[]
steps=[]
for event in ea.Scalars(tag):
tag_values.append(event.value)
wall_time.append(event.wall_time)
steps.append(event.step)
out[tag]=pd.DataFrame(data=dict(zip(steps,np.array([tag_values,wall_time]).transpose())), columns=steps,index=['value','wall_time'])
if len(tags)>0:
df= pd.concat(out.values(),keys=out.keys())
df.to_csv(f'{dname}.csv')
print("- Done")
else:
print('- Not scalers to write')
final_out[dname] = df
return final_out
if __name__ == '__main__':
path = "youre/path/here"
steps = tabulate_events(path)
pd.concat(steps.values(),keys=steps.keys()).to_csv('all_result.csv')
Very minimal example:
import pandas as pd
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
events = event_accumulator.Scalars("train_loss")
x = [x.step for x in events]
y = [x.value for x in events]
df = pd.DataFrame({"step": x, "train_loss": y})
df.to_csv("train_loss.csv")
print(df)
step train_loss
0 0 700.491516
1 1 163.593246
2 2 146.365448
3 3 153.830215
...
Plotting loss vs epochs example:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
log_dir = "lightning_logs/version_1"
y_key = "val_loss"
event_accumulator = EventAccumulator(log_dir)
event_accumulator.Reload()
steps = {x.step for x in event_accumulator.Scalars("epoch")}
x = list(range(len(steps)))
y = [x.value for x in event_accumulator.Scalars(y_key) if x.step in steps]
df = pd.DataFrame({"epoch": x, y_key: y})
df.to_csv(f"{y_key}.csv")
fig, ax = plt.subplots()
sns.lineplot(data=df, x="epoch", y=y_key)
fig.savefig("plot.png", dpi=300)
Just to add to #Spen
in case you want to export the data when you have varying numbers of steps.
This will make one large csv file.
Might need to change around the keys for it to work for you.
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import glob
import pandas as pd
listOutput = (glob.glob("*/"))
listDF = []
for tb_output_folder in listOutput:
print(tb_output_folder)
x = EventAccumulator(path=tb_output_folder)
x.Reload()
x.FirstEventTimestamp()
keys = ['loss', 'mean_absolute_error', 'val_loss', 'val_mean_absolute_error']
listValues = {}
steps = [e.step for e in x.Scalars(keys[0])]
wall_time = [e.wall_time for e in x.Scalars(keys[0])]
index = [e.index for e in x.Scalars(keys[0])]
count = [e.count for e in x.Scalars(keys[0])]
n_steps = len(steps)
listRun = [tb_output_folder] * n_steps
printOutDict = {}
data = np.zeros((n_steps, len(keys)))
for i in range(len(keys)):
data[:,i] = [e.value for e in x.Scalars(keys[i])]
printOutDict = {keys[0]: data[:,0], keys[1]: data[:,1],keys[2]: data[:,2],keys[3]: data[:,3]}
printOutDict['Name'] = listRun
DF = pd.DataFrame(data=printOutDict)
listDF.append(DF)
df = pd.concat(listDF)
df.to_csv('Output.csv')