How does one get widget values with a button in ipython - widget

I have a function createWidgets whose purpose is to take a list of strings and create a list of containers for each string -> 1 container = a textbox and checkbox. Each container is then put into a large container.
What I am trying to do is append a button to the container that on_click takes all the "True"s and puts all the modified strings and puts them in a dataframe
widgelist = e.options
txtBox_type = 'text_widget' # Define if Area box o regular txtbox
bigContainer = createWidgets(widgelist, txtBox_type)
Function
def createWidgets(widgelist, txtBox_type):
#containerList = []
i = 0
for k in widgelist:
## Build Container widgets
chBox_Widget = widgets.CheckboxWidget(description = str(i),value = False,)
if txtBox_type == 'textA_widget': # Check wether txtBox should be an area txt box or not.
txt_Widget = widgets.TextareaWidget( description = str(i), value = k)
else:
txt_Widget = widgets.TextWidget( description = str(i), value = k)
container = widgets.ContainerWidget()
container.children = [chBox_Widget, txt_Widget]
containerList.append(container)
i+= 1
button = widgets.ButtonWidget(description = 'Add')
bigContainer = widgets.ContainerWidget()
bigContainer.children = containerList
return bigContainer
I have gone to many websites and spent many days on this help is very much appreciated

As near as I can interpret the question, the code below should provide an answer:
import IPython.html.widgets as widgets
from IPython.display import display, clear_output
import pandas as pd
df = pd.DataFrame(columns=['Thing'])
def createWidgets(widgelist):
## Each CheckboxWidget and TextWidget are enclosed in a subwidget. We use a
## list comprehension to construct a list of these subwidgets.
containerList = [
widgets.ContainerWidget(children=(widgets.CheckboxWidget(description=k)
widgets.TextWidget(value=k)))
for k in widgelist]
bigContainer = widgets.ContainerWidget(children=containerList)
## To arrange the CheckboxWidget in a row with the TextWidget, we have to
## first display them, then remove_class('vbox') and add_class('hbox'). This
## bit of awkwardness in the IPython version 2.x notebook will hopefully
## be fixed in version 3.x. Displaying bigContainer also displays it's children.
display(bigContainer)
for c in containerList:
c.remove_class('vbox')
c.add_class('hbox')
return bigContainer
widgelist = ['ThingA', 'ThingB', 'ThingC', 'ThingD']
bigContainer = createWidgets(widgelist, txtBox_type)
## Callback for button.on_click.
def add_to_dataframe(a):
# The children of bigContainer are also containers,
# each with first child a CheckboxWidget and second
# child a TextWidget. We iterate through them and
# if checked, add the text to the dataframe df as
# an additional row.
for c in bigContainer.children:
if c.children[0].value:
df.loc[len(df)+1] = (c.children[1].value,)
display(df)
clear_output()
display(df)
button = widgets.ButtonWidget(description = 'Add')
button.on_click(add_to_dataframe)
display(button)
Here is a screen clip of the widget area and output after adding a few rows to the dataframe.
I would have designed the code to do this somewhat differently, but I tried to stay
close to your code organization.

This is updated version for Ipython3 on jupyternotebooks 4
Just rename:
widgets.ContainerWidget ->widgets.Box
widgets.CheckboxWidget -> widgets.Checkbox
widgets.TextWidget -> widgets.Text
Reference: [https://ipython.org/ipython-doc/3/whatsnew/version3_widget_migration.html]

Related

Loading multiple csvs with mixed dtypes in tensorflow for training

I have 100s of csvs in a directory, with headers. I am trying to create a feedforward NN using tensorflow for regression.
What's the best way to import these csvs and train using tf & train it?
Also help to look at my preprocessing if I am doing it right?
Note: My features has mixed datatypes (int,float,string), My target is float
I can not concat the csv and use pandas to import, my data size is >50 GB so can not load it in-memory, have to read it iteratively from disc
Directory Path:
./data/train/ -> 100s of csvs
./data/test -> 100s of csvs
./data/valid -> 100s of csvs
Code:
Methodology:
Create Generator
Use Dataset API to load the data
Preprocess the Data (embedding, one-hot,etc)
Train fit
But, in generator I was able to give only output formats where the inputs/ outputs are homogeneous ddtypes.
Code:
def data_generator(file_list, batch_size = 2):
i = 0
while True:
if i*batch_size >= len(file_list): # This loop is used to run the generator indefinitely.
i = 0
np.random.shuffle(file_list)
else:
file_chunk = file_list[i*batch_size:(i+1)*batch_size]
data = []
labels = []
for file in file_chunk:
temp = pd.read_csv(open(file,'r')) # Change this line to read any other type of file
labels = temp.pop('ACTUAL_BOXES')
data.append(temp.values) # Convert column data to matrix like data with one channel
labels.append(labels)
data = np.asarray(data)
labels = np.asarray(labels)
yield data, labels # Here data will be mixed datatype arrays & lables will be a float dtype array
i = i + 1
#getting list of files inside the directory
train_file_list = np.sort(glob.glob('././data/train/*.csv'))
test_file_list = np.sort(glob.glob('././data/test/*.csv'))
val_file_list = np.sort(glob.glob('././data/val/*.csv'))
train_dataset = tf.data.Dataset.from_generator(data_generator,args= [train_file_list , batch_size = 2],
output_types = (tf.float32, tf.float32), #This is where I am struck
#my sample data and lables will be like this
data = ['a','b',1,2,3.14,2] #Mixed dtypes
lables = [1.0] #float
)
val_dataset = tf.data.Dataset.from_generator(data_generator,args= [val_file_list , batch_size = 2],
output_types = (tf.float32, tf.float32), #This is where I am struck
)
# Pre processing Part:
def encode_inputs(EMBEDDING_FEATURES,INDICATOR_FEATURES):
''' Function for encoding the deatures'''
encoded_features = []
for feature_name in EMBEDDING_FEATURES:
#Getting unique vocab list
vocabulary = np.array(list(flatten(vocab_list[feature_name])))
# categorical columns using the lists created above:
cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary)
embedding_dims = int(math.sqrt(len(vocabulary)))
# create an embedding from the categorical column:
cat_emb = tf.feature_column.embedding_column(cat_col,8) #,dimension=embedding_dims
# add the embeddings to the list of feature columns
encoded_features.append(cat_emb)
for feature_name in INDICATOR_FEATURES:
#Getting unique vocab list
vocabulary = list(flatten(vocab_list[feature_name]))
# indicator columns using the lists created above:
ind_col = tf.feature_column.categorical_column_with_vocabulary_list(
feature_name, vocabulary)
# create an embedding from the categorical column:
cat_one_hot = tf.feature_column.indicator_column(ind_col)
# add the embeddings to the list of feature columns
encoded_features.append(cat_one_hot)
# create the input layer for the model
feature_layer = tf.keras.layers.DenseFeatures(encoded_features)
return feature_layer
# Opening JSON file that contains vocab list for str cols
f = open('./vocab_list.json') # File that contains the unique values of each feature
vocab_list = json.load(f)
features_layer = encode_inputs(EMBEDDING_FEATURES,INDICATOR_FEATURES)
# Model Part
model = tf.keras.models.Sequential([
features_layer,
tf.keras.layers.Dense(30, activation = 'relu'),
tf.keras.layers.Dense(1)
])
m_loss = tf.keras.losses.mean_squared_error
m_optimizer = tf.keras.optimizers.SGD(lr = 1e-3)
batch_size = 32
model.compile(loss = m_loss, optimizer = m_optimizer, metrics = ['accuracy'])
model.fit(train_dataset ,epochs = 10, validation_data = val_dataset )

How do I apply NLP to the search engine I’m building using MySQL as data storage

I’m working on a search engine project for my country. I have the country’s domains list of the sites to crawl. So I have built a bot (the bot was written in python) to crawl some of the sites at the moment. When crawling is successful, the crawler will commit the crawled content to MySQL database. So I have data that people can search for in the MySQL remote server as I speak.
Now, I want to implement NLP in the search such that when a user enters a keyword in the search box, relevant results from MySQL database will show to the user based on the keyword used. I’m using python 3.8 and NLTK for this project. I haven’t done anything about NLP before. This is my first time. But I have read about it though. I also want to ask if using MySQL database is the right option for the search engine. If not, why can’t I use it and what should I use? I’m currently using MySQL because I’m more familiar with it a lot and I enjoy when using it for data storage. I’ve been struggling with this thing since last December. What I really need is the right NLP algorithm to use for selecting relevant results from MySQL database. I know that NLP is difficult to implement but I will appreciate if you can at least try to help out.
Here’s the code:
What I have done so far, I copied some of the code from Kaggle.com . Here is the link https://www.kaggle.com/amitkumarjaiswal/nlp-search-engine/notebook but I still haven’t been able to make it work for my own project.
import pandas as pd
import numpy as np
import string
import random
import nltk
import os
import re
#import nltk.corpus
import csv
#nltk.download('all')
#print(os.listdir(nltk.data.find("corpora")))
#pip install --upgrade nltk
from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
#load 10k reuters news documents
len(reuters.fileids())
#view text from one document
reuters.raw(fileids=['test/14826'])[0:201]
exclude = set(string.punctuation)
alldocslist = []
for index, i in enumerate(reuters.fileids()):
text = reuters.raw(fileids=[i])
text = ''.join(ch for ch in text if ch not in exclude)
alldocslist.append(text)
print(alldocslist[1])
#tokenize words in all DOCS
plot_data = [[]] * len(alldocslist)
for doc in alldocslist:
text = doc
tokentext = word_tokenize(text)
plot_data[index].append(tokentext)
print(plot_data[0][1])
# Navigation: first index gives all documents, second index gives specific document, third index gives words of that doc
plot_data[0][1][0:10]
#make all words lower case for all docs
for x in range(len(reuters.fileids())):
lowers = [word.lower() for word in plot_data[0][x]]
plot_data[0][x] = lowers
plot_data[0][1][0:10]
# remove stop words from all docs
stop_words = set(stopwords.words('english'))
for x in range(len(reuters.fileids())):
filtered_sentence = [w for w in plot_data[0][x] if not w in stop_words]
plot_data[0][x] = filtered_sentence
plot_data[0][1][0:10]
#stem words EXAMPLE (could try others/lemmers )
snowball_stemmer = SnowballStemmer("english")
stemmed_sentence = [snowball_stemmer.stem(w) for w in filtered_sentence]
stemmed_sentence[0:10]
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")
stemmed_sentence = [ porter_stemmer.stem(w) for w in filtered_sentence]
stemmed_sentence[0:10]
# Create inverse index which gives document number for each document and where word appears
#first we need to create a list of all words
l = plot_data[0]
flatten = [item for sublist in l for item in sublist]
words = flatten
wordsunique = set(words)
wordsunique = list(wordsunique)
import math
from textblob import TextBlob as tb
def tf(word, doc):
return doc.count(word) / len(doc)
def n_containing(word, doclist):
return sum(1 for doc in doclist if word in doc)
def idf(word, doclist):
return math.log(len(doclist) / (0.01 + n_containing(word, doclist)))
def tfidf(word, doc, doclist):
return (tf(word, doc) * idf(word, doclist))
# THIS ONE-TIME INDEXING IS THE MOST PROCESSOR-INTENSIVE STEP AND WILL TAKE TIME TO RUN (BUT ONLY NEEDS TO BE RUN ONCE)
plottest = plot_data[0][0:1000]
worddic = {}
for doc in plottest:
for word in wordsunique:
if word in doc:
word = str(word)
index = plottest.index(doc)
positions = list(np.where(np.array(plottest[index]) == word)[0])
idfs = tfidf(word,doc,plottest)
try:
worddic[word].append([index,positions,idfs])
except:
worddic[word] = []
worddic[word].append([index,positions,idfs])
# the index creates a dic with each word as a KEY and a list of doc indexs, word positions, and td-idf score as VALUES
worddic['china']
# pickel (save) the dictonary to avoid re-calculating
np.save('worddic_1000.npy', worddic)
# create word search which takes multiple words and finds documents that contain both along with metrics for ranking:
## (1) Number of occruances of search words
## (2) TD-IDF score for search words
## (3) Percentage of search terms
## (4) Word ordering score
## (5) Exact match bonus
from collections import Counter
def search(searchsentence):
try:
# split sentence into individual words
searchsentence = searchsentence.lower()
try:
words = searchsentence.split(' ')
except:
words = list(words)
enddic = {}
idfdic = {}
closedic = {}
# remove words if not in worddic
realwords = []
for word in words:
if word in list(worddic.keys()):
realwords.append(word)
words = realwords
numwords = len(words)
# make metric of number of occurances of all words in each doc & largest total IDF
for word in words:
for indpos in worddic[word]:
index = indpos[0]
amount = len(indpos[1])
idfscore = indpos[2]
enddic[index] = amount
idfdic[index] = idfscore
fullcount_order = sorted(enddic.items(), key=lambda x:x[1], reverse=True)
fullidf_order = sorted(idfdic.items(), key=lambda x:x[1], reverse=True)
# make metric of what percentage of words appear in each doc
combo = []
alloptions = {k: worddic.get(k, None) for k in (words)}
for worddex in list(alloptions.values()):
for indexpos in worddex:
for indexz in indexpos:
combo.append(indexz)
comboindex = combo[::3]
combocount = Counter(comboindex)
for key in combocount:
combocount[key] = combocount[key] / numwords
combocount_order = sorted(combocount.items(), key=lambda x:x[1], reverse=True)
# make metric for if words appear in same order as in search
if len(words) > 1:
x = []
y = []
for record in [worddic[z] for z in words]:
for index in record:
x.append(index[0])
for i in x:
if x.count(i) > 1:
y.append(i)
y = list(set(y))
closedic = {}
for wordbig in [worddic[x] for x in words]:
for record in wordbig:
if record[0] in y:
index = record[0]
positions = record[1]
try:
closedic[index].append(positions)
except:
closedic[index] = []
closedic[index].append(positions)
x = 0
fdic = {}
for index in y:
csum = []
for seqlist in closedic[index]:
while x > 0:
secondlist = seqlist
x = 0
sol = [1 for i in firstlist if i + 1 in secondlist]
csum.append(sol)
fsum = [item for sublist in csum for item in sublist]
fsum = sum(fsum)
fdic[index] = fsum
fdic_order = sorted(fdic.items(), key=lambda x:x[1], reverse=True)
while x == 0:
firstlist = seqlist
x = x + 1
else:
fdic_order = 0
# also the one above should be given a big boost if ALL found together
#could make another metric for if they are not next to each other but still close
return(searchsentence,words,fullcount_order,combocount_order,fullidf_order,fdic_order)
except:
return("")
search('indonesia crude palm oil')[1]
# 0 return will give back the search term, the rest will give back metrics (see above)
search('indonesia crude palm oil')[1][1:10]
# save metrics to dataframe for use in ranking and machine learning
result1 = search('china daily says what')
result2 = search('indonesia crude palm oil')
result3 = search('price of nickel')
result4 = search('north yemen sugar')
result5 = search('nippon steel')
result6 = search('China')
result7 = search('Gold')
result8 = search('trade')
df = pd.DataFrame([result1,result2,result3,result4,result5,result6,result7,result8])
df.columns = ['search term', 'actual_words_searched','num_occur','percentage_of_terms','td-idf','word_order']
df
# look to see if the top documents seem to make sense
alldocslist[1]
# create a simple (non-machine learning) rank and return function
def rank(term):
results = search(term)
# get metrics
num_score = results[2]
per_score = results[3]
tfscore = results[4]
order_score = results[5]
final_candidates = []
# rule1: if high word order score & 100% percentage terms then put at top position
try:
first_candidates = []
for candidates in order_score:
if candidates[1] > 1:
first_candidates.append(candidates[0])
second_candidates = []
for match_candidates in per_score:
if match_candidates[1] == 1:
second_candidates.append(match_candidates[0])
if match_candidates[1] == 1 and match_candidates[0] in first_candidates:
final_candidates.append(match_candidates[0])
# rule2: next add other word order score which are greater than 1
t3_order = first_candidates[0:3]
for each in t3_order:
if each not in final_candidates:
final_candidates.insert(len(final_candidates),each)
# rule3: next add top td-idf results
final_candidates.insert(len(final_candidates),tfscore[0][0])
final_candidates.insert(len(final_candidates),tfscore[1][0])
# rule4: next add other high percentage score
t3_per = second_candidates[0:3]
for each in t3_per:
if each not in final_candidates:
final_candidates.insert(len(final_candidates),each)
#rule5: next add any other top results for metrics
othertops = [num_score[0][0],per_score[0][0],tfscore[0][0],order_score[0][0]]
for top in othertops:
if top not in final_candidates:
final_candidates.insert(len(final_candidates),top)
# unless single term searched, in which case just return
except:
othertops = [num_score[0][0],num_score[1][0],num_score[2][0],per_score[0][0],tfscore[0][0]]
for top in othertops:
if top not in final_candidates:
final_candidates.insert(len(final_candidates),top)
for index, results in enumerate(final_candidates):
if index < 5:
print("RESULT", index + 1, ":", alldocslist[results][0:100],"...")
# example of output
rank('indonesia palm oil')
# example of output
rank('china')
# Create pseudo-truth set using first 5 words
# Because I don't have a turth set I will generate a pseudo one by pulling terms from the documents - this is far from perfect
# as it may not approximate well peoples actual queries but it will serve well to build the ML architecture
df_truth = pd.DataFrame()
for doc in plottest:
first_five = doc[0:5]
test_sentence = ' '.join(first_five)
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth= pd.concat([df_truth, df_temp])
df_truth['truth'] = range(0,len(plottest))
df_truth1 = pd.DataFrame()
seqlen = 3
for doc in plottest:
try:
start = random.randint(0,(len(doc)-seqlen))
random_seq = doc[start:start+seqlen]
test_sentence = ' '.join(random_seq)
except:
test_sentence = doc[0]
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth1= pd.concat([df_truth1, df_temp])
df_truth1['truth'] = range(0,len(plottest))
# create another psuedo-truth set using different random 4 word sequence from docs
df_truth2 = pd.DataFrame()
seqlen = 4
for doc in plottest:
try:
start = random.randint(0,(len(doc)-seqlen))
random_seq = doc[start:start+seqlen]
test_sentence = ' '.join(random_seq)
except:
test_sentence = doc[0]
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth2= pd.concat([df_truth2, df_temp])
df_truth2['truth'] = range(0,len(plottest))
# create another psuedo-truth set using different random 2 word sequence from docs
df_truth3 = pd.DataFrame()
seqlen = 2
for doc in plottest:
try:
start = random.randint(0,(len(doc)-seqlen))
random_seq = doc[start:start+seqlen]
test_sentence = ' '.join(random_seq)
except:
test_sentence = doc[0]
result = search(test_sentence)
df_temp = pd.DataFrame([result])
df_truth3= pd.concat([df_truth3, df_temp])
df_truth3['truth'] = range(0,len(plottest))
# combine the truth sets and save to disk
truth_set = pd.concat([df_truth,df_truth1,df_truth2,df_truth3])
truth_set.columns = ['search term', 'actual_words_searched','num_occur','percentage_of_terms','td-idf','word_order','truth']
truth_set.to_csv("truth_set_final.csv")
truth_set[0:10]
truth_set
test_set = truth_set[0:3]
test_set
# convert to long format for ML
# WARNING AGAIN THIS IS A SLOW PROCESS DUE TO RAM ILOC - COULD BE OPTIMISED FOR FASTER PERFORMANCE
# BUG When min(maxnum, len(truth_set) <- is a int not a list because of very short variable length)
# row is row
# column is variable
# i is the result
final_set = pd.DataFrame()
test_set = truth_set[1:100]
maxnum = 5
for row in range(0,len(test_set.index)):
test_set = truth_set[1:100]
for col in range(2,6):
for i in range(0,min(maxnum,len(truth_set.iloc[row][col]))):
x = pd.DataFrame([truth_set.iloc[row][col][i]])
x['truth'] = truth_set.iloc[row]['truth']
x.columns = [(str(truth_set.columns[col]),"index",i),(str(truth_set.columns[col]),"score",i),'truth']
test_set = test_set.merge(x,on='truth')
final_set = pd.concat([final_set,test_set])
final_set.head()
final_set.to_csv("ML_set_100.csv")
final_set2 = final_set.drop(['actual_words_searched','num_occur','percentage_of_terms','search term','td-idf','word_order'], 1)
final_set2.to_csv("ML_set_100_3.csv")
final_set2.head()
final_set3 = final_set2
final_set3[0:10]
Obviously, the code above isn't returning searched keywords from MySQL database. I believe you understand me? Thank you very much!

how can a recursive function operate if it returns to the beginning?

I am a novice programmer
I am looking up into a problem which is using a recursive function. Though I could understand the main point, there is an unclear issue which I could not decipher immediately as I go through the debugging process. I will appreciate your help on my question.
The problem's concept (merge sorting) is pretty straight forward, but I am confused with the way a recursive function works in general. Bellow is the program I am dealing with (from Georgia Tech course on Python):
def mergesort(lst):
if len(lst) <= 1:
return lst
else:
midpoint = len(lst) // 2
left = mergesort(lst[:midpoint])
right = mergesort(lst[midpoint:])
newlist = []
while len(left) and len(right) > 0:
if left[0] < right[0]:
newlist.append(left[0])
else:
newlist.append(right[0])
del right[0]
newlist.extend(left)
newlist.extend(right)
return newlist
print(mergesort([2, 5, 3, 8, 6, 9, 1, 4, 7]))
QUESTION: What happens when the program reaches this line left = mergesort( lst[:midpoint])?
Based on my understanding, it returns to the first line of the program and comes down again to reach the same line (just like for does).
So it keeps returning!!! This, however, makes the program unreadable to me. In general, How the program deals with the recursive function is my main question. I could not understand the way it works.
What would happen When the program reaches this line left = mergesort(lst[:midpoint])? Based on my understanding, it returns to the first line of the program and comes down again to reach the same line...
Each time the program recurs, it calls mergesort with a smaller list. We call this a "sub-problem" -
def mergesort(lst):
if len(lst) <= 1:
# ...
else:
midpoint = len(lst) // 2 # find midpoint
left = mergesort(lst[:midpoint]) # solve sub-problem one
right = mergesort(lst[midpoint:]) # solve sub-problem two
# ...
For example, if we first call mergesort with a 4-element list -
mergesort([5,2,4,7])
The input list, lst, does not meet the base case, so we move onto the else branch -
def mergesort(lst): # lst = [5,2,4,7]
if len(lst) <= 1:
# ...
else:
midpoint = len(lst) // 2 # midpoint = 2
left = mergesort(lst[:midpoint]) # left = mergesort([5,2])
right = mergesort(lst[midpoint:]) # right = mergesort([4,7])
# ...
Notice mergesort is called with [5,2] and [4,7] sub-problems. Let's repeat these steps for the first sub-problem -
left = mergesort([5,2])
def mergesort(lst): # lst = [5,2]
if len(lst) <= 1:
# ...
else:
midpoint = len(lst) // 2 # midpoint = 1
left = mergesort(lst[:midpoint]) # left = mergesort([5])
right = mergesort(lst[midpoint:]) # right = mergesort([2])
# ...
So it keeps returning!!!
Not exactly. When we solve the sub-problems in this step, things looks different. When the input is one element or less, the base case is satisfied and the function exits -
left = mergesort([5])
def mergesort(lst): # lst = [5]
if len(lst) <= 1: # base case condition satisfied
return lst # return [5]
else:
... # no more recursion
Recursion stops for the left sub-problem and the answer of [5] is returned. The same applies for the right sub-problem -
right = mergesort([2])
def mergesort(lst): # lst = [2]
if len(lst) <= 1: # base case condition satisfied
return lst # return [2]
else:
... # no more recursion
Next we return our first left sub-problem -
left = mergesort([5,2])
def mergesort(lst): # lst = [5,2]
if len(lst) <= 1:
# ...
else:
midpoint = len(lst) // 2 # midpoint = 1
left = mergesort(lst[:midpoint]) # left = [5] <-
right = mergesort(lst[midpoint:]) # right = [2] <-
# ...
return newlist # newlist = [2,5]
You would now repeat these steps for the first right sub-problem -
right = mergesort([4,7])
def mergesort(lst): # lst = [4,7]
if len(lst) <= 1:
# ...
else:
midpoint = len(lst) // 2 # midpoint = 1
left = mergesort(lst[:midpoint]) # left = mergesort([4])
right = mergesort(lst[midpoint:]) # right = mergesort([7])
# ...
Again, recursion stops as the new left and right sub-problems are a single-element list, which satisfies the base case -
right = mergesort([4,7])
def mergesort(lst): # lst = [4,7]
if len(lst) <= 1:
# ...
else:
midpoint = len(lst) // 2 # midpoint = 1
left = mergesort(lst[:midpoint]) # left = [4] <-
right = mergesort(lst[midpoint:]) # right = [7] <-
# ...
return newlist # newlist = [4,7]
And finally the outermost mergesort call can return -
mergesort([5,2,4,7])
def mergesort(lst): # lst = [5,2,4,7]
if len(lst) <= 1:
# ...
else:
midpoint = len(lst) // 2 # midpoint = 2
left = mergesort(lst[:midpoint]) # left = [2,5]
right = mergesort(lst[midpoint:]) # right = [4,7]
# ...
return newlist # newlist = [2,4,5,7]
# => [2,4,5,7]
All of that said, recursion is a functional heritage and so using it with functional style yields the best results. This means avoiding things like mutations, variable reassignments, and other side effects. Consider this alternative which lowers the conceptual overhead by clearly separating the program's concerns -
def mergesort(lst):
def split(lst):
m = len(lst) // 2
return (lst[:m], lst[m:])
def merge(l, r):
if not l:
return r
elif not r:
return l
elif l[0] < r[0]:
return [l[0]] + merge(l[1:], r)
else:
return [r[0]] + merge(l, r[1:])
if len(lst) <= 1:
return lst
else:
(left, right) = split(lst)
return merge(mergesort(left), mergesort(right))
mergesort([5,2,4,7])
# => [2,4,5,7]
the answer to your question is: copies.
each function is a recipe for calculation.
when a function is called, the recipe's copy is created. each invocation involves creation of a separate copy. that's how each can operate on its own, and they all are not jumbled up together.
in general, there is nothing special about a recursive function call. a function call is a function call, no matter what is the function that is called. a function is called, does what it does, and its result is returned to the caller. as for recursion, you're not supposed to track it. it does its work on its own. you're supposed to prove it to yourself that the base case is correct and that the recursive case is correct. that is all.
then it is guaranteed to work in however convoluted way it does, and the whole point of it is for us to not care about how it does it exactly, i.e. about its exact sequence of steps.
so specifically in your case, assuming mergesort indeed does work correctly (wait, what? never mind, suspend your disbelief for a moment),
left = mergesort(lst[:midpoint])
calls the function mergesort with the first half of lst, from its start to its midpoint, and stores the result - which is the sorted first half, by assumption, - in the variable left; then
right = mergesort(lst[midpoint:])
calls the function mergesort with the second half of lst, from its midpoint to its end, and stores the result - which is the sorted second half, by assumption, - in the variable right;
and then you need to convince yourself that the rest of the code creates newlist from those two sorted halves such that newlist is also sorted in the correct order.
And then by the principle of mathematical induction this proves the correctness of mergesort.
By assuming it works, we prove that it indeed works! Where's the catch? There's no catch, because the two cases that worked by assumption are for two smaller inputs (and that is our recursive case).
And when we divide a thing into two parts, over and over, eventually we're left with either a singleton, or an empty thing. And those two are naturally sorted (and that is our base case).
Recursion is a leap of faith. Assume the thing is working, then you get to use it. And if you use it correctly, you will have thus built the very thing you were using in the first place!

Interactive wizard in wxpython (Phoenix)

I am trying to develop an interactive wizard in wxPython 4.0(Phoenix) with Python 3.7. Basically the wizard has 4 pages(Stages). Each time I click Next, it should run a different Python script by taking the arguments on the page. I need to display the progress of the running of script on the wizard page.
I have already developed a basic interface for wizard with 4 pages. Now I need help on
1. How to pass arguments from the Page to the Python script?
2. How to call a different Python script everytime I click Next Button on the 4 Pages of wizard? (I think I need to write code around the event ON_PAGE_CHANGING, But I am not clear how to call a different Python script everytime I click NExt Button)
3. How to display progress bar of each script on the wizard?
I am attaching the code for my Basic wizard interface. I am new to WxPython, Any help on the above 3 points is greatly appreciated.
#!/usr/bin/env python
import wx
import wx.adv
from wx.adv import Wizard as wizmod
#import images
from wx.adv import WizardPage, WizardPageSimple
import os.path
padding = 5
class wizard_page(wx.adv.WizardPage):
''' An extended panel obj with a few methods to keep track of its siblings.
This should be modified and added to the wizard. Season to taste.'''
def __init__(self, parent, title):
WizardPage.__init__(self, parent)
self.next = self.prev = None
self.sizer = wx.BoxSizer(wx.VERTICAL)
title = wx.StaticText(self, -1, title)
title.SetFont(wx.Font(18, wx.SWISS, wx.NORMAL, wx.BOLD))
self.sizer.Add(title, 0, wx.ALIGN_LEFT|wx.ALL, padding)
self.sizer.Add(wx.StaticLine(self, -1), 0, wx.EXPAND|wx.ALL, padding)
self.SetSizer(self.sizer)
def add_stuff(self, stuff):
'''Add aditional widgets to the bottom of the page'''
self.sizer.Add(stuff, 0, wx.EXPAND|wx.ALL, padding)
def SetNext(self, next):
'''Set the next page'''
self.next = next
def SetPrev(self, prev):
'''Set the previous page'''
self.prev = prev
def GetNext(self):
'''Return the next page'''
return self.next
def GetPrev(self):
'''Return the previous page'''
return self.prev
class wizard(wx.adv.Wizard):
'''Add pages to this wizard object to make it useful.'''
def __init__(self, title, img_filename=""):
# img could be replaced by a py string of bytes
if img_filename and os.path.exists(img_filename):
img = wx.Bitmap(img_filename)
else: img = wx.NullBitmap
wx.adv.Wizard.__init__(self, None, -1, title, img)
self.pages = []
# Lets catch the events
self.Bind(wx.adv.EVT_WIZARD_PAGE_CHANGED, self.on_page_changed)
self.Bind(wx.adv.EVT_WIZARD_PAGE_CHANGING, self.on_page_changing)
self.Bind(wx.adv.EVT_WIZARD_CANCEL, self.on_cancel)
self.Bind(wx.adv.EVT_WIZARD_FINISHED, self.on_finished)
def add_page(self, page):
'''Add a wizard page to the list.'''
if self.pages:
previous_page = self.pages[-1]
page.SetPrev(previous_page)
previous_page.SetNext(page)
self.pages.append(page)
def run(self):
self.RunWizard(self.pages[0])
def on_page_changed(self, evt):
'''Executed after the page has changed.'''
if evt.GetDirection(): dir = "forward"
else: dir = "backward"
page = evt.GetPage()
print ("page_changed: %s, %s\n" % (dir, page.__class__))
def on_page_changing(self, evt):
'''Executed before the page changes, so we might veto it.'''
if evt.GetDirection(): dir = "forward"
else: dir = "backward"
page = evt.GetPage()
print ("page_changing: %s, %s\n" % (dir, page.__class__))
def on_cancel(self, evt):
'''Cancel button has been pressed. Clean up and exit without continuing.'''
page = evt.GetPage()
print ("on_cancel: %s\n" % page.__class__)
# Prevent cancelling of the wizard.
if page is self.pages[0]:
wx.MessageBox("Cancelling on the first page has been prevented.", "Sorry")
evt.Veto()
def on_finished(self, evt):
'''Finish button has been pressed. Clean up and exit.'''
print ("OnWizFinished\n")
if __name__ == '__main__':
app = wx.App() # Start the application
# Create wizard and add any kind pages you'd like
mywiz = wizard('Simple Wizard', img_filename='wiz.png')
page1 = wizard_page(mywiz, 'Stage 1') # Create a first page
#page1.add_stuff(wx.StaticText(page1, -1, 'Hola'))
page1.add_stuff(wx.CheckBox(page1,-1,'Argument1',(35,40),(150,20)))
page1.add_stuff(wx.CheckBox(page1,-1,'Argument2',(35,60),(150,20)))
page1.add_stuff(wx.CheckBox(page1,-1,'Argument3',(35,80),(150,20)))
mywiz.add_page(page1)
# Add some more pages
mywiz.add_page( wizard_page(mywiz, 'Stage 2') )
mywiz.add_page( wizard_page(mywiz, 'Stage 3') )
mywiz.add_page( wizard_page(mywiz, 'Stage 4') )
mywiz.run() # Show the main window
# Cleanup
mywiz.Destroy()
#del app
app.MainLoop()
del app

Issue with defining and executing the function in Python

Data which I am scraping using the beautiful soup contains one category of device name, device names contains Colors mentioned in them eg. Lumia 800 Black. I want to create a new column which contains this color.
I want to search the device name for any color against a list of colors & if color is present in that device name I want to remove that color from device name and put it in new column named Color.
I am using below referred code to accomplish this, I am creating a function named color and trying to search the device name string for presence of color and if present I am trying to feed that color to new variable named color_column. But my output csv is not returning any values at all. It is empty.
Please check the referred code below:
# -*- coding: cp1252 -*-
import csv
import urllib2
import sys
import urllib
import time
import mechanize
import cookielib
from bs4 import BeautifulSoup
from itertools import islice
colors = ["Black","Gray"]
def color(arg):
for colors_1 in colors:
if arg.find(colors_1) == -1:
return color_column == ""
return color_column == colors_1
url = 'http://www.t-mobile.com/shop/phones/default.aspx?shape=smartphones'
user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1;Trident/5.0)'
values = {
'Phones':'MBBDevice',
'__ASYNCPOST':'true',
'__EVENTARGUMENT':'',
'__EVENTTARGET':'pgrTop$lnkPageShowAll',
'__LASTFOCUS':'',
'__VIEWSTATE':'/wEPDwULLTE1NTE5NDk1ODIPFgIeEEN1cnJlbnRQYWdlSW5kZXgCARYCAgEPZBYCAgEPZBYCAgEPZBYCZg9kFgICAQ9kFhgCCg9kFgJmD2QWAmYPZBYCZg8UKwACZDKJBAABAAAA/////wEAAAAAAAAADAIAAABfVE1vYmlsZS5XZWIuVE1vYmlsZURvdENvbS5VSS5XZWJDb250cm9scywgVmVyc2lvbj0xLjAuMC4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPW51bGwFAQAAAEFUTW9iaWxlLldlYi5UTW9iaWxlRG90Q29tLlVJLldlYkNvbnRyb2xzLkJyZWFkQ3J1bWJJdGVtQ29sbGVjdGlvbgEAAAATQ29sbGVjdGlvbkJhc2UrbGlzdAMcU3lzdGVtLkNvbGxlY3Rpb25zLkFycmF5TGlzdAIAAAAJAwAAAAQDAAAAHFN5c3RlbS5Db2xsZWN0aW9ucy5BcnJheUxpc3QDAAAABl9pdGVtcwVfc2l6ZQhfdmVyc2lvbgUAAAgICQQAAAACAAAABQAAABAEAAAABAAAAAkFAAAACQYAAAANAgUFAAAAN1RNb2JpbGUuV2ViLlRNb2JpbGVEb3RDb20uVUkuV2ViQ29udHJvbHMuQnJlYWRDcnVtYkl0ZW0DAAAABV90ZXh0BF91cmwJX3Nob3dMaW5rAQEAAQIAAAAGBwAAAARIb21lBggAAAAAAQEGAAAABQAAAAYJAAAAGVNtYXJ0cGhvbmVzICYgQ2VsbCBQaG9uZXMGCgAAAAtzaG9wL3Bob25lcwELZAIMD2QWAgIDDxYCHgxIdG1sT3ZlcnJpZGUFkwI8aW1nIHN0eWxlPSJGTE9BVDogcmlnaHQ7IENVUlNPUjogcG9pbnRlciEgaW1wb3J0YW50IiBvbmNsaWNrPSJqYXZhc2NyaXB0OnBvcFVwKCAnL3RlbXBsYXRlcy9wb3B1cC5hc3B4P1BBc3NldD1TaHBfUGhuX3NoaXBwaW5nRGV0YWlscycsICczNDAnLCAnNTY4JywgJzQ1JywgJzMwJywgJzAnLCAnMCcsICcxJyApIiBhbHQ9IkZyZWUgU2hpcHBpbmcgb24gYWxsIGNlbGwgcGhvbmVzIGFuZCBkZXZpY2VzLiIgc3JjPSIuLi9pbWFnZXMvZnJlZV9zaGlwcGluZy1iYW5uZXIuZ2lmIiAvPmQCDg8PFgIeB1Zpc2libGVoZGQCGA9kFgJmD2QWAmYPZBYCZg9kFggCAQ9kFgICAQ8QDxYEHgdDaGVja2VkaB4HRW5hYmxlZGgWAh4LbWFrZWVuYWJsZWQFBWZhbHNlZGRkAgUPZBYCAgEPEA9kFgIfBQUEdHJ1ZWRkZAIHD2QWAgIBDxAPZBYCHwUFBHRydWVkZGQCCQ9kFgICAQ8QD2QWAh8FBQR0cnVlZGRkAhoPZBYCZg9kFgJmD2QWAmYPZBYEAgMPZBYCAgEPEA9kFgIfBQUEdHJ1ZWRkZAIFD2QWAgIBDxAPFgIeBFRleHQF2AU8dGFibGUgaGVpZ2h0PSIxNSIgY2VsbHNwYWNpbmc9IjAiIGNlbGxwYWRkaW5nPSIwIiB3aWR0aD0iNzciIGJvcmRlcj0iMCI+CiAgICAgIDx0Ym9keT4KICAgICAgICA8dHI+CiAgICAgICAgICA8dGQgY2xhc3M9InJlZnVyYmlzaGVkIj5SZWZ1cmJpc2hlZDwvdGQ+CgogICAgICAgICAgPHRkIGNsYXNzPSJyZWZ1cmJpc2hlZCI+CiAgICAgICAgICAgIDxkaXYgb25tb3VzZW92ZXI9ImphdmFzY3JpcHQ6ZGlzcENPQkRlc2MoKTsiIHN0eWxlPSJGTE9BVDogbGVmdCIgb25tb3VzZW91dD0iamF2YXNjcmlwdDpoaWRlQ09CRGVzYygpOyIgcnVuYXQ9InNlcnZlciI+CiAgICAgICAgICAgICAgPGltZyBzcmM9Ii9pbWFnZXMvaWNvbl9oZWxwLmdpZiIgLz4gPGRpdiBjbGFzcz0idG9vbHRpcCIgaWQ9ImRpdkNPQkRlc2NyaXB0aW9uIiBzdHlsZT0iRElTUExBWTogbm9uZSI+CiAgICAgIDxkaXYgY2xhc3M9InRvb2x0aXAtYnRtLWJrZyI+CiAgICAgICAgPGRpdiBjbGFzcz0idG9vbHRpcC1jb250YWluZXIiPgogICAgICAgICAgR2V0IGEgZ3JlYXQgdmFsdWUgb24gYSBsaWtlLW5ldyBwaG9uZQogICAgICAgICAgPGJyIC8+CiAgICAgICAgICAgd2l0aCBhIDkwLWRheSB3YXJyYW50eS4KICAgICAgICA8L2Rpdj4KICAgICAgPC9kaXY+CiAgICA8L2Rpdj4KICAgICAgICAgICAgPC9kaXY+CiAgICAgICAgICA8L3RkPgogICAgICAgIDwvdHI+CiAgICAgIDwvdGJvZHk+CiAgICA8L3RhYmxlPhYCHwUFBHRydWVkZGQCIA8WAh4Fc3R5bGUFDmRpc3BsYXk6YmxvY2s7FgJmD2QWAmYPZBYCZg9kFgYCAw9kFgICAQ8QD2QWAh8FBQR0cnVlZGRkAgUPZBYCAgEPEA9kFgIfBQUEdHJ1ZWRkZAIHD2QWAgIBDxAPZBYCHwUFBHRydWVkZGQCKg9kFgJmD2QWAmYPZBYEZg8PFgIfAmcWAh4HT25DbGljawUKQ2xlYXJJRFMoKWQCAQ8PZBYCHwgFCkNsZWFySURTKClkAi4PZBYCZg9kFgJmD2QWAgIKD2QWCAIBDw8WAh8CaGRkAgMPFgIeCl9QYWdlQ291bnQCBBYGAgIPFgIfAmhkAgcPD2QWAh8HBQxkaXNwbGF5Om5vbmVkAggPDxYCHwJnZGQCBw8WAh8JAgQWBgICDxYCHwJoZAIIDw9kFgIfBwUMZGlzcGxheTpub25lZAIJDw8WAh8CZ2RkAgsPFgIfAmhkAjAPFgIeE0Ntc0NvbGxlY3Rpb25TdHJpbmdlZAI0D2QWAmYPZBYCZg9kFgQCAQ8WAh4MQ21zQXNzZXROYW1lBRVUb3V0X0ZBUV9EZXZBbGxQaG9uZXNkAgQPFgIfCgUPdG91dF9odG1sX2xvZ2luZAI2D2QWBGYPZBYCZg9kFgJmDxYCHwJoZAIBD2QWAmYPZBYCZg8WAh8LBRJzaHBfcGhuX2xlZ2FsTm90ZXNkAjgPDxYCHhxUaXRsZXBvcHVwUGxhbkNoYW5nZVJlcXVpcmVkZWQWBAIPDxYCHwJoZAITDxYCHwJoZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WNAUJTUJCRGV2aWNlBQ1QcmVQYWlkUGhvbmVzBQ1QcmVQYWlkUGhvbmVzBSFyZXBQcmljZVJhbmdlJGN0bDAwJGNoa1ByaWNlUmFuZ2UFDmNoa05ld0Fycml2YWxzBQ9jaGtXZWJPbmx5RGVhbHMFEmNoa1dlYk9ubHlQcm9kdWN0cwUPY2hrTmV3Q29uZGl0aW9uBQZjaGtDT0IFFnJlcFR5cGVzJGN0bDAwJGNoa1R5cGUFFnJlcFR5cGVzJGN0bDAyJGNoa1R5cGUFFnJlcFR5cGVzJGN0bDA0JGNoa1R5cGUFFnJlcFR5cGVzJGN0bDA1JGNoa1R5cGUFFnJlcFR5cGVzJGN0bDA2JGNoa1R5cGUFDGNoa0FuZHJvaWRPUwUPY2hrQmxhY2tCZXJyeU9TBQhjaGtXaW5PUwUgcmVwRmVhdHVyZUZpbHRlciRjdGwwMCRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMDEkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDAyJGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwwMyRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMDQkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDA1JGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwwNiRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMDckY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDA4JGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwwOSRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMTAkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDExJGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwxMiRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMTMkY2hrRmlsdGVyBSByZXBGZWF0dXJlRmlsdGVyJGN0bDE0JGNoa0ZpbHRlcgUgcmVwRmVhdHVyZUZpbHRlciRjdGwxNSRjaGtGaWx0ZXIFIHJlcEZlYXR1cmVGaWx0ZXIkY3RsMTYkY2hrRmlsdGVyBSdyZXBNYW51ZmFjdHVyZXJzJGN0bDAwJGNoa01hbnVmYWN0dXJlcnMFJ3JlcE1hbnVmYWN0dXJlcnMkY3RsMDEkY2hrTWFudWZhY3R1cmVycwUncmVwTWFudWZhY3R1cmVycyRjdGwwMiRjaGtNYW51ZmFjdHVyZXJzBSdyZXBNYW51ZmFjdHVyZXJzJGN0bDA0JGNoa01hbnVmYWN0dXJlcnMFJ3JlcE1hbnVmYWN0dXJlcnMkY3RsMDUkY2hrTWFudWZhY3R1cmVycwUncmVwTWFudWZhY3R1cmVycyRjdGwwNiRjaGtNYW51ZmFjdHVyZXJzBSdyZXBNYW51ZmFjdHVyZXJzJGN0bDA3JGNoa01hbnVmYWN0dXJlcnMFJ3JlcE1hbnVmYWN0dXJlcnMkY3RsMDgkY2hrTWFudWZhY3R1cmVycwUabXJwUGhvbmVzJGN0bDAwJGNoa0NvbXBhcmUFGm1ycFBob25lcyRjdGwwMiRjaGtDb21wYXJlBRptcnBQaG9uZXMkY3RsMDQkY2hrQ29tcGFyZQUabXJwUGhvbmVzJGN0bDA2JGNoa0NvbXBhcmUFGm1ycFBob25lcyRjdGwwOCRjaGtDb21wYXJlBRptcnBQaG9uZXMkY3RsMTAkY2hrQ29tcGFyZQUabXJwUGhvbmVzJGN0bDEyJGNoa0NvbXBhcmUFGm1ycFBob25lcyRjdGwxNCRjaGtDb21wYXJlBRptcnBQaG9uZXMkY3RsMTYkY2hrQ29tcGFyZQUabXJwUGhvbmVzJGN0bDE4JGNoa0NvbXBhcmVnDy0KUN8keEvS5/wEmJXssTUSNw==',
'ctl09':'ctl13|pgrTop$lnkPageShowAll',
'ddlSort':'0',
'hdnBlackBerryID':'3c2c3562-aa1c-4fe4-a0ca-da5dd8e4bd84',
'hdnCapCode':'',
'hdnDeviceId':'',
'hdnFeature':'',
'hdnFeatureNames':'',
'hdnFilter':'',
'hdnIsPricingOptionLockedB':'false',
'hdnLocationParameter':'',
'hdnManufacturer':'',
'hdnManufacturerID':'',
'hdnManufacturerNames':'',
'hdnOtherFilters':'',
'hdnPageIndex':'',
'hdnPriceRange':'',
'hdnPriceRangeText':'',
'hdnProductType':'GSM',
'hdnSelectedDeviceId':'',
'hdnSelections':'',
'hdnSortFilter':'0',
'hdnTitle':'',
'hdnType':'smp,',
'hdnTypeNames':'Smartphone|',
'popupPlanChangeRequired$hdnDeviceID':'',
'popupPlanChangeRequired$hdnFamilyID':'',
'popupPlanChangeRequired$hiddenImagePath':'',
'repTypes$ctl05$chkType':'on',
'txtSelectedDevices':'0',
'txtSelectedFeatures':'0'}
headers = { 'User-Agent' : user_agent }
data = urllib.urlencode(values)
req = urllib2.Request(url, data, headers)
response = urllib2.urlopen(req)
page = response.read()
soup = BeautifulSoup(page)
with open('tmob_colortest.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',')
items = soup.findAll('div', {"class": "phonename"}, text = colors)
prices = soup.findAll('p', {"class": "totalitemprice"})
for item, price in zip(items, prices):
textcontent = u' '.join(islice(item.stripped_strings, 0, 2, 1))
textcontent2 = u' '.join(price.stripped_strings)
name_1 = unicode(textcontent).encode('utf8').replace('Nexus 4','LG Nexus 4').replace(' T-Mobile Refurbished Device','').replace('™','').replace('®','').replace(' ›','').replace("NEW! ","").replace(" Web-only offer -- now thru Thu 1/3/13","").replace(" Web-only offer!","").strip()
oem = list(name_1)
pos = oem.index(' ')
if name_1.find('Refurbished')== -1:
name= name_1
refur = "N"
else:
name = name_1.replace("Refurbished","").replace(" -","")
refur = "Y"
spamwriter.writerow(["US", "T-Mobile",
name[0:pos],name,refur,color_column,
"24 Months","$",unicode(textcontent2).encode('utf8').replace("FREE","0").replace('$','')])
Please help me to solve this issue and pardon my ignorance as I am new to coding.
You never actually use your function, so color_column is never filled.
What you want to do is make your function return the changed product name, and the color detected, as two separate values:
def handle_color(arg):
for col in colors:
if col.lower() not in arg.lower():
continue
# color found, remove it from arg (case insensitively)
start = arg.lower().index(col.lower())
arg = arg[:start] + arg[start + len(col):]
return arg, col
# No matching color found, return arg unchanged and an empty value for the color
return arg, ''
Now all you have to do is call this function and unpack it's return value into two variables for your CSV:
name, color_column = handle_color(name)
and color_column will either be an empty value or the matched color (now removed from name).