replicating a row with changing a field in Python - csv

I have a large csv file with two columns like this:
Id and vehicle
and I like to replicate the rows and if the vehicle is "truck", but instead put "car".
I have this code, but there is an error
which says
_csv.Error: iterator should return strings, not bytes (did you open the file in text mode?)
what does it mean? where I am wrong?
infilename = r'external carriers.csv'
outfilename = r'outputCSV.csv'
with open(infilename, 'rb') as fp_in, open(outfilename, 'wb') as fp_out:
reader = csv.reader(fp_in, delimiter=",")
writer = csv.writer(fp_out, delimiter=",")
for row in reader:
if len(row) == 2:
if row == "truck":
writer.writerow = "car"

It's obvious, you have opened the file in text mode you need rt :
with open(infilename, 'rt') as fp_in, open(outfilename, 'wt') as fp_out:
Also if you want to check the vehicle type you need to check the row[1] which preserve your car name and then reassign it and write the row to your output file.Also note that you don't need to check the length of your rows since calling the len function can be terrible in term if your performance which has O(n) and for large files (specially with large rows) is very inefficient.
infilename = r'external carriers.csv'
outfilename = r'outputCSV.csv'
with open(infilename, 'rt') as fp_in, open(outfilename, 'wt') as fp_out:
reader = csv.reader(fp_in, delimiter=",")
writer = csv.writer(fp_out, delimiter=",")
for row1,row2 in reader:
if row2 == "truck":
writer.writerow([row1,'car'])

Related

How to extract data from one CSV file to another one using index value

I have to filter the data, therefore I need to create new CSV file based on the filters.
I am having a trouble doing it, cause the new file does not change after I run the code
Below is my code. Where I have two csv file. Stage_3_try.csv file is the one I am trying to add new data. I used enumerate to get the index value of the specific value I searched in previous csv file.
# Projec
import csv
from csv import writer
A = np.array([ 316143.8829, 6188926.04])
B = np.array([ 314288.7418, 6190277.519])
for i in range(0,len(east_3)):
P = []
P.append(east_3[i])
P.append( north_3[i])
P = np.asarray(P)
projected = point_on_line(P) #a code to do the projection
x_values = [A[0], B[0]]
y_values = [A[1], B[1]]
plt.plot(x_values, y_values, 'b-')
if projected[0]>315745.75 and projected[1]>6188289:
with open('Stage_3_try.csv', 'a') as f_out:
writer = csv.writer(f_out)
for num, row in enumerate(stage_3['UTM North NAD83']):
if row == P[1]:
writer.writerow(stage_3.loc[[num][0]])
print(type(stage_3.loc[[num][0]]))
plt.plot(projected[0], projected[1], 'rx')
f_out.close()
else:
pass
PS: I updated the code, since the previous one worked, but when I added it to the loop, it stopped working

When appending to a List of JSON Objects in python, why is it duplicating only the "2nd layer" objects?

Here is my code.
def generateNewDevices(numofdevices):
global simulated_devices
for x in range(numofdevices):
new_device = reference_device.copy()
new_device["accel"]["accx"] = random.randint(-32768, 32767)
new_device["accel"]["accy"] = random.randint(-32768, 32767)
new_device["accel"]["accz"] = random.randint(-32768, 32767)
new_device["location"]["gpsla"] = random.uniform(MINLAT, MAXLAT)
new_device["location"]["gpslo"] = random.uniform(MINLON, MAXLON)
new_device["temp"] = random.randint(-127, 127)
new_device["status"] = random.randint(0, 1)
str1 = new_device["deviceName"]
str1 = str1[:-3]
str2 = str(x).zfill(3)
str1 += str2
new_device["deviceName"] = str1
node_red_send(URL, new_device)
print(new_device)
simulated_devices.append(new_device)
generateNewDevices(3)
for x in range(len(simulated_devices)):
print(simulated_devices[x])
Why when printing through the list of values at the end, does the list show the "new device" data for appended JSON objects "1 layer deep" (temp, status and name) but duplicate the data for "2 layers deep" (accx, accy, gpsla)?
The .copy()s are in there because I was having issues with python append duplicating all the values at first. Is this some variation of the same issue? I even tried .copy()ing right before appending to the list. (I come from a c/c++ background so I do not fully understand why python does some of its things)
Any help appreciated.
Kr, apuri123.
I doubt anyone will end up here when searching for an answer, but in case you do, you are looking for "deepcopy":
import copy
original = {} #object with as many objects within objects as you want
myCopy = copy.deepcopy(original)
Google "python deepcopy" and you should be able to find what you are looking for.

Didn't get values from mysql when more than one data frame inserting in python

I am trying to read xlsx file from unread mail and convert it to data frame finally it will insert into MySQL DB.To avoid duplication while inserting each row of data frame i check if the data already present in db,for this duplication i check mails one by one.
My issue is when two or more unread mail is present in inbox this duplication check fails.
detach_dir = os.path.dirname(os.path.abspath(__file__)) +
'/attachments'
user = "abc#outlook.in"
pwd = "xyz#123*"
m = imaplib.IMAP4_SSL("outlook.office365.com")
m.login(user,pwd)
# Select the mailbox
m.select("folder name in mail")
n = 0
resp, items = m.search(None, '(UNSEEN)')
items = items[0].split()
for emailid in items:
resp, data = m.fetch(emailid, "(RFC822)")
email_body = data[0][1]
mail = email.message_from_bytes(email_body)
if mail.get_content_maintype() != 'multipart':
continue
att_path = os.path.join(detach_dir, filename)
if not os.path.isfile(att_path) :
fp = open(att_path, 'wb')
fp.write(part.get_payload(decode=True))
fp.close()
df_mail = pd.read_excel(att_path,skiprows=
[0,2,3,4,5,6,7,8,9],skip_blank_lines=True,skipfooter=1,index=False)
df_mail = df_mail.fillna(0)
df_mail.dropna(how="all", inplace=True)
for i, row in df_mail.iterrows():
sql = 'SELECT * FROM `tablename WHERE condition for duplicate'
extist=con.execute(sql)
duplicate=extist.fetchall()
if len(duplicate) == 0:
df_mail.iloc[i:i+1].to_sql('table', con = engine, if_exists = 'append', chunksize = 1000,index=False)
else:
print("duplicate data")
Can you please share your code and a list of things you have tried? It is very hard to help without those.

Horizontal append in for loop?

I have a for loop iterating over a folder of one column csv's using glob, it makes some adjustments and then appends the results to a list and saves to a new csv, it resembles:
data= []
infiles = glob.glob("*.csv")
for file in infiles:
df = pd.io.parsers.read_csv(file)
(assorted adjustments)
data.append(df)
fullpanel = pd.concat(panel)
fullpanel.to_csv('data.csv')
The problem is that makes one long column, I need each column (of differing lengths) added next to each other.
I think you can add parameter axis=1 to concat for columns added next to each other. Also you can change pd.io.parsers.read_csv to pd.read_csv and panel to data in concat.
data= []
infiles = glob.glob("*.csv")
for file in infiles:
df = pd.read_csv(file)
(assorted adjustments)
data.append(df)
fullpanel = pd.concat(data, axis=1)
fullpanel.to_csv('data.csv')

Writing items from function to separate text files?

I'm running some web scraping, and now have a list of 911 links saved in the following (I included 5 to demonstrate how they're stored):
every_link = ['http://www.millercenter.org/president/obama/speeches/speech-4427', 'http://www.millercenter.org/president/obama/speeches/speech-4425', 'http://www.millercenter.org/president/obama/speeches/speech-4424', 'http://www.millercenter.org/president/obama/speeches/speech-4423', 'http://www.millercenter.org/president/obama/speeches/speech-4453']
These URLs link to presidential speeches over time. I want to store each individual speech (so, 911 unique speeches) in different text files, or be able to group by president. I'm trying to pass the following function on to these links:
def processURL(l):
open_url = urllib2.urlopen(l).read()
item_soup = BeautifulSoup(open_url)
item_div = item_soup.find('div',{'id': 'transcript'},{'class': 'displaytext'})
item_str = item_div.text.lower()
item_str_processed = punctuation.sub('',item_str)
item_str_processed_final = item_str_processed.replace('—',' ')
for l in every_link:
processURL(l)
So, I would want to save to unique text files words from the all the processed speeches. This might look like the following, with obama_44xx representing individual text files:
obama_4427 = "blah blah blah"
obama_4425 = "blah blah blah"
obama_4424 = "blah blah blah"
...
I'm trying the following:
for l in every_link:
processURL(l)
obama.write(processURL(l))
But that's not working...
Is there another way I should go about this?
Okay, so you have a couple of issues. First of all, your processURL function doesn't actually return anything, so when you try to write the return value of the function, it's going to be None. Maybe try something like this:
def processURL(link):
open_url = urllib2.urlopen(link).read()
item_soup = BeautifulSoup(open_url)
item_div = item_soup.find('div',{'id': 'transcript'},{'class': 'displaytext'})
item_str = item_div.text.lower()
item_str_processed = punctuation.sub('',item_str)
item_str_processed_final = item_str_processed.replace('—',' ')
splitlink = link.split("/")
president = splitlink[4]
speech_num = splitlink[-1].split("-")[1]
filename = "{0}_{1}".format(president, speech_num)
return filename, item_str_processed_final # returning a tuple
for link in every_link:
filename, content = processURL(link) # yay tuple unpacking
with open(filename, 'w') as f:
f.write(content)
This will write each file to a filename that looks like president_number. So for example, it will write Obama's speech with id number 4427 to a file called obama_4427. Lemme know if that works!
You have to call the processURL function and have it return the text you want written. After that, you simply have to add the writing to disk code within the loop. Something like this:
def processURL(l):
open_url = urllib2.urlopen(l).read()
item_soup = BeautifulSoup(open_url)
item_div = item_soup.find('div',{'id': 'transcript'},{'class': 'displaytext'})
item_str = item_div.text.lower()
#item_str_processed = punctuation.sub('',item_str)
#item_str_processed_final = item_str_processed.replace('—',' ')
return item_str
for l in every_link:
speech_text = processURL(l).encode('utf-8').decode('ascii', 'ignore')
speech_num = l.split("-")[1]
with open("obama_"+speech_num+".txt", 'w') as f:
f.write(speech_text)
The .encode('utf-8').decode('ascii', 'ignore') is purely for dealing with non-ascii characters in the text. Ideally you would handle them in a different way, but that depends on your needs (see Python: Convert Unicode to ASCII without errors).
Btw, the 2nd link in your list is 404. You should make sure your script can handle that.