I have to update millions of row into MySQL. I am currently using for loop to execute query. To make the update faster I want to use executemany() of Python MySQL Connector, so that I can update in batches using single query for each batch.
I don't think mysqldb has a way of handling multiple UPDATE queries at one time.
But you can use an INSERT query with ON DUPLICATE KEY UPDATE condition at the end.
I written the following example for ease of use and readability.
import MySQLdb
def update_many(data_list=None, mysql_table=None):
"""
Updates a mysql table with the data provided. If the key is not unique, the
data will be inserted into the table.
The dictionaries must have all the same keys due to how the query is built.
Param:
data_list (List):
A list of dictionaries where the keys are the mysql table
column names, and the values are the update values
mysql_table (String):
The mysql table to be updated.
"""
# Connection and Cursor
conn = MySQLdb.connect('localhost', 'jeff', 'atwood', 'stackoverflow')
cur = conn.cursor()
query = ""
values = []
for data_dict in data_list:
if not query:
columns = ', '.join('`{0}`'.format(k) for k in data_dict)
duplicates = ', '.join('{0}=VALUES({0})'.format(k) for k in data_dict)
place_holders = ', '.join('%s'.format(k) for k in data_dict)
query = "INSERT INTO {0} ({1}) VALUES ({2})".format(mysql_table, columns, place_holders)
query = "{0} ON DUPLICATE KEY UPDATE {1}".format(query, duplicates)
v = data_dict.values()
values.append(v)
try:
cur.executemany(query, values)
except MySQLdb.Error, e:
try:
print"MySQL Error [%d]: %s" % (e.args[0], e.args[1])
except IndexError:
print "MySQL Error: %s" % str(e)
conn.rollback()
return False
conn.commit()
cur.close()
conn.close()
Explanation of one liners
columns = ', '.join('`{}`'.format(k) for k in data_dict)
is the same as
column_list = []
for k in data_dict:
column_list.append(k)
columns = ", ".join(columns)
Here's an example of usage
test_data_list = []
test_data_list.append( {'id' : 1, 'name' : 'Marco', 'articles' : 1 } )
test_data_list.append( {'id' : 2, 'name' : 'Keshaw', 'articles' : 8 } )
test_data_list.append( {'id' : 3, 'name' : 'Wes', 'articles' : 0 } )
update_many(data_list=test_data_list, mysql_table='writers')
Query output
INSERT INTO writers (`articles`, `id`, `name`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE articles=VALUES(articles), id=VALUES(id), name=VALUES(name)
Values output
[[1, 1, 'Marco'], [8, 2, 'Keshaw'], [0, 3, 'Wes']]
Maybe this can help
How to update multiple rows with single MySQL query in python?
cur.executemany("UPDATE Writers SET Name = %s WHERE Id = %s ",
[("new_value" , "3"),("new_value" , "6")])
conn.commit()
Related
To insert values into mysql table in python. Below is the code extracting mongodb collection data and inserting it into mysql table in python.
def insert():
client=MongoClient('mongodb://localhost:27017')
db=client['mydb'] #database
coll=db['data'] #collection
mongo_docs = coll.find({},{'_id':0}) #mongo cursor removed '_id' in projection
fieldnames = list(mongo_docs[0].keys()) #all the column names in the dataset
for record in mongo_docs:
values = list(record.values()) #all the values in the dataset
#print(values)
connection=mysql.connector.connect(host="localhost",user="root",database="mydb",password="passwd")
cursor1=connection.cursor()
connection.commit() #mysql connection
count=0
for i in fieldnames:
count=count+1
qmark=[]
a=0
while a<count:
qmark.append('%s')
a=a+1
q=','.join(tuple(qmark))
query="INSERT INTO ndata VALUES ('%s')"%(q)
cursor1.executemany("INSERT INTO ndata VALUES (%s)" %(q),(values))
This code throws an error:
ProgrammingError: Could not process parameters: int(82506), it must be of type list, tuple or dict
The values in the dataset are like this:
[82506, '1945-12-31', 0, '', 29.44444444, 17.22222222, 23.33333333, 0, '', 45, 12, 31, 0, '', '', 85, 63, 74, 0, '', '', '', '', '', '', '', '', '', '', '', '']
which has empty strings inside it.
q in the code produces %s, generates %s which equal to number of columns in the dataset. Here 31 columns in the dataset so there are 31 of (%s,%s,%s.....) in q
The same code with when executed with
cursor.execute("INSERT INTO ndata VALUES (%s)" %(q),(values))
in place of cursor.executemany() runs without any errors but it does not insert any values into the table in mysql.
What changes should i make to insert multiple rows of values at once ?
or how could i insert it row by row?
I can't test it but I think you create values in wrong way.
If it works for execute() then values has only one row of data but executemany() expects list with many rows of data.
And this may suggest that you create values in wrong way.
You should create list values = [] before for-loop and you should use values.append(...) instead of values = ... to add new row to list (instead of keeping only one row in variable).
# --- before loop ---
values = []
# --- loop ---
for record in mongo_docs:
row = list(record.values())
values.append(row)
# --- after loop ---
print(values)
BTW:
Shorter
count = len(fieldnames)
and
qmark = ['%s'] * count
q = ','.join(qmark)
def insert():
client = MongoClient('mongodb://localhost:27017') # PEP8: spaces around `=`
db = mongo_client['mydb']
collection = db['data']
mongo_docs = collection.find({},{'_id':0})
fieldnames = list(mongo_docs[0].keys())
values = []
for record in mongo_docs:
row = list(record.values())
values.append(row)
connection = mysql.connector.connect(host="localhost", user="root", database="mydb", password="passwd") # PEP8: spaces after `,`
cursor = connection.cursor()
count = len(fieldnames)
qmark = ['%s'] * count
q = ','.join(qmark) # no need `tuple()`
query = f"INSERT INTO ndata VALUES ({q})" # modern `f-string` instead of very old `%`
cursor.executemany(query, values)
connection.commit() # after `executemany('INSERT ...')`
PEP 8 -- Style Guide for Python Code
I have a data frame in pyspark like below
df = spark.createDataFrame(
[
('2021-10-01','A',25),
('2021-10-02','B',24),
('2021-10-03','C',20),
('2021-10-04','D',21),
('2021-10-05','E',20),
('2021-10-06','F',22),
('2021-10-07','G',23),
('2021-10-08','H',24)],("RUN_DATE", "NAME", "VALUE"))
Now using this data frame I want to update a table in MySql
# query to run should be similar to this
update_query = "UPDATE DB.TABLE SET DATE = '2021-10-01', VALUE = 25 WHERE NAME = 'A'"
# mysql_conn is a function which I use to connect to `MySql` from `pyspark` and run queries
# Invoking the function
mysql_conn(host, user_name, password, update_query)
Now when I invoke the mysql_conn function by passing parameters the query runs successfully and the record gets updated in the MySql table.
Now I want to run the update statement for all the records in the data frame.
For each NAME it has to pick the RUN_DATE and VALUE and replace in update_query and trigger the mysql_conn.
I think we need to a for loop but not sure how to proceed.
Instead of iterating through the dataframe with a for loop, it would be better to distribute the workload across each partitions using foreachPartition. Moreover, since you are writing a custom query instead of executing one query for each query, it would be more efficient to execute a batch operation to reduce the round trips, latency and concurrent connections. Eg
def update_db(rows):
temp_table_query=""
for row in rows:
if len(temp_table_query) > 0:
temp_table_query = temp_table_query + " UNION ALL "
temp_table_query = temp_table_query + " SELECT '%s' as RUNDATE, '%s' as NAME, %d as VALUE " % (row.RUN_DATE,row.NAME,row.VALUE)
update_query="""
UPDATE DBTABLE
INNER JOIN (
%s
) new_records ON DBTABLE.NAME = new_records.NAME
SET
DBTABLE.DATE = new_records.RUNDATE,
DBTABLE.VALUE = new_records.VALUE
""" % (temp_table_query)
mysql_conn(host, user_name, password, update_query)
df.foreachPartition(update_db)
View Demo on how the UPDATE query works
Let me know if this works for you.
When I update data in a column called 'mine' where all rows are null values in mysql with Python, I want to escape the while statement when there are no more null values. What condition should I add to the query statement below? The current data is updated normally, but when all data is updated, the indication that it is over is not displayed.
import pymysql
conn=pymysql.connect(
user='root',
passwd='*',
host='',
db='practice',
charset='utf8')
curs = conn.cursor()
num = 0
while num >= 0:
num += 1
sql = "update zipcode set mine = %s where mine is null limit 1"
data = (num)
curs.execute(sql, data)
conn.commit()
conn.close()
I am trying to insert data from my array into MySQL.
To my big surprise there were not many examples on how to do it if you perform a for-loop for your array, every example that I have found was from an already existing array list.
Thanks to Adrian below, we noticed that I need tuples for my list.
Updated code
connection = mysql.connector.connect(
host='localhost',
database='test',
user='root',
password='pass'
)
query = "INSERT INTO blue (created, published, publisher) VALUES (%s, %s, %s)"
array = []
# The idea here is to get all table rows in the page so you can group the values into rows that are going to be added to MySQL
tr = soup.find_all('tr')
for table_row in tr:
row_data = table_row.find_all('td')
insert_row = []
for data in row_data:
data = re.sub('<[^>]*>', '', str(data))
insert_row.append(data)
array.append(tuple(insert_row))
print(array)
cursor = connection.cursor()
cursor.executemany(query, array)
cursor.commit()
Getting close but at the moment I receive the following
IndexError: Tuple index out of range
mysql.connector.errors.ProgrammingError: Not enough parameters for the SQL statement
Thanks in advance!
I think you are mixing two ways of solving the problem...
One way is using the executemany method as described in the documentation
query = "INSERT INTO blues (created, published, publisher) VALUES (%s, %s, %s)"
array = []
# The idea here is to get all table rows in the page so you
# can group the values into rows that are going to be added to MySQL
tr = soup.find_all('tr')
for table_row in tr:
row_data = table_row.find_all('td')
insert_row = [None, None, None]
for idx in range(len(row_data)):
if row_data[idx] and idx < 3:
data = re.sub('<[^>]*>', '', str(row_data[idx]))
if data:
insert_row[idx] = data
array.append(tuple(insert_row))
cursor = connection.cursor()
cursor.executemany(query, array)
cursor.commit()
Another way is to build the query yourself...
query = "INSERT INTO blues (created, published, publisher) VALUES "
array = []
# The idea here is to get all table rows in the page so you can group the values into rows that are going to be added to MySQL
tr = soup.find_all('tr')
for table_row in tr:
row_data = table_row.find_all('td')
insert_row = []
for data in row_data:
data = re.sub('<[^>]*>', '', str(data))
insert_row.append(data)
array.append(tuple(insert_row))
values = []
for item in array:
row = [None, None, None]
for idx in range(len(item)):
row[idx] = item[idx]
values.append(str(tuple(row)))
query += ",".join(values)
cursor = connection.cursor()
cursor.execute(query)
cursor.commit()
Hope this helps...
I have written a function that uses mysql connector to insert data to table in mysql.
def insert_row(data, table, conn):
"""Insert new row of data receieved.
"""
cursor = conn.cursor()
query = ("INSERT INTO " + table + " "
"(temp, humidity)"
" VALUES (%(temp)s, %(humidity)s)")
print(query)
cursor.execute(query, data)
conn.commit()
cursor.close()
However, now I want to modify it to be able to build query dynamically based on table and its columns coming from the data.
the data arg is a dict object.
currently statement that gets constructed is this INSERT INTO particle_photon (temp, humidity) VALUES (%(temp)s, %(humidity)s) but different table may have different columns coming in the data dict object.
I figured out myself,
def insert_row(data, table, conn):
"""Insert new row of data receieved.
"""
cursor = conn.cursor()
placeholder = ", ".join(["%s"] * len(data))
stmt = "insert into `{table}` ({columns}) values ({values});".format(
table=table,
columns=",".join(data.keys()),
values=placeholder
)
cursor.execute(stmt, list(data.values()))
conn.commit()
cursor.close()
You can remove column names and the parenthesis around them from your prepared query, then include them again in your table parameter. So the table would be equal to particle_photon (temp, humidity) instead of just particle_photon.