generate queries for each key in pyspark data frame - mysql

I have a data frame in pyspark like below
df = spark.createDataFrame(
[
('2021-10-01','A',25),
('2021-10-02','B',24),
('2021-10-03','C',20),
('2021-10-04','D',21),
('2021-10-05','E',20),
('2021-10-06','F',22),
('2021-10-07','G',23),
('2021-10-08','H',24)],("RUN_DATE", "NAME", "VALUE"))
Now using this data frame I want to update a table in MySql
# query to run should be similar to this
update_query = "UPDATE DB.TABLE SET DATE = '2021-10-01', VALUE = 25 WHERE NAME = 'A'"
# mysql_conn is a function which I use to connect to `MySql` from `pyspark` and run queries
# Invoking the function
mysql_conn(host, user_name, password, update_query)
Now when I invoke the mysql_conn function by passing parameters the query runs successfully and the record gets updated in the MySql table.
Now I want to run the update statement for all the records in the data frame.
For each NAME it has to pick the RUN_DATE and VALUE and replace in update_query and trigger the mysql_conn.
I think we need to a for loop but not sure how to proceed.

Instead of iterating through the dataframe with a for loop, it would be better to distribute the workload across each partitions using foreachPartition. Moreover, since you are writing a custom query instead of executing one query for each query, it would be more efficient to execute a batch operation to reduce the round trips, latency and concurrent connections. Eg
def update_db(rows):
temp_table_query=""
for row in rows:
if len(temp_table_query) > 0:
temp_table_query = temp_table_query + " UNION ALL "
temp_table_query = temp_table_query + " SELECT '%s' as RUNDATE, '%s' as NAME, %d as VALUE " % (row.RUN_DATE,row.NAME,row.VALUE)
update_query="""
UPDATE DBTABLE
INNER JOIN (
%s
) new_records ON DBTABLE.NAME = new_records.NAME
SET
DBTABLE.DATE = new_records.RUNDATE,
DBTABLE.VALUE = new_records.VALUE
""" % (temp_table_query)
mysql_conn(host, user_name, password, update_query)
df.foreachPartition(update_db)
View Demo on how the UPDATE query works
Let me know if this works for you.

Related

Update and insert from qtableWidget into MYSQL database more efficiently

I'm building a desktop app with PyQt5 to connect with, load data from, insert data into and update a MySQL database. What I came up with to update the database and insert data into the database works. But I feel there should be a much faster way to do it in terms of computation speed. If anyone could help that would be really helpful. What I have as of now for updating the database is this -
def log_change(self, item):
self.changed_items.append([item.row(),item.column()])
# I connect this function to the item changed signal to log any cells which have been changed
def update_db(self):
# Creating an empty list to remove the duplicated cells from the initial list
self.changed_items_load= []
[self.changed_items_load.append(x) for x in self.changed_items if x not in self.changed_items_load]
# loop through the changed_items list and remove cells with no values in them
for db_wa in self.changed_items_load:
if self.tableWidget.item(db_wa[0],db_wa[1]).text() == "":
self.changed_items_load.remove(db_wa)
try:
mycursor = mydb.cursor()
# loop through the list and update the database cell by cell
for ecr in self.changed_items_load:
command = ("update table1 set `{col_name}` = %s where id=%s;")
# table widget column name matches db table column name
data = (str(self.tableWidget.item(ecr[0],ecr[1]).text()),int(self.tableWidget.item(ecr[0],0).text()))
mycursor.execute(command.format(col_name = self.col_names[ecr[1]]),data)
# self.col_names is a list of the tableWidget columns
mydb.commit()
mycursor.close()
except OperationalError:
Msgbox = QMessageBox()
Msgbox.setText("Error! Connection to database lost!")
Msgbox.exec()
except NameError:
Msgbox = QMessageBox()
Msgbox.setText("Error! Connect to database!")
Msgbox.exec()
For inserting data and new rows into the db I was able to find some info online about that. But I have been unable to insert multiple lines at once as well as insert varying column length for each row. Like if I want to insert only 2 columns at row 1, and then 3 columns at row 2... something like that.
def insert_db(self):
# creating a list of each column
self.a = [self.tableWidget.item(row,1).text() for row in range (self.tableWidget.rowCount()) if self.tableWidget.item(row,1) != None]
self.b = [self.tableWidget.item(row,2).text() for row in range (self.tableWidget.rowCount()) if self.tableWidget.item(row,2) != None]
self.c = [self.tableWidget.item(row,3).text() for row in range (self.tableWidget.rowCount()) if self.tableWidget.item(row,3) != None]
self.d = [self.tableWidget.item(row,4).text() for row in range (self.tableWidget.rowCount()) if self.tableWidget.item(row,4) != None]
try:
mycursor = mydb.cursor()
mycursor.execute("INSERT INTO table1(Name, Date, Quantity, Comments) VALUES ('%s', '%s', '%s', '%s')" %(''.join(self.a),
''.join(self.b),
''.join(self.c),
''.join(self.d)))
mydb.commit()
mycursor.close()
except OperationalError:
Msgbox = QMessageBox()
Msgbox.setText("Error! Connection to database lost!")
Msgbox.exec()
except NameError:
Msgbox = QMessageBox()
Msgbox.setText("Error! Connect to database!")
Msgbox.exec()
Help would be appreciated. Thanks.
Like if I want to insert only 2 columns at row 1, and then 3 columns at row 2
No. A given Database table has a specific number of columns. That is an integral part of the definition of a "table".
INSERT adds new rows to a table. It is possible to construct a single SQL statement that inserts multiple rows "all at once".
UPDATE modifies one or more rows of a table. The rows are indicated by some condition specified in the Update statement.
Constructing SQL with %s is risky -- it gets in trouble if there are quotes in the string being inserted.
(I hope these comments help you get to the next stage of understanding databases.)

How to reduce the number of database hits while querying in Django

I have three tables
User
Device
Log
I want to filter the logs based on devices and logs. I'm using the following querying which iterates over the users and devices in order to get the logs. I feel this will become a performance hit. How to reduce the number of database hits?
for user_obj in User.objects.all():
device_qs = Device.objects.filter(user=user_obj)
if device_qs.exists():
for device_obj in device_qs:
log_count = Log.objects.filter(user=user_obj, device=device_obj, created_at__range(from_date, to_date)).count()
If you only need the log count per user and device (which is what you get from the code you posted), you can get that in just one query:
from django.db.models import Count
logs = (Log.objects
.filter(created_at__range = (from_date, to_date))
.values('user', 'device')
.annotate(log_count=Count('device'))
)
You can modify the query to include any attributes of the user and device models that you need:
.values('user__last_name', 'device__name') # etc.
You can also order the dataset by appending order_by() at the end to be able to iterate over it in the desired order:
.order_by('user__last_name', '-log_count')
What I would do is create a "proxy model" that references a view in your MySQL instance
The view would look like this:
SELECT
t1.*,
t2.*,
t3.*
FROM users t1
RIGHT JOIN device t2 (ON t1.id=t2.user_id)
RIGHT JOIN log t3 (ON t3.device_id=t2.id);
Now to create a proxy model, do this:
class SomeModel(models.Model):
# all fields from the 3 tables here
class Meta:
db_table = 'yourViewNameHere'
managed = False # this keeps django from creating the table
then python manage.py makemigrations + python manage.py migrate as usual
Now, to access the the data you need, you would do something like this:
from django.db import connection
sql = "SELECT * FROM your_view WHERE some_date_column > 'foo' AND some_date_column < 'bar' "
with connection.cursor() as cur:
cur.execute(sql)
data = cur.fetchall()
print(data)
Note that if you are passing parameters to the raw sql query, you should always pass them like this to avoid sql injection:
sql = "SELECT * FROM your_view WHERE some_date_column > %s AND some_date_column < %s"
params = ('foo', 'bar')
with connection.cursor() as cur:
cur.execute(sql, params)
data = cur.fetchall()

Making thousands of SELECT queries faster

Situation
Working with Python 3.7.2
I have read previlege of a MariaDB table with 5M rows on a server.
I have a local text file with 7K integers, one per line.
The integers represent IDXs of the table.
The IDX column of the table is the primary key. (so I suppose it is automatically indexed?)
Problem
I need to select all the rows whose IDX is in the text file.
My effort
Version 1
Make 7K queries, one for each line in the text file. This makes approximately 130 queries per second, costing about 1 minute to complete.
import pymysql
connection = pymysql.connect(....)
with connection.cursor() as cursor:
query = (
"SELECT *"
" FROM TABLE1"
" WHERE IDX = %(idx)s;"
)
all_selected = {}
with open("idx_list.txt", "r") as f:
for idx in f:
idx = idx.strip()
if idx:
idx = int(idx)
parameters = {"idx": idx}
cursor.execute(query, parameters)
result = cursor.fetchall()[0]
all_selected[idx] = result
Version 2
Select the whole table, iterate over the cursor and cherry-pick rows. The for-loop over .fetchall_unbuffered() covers 30-40K rows per second, and the whole script costs about 3 minutes to complete.
import pymysql
connection = pymysql.connect(....)
with connection.cursor() as cursor:
query = "SELECT * FROM TABLE1"
set_of_idx = set()
with open("idx_list.txt", "r") as f:
for line in f:
if line.strip():
line = int(line.strip())
set_of_idx.add(line)
all_selected = {}
cursor.execute(query)
for row in cursor.fetchall_unbuffered():
if row[0] in set_of_idx:
all_selected[row[0]] = row[1:]
Expected behavior
I need to select faster, because the number of IDXs in the text file will grow as big as 10K-100K in the future.
I consulted other answers including this, but I can't make use of it since I only have read previlege, thus impossible to create another table to join with.
So how can I make the selection faster?
A temporary table implementation would look like:
connection = pymysql.connect(....,local_infile=True)
with connection.cursor() as cursor:
cursor.execute("CREATE TEMPORARY TABLE R (IDX INT PRIMARY KEY)")
cursor.execute("LOAD DATA LOCAL INFILE 'idx_list.txt' INTO R")
cursor.execute("SELECT TABLE1.* FROM TABLE1 JOIN R USING ( IDX )")
..
cursor.execute("DROP TEMPORARY TABLE R")
Thanks to the hint (or more than a hint) from #danblack, I was able to achieve the desired result with the following query.
query = (
"SELECT *"
" FROM TABLE1"
" INNER JOIN R"
" ON R.IDX = TABLE1.IDX;"
)
cursor.execute(query)
danblack's SELECT statement didn't work for me, raising an error:
pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MariaDB server version for the right syntax to use near 'IDX' at line 1")
This is probably because of MariaDB's join syntax, so I consulted the MariaDB documentation on joining tables.
And now it selects 7K rows in 0.9 seconds.
Leaving here as an answer just for the sake of completeness, and for future readers.

Iterating over records in one table and adding them to another with calculations

I'm currently using PHP and MySQL to retrieve a set of 100,000 records in a table, then iterate over each of those records to do some calculations and then insert the result into another table. I'm wondering if I'd be able to do this in pure SQL and make the query run faster.
Here's what I"m currently using:
$stmt= $pdo->query("
SELECT Well_Permit_Num
, Gas_Quantity
, Gas_Production_Days
FROM DEP_OG_Production_Import
ORDER
BY id ASC
");
foreach ($stmt as $row) {
$data = array('well_id' => $row['Well_Permit_Num'],
'gas_quantity' => $row['Gas_Quantity'],
'gas_days' => $row['Gas_Production_Days'],
'gas_average' => ($row['Gas_Production_Days']);
$updateTot = $pdo->prepare("INSERT INTO DEP_OG_TOTALS
(Well_Permit_Num,
Total_Gas,
Total_Gas_Days,
Total_Gas_Avg)
VALUES (:well_id,
:gas_quantity,
:gas_days,
:gas_average)
ON DUPLICATE KEY UPDATE
Total_Gas = Total_Gas + VALUES(Total_Gas),
Total_Gas_Days = Total_Gas_Days + VALUES(Total_Gas_Days),
Total_Gas_Avg =(Total_Gas + VALUES(Total_Gas)) / (Total_Gas_Days + VALUES(Total_Gas_Days))");
}
I'd like to see if this can be done in pure MySQL instead of having to use PHP just for the fact of using it to hold the variables.
My Result should be 1 record that is a running total for each Well. The source table may house 60-70 records for the same well, but over a few thousand different Wells.
It's a constant import process that has to be run, so it's not like there is a final table which you can just do SUM(Gas_Quantity)... etc.. on
As commented by Uueerdo, you seem to need an INSERT ... SELECT query. The role of such query is to INSERT insert the resultset returned by an inner SELECT. The inner select is an aggregate query that computes the total sum of gas and days for each well.
INSERT INTO DEP_OG_TOTALS (Well_Permit_Num, Total_Gas, Total_Gas_Days, Total_Gas_Avg)
SELECT
t.Well_Permit_Num,
SUM(t.Gas_Quantity) Total_Gas,
SUM(t.Gas_Production_Days) Total_Gas_Days
FROM DEP_OG_Production_Import t
GROUP BY t.Well_Permit_Num
ON DUPLICATE KEY UPDATE
Total_Gas = Total_Gas + t.Total_Gas,
Total_Gas_Days = Total_Gas_Days + t.Total_Gas_Days,
Total_Gas_Avg =(Total_Gas + t.Total_Gas) / (Total_Gas_Days + t.Total_Gas_Days)

SQLAlchemy MySQL IF Statement

I'm in the middle of converting an old legacy PHP system to Flask + SQLAlchemy and was wondering how I would construct the following:
I have a model:
class Invoice(db.Model):
paidtodate = db.Column(DECIMAL(10,2))
fullinvoiceamount = db.Column(DECIMAL(10,2))
invoiceamount = db.Column(DECIMAL(10,2))
invoicetype = db.Column(db.String(10))
acis_cost = db.Column(DECIMAL(10,2))
The query I need to run is:
SELECT COUNT(*) AS the_count, sum(if(paidtodate>0,paidtodate,if(invoicetype='CPCN' or invoicetype='CPON' or invoicetype='CBCN' or invoicetype='CBON' or invoicetype='CPUB' or invoicetype='CPGU' or invoicetype='CPSO',invoiceamount,
fullinvoiceamount))) AS amount,
SUM(acis_cost) AS cost, (SUM(if(paidtodate>0,paidtodate,invoiceamount))-SUM(acis_cost)) AS profit FROM tblclientinvoices
Is there an SQLAlchemyish way to construct this query? - I've tried googling for Mysql IF statments with SQlAlchemy but drew blanks.
Many thanks!
Use func(documentation) to generate SQL function expression:
qry = select([
func.count().label("the_count"),
func.sum(func.IF(
Invoice.paidtodate>0,
Invoice.paidtodate,
# #note: I prefer using IN instead of multiple OR statements
func.IF(Invoice.invoicetype.in_(
("CPCN", "CPON", "CBCN", "CBON", "CPUB", "CPGU", "CPSO",)
),
Invoice.invoiceamount,
Invoice.fullinvoiceamount)
)
).label("amount"),
func.sum(Invoice.acis_cost).label("Cost"),
(func.sum(func.IF(
Invoice.paidtodate>0,
Invoice.paidtodate,
Invoice.invoiceamount
))
- func.sum(Invoice.acis_cost)
).label("Profit"),
],
)
rows = session.query(qry).all()
for row in rows:
print row