I have a json file that's about 50 gigs I have done some processing on it and had to stop it mid way, I was on the 16000000th entry I can't afford to start the processing from the start is there any way I can get the above mentioned row and start from there? I was trying to get the last entry on my DB say comment_id search for that in the and start the processing from there but have been unable to do so efficiently.
UPDATE So I got a while loop in there but the problem is even with a while loop the insertion process is still taking place. how do i stop it from executing until the said while loop condition is met.
import sqlite3
import json
from datetime import datetime
import time
timeframe = '2017-10'
sql_transaction = []
start_row = 0
cleanup = 1000000
connection = sqlite3.connect('{}.db'.format(timeframe))
c = connection.cursor()
def create_table():
c.execute("""CREATE TABLE IF NOT EXISTS
parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT,
score INT)""")
def format_data(data):
data = data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
return data
def transaction_bldr(sql):
global sql_transaction
sql_transaction.append(sql)
if len(sql_transaction) > 1000:
c.execute('BEGIN TRANSACTION')
for s in sql_transaction:
try:
c.execute(s)
except:
pass
connection.commit()
sql_transaction = []
def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
try:
sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ?
WHERE parent_id =?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion',str(e))
def sql_insert_has_parent(commentid,parentid,parent,comment,subreddit,time,score):
try:
sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score)
VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion',str(e))
def sql_insert_no_parent(commentid,parentid,comment,subreddit,time,score):
try:
sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score)
VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
transaction_bldr(sql)
except Exception as e:
print('s0 insertion',str(e))
def acceptable(data):
if len(data.split(' ')) > 1000 or len(data) < 1:
return False
elif len(data) > 32000:
return False
elif data == '[deleted]':
return False
elif data == '[removed]':
return False
else:
return True
def find_parent(pid):
try:
sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
def find_existing_score(pid):
try:
sql = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
c.execute(sql)
result = c.fetchone()
if result != None:
return result[0]
else: return False
except Exception as e:
#print(str(e))
return False
if __name__ == '__main__':
create_table()
row_counter = 0
paired_rows = 0
with open('F:\pydata\RC_2017-10', buffering=1000) as f:
for row in f:
#print(row)
#time.sleep(555)
row_counter += 1
if row_counter > start_row:
try:
row = json.loads(row)
#print(row)
parent_id = row['parent_id'].split('_')[1]
body = format_data(row['body'])
created_utc = row['created_utc']
score = row['score']
comment_id = row['id']
subreddit = row['subreddit']
parent_data = find_parent(parent_id)
existing_comment_score = find_existing_score(parent_id)
while True:
#print('dasfdjasf;ldjasfdasflj')
if (comment_id == 'dnqil6j'):
print('true')
#print('asdfadsfasdf')
break
if existing_comment_score:
print('score')
#print('going for score bro')
if score > existing_comment_score:
if acceptable(body):
#print('inserting')
sql_insert_replace_comment(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
else:
if acceptable(body):
if parent_data:
if score >= 2:
sql_insert_has_parent(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
paired_rows += 1
else:
sql_insert_no_parent(comment_id,parent_id,body,subreddit,created_utc,score)
except Exception as e:
print(str(e))
if row_counter % 100000 == 0:
print('checking from here')
print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))
if row_counter > start_row:
if row_counter % cleanup == 0:
print("Cleanin up!")
sql = "DELETE FROM parent_reply WHERE parent IS NULL"
c.execute(sql)
connection.commit()
c.execute("VACUUM")
connection.commit()
Related
its a pgm for inserting values to table, but it shows an error while running "mysql.connector.errors.DatabaseError: 1366 (HY000): Incorrect integer value: 'sstrength' for column 'strength' at row 1".if i change the "%d" to '%s' it will generate other error "not all parameters were used in the sql statement"
import mysql.connector
mydb = mysql.connector.connect(
host="localhost",
user="user",
password="password",
database="db",
)
mycursor = mydb.cursor()
def create_table(sql: str,mydb):
mycursor = mydb.cursor()
mycursor.execute(sql)
def insert_to_students(new_student,mydb):
sql = ("INSERT INTO students (name, roll_no,reg_no,class_id) VALUES (%s,%d,%d,%d)")
mydb.commit()
mycursor.execute(sql,new_student)
def insert_to_classes(new_classes,mydb):
sql = ("INSERT INTO classes (name,strength) VALUES (%s,%s)")
mydb.commit()
mycursor.execute(sql,new_classes)
def create_classes_students_table(mydb):
# create class table
sql ="CREATE TABLE classes (id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(20),strength INT)"
create_table(sql,mydb)
# create student table
sql ="CREATE TABLE students (id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(100),roll_no INT,reg_no VARCHAR(10),class_id INT,FOREIGN KEY (id) REFERENCES classes(id))"
create_table(sql,mydb)
#create_classes_students_table(mydb)
#Heading of menu-driven approach
print("\nWELCOME TO CLASSES AND STUDENTS TABLE")
#using the while loop to print menu list
while True:
print("\nMENU")
print("1. Insert data to students table")
print("2. Insert data to classes table")
print("3. List both classes and students table")
print("4. Update students table using name")
print("5. Exit")
table_choice = int(input("\nEnter your choice : "))
if table_choice == 1:
while True:
print("\n1. Insert data to students table")
print("2. Exit")
choice1 = int(input("\nInsert data to students table: "))
#Calling the relevant method based on users choice using if-else loop
if choice1 == 1:
sname = input("Enter Name of Student: ")
sroll_no = int(input("Enter Roll num of Student: "))
sreg_no = int(input("Enter Reg num of Student: "))
sclass_id= int(input("Enter class id of Student: "))
new_student_dict = ("sname","sroll_no","sreg_no","sclass_id")
insert_to_students(new_student_dict,mydb)
elif choice1 == 2:
break
else:
print("Incorrect Choice!")
elif table_choice == 2:
while True:
print("\n1. Insert data to classes table")
print("2. Exit")
choice1 = int(input("\nInsert data to students table: "))
if choice1 == 1:
scname = input("Enter Name of Student: ")
sstrength = int(input("Enter strength of the classes: "))
new_classes_dict = ("scname","sstrength")
insert_to_classes(new_classes_dict,mydb)
elif choice1 == 2:
break
else:
print("Incorrect Choice!")
else:
print("Incorrect Choice. Please, try again.")
break
The variables logged_in_before and already_set_date are Null in the database but the if statement still executes and prints "date already set" when it shouldn't be. I've tried so many combinations using is None, == None, == "None", == "Null" etc but it's not working.
The values are stored as VARCHAR(255) in the database and are returning a tuple of (None,) when I print them out.
What am I missing, I assume it's something to do with the values being strings rather than objects set to None?
def login(email, password):
mydb = mysql.connector.connect(host="localhost", user="root", passwd="", database="")
cur = mydb.cursor(buffered=True)
cur.execute("SELECT * FROM users WHERE email = %s and password = %s", (email, password))
result = cur.fetchone()
mydb.commit()
if result:
cur.execute("SELECT logged_in_before FROM users WHERE email = %s", (email,))
logged_in_before = cur.fetchone()
mydb.commit()
if logged_in_before:
cur.execute("SELECT date_of_visit FROM users WHERE email = %s", (email,))
already_set_date = cur.fetchone()
mydb.commit()
if already_set_date:
print("date already set")
else:
print("date not set")
else:
cur.execute("UPDATE users SET logged_in_before = 'yes' WHERE email = %s",
(email,))
mydb.commit()
print("Now logged in")
else:
invalid_login()
cur.close()
mydb.close()
When dealing with NULL values, you would use IS NULL instead of = NULL. You can also use IS NOT NULL.
I'm writing a code to generate n-grams for every record in the table by reading a specific column.
def extract_from_db(inp_cust_id):
sql_db = TatDBHelper()
t_sql = "select notes from raw_data where customer_id = {0}"
db_data = sql_db.execute_read(t_sql.format(inp_cust_id))
for row in db_data:
text = row.values()
bi_grams = generate_ngrams(text[0].encode("utf-8"), 2)
print bi_grams
def generate_ngrams(sentence, n):
sentence = sentence.lower()
sentence = re.sub(r'[^a-zA-Z0-9\s]', ' ', sentence)
tokens = [token for token in sentence.split(" ") if token != ""]
ngrams = zip(*[tokens[i:] for i in range(n)])
return [" ".join(ngram) for ngram in ngrams]
I'm getting the output like:
['i highly', 'highly recommend', 'recommend it']
['the penguin', 'penguin encounter', 'encounter was', 'was awesome']
I want the output to look like below, can anybody help me to get this.
['i highly',
'highly recommend',
'recommend it',
...
]
creat another list all_ngrams, and keep appending the values to it , using .extend(), and finally you will have all the ngrams in one list.
Try this :
def extract_from_db(inp_cust_id):
sql_db = TatDBHelper()
t_sql = "select notes from raw_data where customer_id = {0}"
db_data = sql_db.execute_read(t_sql.format(inp_cust_id))
all_ngrams = []
for row in db_data:
text = row.values()
bi_grams = generate_ngrams(text[0].encode("utf-8"), 2)
all_ngrams.extend(bi_grams)
print all_ngrams
I've a django app which fetches data from mysql db whenever a request is received.This works fine when request is processed by one user but, when more than user send request, I get a error message saying "InterfaceError at url (0, '')".
I'm using Django version 1.9.
As per my research , I included CONN_MAX_AGE in my settings.py but still I got the same error.
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
**'CONN_MAX_AGE': None**
}
}
my models.py
connection = pymysql.connect(host='localhost',user='user',password='password',db='db_name',port=3306,charset='utf8',cursorclass=pymysql.cursors.DictCursor)
def execute(query):
cur = connection.cursor()
cur.execute(query)
data = cur.fetchall()
connection.commit()
cur.close()
return data
def trending_assets():
sql = "select * from media_recommendation_engine.content_table t1 inner join (SELECT movieId,rank from" \
" media_recommendation_engine.asset_ranks limit 10) t2 on t1.movieId = " \
"t2.movieId order by t2.rank asc ;;"
data = execute(sql)
return data
views.py
#permission_classes((permissions.IsAuthenticated,))
class Trending(viewsets.GenericViewSet):
def list(self,request):
if request.query_params['type']=='trending':
result_data = models.trending_assets()
return Response(models.formatter(result_data))
#
else:
return JsonResponse({'message': 'Wrong Argument pass'},status= 400)
You should connect the db each time you need to when a request need to be processed. Earlier I used a connection globally. And do not use db.close().
def execute(query):
connection = pymysql.connect(host='localhost', user='user', password='passsword', db='db_name', port=3306, charset='utf8', cursorclass=pymysql.cursors.DictCursor)
cur = connection.cursor()
cur.execute(query)
data = cur.fetchall()
connection.commit()
cur.close()
return data
def trending_assets():
sql = "select * from media_recommendation_engine.content_table t1 inner join (SELECT movieId,rank from" \
" media_recommendation_engine.asset_ranks limit 10) t2 on t1.movieId = " \
"t2.movieId order by t2.rank asc ;;"
data = execute(sql)
return data
I'm struggling with the formatting on a mysql query and I was hoping you could point me in the right direction. Here are the queries
sql = "SELECT price FROM inventory WHERE card_name = %s AND card_set = %s"
sql_rare = "SELECT rarity FROM inventory WHERE card_name = %s AND card_set = %s"
sql_count = "SELECT count(*) FROM inventory WHERE card_name = %s AND card_set = %s
When I run the following code, utilizing the sql_count query, i get an error saying:
File "C:\Users\Spencer\Desktop\Python Programs\PythonMTG\Revision3AutoAndManual\51515\magicassistantcsv.py", line 264, in output_card
for row in getmtgprice.query(sql_count, ([card_name, set_name])):
TypeError: query() takes exactly 4 arguments (3 given)
Here is the code producing this error:
getmtgprice = PriceCheck()
for row in getmtgprice.query(sql_count, ([card_name, set_name])):
if row[0] ==0:
priced_card = '0.00'
And here is the PriceCheck function:
class PriceCheck(object):
def __init__(self):
self.conn = MySQLdb.connect(host='localhost', user='root', passwd='', db='mscan')
self.c = self.conn.cursor()
def query(self, arg, cardname, setname):
self.c.execute(arg, cardname, setname)
return self.c
def __del__(self):
self.conn.close()
Do you see where I went wrong?
Your query method takes separate arguments for cardname and setname, not a list containing both. So, instead of:
for row in getmtgprice.query(sql_count, ([card_name, set_name])):
You should have:
for row in getmtgprice.query(sql_count, card_name, set_name):