I'm testing the execution of a crawler inside of the airflow structure. If I ran the following script, everything works fine and the payloads is printed.
from airflow import DAG
from airflow.models import BaseOperator, TaskInstance
from hooks.crawler_hook import CrawlerHook
from datetime import datetime
import time
class CrawlerOperator(BaseOperator):
def __init__(self, conn_id=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.conn_id = conn_id
def execute(self):
hook = CrawlerHook(conn_id=self.conn_id)
print(hook.run())
if __name__ == "__main__":
CrawlerOperator(task_id='test_run').execute()
But when I try to run a TaskInstance inside a DAG, I've got the an error and cannot understand why:
if __name__ == "__main__":
with DAG(dag_id="DAG1", start_date=datetime.now(), catchup=False) as dag:
to = CrawlerOperator(task_id="test_run")
ti = TaskInstance(task=to)
ti.run()
The error:
Traceback (most recent call last):
File "/home/../.env/lib/python3.8/site-packages/airflow/utils/session.py", line 67, in wrapper
return func(*args, **kwargs)
File "/home/../.env/lib/python3.8/site-packages/airflow/models/taskinstance.py", line 1123, in get_dagrun
dr = session.query(DagRun).filter(DagRun.dag_id == self.dag_id, DagRun.run_id == self.run_id).one()
File "/home/../.env/lib/python3.8/site-packages/sqlalchemy/orm/query.py", line 3500, in one
raise orm_exc.NoResultFound("No row was found for one()")
sqlalchemy.orm.exc.NoResultFound: No row was found for one()
Any suggestions?
I assume you are using it in some kind of unit test. What you are missing (and as the error indicates) is a DagRun:
from airflow.models import DagRun
DagRun(dag_id=self.dag.dag_id, execution_date=timezone.utcnow(), run_id="test")
ti.dag_run = dag_run
This is needed because tasks are associated to a DagRun not to a DAG.
DAG can have many DagRuns.
You can see example in one of the unit tests in Airflow codebase.
Related
i am trying to fetch the list of sql query running more than 3600 sec and kill those id's using python below is the code
import json
import mysql.connector
import pymysql
def main():
# TODO implement
connection = pymysql.connect(user='', password='',
host='',
port=3306,
database='');
cursor = connection.cursor() # get the cursor
# cursor.execute('SHOW PROCESSLIST;')
# extracted_data = cursor.fetchall();
# for i in extracted_data:
# print(i)
with connection.cursor() as cursor:
print(cursor.execute('SHOW PROCESSLIST'))
for item in cursor.fetchall():
if item.get('Time') > 3600 and item.get('command') == 'query':
_id = item.get('Id')
print('kill %s' % item)
cursor.execute('kill %s', _id)
connection.close()
main()
below is the error i am getting
"C:\drive c\pyfile\venv\Scripts\python.exe" "C:/drive c/pyfile/sqlnew2.py"
Traceback (most recent call last):
File "C:\drive c\pyfile\sqlnew2.py", line 23, in <module>
main()
File "C:\drive c\pyfile\sqlnew2.py", line 18, in main
if item.get('Time') > 3600 and item.get('command') == 'query':
AttributeError: 'tuple' object has no attribute 'get'
The .fetchall() method returns a tuple, not a dictionary. Therefore you should access the elements using the numerical indexes, for example item[0], item[1], etc
As an alternative, if you want to fetch the results as a dictionary, you can use a DictCursor
First import it:
import pymysql.cursors
Then modify the cursor line like that:
with connection.cursor(pymysql.cursors.DictCursor) as cursor:
...
Im following ChristopherGS's tutorial for FASTapi but i'm stuck on part 6 because I believe his syntax may be already deprecated.
I get AttributeError: module 'jinja2' has no attribute 'contextfunction at the end when the program stops. How do I solve this, I've been stuck here for 3 days.
This is my code:
from fastapi.templating import Jinja2Templates
from typing import Optional, Any
from pathlib import Path
from app.schemas import RecipeSearchResults, Recipe, RecipeCreate
from app.recipe_data import RECIPES
BASE_PATH = Path(__file__).resolve().parent
TEMPLATES = Jinja2Templates(directory=str(BASE_PATH / "templates"))
app = FastAPI(title="Recipe API", openapi_url="/openapi.json")
api_router = APIRouter()
# Updated to serve a Jinja2 template
# https://www.starlette.io/templates/
# https://jinja.palletsprojects.com/en/3.0.x/templates/#synopsis
#api_router.get("/", status_code=200)
def root(request: Request) -> dict:
"""
Root GET
"""
return TEMPLATES.TemplateResponse(
"index.html",
{"request": request, "recipes": RECIPES},
)
#api_router.get("/recipe/{recipe_id}", status_code=200, response_model=Recipe)
def fetch_recipe(*, recipe_id: int) -> Any:
"""
Fetch a single recipe by ID
"""
result = [recipe for recipe in RECIPES if recipe["id"] == recipe_id]
if not result:
# the exception is raised, not returned - you will get a validation
# error otherwise.
raise HTTPException(
status_code=404, detail=f"Recipe with ID {recipe_id} not found"
)
return result[0]
if __name__ == "__main__":
# Use this for debugging purposes only
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8001, log_level="debug")
It could be due to the version mismatch bewteen jinja and starlette(fastapi).
I faced similar issue with the latest fastapi docker image(python3.9). It was resolved by installing an older version of jinja2.
Try downgrading jinja2, if you are using jinja2 >3.0.3:
pip install jinja2==3.0.3
Other option would be to upgrade fastapi/starlette.
Ref:
FastAPI Jinja2Templates - Error while running initialising templates?
https://github.com/pallets/jinja/blob/1b714c7e82c73575d1dba48f560db07fe9a5cb74/CHANGES.rst#version-310
I m getting error in running nested function in my python interpreter
import MySQLdb
import serial
import time
import smtplib
ser=serial.Serial('/dev/ttyACM1',9600)
db=MySQLdb.connect("localhost","root","pass","db")
cursor=db.cursor()
while 1:
print("Waiting ;;...")
print("")
print("collecting")
print("")
time.sleep(3)
x=ser.readline()
time.sleep(3)
if x>700:
send()
print"send mail"
print("inserting into Database")
sql="INSERT INTO vidit2(temp) VALUES(%s);" %(x)
cursor.execute(sql)
db.commit()
time.sleep(3)
def send():
content="send"
mail=smtplib.SMTP("smtp.gmail.com",587)
mail.ehlo()
mail.starttls()
mail.login("emailid","pass")
mail.sendmail("sender","reciever",content)
mail.close()
Error:
python temp.py
Waiting ;;...
collecting
Traceback (most recent call last):
File "temp.py", line 24, in
send()
NameError: name 'send' is not defined
Please help.
Thanks in Advance
Unlike, say, JavaScript which will "hoist" function definitions during compilation so that they can be called before they are defined in your code (just learned about this the other day so forgive me if this is an oversimplification), in Python you need to define functions before you call them (interesting discussion here). This means you need to do:
def send():
...
before:
send()
I've faced following issue and I'm not entirely sure it's a django or MySQL issue.
It happens when I want to retry django's save method in case of database error such as deadlock. Lets not focus on how ugly it is because it was just temporary hack which revealed potentially another problem.
To reproduce it I've prepared a script that runs 3 concurrent processes and simulate database failure by raising AssertionError.
models.py
from django.db import models, transaction
from time import sleep
from django.db.utils import OperationalError
class ModelA(models.Model):
name = models.CharField(max_length=255)
def __unicode__(self):
return self.name
def save(self, *args, **kwargs):
def save_record(attempt):
print attempt
try:
with transaction.atomic():
super(ModelA, self).save(*args, **kwargs)
if attempt > 1:
assert False
except (AssertionError, OperationalError):
# dirty hack to retry
sleep(1)
if attempt > 0:
save_record(attempt-1)
else:
raise
save_record(5)
test script
import time
from multiprocessing import Process
from django.core.management.base import BaseCommand
from django.db import transaction
from atomic import models
#transaction.atomic
def create_record():
a = models.ModelA()
a.name = "test {}".format(time.time())
a.save()
class Command(BaseCommand):
def handle(self, *args, **options):
procs = []
for i in range(3):
p = Process(target=create_record)
procs.append(p)
for p in procs:
p.start()
for p in procs:
p.join()
If I run only 1 process everything works but with 3 concurrent processes 1 works (saves data) and another 2 fails with following traceback
Traceback (most recent call last):
File "/usr/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
self._target(*self._args, **self._kwargs)
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/transaction.py", line 371, in inner
return func(*args, **kwargs)
File "/media/django/atomictest/atomic/management/commands/test_atomic.py", line 14, in create_record
a.save()
File "/media/django/atomictest/atomic/models.py", line 29, in save
save_record(5)
File "/media/django/atomictest/atomic/models.py", line 25, in save_record
save_record(attempt-1)
File "/media/django/atomictest/atomic/models.py", line 25, in save_record
save_record(attempt-1)
File "/media/django/atomictest/atomic/models.py", line 25, in save_record
save_record(attempt-1)
File "/media/django/atomictest/atomic/models.py", line 18, in save_record
super(ModelA, self).save(*args, **kwargs)
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/models/base.py", line 545, in save
force_update=force_update, update_fields=update_fields)
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/models/base.py", line 573, in save_base
updated = self._save_table(raw, cls, force_insert, force_update, using, update_fields)
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/models/base.py", line 635, in _save_table
forced_update)
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/models/base.py", line 679, in _do_update
return filtered._update(values) > 0
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/models/query.py", line 510, in _update
return query.get_compiler(self.db).execute_sql(None)
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/models/sql/compiler.py", line 980, in execute_sql
cursor = super(SQLUpdateCompiler, self).execute_sql(result_type)
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/models/sql/compiler.py", line 786, in execute_sql
cursor.execute(sql, params)
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/backends/util.py", line 69, in execute
return super(CursorDebugWrapper, self).execute(sql, params)
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/backends/util.py", line 47, in execute
self.db.validate_no_broken_transaction()
File "/home/blue/.virtualenvs/atomictest/local/lib/python2.7/site-packages/django/db/backends/__init__.py", line 372, in validate_no_broken_transaction
"An error occurred in the current transaction. You can't "
TransactionManagementError: An error occurred in the current transaction. You can't execute queries until the end of the 'atomic' block.
I use django 1.6.5 and MySQL 5.5.37. Tested also with sqlite and postgres and there's no such issue with those backends.
I've also noticed that this happends only with nested atomic blocks. If i remove #transaction.atomic decorator from create_record() function it works again.
You should avoid catching exceptions inside atomic:
When exiting an atomic block, Django looks at whether it’s exited
normally or with an exception to determine whether to commit or roll
back. If you catch and handle exceptions inside an atomic block, you
may hide from Django the fact that a problem has happened. This can
result in unexpected behavior.
This is mostly a concern for DatabaseError and its subclasses such as
IntegrityError. After such an error, the transaction is broken and
Django will perform a rollback at the end of the atomic block. If you
attempt to run database queries before the rollback happens, Django
will raise a TransactionManagementError. You may also encounter this
behavior when an ORM-related signal handler raises an exception.
The correct way to catch database errors is around an atomic block as
shown above. If necessary, add an extra atomic block for this purpose.
This pattern has another advantage: it delimits explicitly which
operations will be rolled back if an exception occurs.
If you catch exceptions raised by raw SQL queries, Django’s behavior
is unspecified and database-dependent.
In this case you're catching DataBaseErrors inside the create_record's atomic block.
You should move your save retry attempt logic to the create_record method.
def create_record(retry=5):
instance = models.ModelA(name="test {}".format(time.time()))
try:
with transaction.atomic():
instance.save()
except DataBaseError:
if retry:
time.sleep(1)
create_record(retry-1)
else:
raise
And avoid overriding models.Model.save.
I have scheduled a few recurring tasks with celery beat for our web app
The app itself is build using pyramid web framework. Using the zopetransaction extension to manage session
In celery, I am using the app as a library. I am redefining session in models with a function.
It works well but once in a while, it raises InvalidRequestError: This session is in 'prepared' state; no further SQL can be emitted within this transaction
I am not sure what is wrong and why it issues these warnings.
Sample code:
in tasks.py
def initialize_async_session():
import sqlalchemy
from webapp.models import Base, set_dbsession, engine
Session = sqlalchemy.orm.scoped_session(
sqlalchemy.orm.sessionmaker(autocommit=True, autoflush=True)
)
Session.configure(bind=engine)
session = Session()
set_dbsession(session)
Base.metadata.bind = engine
return session
#celery.task
def rerun_scheduler():
log.info("Starting pipeline scheduler")
session = initialize_async_session()
webapp.sheduledtask.service.check_for_updates(session)
log.info("Ending pipeline scheduler")
In models.py in webapp
DBSession = scoped_session(sessionmaker(bind=engine, expire_on_commit=False,
extension=ZopeTransactionExtension()))
def set_dbsession(db_session=None):
"""
This function sets the db session
"""
global DBSession
if db_session:
DBSession = db_session
log.info("session changed to {0}".format(db_session))
UPDATE:
traceback:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 551, in __bootstrap_inner
self.run()
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/edgem_common-0.0-py2.7.egg/common/utils.py", line 54, in new_function
result = f(*args, **kwargs)
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/edgem_common-0.0-py2.7.egg/common/utils.py", line 100, in new_function
result = f(*args, **kwargs)
File "/home/ubuntu/modwsgi/env/mvc-service/webapp/webapp/data/mongo_service.py", line 1274, in run
self.table_params.set_task_status_as_finished()
File "/home/ubuntu/modwsgi/env/mvc-service/webapp/webapp/mem_objects.py", line 33, in set_task_status_as_finished
task = Task.get_by_id(self.task_id)
File "/home/ubuntu/modwsgi/env/mvc-service/webapp/webapp/models.py", line 162, in get_by_id
return DBSession.query(cls).filter(cls.id == obj_id).first()
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/SQLAlchemy-0.7.9-py2.7-linux-x86_64.egg/sqlalchemy/orm/query.py", line 2156, in first
ret = list(self[0:1])
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/SQLAlchemy-0.7.9-py2.7-linux-x86_64.egg/sqlalchemy/orm/query.py", line 2023, in __getitem__
return list(res)
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/SQLAlchemy-0.7.9-py2.7-linux-x86_64.egg/sqlalchemy/orm/query.py", line 2227, in __iter__
return self._execute_and_instances(context)
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/SQLAlchemy-0.7.9-py2.7-linux-x86_64.egg/sqlalchemy/orm/query.py", line 2240, in _execute_and_instances
close_with_result=True)
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/SQLAlchemy-0.7.9-py2.7-linux-x86_64.egg/sqlalchemy/orm/query.py", line 2231, in _connection_from_session
**kw)
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/SQLAlchemy-0.7.9-py2.7-linux-x86_64.egg/sqlalchemy/orm/session.py", line 777, in connection
close_with_result=close_with_result)
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/SQLAlchemy-0.7.9-py2.7-linux-x86_64.egg/sqlalchemy/orm/session.py", line 781, in _connection_for_bind
return self.transaction._connection_for_bind(engine)
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/SQLAlchemy-0.7.9-py2.7-linux-x86_64.egg/sqlalchemy/orm/session.py", line 289, in _connection_for_bind
self._assert_is_active()
File "/home/ubuntu/modwsgi/env/local/lib/python2.7/site-packages/SQLAlchemy-0.7.9-py2.7-linux-x86_64.egg/sqlalchemy/orm/session.py", line 217, in _assert_is_active
"This Session's transaction has been rolled back "
InvalidRequestError: This Session's transaction has been rolled back by a nested rollback() call. To begin a new transaction, issue Session.rollback() first.
#########################################################################
[2013-05-30 14:32:57,782: WARNING/PoolWorker-3] Exception in thread Thread-4:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 552, in __bootstrap_inner
self.run()
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/edgem_common-0.0-py2.7.egg/common/utils.py", line 54, in new_function
result = f(*args, **kwargs)
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/edgem_common-0.0-py2.7.egg/common/utils.py", line 100, in new_function
result = f(*args, **kwargs)
File "/home/ranjith/wksp/mvc-service/webapp/webapp/data/mongo_service.py", line 1274, in run
self.table_params.set_task_status_as_finished()
File "/home/ranjith/wksp/mvc-service/webapp/webapp/mem_objects.py", line 33, in set_task_status_as_finished
task = Task.get_by_id(self.task_id)
File "/home/ranjith/wksp/mvc-service/webapp/webapp/models.py", line 166, in get_by_id
return DBSession.query(cls).filter(cls.id == obj_id).first()
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/SQLAlchemy-0.8.1-py2.7-linux-x86_64.egg/sqlalchemy/orm/query.py", line 2145, in first
ret = list(self[0:1])
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/SQLAlchemy-0.8.1-py2.7-linux-x86_64.egg/sqlalchemy/orm/query.py", line 2012, in __getitem__
return list(res)
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/SQLAlchemy-0.8.1-py2.7-linux-x86_64.egg/sqlalchemy/orm/query.py", line 2216, in __iter__
return self._execute_and_instances(context)
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/SQLAlchemy-0.8.1-py2.7-linux-x86_64.egg/sqlalchemy/orm/query.py", line 2229, in _execute_and_instances
close_with_result=True)
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/SQLAlchemy-0.8.1-py2.7-linux-x86_64.egg/sqlalchemy/orm/query.py", line 2220, in _connection_from_session
**kw)
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/SQLAlchemy-0.8.1-py2.7-linux-x86_64.egg/sqlalchemy/orm/session.py", line 798, in connection
close_with_result=close_with_result)
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/SQLAlchemy-0.8.1-py2.7-linux-x86_64.egg/sqlalchemy/orm/session.py", line 802, in _connection_for_bind
return self.transaction._connection_for_bind(engine)
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/SQLAlchemy-0.8.1-py2.7-linux-x86_64.egg/sqlalchemy/orm/session.py", line 281, in _connection_for_bind
self._assert_active()
File "/home/ranjith/wksp/env/local/lib/python2.7/site-packages/SQLAlchemy-0.8.1-py2.7-linux-x86_64.egg/sqlalchemy/orm/session.py", line 181, in _assert_active
"This session is in 'prepared' state; no further "
InvalidRequestError: This session is in 'prepared' state; no further SQL can be emitted within this transaction.
I believe the problem is that you are attempting to use the SQLAlchemy session in your Celery task.
The first thing I recommend doing is creating two separate scoped sessions, one for your Celery application and another one for your web application. Next, I would make sure your Celery database session is only configured once during Celery initialization. You can use the Celery worker_init.connect to make sure it creates the database during Celery startup (http://hynek.me/articles/using-celery-with-pyramid/).
It is very important that your web application does not use the same database session as your Celery application.
Something like this for your tasks.py file:
from celery import Celery
from celery.signals import worker_init
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
Session = sqlalchemy.orm.scoped_session(
sqlalchemy.orm.sessionmaker(autocommit=True, autoflush=True))
#worker_init.connect
def initialize_session():
some_engine = create_engine('database_url')
Session.configure(bind=some_engine)
#celery.task
def rerun_scheduler():
log.info("Starting pipeline scheduler")
webapp.sheduledtask.service.check_for_updates(Session)
log.info("Ending pipeline scheduler")
Cross posting my answer to a very similar stack overflow:
What's the proper way to use SQLAlchemy Sessions with Celery?
This solved the issue for me:
Sqlalchemy pools connections by default in a non-threadsafe manner,
Celery forks processes by default: one or the other needs to be changed.
Turn off Sqlalchemy pooling
Sql Alchemy Docs
from sqlalchemy.pool import NullPool
engine = create_engine(
SQLALCHEMY_DATABASE_URL, poolclass=NullPool
)