Support deleting timer task results automatically (#253)

my8100 · web-flow · commit ca6200d5d6b6 · 2025-02-16T18:54:56.000+08:00
* Support deleting timer task results automatically

* Add tests for delete_outdated_task_results

* Update tests for delete_outdated_task_results

* Fix tests for delete_outdated_task_results
diff --git a/scrapydweb/default_settings.py b/scrapydweb/default_settings.py
@@ -117,7 +117,7 @@
 ############################## LogParser ######################################
 # Whether to backup the stats json files locally after you visit the Stats page of a job
 # so that it is still accessible even if the original logfile has been deleted.
-# The default is True, set it to False to disable this behaviour.
+# The default is True, set it to False to disable this behavior.
 BACKUP_STATS_JSON_FILE = True
 
 
@@ -127,11 +127,28 @@
 
 # The default is 300, which means ScrapydWeb would automatically create a snapshot of the Jobs page
 # and save the jobs info in the database in the background every 300 seconds.
-# Note that this behaviour would be paused if the scheduler for timer tasks is disabled.
-# Set it to 0 to disable this behaviour.
+# Note that this behavior would be paused if the scheduler for timer tasks is disabled.
+# Set it to 0 to disable this behavior.
 JOBS_SNAPSHOT_INTERVAL = 300
 
 
+# The default is 300, which means ScrapydWeb would automatically check the amount of task results of all timer tasks
+# in the background every 300 seconds to delete some outdated records in the database.
+# This option works only when either KEEP_TASK_RESULT_LIMIT or KEEP_TASK_RESULT_WITHIN_DAYS is not 0.
+# Note that this behavior would be paused if the scheduler for timer tasks is disabled.
+# Set it to 0 to disable this behavior.
+CHECK_TASK_RESULT_INTERVAL = 300
+
+# The default is 1000, which means only the latest 1000 timer task results would not be deleted from the database.
+# See also CHECK_TASK_RESULT_INTERVAL. Set it to 0 to disable this behavior.
+KEEP_TASK_RESULT_LIMIT = 1000
+
+# The default is 31, which means only the timer task results executed within recent 31 days
+# would not be deleted from the database.
+# See also CHECK_TASK_RESULT_INTERVAL. Set it to 0 to disable this behavior.
+KEEP_TASK_RESULT_WITHIN_DAYS = 31
+
+
 ############################## Run Spider #####################################
 # The default is False, set it to True to automatically
 # expand the 'settings & arguments' section in the Run Spider page.
diff --git a/scrapydweb/run.py b/scrapydweb/run.py
@@ -8,6 +8,7 @@
 from flask import request
 
 # from . import create_app  # --debug: ImportError: cannot import name 'create_app'
+# python -m scrapydweb.run
 from scrapydweb import create_app
 from scrapydweb.__version__ import __description__, __version__
 from scrapydweb.common import authenticate, find_scrapydweb_settings_py, handle_metadata, handle_slash
@@ -137,9 +138,15 @@ def load_custom_settings(config):
                       file=SCRAPYDWEB_SETTINGS_PY))
         else:
             sys.exit("\nATTENTION:\nYou may encounter ERROR if there are any running timer tasks added in v1.2.0,\n"
-                     "and you have to restart scrapydweb and manually edit the tasks to resume them.\n"
-                     "\nThe config file '{file}' has been copied to current working directory.\n"
-                     "Please add your SCRAPYD_SERVERS in the config file and restart scrapydweb.\n".format(
+                     "and you have to restart scrapydweb and manually edit the tasks to resume them.\n\n"
+                     "The config file '{file}' has been copied to current working directory.\n"
+                     "Please add your SCRAPYD_SERVERS in the config file and restart scrapydweb.\n\n"
+                     "New options to control the amount of task results of all timer tasks:\n"
+                     "##########\n"
+                     "CHECK_TASK_RESULT_INTERVAL = 300\n"
+                     "KEEP_TASK_RESULT_LIMIT = 1000\n"
+                     "KEEP_TASK_RESULT_WITHIN_DAYS = 31\n"
+                     "##########\n".format(
                       file=SCRAPYDWEB_SETTINGS_PY))
 
 
diff --git a/scrapydweb/templates/scrapydweb/settings.html b/scrapydweb/templates/scrapydweb/settings.html
@@ -95,6 +95,9 @@ <h3>Timer tasks</h3>
       <ul class="collapse">
         <li><div class="title"><h4>scheduler.state: {{ scheduler_state }}</h4></div></li>
         <li><div class="title"><h4>JOBS_SNAPSHOT_INTERVAL = {{ JOBS_SNAPSHOT_INTERVAL }}</h4></div></li>
+        <li><div class="title"><h4>CHECK_TASK_RESULT_INTERVAL = {{ CHECK_TASK_RESULT_INTERVAL }}</h4></div></li>
+        <li><div class="title"><h4>KEEP_TASK_RESULT_LIMIT = {{ KEEP_TASK_RESULT_LIMIT }}</h4></div></li>
+        <li><div class="title"><h4>KEEP_TASK_RESULT_WITHIN_DAYS = {{ KEEP_TASK_RESULT_WITHIN_DAYS }}</h4></div></li>
       </ul>
     </div>
 
diff --git a/scrapydweb/utils/check_app_config.py b/scrapydweb/utils/check_app_config.py
@@ -307,6 +307,28 @@ def check_assert(key, default, is_instance, allow_zero=True, non_empty=False, co
                                       trigger='interval', seconds=JOBS_SNAPSHOT_INTERVAL,
                                       misfire_grace_time=60, coalesce=True, max_instances=1, jobstore='memory'))
 
+    check_assert('CHECK_TASK_RESULT_INTERVAL', 300, int)
+    check_assert('KEEP_TASK_RESULT_LIMIT', 1000, int)
+    check_assert('KEEP_TASK_RESULT_WITHIN_DAYS', 31, int)
+    CHECK_TASK_RESULT_INTERVAL = config.get('CHECK_TASK_RESULT_INTERVAL', 300)
+    KEEP_TASK_RESULT_LIMIT = config.get('KEEP_TASK_RESULT_LIMIT', 1000)
+    KEEP_TASK_RESULT_WITHIN_DAYS = config.get('KEEP_TASK_RESULT_WITHIN_DAYS', 31)
+
+    logger.info('CHECK_TASK_RESULT_INTERVAL: %s' % CHECK_TASK_RESULT_INTERVAL)
+    logger.info('KEEP_TASK_RESULT_LIMIT: %s' % KEEP_TASK_RESULT_LIMIT)
+    logger.info('KEEP_TASK_RESULT_WITHIN_DAYS: %s' % KEEP_TASK_RESULT_WITHIN_DAYS)
+    if CHECK_TASK_RESULT_INTERVAL and (KEEP_TASK_RESULT_LIMIT or KEEP_TASK_RESULT_WITHIN_DAYS):
+        username = config.get('USERNAME', '')
+        password = config.get('PASSWORD', '')
+        kwargs = dict(
+            url=config['URL_SCRAPYDWEB'] + handle_metadata().get('url_delete_task_result',
+                                                                 '/1/tasks/xhr/delete/1/2/'),
+            auth=(username, password) if username and password else None,
+        )
+        logger.info(scheduler.add_job(id='delete_task_result', replace_existing=True,
+                                      func=delete_task_result, args=None, kwargs=kwargs,
+                                      trigger='interval', seconds=CHECK_TASK_RESULT_INTERVAL,
+                                      misfire_grace_time=60, coalesce=True, max_instances=1, jobstore='memory'))
     # Subprocess
     init_subprocess(config)
 
@@ -323,6 +345,17 @@ def create_jobs_snapshot(url_jobs, auth, nodes):
         #     print(url_jobs, r.status_code)
 
 
+def delete_task_result(url, auth):
+    url = re.sub(r'(\d+/)+$', '', url)
+    try:
+        r = session.post(url, auth=auth, timeout=60)
+        assert r.status_code == 200, "Request got status_code: %s" % r.status_code
+    except Exception as err:
+        print("Fail to delete task result: %s\n%s" % (url, err))
+    # else:
+    #     print('delete_task_result', url, r.status_code, r.json())
+
+
 def check_scrapyd_servers(config):
     SCRAPYD_SERVERS = config.get('SCRAPYD_SERVERS', []) or ['127.0.0.1:6800']
     SCRAPYD_SERVERS_PUBLIC_URLS = config.get('SCRAPYD_SERVERS_PUBLIC_URLS', None) or [''] * len(SCRAPYD_SERVERS)
@@ -368,11 +401,12 @@ def check_connectivity(server):
         try:
             url = 'http://%s:%s' % (_ip, _port)
             r = session.get(url, auth=_auth, timeout=10)
-            assert r.status_code == 200, "%s got status_code %s" % (url, r.status_code)
+            assert r.status_code == 200, "%s with auth %s got status_code %s" % (url, _auth, r.status_code)
         except Exception as err:
             logger.error(err)
             return False
         else:
+            logger.debug("%s with auth %s got status_code %s" % (url, _auth, r.status_code))
             return True
 
     # with ThreadPool(min(len(servers), 100)) as pool:  # Works in python 3.3 and up
diff --git a/scrapydweb/vars.py b/scrapydweb/vars.py
@@ -15,7 +15,7 @@
 
 PYTHON_VERSION = '.'.join([str(n) for n in sys.version_info[:3]])
 PY2 = sys.version_info.major < 3
-SCRAPYDWEB_SETTINGS_PY = 'scrapydweb_settings_v10.py'
+SCRAPYDWEB_SETTINGS_PY = 'scrapydweb_settings_v11.py'
 sys.path.append(os.getcwd())
 try:
     custom_settings_module = importlib.import_module(os.path.splitext(SCRAPYDWEB_SETTINGS_PY)[0])
diff --git a/scrapydweb/views/baseview.py b/scrapydweb/views/baseview.py
@@ -111,6 +111,9 @@ def __init__(self, *args, **kwargs):
         # Timer Tasks
         self.scheduler = scheduler
         self.JOBS_SNAPSHOT_INTERVAL = app.config.get('JOBS_SNAPSHOT_INTERVAL', 300)
+        self.CHECK_TASK_RESULT_INTERVAL = app.config.get('CHECK_TASK_RESULT_INTERVAL', 300)
+        self.KEEP_TASK_RESULT_LIMIT = app.config.get('KEEP_TASK_RESULT_LIMIT', 1000)
+        self.KEEP_TASK_RESULT_WITHIN_DAYS = app.config.get('KEEP_TASK_RESULT_WITHIN_DAYS', 31)
 
         # Run Spider
         self.SCHEDULE_EXPAND_SETTINGS_ARGUMENTS = app.config.get('SCHEDULE_EXPAND_SETTINGS_ARGUMENTS', False)
diff --git a/scrapydweb/views/overview/tasks.py b/scrapydweb/views/overview/tasks.py
@@ -1,10 +1,11 @@
 # coding: utf-8
-from datetime import datetime
+from datetime import datetime, timedelta
 import json
 import logging
 import traceback
 
 from flask import Blueprint, flash, render_template, request, send_file, url_for
+from sqlalchemy import and_
 
 from ...common import handle_metadata
 from ...models import Task, TaskResult, TaskJobResult, db
@@ -272,6 +273,7 @@ def __init__(self):
         self.task = Task.query.get(self.task_id) if self.task_id else None
         self.apscheduler_job = self.scheduler.get_job(str(self.task_id)) if self.task_id else None  # Return type: Job|None
         self.js = dict(action=self.action, task_id=self.task_id, task_result_id=self.task_result_id, url=request.url)
+        # self.logger.warning(self.js)
 
     def dispatch_request(self, **kwargs):
         try:
@@ -293,8 +295,10 @@ def generate_response(self):
         elif self.action == 'delete':  # delete a task_result|task
             if self.task_result_id:
                 self.delete_task_result()
-            else:
+            elif self.task_id:
                 self.delete_task()
+            else:
+                self.delete_outdated_task_results()
         elif self.action == 'dump':  # For test only
             self.dump_task_data()
         elif self.action == 'fire':  # update next_run_time
@@ -425,3 +429,38 @@ def list_tasks_or_results(self):
         else:
             records = Task.query.all()
         self.js['ids'] = [i.id for i in records]
+
+    def delete_outdated_task_results(self):
+        # The condition equals to: pass_count != 0 or fail_count != 0
+        condition = ~and_(TaskResult.pass_count == 0, TaskResult.fail_count == 0)
+
+        if self.KEEP_TASK_RESULT_LIMIT:
+            count_before = TaskResult.query.count()
+            task_results = TaskResult.query.filter(condition).order_by(
+                TaskResult.execute_time.desc()).offset(self.KEEP_TASK_RESULT_LIMIT).all()
+            for task_result in task_results:
+                self.logger.debug("delete TaskResult: %s" % task_result)
+                db.session.delete(task_result)
+            db.session.commit()
+            count_after = TaskResult.query.count()
+            self.logger.info("KEEP_TASK_RESULT_LIMIT: %s, total TaskResult: from %s to %s" % (
+                self.KEEP_TASK_RESULT_LIMIT, count_before, count_after))
+            self.js.update(amount_limit=dict(KEEP_TASK_RESULT_LIMIT=self.KEEP_TASK_RESULT_LIMIT,
+                                             count_before=count_before, count_after=count_after))
+
+        if self.KEEP_TASK_RESULT_WITHIN_DAYS:
+            count_before = TaskResult.query.count()
+            # timedelta(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0)
+            n_days_ago = datetime.now() - timedelta(days=self.KEEP_TASK_RESULT_WITHIN_DAYS)
+
+            task_results = TaskResult.query.filter(TaskResult.execute_time <= n_days_ago, condition).all()
+            for task_result in task_results:
+                self.logger.debug("delete TaskResult: %s" % task_result)
+                db.session.delete(task_result)
+            db.session.commit()
+
+            count_after = TaskResult.query.count()
+            self.logger.info("KEEP_TASK_RESULT_WITHIN_DAYS: %s, total TaskResult: from %s to %s" % (
+                self.KEEP_TASK_RESULT_WITHIN_DAYS, count_before, count_after))
+            self.js.update(day_limit=dict(KEEP_TASK_RESULT_WITHIN_DAYS=self.KEEP_TASK_RESULT_WITHIN_DAYS,
+                                          count_before=count_before, count_after=count_after))
diff --git a/scrapydweb/views/system/settings.py b/scrapydweb/views/system/settings.py
@@ -90,6 +90,9 @@ def update_kwargs(self):
         # Timer Tasks
         self.kwargs['scheduler_state'] = SCHEDULER_STATE_DICT[self.scheduler.state]
         self.kwargs['JOBS_SNAPSHOT_INTERVAL'] = self.JOBS_SNAPSHOT_INTERVAL
+        self.kwargs['CHECK_TASK_RESULT_INTERVAL'] = self.CHECK_TASK_RESULT_INTERVAL
+        self.kwargs['KEEP_TASK_RESULT_LIMIT'] = self.KEEP_TASK_RESULT_LIMIT
+        self.kwargs['KEEP_TASK_RESULT_WITHIN_DAYS'] = self.KEEP_TASK_RESULT_WITHIN_DAYS
 
         # Run Spider
         self.kwargs['run_spider_details'] = self.json_dumps(dict(
diff --git a/tests/test_tasks_single_scrapyd.py b/tests/test_tasks_single_scrapyd.py
@@ -8,6 +8,7 @@
 from six.moves.urllib.parse import unquote_plus
 from tzlocal import get_localzone
 
+from scrapydweb.utils.check_app_config import check_app_config
 from tests.utils import cst, req_single_scrapyd, sleep, upload_file_deploy
 
 
@@ -867,3 +868,12 @@ def test_history(app, client):
                             "assert js['status_code'] == 200 and js['status'] == 'ok'",
                             "Task #%s deleted" % task_id,
                             ])
+
+def test_check_task_result_interval(app, client):
+    app.config['ENABLE_MONITOR'] = False
+    app.config['CHECK_TASK_RESULT_INTERVAL'] = 5
+    app.config['SCRAPYD_SERVERS'] = app.config['_SCRAPYD_SERVERS']
+    check_app_config(app.config)
+    sleep(8)
+    __, js = req_single_scrapyd(app, client, view='tasks.xhr', kws=dict(node=NODE, action='delete'))
+    print("test_check_task_result_interval: %s" % js)