@@ -307,6 +307,28 @@ def check_assert(key, default, is_instance, allow_zero=True, non_empty=False, co
307307 trigger = 'interval' , seconds = JOBS_SNAPSHOT_INTERVAL ,
308308 misfire_grace_time = 60 , coalesce = True , max_instances = 1 , jobstore = 'memory' ))
309309
310+ check_assert ('CHECK_TASK_RESULT_INTERVAL' , 300 , int )
311+ check_assert ('KEEP_TASK_RESULT_LIMIT' , 1000 , int )
312+ check_assert ('KEEP_TASK_RESULT_WITHIN_DAYS' , 31 , int )
313+ CHECK_TASK_RESULT_INTERVAL = config .get ('CHECK_TASK_RESULT_INTERVAL' , 300 )
314+ KEEP_TASK_RESULT_LIMIT = config .get ('KEEP_TASK_RESULT_LIMIT' , 1000 )
315+ KEEP_TASK_RESULT_WITHIN_DAYS = config .get ('KEEP_TASK_RESULT_WITHIN_DAYS' , 31 )
316+
317+ logger .info ('CHECK_TASK_RESULT_INTERVAL: %s' % CHECK_TASK_RESULT_INTERVAL )
318+ logger .info ('KEEP_TASK_RESULT_LIMIT: %s' % KEEP_TASK_RESULT_LIMIT )
319+ logger .info ('KEEP_TASK_RESULT_WITHIN_DAYS: %s' % KEEP_TASK_RESULT_WITHIN_DAYS )
320+ if CHECK_TASK_RESULT_INTERVAL and (KEEP_TASK_RESULT_LIMIT or KEEP_TASK_RESULT_WITHIN_DAYS ):
321+ username = config .get ('USERNAME' , '' )
322+ password = config .get ('PASSWORD' , '' )
323+ kwargs = dict (
324+ url = config ['URL_SCRAPYDWEB' ] + handle_metadata ().get ('url_delete_task_result' ,
325+ '/1/tasks/xhr/delete/1/2/' ),
326+ auth = (username , password ) if username and password else None ,
327+ )
328+ logger .info (scheduler .add_job (id = 'delete_task_result' , replace_existing = True ,
329+ func = delete_task_result , args = None , kwargs = kwargs ,
330+ trigger = 'interval' , seconds = CHECK_TASK_RESULT_INTERVAL ,
331+ misfire_grace_time = 60 , coalesce = True , max_instances = 1 , jobstore = 'memory' ))
310332 # Subprocess
311333 init_subprocess (config )
312334
@@ -323,6 +345,17 @@ def create_jobs_snapshot(url_jobs, auth, nodes):
323345 # print(url_jobs, r.status_code)
324346
325347
348+ def delete_task_result (url , auth ):
349+ url = re .sub (r'(\d+/)+$' , '' , url )
350+ try :
351+ r = session .post (url , auth = auth , timeout = 60 )
352+ assert r .status_code == 200 , "Request got status_code: %s" % r .status_code
353+ except Exception as err :
354+ print ("Fail to delete task result: %s\n %s" % (url , err ))
355+ # else:
356+ # print('delete_task_result', url, r.status_code, r.json())
357+
358+
326359def check_scrapyd_servers (config ):
327360 SCRAPYD_SERVERS = config .get ('SCRAPYD_SERVERS' , []) or ['127.0.0.1:6800' ]
328361 SCRAPYD_SERVERS_PUBLIC_URLS = config .get ('SCRAPYD_SERVERS_PUBLIC_URLS' , None ) or ['' ] * len (SCRAPYD_SERVERS )
@@ -368,11 +401,12 @@ def check_connectivity(server):
368401 try :
369402 url = 'http://%s:%s' % (_ip , _port )
370403 r = session .get (url , auth = _auth , timeout = 10 )
371- assert r .status_code == 200 , "%s got status_code %s" % (url , r .status_code )
404+ assert r .status_code == 200 , "%s with auth %s got status_code %s" % (url , _auth , r .status_code )
372405 except Exception as err :
373406 logger .error (err )
374407 return False
375408 else :
409+ logger .debug ("%s with auth %s got status_code %s" % (url , _auth , r .status_code ))
376410 return True
377411
378412 # with ThreadPool(min(len(servers), 100)) as pool: # Works in python 3.3 and up
0 commit comments