Megatest

Diff
Login

Differences From Artifact [b67c39dcab]:

To Artifact [e111ea3adb]:


851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876

877
878
879
880
881
882
883
884
885
886
887
888
889
890
	(num-retries           0)
	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
	(max-concurrent-jobs   (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1))) ;; length of the register queue ahead
	(reglen                (if (number? reglen-in) reglen-in 1))
	(last-time-incomplete  (current-seconds))
	(last-time-some-running (current-seconds)))

    ;; Initialize the test-registery hash with tests that already have a record
    ;; convert state to symbol and use that as the hash value
    (for-each (lambda (trec)
		(let ((id (db:test-get-id        trec))
		      (tn (db:test-get-testname  trec))
		      (ip (db:test-get-item-path trec))
		      (st (db:test-get-state     trec)))
		  (if (not (equal? st "DELETED"))
		      (hash-table-set! test-registry (runs:make-full-test-name tn ip) (string->symbol st)))))
	      tests-info)
    (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100))

    (let loop ((hed         (car sorted-test-names))
	       (tal         (cdr sorted-test-names))
	       (reg         '()) ;; registered, put these at the head of tal 
	       (reruns      '()))

      (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns))

      ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
      ;; (if (> (current-seconds)(+ last-time-incomplete 900))
      ;;     (begin
      ;;       (set! last-time-incomplete (current-seconds))
      ;;       (cdb:remote-run db:find-and-mark-incomplete #f)))

      ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns)
      (let* ((test-record (hash-table-ref test-records hed))
	     (test-name   (tests:testqueue-get-testname test-record))
	     (tconfig     (tests:testqueue-get-testconfig test-record))
	     (jobgroup    (config-lookup tconfig "test_meta" "jobgroup"))
	     (testmode    (let ((m (config-lookup tconfig "requirements" "mode")))







|


















>



|
|
|
|







851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
	(num-retries           0)
	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
	(max-concurrent-jobs   (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1))) ;; length of the register queue ahead
	(reglen                (if (number? reglen-in) reglen-in 1))
	(last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
	(last-time-some-running (current-seconds)))

    ;; Initialize the test-registery hash with tests that already have a record
    ;; convert state to symbol and use that as the hash value
    (for-each (lambda (trec)
		(let ((id (db:test-get-id        trec))
		      (tn (db:test-get-testname  trec))
		      (ip (db:test-get-item-path trec))
		      (st (db:test-get-state     trec)))
		  (if (not (equal? st "DELETED"))
		      (hash-table-set! test-registry (runs:make-full-test-name tn ip) (string->symbol st)))))
	      tests-info)
    (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100))

    (let loop ((hed         (car sorted-test-names))
	       (tal         (cdr sorted-test-names))
	       (reg         '()) ;; registered, put these at the head of tal 
	       (reruns      '()))

      (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns))

      ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
      (if (> (current-seconds)(+ last-time-incomplete 900))
          (begin
            (set! last-time-incomplete (current-seconds))
            (cdb:remote-run db:find-and-mark-incomplete #f)))

      ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns)
      (let* ((test-record (hash-table-ref test-records hed))
	     (test-name   (tests:testqueue-get-testname test-record))
	     (tconfig     (tests:testqueue-get-testconfig test-record))
	     (jobgroup    (config-lookup tconfig "test_meta" "jobgroup"))
	     (testmode    (let ((m (config-lookup tconfig "requirements" "mode")))
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
;; fields are passing in through 
;; action:
;;    'remove-runs
;;    'set-state-status
;;
;; NB// should pass in keys?
;;
(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f))
  (common:clear-caches) ;; clear all caches
  (let* ((db           #f)
	 (keys         (rmt:get-keys))
	 (rundat       (mt:get-runs-by-patt keys runnamepatt target))
	 (header       (vector-ref rundat 0))
	 (runs         (vector-ref rundat 1))
	 (states       (if state  (string-split state  ",") '()))







|







1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
;; fields are passing in through 
;; action:
;;    'remove-runs
;;    'set-state-status
;;
;; NB// should pass in keys?
;;
(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(remove-data-only #f))
  (common:clear-caches) ;; clear all caches
  (let* ((db           #f)
	 (keys         (rmt:get-keys))
	 (rundat       (mt:get-runs-by-patt keys runnamepatt target))
	 (header       (vector-ref rundat 0))
	 (runs         (vector-ref rundat 1))
	 (states       (if state  (string-split state  ",") '()))
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384



1385
1386




1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
			     (if (not (null? tal))
				 (loop (car tal)(cdr tal))))
			   (let* ((item-path     (db:test-get-item-path new-test-dat))
				  (test-name     (db:test-get-testname new-test-dat))
				  (run-dir       ;;(filedb:get-path *fdb*
				   ;; (rmt:sdb-qry 'getid 
				   (db:test-get-rundir new-test-dat)) ;; )    ;; run dir is from the link tree
				  (real-dir      (if (file-exists? run-dir)
						     (resolve-pathname run-dir)
						     #f))
				  (test-state    (db:test-get-state new-test-dat))
				  (test-fulln    (db:test-get-fullname new-test-dat)))



			     (case action
			       ((remove-runs)




				(debug:print-info 0 "test: " test-name " itest-state: " test-state)
				(if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
				    (begin
				      (if (not (hash-table-ref/default test-retry-time test-fulln #f))
					  (begin
					    ;; want to set to REMOVING BUT CANNOT do it here?
					    (hash-table-set! test-retry-time test-fulln (current-seconds))))
				      (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)
					  ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first
					  ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give
					  ;; up and blow it away.
					  (begin
					    (debug:print 0 "WARNING: could not gracefully remove test " test-fulln ", tried to kill it to no avail. Forcing state to FAILEDKILL and continuing")
					    (mt:test-set-state-status-by-id run-id (db:test-get-id test) "FAILEDKILL" "n/a" #f)
					    (thread-sleep! 1))
					  (begin
					    (mt:test-set-state-status-by-id run-id (db:test-get-id test) "KILLREQ" "n/a" #f)
					    (thread-sleep! 1)))
				      ;; NOTE: This is suboptimal as the testdata will be used later and the state/status may have changed ...
				      (if (null? tal)
					  (loop new-test-dat tal)
					  (loop (car tal)(append tal (list new-test-dat)))))
				    (begin
				      (mt:test-set-state-status-by-id run-id (db:test-get-id test) "REMOVING" "LOCKED" #f)
				      (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
				      (if (and real-dir 
					       (> (string-length real-dir) 5)
					       (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
					  (begin ;; let* ((realpath (resolve-pathname run-dir)))
					    (debug:print-info 1 "Recursively removing " real-dir)
					    (if (file-exists? real-dir)
						(runs:safe-delete-test-dir real-dir)
						(debug:print 0 "WARNING: test dir " real-dir " appears to not exist or is not readable")))
					  (if real-dir 
					      (debug:print 0 "WARNING: directory " real-dir " does not exist")
					      (debug:print 0 "WARNING: no real directory corrosponding to link " run-dir ", nothing done")))
				      (if (symbolic-link? run-dir)
					  (begin
					    (debug:print-info 1 "Removing symlink " run-dir)
					    (handle-exceptions
					     exn
					     (debug:print 0 "ERROR:  Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue")
					     (delete-file run-dir)))
					  (if (directory? run-dir)
					      (if (> (directory-fold (lambda (f x)(+ 1 x)) 0 run-dir) 0)
						  (debug:print 0 "WARNING: refusing to remove " run-dir " as it is not empty")
						  (handle-exceptions
						   exn
						   (debug:print 0 "ERROR:  Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue")
						   (delete-directory run-dir)))
					      (if run-dir
						  (debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink")
						  (debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted."))
					      ))
				      ;; Only delete the records *after* removing the directory. If things fail we have a record 
				      (rmt:delete-test-records (db:test-get-run_id test)(db:test-get-id test))
				      (if (not (null? tal))
					  (loop (car tal)(cdr tal))))))
			       ((set-state-status)
				(debug:print-info 2 "new state " (car state-status) ", new status " (cadr state-status))
				(mt:test-set-state-status-by-id run-id (db:test-get-id test) (car state-status)(cadr state-status) #f)
				(if (not (null? tal))
				    (loop (car tal)(cdr tal))))
			       ((run-wait)
				(debug:print-info 2 "still waiting, " (length tests) " tests still running")







<
<
<

|
>
>
>


>
>
>
>
|
|
|
|
|
|
|
|
|
|
|
|
|

|
|

|
|
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
|
|







1374
1375
1376
1377
1378
1379
1380



1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414


























1415






1416
1417
1418
1419
1420
1421
1422
1423
1424
			     (if (not (null? tal))
				 (loop (car tal)(cdr tal))))
			   (let* ((item-path     (db:test-get-item-path new-test-dat))
				  (test-name     (db:test-get-testname new-test-dat))
				  (run-dir       ;;(filedb:get-path *fdb*
				   ;; (rmt:sdb-qry 'getid 
				   (db:test-get-rundir new-test-dat)) ;; )    ;; run dir is from the link tree



				  (test-state    (db:test-get-state new-test-dat))
				  (test-fulln    (db:test-get-fullname new-test-dat))
				  (uname         (db:test-get-uname    new-test-dat))
				  (toplevel-with-children (and (db:test-get-is-toplevel test)
							       (> (cdb:remote-run db:test-toplevel-num-items db run-id test-name) 0))))
			     (case action
			       ((remove-runs)
				;; if the test is a toplevel-with-children issue an error and do not remove
				(if toplevel-with-children
				    (debug:print 0 "WARNING: skipping removal of " test-fulln " with run-id " run-id " as it has sub tests")
				    (begin
				      (debug:print-info 0 "test: " test-name " itest-state: " test-state)
				      (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
					  (begin
					    (if (not (hash-table-ref/default test-retry-time test-fulln #f))
						(begin
						  ;; want to set to REMOVING BUT CANNOT do it here?
						  (hash-table-set! test-retry-time test-fulln (current-seconds))))
					    (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)
						;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first
						;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give
						;; up and blow it away.
						(begin
						  (debug:print 0 "WARNING: could not gracefully remove test " test-fulln ", tried to kill it to no avail. Forcing state to FAILEDKILL and continuing")
					    (mt:test-set-state-status-by-id run-id (db:test-get-id test) "FAILEDKILL" "n/a" #f)
						  (thread-sleep! 1))
						(begin
					    (mt:test-set-state-status-by-id run-id (db:test-get-id test) "KILLREQ" "n/a" #f)
						  (thread-sleep! 1)))
					    ;; NOTE: This is suboptimal as the testdata will be used later and the state/status may have changed ...
					    (if (null? tal)
						(loop new-test-dat tal)
						(loop (car tal)(append tal (list new-test-dat)))))
					  (begin


























					    (runs:remove-test-directory db new-test-dat remove-data-only)






					    (if (not (null? tal))
						(loop (car tal)(cdr tal))))))))
			       ((set-state-status)
				(debug:print-info 2 "new state " (car state-status) ", new status " (cadr state-status))
				(mt:test-set-state-status-by-id run-id (db:test-get-id test) (car state-status)(cadr state-status) #f)
				(if (not (null? tal))
				    (loop (car tal)(cdr tal))))
			       ((run-wait)
				(debug:print-info 2 "still waiting, " (length tests) " tests still running")
1473
1474
1475
1476
1477
1478
1479









































1480
1481
1482
1483
1484
1485
1486
		       ;; 	 (debug:print 1 "Removing run dir " runpath)
		       ;; 	 (system (conc "rmdir -p " runpath))))
		       )))))
	 ))
     runs))
  #t)










































;;======================================================================
;; Routines for manipulating runs
;;======================================================================

;; Since many calls to a run require pretty much the same setup 
;; this wrapper is used to reduce the replication of code
(define (general-run-call switchname action-desc proc)







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
		       ;; 	 (debug:print 1 "Removing run dir " runpath)
		       ;; 	 (system (conc "rmdir -p " runpath))))
		       )))))
	 ))
     runs))
  #t)

(define (runs:remove-test-directory db test remove-data-only)
  (let* ((run-dir       (db:test-get-rundir test))    ;; run dir is from the link tree
	 (real-dir      (if (file-exists? run-dir)
			    (resolve-pathname run-dir)
			    #f)))
    (if (not remove-data-only)
	(mt:test-set-state-status-by-id (db:test-get-id test) "REMOVING" "LOCKED" #f))
    (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
    (if (and real-dir 
	     (> (string-length real-dir) 5)
	     (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
	(begin ;; let* ((realpath (resolve-pathname run-dir)))
	  (debug:print-info 1 "Recursively removing " real-dir)
	  (if (file-exists? real-dir)
	      (runs:safe-delete-test-dir real-dir)
	      (debug:print 0 "WARNING: test dir " real-dir " appears to not exist or is not readable")))
	(if real-dir 
	    (debug:print 0 "WARNING: directory " real-dir " does not exist")
	    (debug:print 0 "WARNING: no real directory corrosponding to link " run-dir ", nothing done")))
    (if (symbolic-link? run-dir)
	(begin
	  (debug:print-info 1 "Removing symlink " run-dir)
	  (handle-exceptions
	   exn
	   (debug:print 0 "ERROR:  Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue")
	   (delete-file run-dir)))
	(if (directory? run-dir)
	    (if (> (directory-fold (lambda (f x)(+ 1 x)) 0 run-dir) 0)
		(debug:print 0 "WARNING: refusing to remove " run-dir " as it is not empty")
		(handle-exceptions
		 exn
		 (debug:print 0 "ERROR:  Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue")
		 (delete-directory run-dir)))
	    (if run-dir
		(debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink")
		(debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted."))
	    ))
    ;; Only delete the records *after* removing the directory. If things fail we have a record 
    (if (not remove-data-only)
	(cdb:remote-run db:delete-test-records db #f (db:test-get-id test)))))

;;======================================================================
;; Routines for manipulating runs
;;======================================================================

;; Since many calls to a run require pretty much the same setup 
;; this wrapper is used to reduce the replication of code
(define (general-run-call switchname action-desc proc)