@@ -1005,92 +1005,97 @@ (test-retry-time (make-hash-table)) (allow-run-time 10)) ;; seconds to allow for killing tests before just brutally killing 'em (let loop ((test (car sorted-tests)) (tal (cdr sorted-tests))) (let* ((test-id (db:test-get-id test)) - (new-test-dat (cdb:remote-run db:get-test-info-by-id #f test-id)) - (item-path (db:test-get-item-path new-test-dat)) - (test-name (db:test-get-testname new-test-dat)) - (run-dir (db:test-get-rundir new-test-dat)) ;; run dir is from the link tree - (real-dir (if (file-exists? run-dir) - (resolve-pathname run-dir) - #f)) - (test-state (db:test-get-state new-test-dat)) - (test-fulln (db:test-get-fullname new-test-dat))) - (case action - ((remove-runs) - (debug:print-info 0 "test: " test-name " itest-state: " test-state) - (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ")) - (begin - (if (not (hash-table-ref/default test-retry-time test-fulln #f)) - (begin - ;; want to set to REMOVING BUT CANNOT do it here? - (hash-table-set! test-retry-time test-fulln (current-seconds)))) - (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time) - ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first - ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give - ;; up and blow it away. - (begin - (debug:print 0 "WARNING: could not gracefully remove test " test-fulln ", tried to kill it to no avail. Forcing state to FAILEDKILL and continuing") - (mt:test-set-state-status-by-id (db:test-get-id test) "FAILEDKILL" "n/a" #f) - (thread-sleep! 1)) - (begin - (mt:test-set-state-status-by-id (db:test-get-id test) "KILLREQ" "n/a" #f) - (thread-sleep! 1))) - ;; NOTE: This is suboptimal as the testdata will be used later and the state/status may have changed ... - (if (null? tal) - (loop new-test-dat tal) - (loop (car tal)(append tal (list new-test-dat))))) - (begin - (mt:test-set-state-status-by-id (db:test-get-id test) "REMOVING" "LOCKED" #f) - (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) - (if (and real-dir - (> (string-length real-dir) 5) - (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. - (begin ;; let* ((realpath (resolve-pathname run-dir))) - (debug:print-info 1 "Recursively removing " real-dir) - (if (file-exists? real-dir) - (runs:safe-delete-test-dir real-dir) - (debug:print 0 "WARNING: test dir " real-dir " appears to not exist or is not readable"))) - (if real-dir - (debug:print 0 "WARNING: directory " real-dir " does not exist") - (debug:print 0 "WARNING: no real directory corrosponding to link " run-dir ", nothing done"))) - (if (symbolic-link? run-dir) - (begin - (debug:print-info 1 "Removing symlink " run-dir) - (handle-exceptions - exn - (debug:print 0 "ERROR: Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue") - (delete-file run-dir))) - (if (directory? run-dir) - (if (> (directory-fold (lambda (f x)(+ 1 x)) 0 run-dir) 0) - (debug:print 0 "WARNING: refusing to remove " run-dir " as it is not empty") - (handle-exceptions - exn - (debug:print 0 "ERROR: Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue") - (delete-directory run-dir))) - (if run-dir - (debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink") - (debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted.")) - )) - ;; Only delete the records *after* removing the directory. If things fail we have a record - (cdb:remote-run db:delete-test-records db #f (db:test-get-id test)) - (if (not (null? tal)) - (loop (car tal)(cdr tal)))))) - ((set-state-status) - (debug:print-info 2 "new state " (car state-status) ", new status " (cadr state-status)) - (mt:test-set-state-status-by-id (db:test-get-id test) (car state-status)(cadr state-status) #f) - (if (not (null? tal)) - (loop (car tal)(cdr tal)))) - ((run-wait) - (debug:print-info 2 "still waiting, " (length tests) " tests still running") - (thread-sleep! 10) - (let ((new-tests (proc-get-tests run-id))) - (if (null? new-tests) - (debug:print-info 1 "Run completed according to zero tests matching provided criteria.") - (loop (car new-tests)(cdr new-tests)))))))) - ))) + (new-test-dat (cdb:get-test-info-by-id *runremote* test-id))) + (if (not new-test-dat) + (begin + (debug:print 0 "ERROR: We have a test-id of " test-id " but no record was found. NOTE: No locking of records is done between processes, do not simultaneously remove the same run from two processes!") + (if (not (null? tal)) + (loop (car tal)(cdr tal)))) + (let* ((item-path (db:test-get-item-path new-test-dat)) + (test-name (db:test-get-testname new-test-dat)) + (run-dir (db:test-get-rundir new-test-dat)) ;; run dir is from the link tree + (real-dir (if (file-exists? run-dir) + (resolve-pathname run-dir) + #f)) + (test-state (db:test-get-state new-test-dat)) + (test-fulln (db:test-get-fullname new-test-dat))) + (case action + ((remove-runs) + (debug:print-info 0 "test: " test-name " itest-state: " test-state) + (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ")) + (begin + (if (not (hash-table-ref/default test-retry-time test-fulln #f)) + (begin + ;; want to set to REMOVING BUT CANNOT do it here? + (hash-table-set! test-retry-time test-fulln (current-seconds)))) + (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time) + ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first + ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give + ;; up and blow it away. + (begin + (debug:print 0 "WARNING: could not gracefully remove test " test-fulln ", tried to kill it to no avail. Forcing state to FAILEDKILL and continuing") + (mt:test-set-state-status-by-id (db:test-get-id test) "FAILEDKILL" "n/a" #f) + (thread-sleep! 1)) + (begin + (mt:test-set-state-status-by-id (db:test-get-id test) "KILLREQ" "n/a" #f) + (thread-sleep! 1))) + ;; NOTE: This is suboptimal as the testdata will be used later and the state/status may have changed ... + (if (null? tal) + (loop new-test-dat tal) + (loop (car tal)(append tal (list new-test-dat))))) + (begin + (mt:test-set-state-status-by-id (db:test-get-id test) "REMOVING" "LOCKED" #f) + (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) + (if (and real-dir + (> (string-length real-dir) 5) + (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. + (begin ;; let* ((realpath (resolve-pathname run-dir))) + (debug:print-info 1 "Recursively removing " real-dir) + (if (file-exists? real-dir) + (runs:safe-delete-test-dir real-dir) + (debug:print 0 "WARNING: test dir " real-dir " appears to not exist or is not readable"))) + (if real-dir + (debug:print 0 "WARNING: directory " real-dir " does not exist") + (debug:print 0 "WARNING: no real directory corrosponding to link " run-dir ", nothing done"))) + (if (symbolic-link? run-dir) + (begin + (debug:print-info 1 "Removing symlink " run-dir) + (handle-exceptions + exn + (debug:print 0 "ERROR: Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue") + (delete-file run-dir))) + (if (directory? run-dir) + (if (> (directory-fold (lambda (f x)(+ 1 x)) 0 run-dir) 0) + (debug:print 0 "WARNING: refusing to remove " run-dir " as it is not empty") + (handle-exceptions + exn + (debug:print 0 "ERROR: Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue") + (delete-directory run-dir))) + (if run-dir + (debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink") + (debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted.")) + )) + ;; Only delete the records *after* removing the directory. If things fail we have a record + (cdb:remote-run db:delete-test-records db #f (db:test-get-id test)) + (if (not (null? tal)) + (loop (car tal)(cdr tal)))))) + ((set-state-status) + (debug:print-info 2 "new state " (car state-status) ", new status " (cadr state-status)) + (mt:test-set-state-status-by-id (db:test-get-id test) (car state-status)(cadr state-status) #f) + (if (not (null? tal)) + (loop (car tal)(cdr tal)))) + ((run-wait) + (debug:print-info 2 "still waiting, " (length tests) " tests still running") + (thread-sleep! 10) + (let ((new-tests (proc-get-tests run-id))) + (if (null? new-tests) + (debug:print-info 1 "Run completed according to zero tests matching provided criteria.") + (loop (car new-tests)(cdr new-tests)))))))) + ))))) ;; remove the run if zero tests remain (if (eq? action 'remove-runs) (let ((remtests (mt:get-tests-for-run (db:get-value-by-header run header "id") #f '("DELETED") '("n/a") not-in: #t))) (if (null? remtests) ;; no more tests remaining (let* ((dparts (string-split lasttpath "/"))