Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1992,35 +1992,35 @@ (hash-table-set! tests-hash full-testname testdat)))) results) (if (null? tal) (map cdr (hash-table->alist tests-hash)) ;; return a list of the most recent tests (loop (car tal)(cdr tal)))))))))) - (let* ((remtries 10) - (proc #f)) - (set! proc (lambda (remtries) - (if (> remtries 0) - (handle-exceptions - exn - (let ((sleep-time (random 30)) - (err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) - (case err-status - ((busy) - (thread-sleep! sleep-time) - (proc 10)) ;; we never give up on busy - (else - (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") - (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) - (print-call-chain) - (debug:print 0 "Sleeping for " sleep-time) - (thread-sleep! sleep-time) - (debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up") - (proc (- remtries 1))))) - (apply sqlite3:execute db query params)) - (debug:print 0 "ERROR: too many attempts to access db were made and no sucess. query: " - query ", params: " params)))) - (proc remtries)) +;; (let* ((remtries 10) +;; (proc #f)) +;; (set! proc (lambda (remtries) +;; (if (> remtries 0) +;; (handle-exceptions +;; exn +;; (let ((sleep-time (random 30)) +;; (err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) +;; (case err-status +;; ((busy) +;; (thread-sleep! sleep-time) +;; (proc 10)) ;; we never give up on busy +;; (else +;; (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") +;; (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) +;; (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) +;; (print-call-chain) +;; (debug:print 0 "Sleeping for " sleep-time) +;; (thread-sleep! sleep-time) +;; (debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up") +;; (proc (- remtries 1))))) +;; (apply sqlite3:execute db query params)) +;; (debug:print 0 "ERROR: too many attempts to access db were made and no sucess. query: " +;; query ", params: " params)))) +;; (proc remtries)) (define (db:test-get-records-for-index-file dbstruct run-id test-name) (let ((res '())) (sqlite3:for-each-row (lambda (id itempath state status run_duration logf-id comment-id) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -330,30 +330,35 @@ ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid (vector-ref exit-info 0))) (if (number? pid) - (process-signal pid signal/kill) - ;; (begin - ;; (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")") - ;; (let ((processes (cmd-run->list (conc "pgrep -l -P " pid)))) - ;; (for-each - ;; (lambda (p) - ;; (let* ((parts (string-split p)) - ;; (p-id (if (> (length parts) 0) - ;; (string->number (car parts)) - ;; #f))) - ;; (if p-id - ;; (begin - ;; (debug:print 0 "Killing " (cadr parts) "; kill -9 " p-id) - ;; (system (conc "kill -9 " p-id)))))) - ;; (car processes)) - ;; (system (conc "kill -9 -" pid)))) + (handle-exceptions + exn + (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.") + ;;(process-signal pid signal/kill)) + (begin + (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")") + (let ((processes (cmd-run->list (conc "pgrep -l -P " pid)))) + (for-each + (lambda (p) + (let* ((parts (string-split p)) + (p-id (if (> (length parts) 0) + (string->number (car parts)) + #f))) + (if p-id + (begin + (debug:print 0 "Killing " (cadr parts) "; kill -9 " p-id) + ;; (process-signal pid signal/kill))))) ;; + (system (conc "kill -9 " p-id)))))) + (car processes))) + (system (conc "kill -9 -" pid)) + (tests:test-set-status! test-id "KILLED" "FAIL" (args:get-arg "-m") #f))) (begin (debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process") - (tests:test-set-status! run-id test-id "KILLED" "FAIL" - (args:get-arg "-m") #f) +;; (tests:test-set-status! run-id test-id "KILLED" "FAIL" + (tests:test-set-status! run-id test-id "KILLED" "FAIL" (args:get-arg "-m") #f) (exit 1) ;; IS THIS NECESSARY OR WISE??? ))) (set! kill-tries (+ 1 kill-tries)) (mutex-unlock! m))) (if keep-going