Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -145,11 +145,11 @@ (alist->env-vars env-ovrd) (set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) (set-item-env-vars itemdat) (save-environment-as-files "megatest") ;; open-run-close not needed for test-set-meta-info - (tests:set-full-meta-info #f test-id run-id 0 work-area) + (tests:set-full-meta-info #f test-id run-id 0 work-area 10) ;; (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f) (tests:test-force-state-status! test-id "REMOTEHOSTSTART" "n/a") (thread-sleep! 0.3) ;; NFS slowness has caused grief here @@ -305,11 +305,11 @@ (round (- (current-seconds) start-seconds))))) (kill-tries 0)) - (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) + (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes))) (begin (set! kill-job? (or (test-get-kill-request test-id) ;; run-id test-name itemdat)) (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) (time-exceeded (> run-seconds runtlim))) @@ -317,11 +317,11 @@ (begin (debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) #t) #f))))) ;; open-run-close not needed for test-set-meta-info - (tests:set-partial-meta-info #f test-id run-id minutes work-area) + (tests:set-partial-meta-info #f test-id run-id minutes work-area 10) (if kill-job? (begin (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -994,11 +994,14 @@ (logprofile (args:get-arg "-logpro")) (logfile (conc stepname ".log")) (cmd (if (null? remargs) #f (car remargs))) (params (if cmd (cdr remargs) '())) (exitstat #f) - (shell (last (string-split (get-environment-variable "SHELL") "/"))) + (shell (let ((sh (get-environment-variable "SHELL") )) + (if sh + (last (string-split sh "/")) + "bash"))) (redir (case (string->symbol shell) ((tcsh csh ksh) ">&") ((zsh bash sh ash) "2>&1 >") (else ">&"))) (fullcmd (conc "(" (string-intersperse Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -704,29 +704,59 @@ (if minutes (cdb:tests-update-run-duration *runremote* test-id minutes)) (if (and uname hostname) (cdb:tests-update-uname-host *runremote* test-id uname hostname))) -(define (tests:set-full-meta-info db test-id run-id minutes work-area) +(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) ;; DOES cdb:remote-run under the hood! - (let* ((num-records 0) ;; (test:tdb-get-rundat-count tdb)) - (cpuload (get-cpu-load)) - (diskfree (get-df (current-directory))) - (uname (get-uname "-srvpio")) - (hostname (get-host-name))) - (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) - (tests:update-central-meta-info test-id cpuload diskfree minutes uname hostname))) - -(define (tests:set-partial-meta-info db test-id run-id minutes work-area) + (let ((remtries 10)) + (handle-exceptions + exn + (if (> remtries 0) + (begin + (set! remtries (- remtries 1)) + (thread-sleep! 10) + (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) + (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) + (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up") + (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) + (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) + (print-call-chain))) + (let* ((num-records 0) ;; (test:tdb-get-rundat-count tdb)) + (cpuload (get-cpu-load)) + (diskfree (get-df (current-directory))) + (uname (get-uname "-srvpio")) + (hostname (get-host-name))) + (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) + (tests:update-central-meta-info test-id cpuload diskfree minutes uname hostname))))) + +(define (tests:set-partial-meta-info db test-id run-id minutes work-area remtries) ;; DOES cdb:remote-run under the hood! (let* ((cpuload (get-cpu-load)) - (diskfree (get-df (current-directory)))) - (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) + (diskfree (get-df (current-directory))) + (remtries 10)) + (handle-exceptions + exn + (if (> remtries 0) + (begin + (set! remtries (- remtries 1)) + (thread-sleep! 10) + (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) + (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) + (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up") + (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) + (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) + (print-call-chain))) + (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) ;; Update central with uname and hostname = #f ;; Is this one of the performance problems? This info should come from testdat-meta anyway ;; (tests:update-central-meta-info test-id cpuload diskfree minutes #f #f) - )) + ))) (define (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) (let ((tdb (db:open-test-db-by-test-id db test-id work-area: work-area))) (if (sqlite3:database? tdb) (begin