Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -499,11 +499,11 @@ (apply open-run-close-no-exception-handling proc idb params))) ;; (define open-run-close (define open-run-close open-run-close-exception-handling) ;; open-run-close-no-exception-handling - open-run-close-exception-handling) +;; open-run-close-exception-handling) ;;) (define (db:initialize-main-db db) (let* ((configdat (car *configinfo*)) ;; tut tut, global warning... (keys (keys:config-get-fields configdat)) @@ -1816,17 +1816,17 @@ #f) #f)) (define (db:tests-register-test dbstruct run-id test-name item-path) (sqlite3:execute (db:get-db dbstruct run-id) 'register-test run-id test-name item-path)) - (let ((sleep-time (random 20)) - (err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) - (case err-status - ((busy)(thread-sleep! 4)) - (else - (debug:print 0 "WARNING: possible problem with call to cdb:remote-run, database may be read-only and locked, waiting and trying again ...") - (thread-sleep! sleep-time))) +;; (let ((sleep-time (random 20)) +;; (err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) +;; (case err-status +;; ((busy)(thread-sleep! 4)) +;; (else +;; (debug:print 0 "WARNING: possible problem with call to cdb:remote-run, database may be read-only and locked, waiting and trying again ...") +;; (thread-sleep! sleep-time))) (define (db:test-get-logfile-info dbstruct run-id test-name) (let ((res #f)) (sqlite3:for-each-row (lambda (path final_logf) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -144,11 +144,13 @@ (alist->env-vars env-ovrd) (set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) (set-item-env-vars itemdat) (save-environment-as-files "megatest") ;; open-run-close not needed for test-set-meta-info - (tests:set-full-meta-info test-id run-id 0 work-area) + ;; (tests:set-full-meta-info #f test-id run-id 0 work-area) + ;; (tests:set-full-meta-info test-id run-id 0 work-area) + (tests:set-full-meta-info #f test-id run-id 0 work-area 10) ;; (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f) (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (thread-sleep! 0.3) ;; NFS slowness has caused grief here @@ -302,11 +304,13 @@ (round (- (current-seconds) start-seconds))))) (kill-tries 0)) - (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) + ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) + ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) + (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes))) (begin (set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)) (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) (time-exceeded (> run-seconds runtlim))) @@ -314,11 +318,13 @@ (begin (debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) #t) #f))))) ;; open-run-close not needed for test-set-meta-info + ;; (tests:set-partial-meta-info #f test-id run-id minutes work-area) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f) + ;; (tests:set-partial-meta-info #f test-id run-id minutes work-area 10) ;; (tests:set-partial-meta-info test-id run-id minutes work-area) (if kill-job? (begin (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -991,11 +991,14 @@ (logprofile (args:get-arg "-logpro")) (logfile (conc stepname ".log")) (cmd (if (null? remargs) #f (car remargs))) (params (if cmd (cdr remargs) '())) (exitstat #f) - (shell (last (string-split (get-environment-variable "SHELL") "/"))) + (shell (let ((sh (get-environment-variable "SHELL") )) + (if sh + (last (string-split sh "/")) + "bash"))) (redir (case (string->symbol shell) ((tcsh csh ksh) ">&") ((zsh bash sh ash) "2>&1 >") (else ">&"))) (fullcmd (conc "(" (string-intersperse Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -636,23 +636,56 @@ (rmt:general-call 'update-run-duration run-id minutes test-id)) (if (and uname hostname) (rmt:general-call 'update-uname-host run-id uname hostname test-id))) ;; This one is for running with no db access (i.e. via rmt: internally) -(define (tests:set-full-meta-info test-id run-id minutes work-area) - (let* ((num-records 0) - (cpuload (get-cpu-load)) - (diskfree (get-df (current-directory))) - (uname (get-uname "-srvpio")) - (hostname (get-host-name))) - (tdb:remote-update-testdat-meta-info run-id test-id work-area cpuload diskfree minutes) - (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) - -(define (tests:set-partial-meta-info test-id run-id minutes work-area) +(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) +;; (define (tests:set-full-meta-info test-id run-id minutes work-area) +;; (let ((remtries 10)) + (handle-exceptions + exn + (if (> remtries 0) + (begin + (set! remtries (- remtries 1)) + (thread-sleep! 10) + (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) + (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) + (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up") + (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) + (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) + (print-call-chain))) + (let* ((num-records 0) ;; (test:tdb-get-rundat-count tdb)) + (cpuload (get-cpu-load)) + (diskfree (get-df (current-directory))) + (uname (get-uname "-srvpio")) + (hostname (get-host-name))) + (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) + (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))) + +;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) +(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) (let* ((cpuload (get-cpu-load)) - (diskfree (get-df (current-directory)))) - (tdb:remote-update-testdat-meta-info run-id test-id work-area cpuload diskfree minutes))) + (diskfree (get-df (current-directory))) + (remtries 10)) + (handle-exceptions + exn + (if (> remtries 0) + (begin + (set! remtries (- remtries 1)) + (thread-sleep! 10) + (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) + (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) + (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up") + (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) + (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) + (print-call-chain))) + (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) + ))) ;;====================================================================== ;; A R C H I V I N G ;;======================================================================