Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -165,10 +165,12 @@ (define-inline (dboard:uidat-get-keycol vec)(vector-ref vec 0)) (define-inline (dboard:uidat-get-lftcol vec)(vector-ref vec 1)) (define-inline (dboard:uidat-get-header vec)(vector-ref vec 2)) (define-inline (dboard:uidat-get-runsvec vec)(vector-ref vec 3)) + +(if (get-environment-variable "MT_RUN_AREA_HOME")(change-directory (get-environment-variable "MT_RUN_AREA_HOME"))) (define (message-window msg) (iup:show (iup:dialog (iup:vbox Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -480,21 +480,28 @@ #f)) (define (open-run-close-exception-handling proc idb . params) (handle-exceptions exn - (begin - (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") - (debug:print 0 " " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain) - (thread-sleep! (random 120)) - (debug:print-info 0 "trying db call one more time....") - (apply open-run-close-no-exception-handling proc idb params)) + (let ((sleep-time (random 30)) + (err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) + (case err-status + ((busy) + (thread-sleep! sleep-time)) + (else + (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) + (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) + (print-call-chain) + (thread-sleep! sleep-time) + (debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up"))) + (apply open-run-close-exception-handling proc idb params)) (apply open-run-close-no-exception-handling proc idb params))) ;; (define open-run-close -(define open-run-close ;; (if (debug:debug-mode 2) +(define open-run-close open-run-close-exception-handling) ;; open-run-close-no-exception-handling open-run-close-exception-handling) ;;) (define (db:initialize-main-db db) @@ -611,10 +618,12 @@ comment TEXT DEFAULT '', status TEXT DEFAULT 'n/a', type TEXT DEFAULT '', CONSTRAINT test_data_constraint UNIQUE (test_id,category,variable));") (sqlite3:execute db "CREATE TABLE IF NOT EXISTS test_rundat ( + ;; Why use FULL here? This data is not that critical + ;; (sqlite3:execute db "PRAGMA synchronous = FULL;") id INTEGER PRIMARY KEY, test_id INTEGER, update_time TIMESTAMP, cpuload INTEGER DEFAULT -1, diskfree INTEGER DEFAULT -1, @@ -1807,10 +1816,17 @@ #f) #f)) (define (db:tests-register-test dbstruct run-id test-name item-path) (sqlite3:execute (db:get-db dbstruct run-id) 'register-test run-id test-name item-path)) + (let ((sleep-time (random 20)) + (err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) + (case err-status + ((busy)(thread-sleep! 4)) + (else + (debug:print 0 "WARNING: possible problem with call to cdb:remote-run, database may be read-only and locked, waiting and trying again ...") + (thread-sleep! sleep-time))) (define (db:test-get-logfile-info dbstruct run-id test-name) (let ((res #f)) (sqlite3:for-each-row (lambda (path final_logf) @@ -1976,10 +1992,35 @@ (hash-table-set! tests-hash full-testname testdat)))) results) (if (null? tal) (map cdr (hash-table->alist tests-hash)) ;; return a list of the most recent tests (loop (car tal)(cdr tal)))))))))) + (let* ((remtries 10) + (proc #f)) + (set! proc (lambda (remtries) + (if (> remtries 0) + (handle-exceptions + exn + (let ((sleep-time (random 30)) + (err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) + (case err-status + ((busy) + (thread-sleep! sleep-time) + (proc 10)) ;; we never give up on busy + (else + (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) + (print-call-chain) + (debug:print 0 "Sleeping for " sleep-time) + (thread-sleep! sleep-time) + (debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up") + (proc (- remtries 1))))) + (apply sqlite3:execute db query params)) + (debug:print 0 "ERROR: too many attempts to access db were made and no sucess. query: " + query ", params: " params)))) + (proc remtries)) (define (db:test-get-records-for-index-file dbstruct run-id test-name) (let ((res '())) (sqlite3:for-each-row (lambda (id itempath state status run_duration logf-id comment-id) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -1216,12 +1216,10 @@ (db:replace-test-records dbstruct run-id testrecs))) run-ids) (set! *didsomething* #t) (db:close-all dbstruct))) - - ;;====================================================================== ;; Exit and clean up ;;====================================================================== (if *runremote* (close-all-connections!)) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -453,22 +453,25 @@ ;; prereqstrs)) (give-up #f)) ;; We can get here when a prereq has not been run due to *it* having a prereq that failed. ;; We need to use this to dequeue this item as CANNOTRUN - (for-each (lambda (prereq) - (if (eq? (hash-table-ref/default test-registry prereq 'justfine) 'CANNOTRUN) - (set! give-up #t))) - prereqstrs) + ;; + (if (member testmode '(toplevel)) + (for-each (lambda (prereq) + (if (eq? (hash-table-ref/default test-registry prereq 'justfine) 'CANNOTRUN) + (set! give-up #t))) + prereqstrs)) + (if (and give-up (not (and (null? tal)(null? reg)))) (let ((trimmed-tal (mt:discard-blocked-tests run-id hed tal test-records)) (trimmed-reg (mt:discard-blocked-tests run-id hed reg test-records))) (debug:print 1 "WARNING: test " hed " has discarded prerequisites, removing it from the queue") (let ((test-id (rmt:get-test-id run-id hed ""))) - (mt:test-set-state-status-by-id test-id "DEQUED" "PREQ_FAIL" "Failed to run due to failed prerequisites")) + (mt:test-set-state-status-by-id test-id "DEQUEUED" "PREQ_FAIL" "Failed to run due to failed prerequisites")) (if (and (null? trimmed-tal) (null? trimmed-reg)) #f (list (runs:queue-next-hed trimmed-tal trimmed-reg reglen regfull)