Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -2082,20 +2082,22 @@ (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where ;; numcpus (or could be ;; maxload) is zero, ;; crude fallback is to ;; at least use 1 - (loadjmp (- first next)) + (loadjmp (- first (if (> next (* numcpus 0.7)) + 0 + next))) ;; we will force a conservative calculation any time next is large. ;; add some randomness to the time to break any alignment ;; where netbatch dumps many jobs to machines simultaneously (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10) (/ (- 1000 count) 10) waitdelay) (- first adjmaxload) )) ))) ;; let's let the user know once in a long while that load checking ;; is happening but not constantly report it - (if (> (random 100) 75) ;; about 25% of the time + (if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp)) (cond ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable (> num-tries 0)) @@ -2104,25 +2106,30 @@ (common:wait-for-cpuload maxload-in numcpus-in waitdelay count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1))) ((and (> first adjmaxload) (> count 0)) (debug:print-info 0 *default-log-port* - "server start delayed " adjwait + "Delaying " adjwait " seconds due to load " first " exceeding max of " adjmaxload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) - ((and (> loadjmp numcpus) + ((and (> loadjmp (cond + ((> numcpus 8)(/ numcpus 4)) + ((> numcpus 4)(/ numcpus 2)) + (else 0.5))) (> count 0)) - (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) + (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus + (if msg (conc " " msg) "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) (else (if (> num-tries 0) - (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.") + (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost"))) + (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")) (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing.")))))) (define (get-uname . params) (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) (uname #f)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -171,41 +171,65 @@ ;; (dbr:dbstruct-mtime-set! dbstruct (current-milliseconds)) ;; (dbr:dbstruct-rtime-set! dbstruct (current-milliseconds))) ;; (dbr:dbstruct-inuse-set! dbstruct #f) ;; (mutex-unlock! *rundb-mutex*)))) +(define-inline (db:generic-error-printout exn . message) + (print-call-chain (current-error-port)) + (apply debug:print-error 0 *default-log-port* message) + (debug:print-error 0 *default-log-port* " params: " params + ", error: " ((condition-property-accessor 'exn 'message) exn) + ", arguments: " ((condition-property-accessor 'exn 'arguments) exn) + ", location: " ((condition-property-accessor 'exn 'location) exn) + )) + +(print-call-chain (current-error-port)) ;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;") ;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no ;; (define (db:with-db dbstruct run-id r/w proc . params) (let* ((have-struct (dbr:dbstruct? dbstruct)) - (dbdat (if have-struct - (db:get-db dbstruct) - #f)) - (db (if have-struct - (db:dbdat-get-db dbdat) - dbstruct)) - (use-mutex (> *api-process-request-count* 25))) + (dbdat (if have-struct + (db:get-db dbstruct) + #f)) + (db (if have-struct + (db:dbdat-get-db dbdat) + dbstruct)) + (fname (db:dbdat-get-path dbdat)) + (use-mutex (> *api-process-request-count* 25))) ;; was 25 (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) (debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access")) (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*)) (debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*)) - (handle-exceptions - exn + (condition-case (begin - (print-call-chain (current-error-port)) - (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn)) - ;; there is no recovering at this time. exit - (exit 50)) - (if use-mutex (mutex-lock! *db-with-db-mutex*)) - (let ((res (apply proc db params))) - (if use-mutex (mutex-unlock! *db-with-db-mutex*)) - ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w)) - (if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat)) - res)))) - + ;;;;;;;;; (handle-exceptions + ;;;;;;;;; exn + ;;;;;;;;; (begin + ;;;;;;;;; (print-call-chain (current-error-port)) + ;;;;;;;;; (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn)) + ;;;;;;;;; ;; there is no recovering at this time. exit + ;;;;;;;;; (exit 50)) + (if use-mutex (mutex-lock! *db-with-db-mutex*)) + (let ((res (apply proc db params))) + (if use-mutex (mutex-unlock! *db-with-db-mutex*)) + ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w)) + (if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat)) + res)) + (exn (io-error) + (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.")) + (exn (corrupt) + (db:generic-error-printout exn "ERROR: database " fname " is corrupt. Repair it to proceed.")) + (exn (busy) + (db:generic-error-printout exn "ERROR: database " fname + " is locked. Try copying to another location, remove original and copy back.")) + (exn (permission)(db:generic-error-printout exn "ERROR: database " fname " has some permissions problem.")) + (exn () + (db:generic-error-printout exn "ERROR: Unknown error with database " fname " message: " + ((condition-property-accessor 'exn 'message) exn)))))) + ;;====================================================================== ;; K E E P F I L E D B I N dbstruct ;;====================================================================== ;; (define (db:get-filedb dbstruct run-id) @@ -1799,84 +1823,95 @@ (debug:print-info 4 *default-log-port* "deadtime-trim = " deadtime-trim) (db:with-db dbstruct #f #f (lambda (db) - ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes - ;; - ;; HOWEVER: this code in run:test seems to work fine - ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) - ;; (db:test-get-run_duration testdat))) - ;; 600) - ;; (db:delay-if-busy dbdat) - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path event-time run-duration) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (begin - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) - (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)))) - db - - "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');" - run-id running-deadtime) ;; default time 720 seconds - - - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path event-time run-duration) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (begin - (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration) - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) - db - "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');" - run-id remotehoststart-deadtime) ;; default time 230 seconds - - ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config - ;; - ;; (db:delay-if-busy dbdat) - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (begin - (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked") + (let* ((stmth1 (db:get-cache-stmth + dbstruct db + "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests + WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) + AND state IN ('RUNNING');")) + (stmth2 (db:get-cache-stmth + dbstruct db + "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests + WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) + AND state IN ('REMOTEHOSTSTART');")) + (stmth3 (db:get-cache-stmth + dbstruct db + "SELECT id,rundir,uname,testname,item_path FROM tests + WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 + AND state IN ('LAUNCHED');"))) + ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes + ;; + ;; HOWEVER: this code in run:test seems to work fine + ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) + ;; (db:test-get-run_duration testdat))) + ;; 600) + ;; (db:delay-if-busy dbdat) + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path event-time run-duration) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) + (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" + test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) + " event-time="event-time" run-duration="run-duration)))) + stmth1 + run-id running-deadtime) ;; default time 720 seconds + + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path event-time run-duration) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id + " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time + " run-duration="run-duration) + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) + stmth2 + run-id remotehoststart-deadtime) ;; default time 230 seconds + + ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config + ;; + ;; (db:delay-if-busy dbdat) + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (begin + (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id + " 1 day since event_time marked") (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))) - db - "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');" - run-id) + stmth3 + run-id) - (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.") + (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " + (length toplevels) " old LAUNCHED toplevel tests and " + (length incompleted) " tests marked RUNNING but apparently dead.")) - ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE. + ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE. ;; - ;; (db:delay-if-busy dbdat) - (let* (;; (min-incompleted (filter (lambda (x) - ;; (let* ((testpath (cadr x)) - ;; (tdatpath (conc testpath "/testdat.db")) - ;; (dbexists (common:file-exists? tdatpath))) - ;; (or (not dbexists) ;; if no file then something wrong - mark as incomplete - ;; (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim - ;; incompleted)) - (min-incompleted-ids (map car incompleted)) ;; do 'em all - (all-ids (append min-incompleted-ids (map car oldlaunched)))) - (if (> (length all-ids) 0) - (begin - ;; (launch:is-test-alive "localhost" 435) - (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD") - (for-each + ;; (db:delay-if-busy dbdat) + (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all + (all-ids (append min-incompleted-ids (map car oldlaunched)))) + (if (> (length all-ids) 0) + (begin + ;; (launch:is-test-alive "localhost" 435) + (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") + " as DEAD") + (for-each (lambda (test-id) (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id)) (tinfo (db:get-test-info-by-id dbstruct run-id test-id)) (run-dir (db:test-get-rundir tinfo)) (host (db:test-get-host tinfo)) @@ -1883,45 +1918,30 @@ (pid (db:test-get-process_id tinfo)) (result (db:get-status-from-final-status-file run-dir))) (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) (begin (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") - (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS" - "Test stopped responding but it has PASSED; marking it PASS in the DB.")) - (let ((is-alive (launch:is-test-alive host pid))) + (db:set-state-status-and-roll-up-items + dbstruct run-id test-id 'foo "COMPLETED" "PASS" + "Test stopped responding but it has PASSED; marking it PASS in the DB.")) + (let ((is-alive (and (not (eq? pid 0)) ;; 0 is default in re-used field "attemptnum" where pid stored. + (launch:is-test-alive host pid)))) (if is-alive - (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host " has a process on pid " pid ", NOT setting to DEAD.") + (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host + " has a process on pid " pid ", NOT setting to DEAD.") (begin - (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result) - (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" - "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) - ;; call end of eud of run detection for posthook - from merge, is it needed? - ;; (launch:end-of-run-check run-id) + (debug:print 0 *default-log-port* "INFO: test " test-id + " final state/status is not COMPLETED/PASS. It is " result) + (db:set-state-status-and-roll-up-items + dbstruct run-id test-id 'foo "COMPLETED" "DEAD" + "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) + ;; call end of eud of run detection for posthook - from merge, is it needed? + ;; (launch:end-of-run-check run-id) all-ids) - ;;call end of eud of run detection for posthook - (launch:end-of-run-check run-id) - ))))))) - - -;; ALL REPLACED BY THE BLOCK ABOVE -;; -;; (sqlite3:execute -;; db -;; (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" -;; (string-intersperse (map conc all-ids) ",") -;; ");") -;; run-id)))) -;; -;; ;; Now do rollups for the toplevel tests -;; ;; -;; ;; (db:delay-if-busy dbdat) -;; (for-each -;; (lambda (toptest) -;; (let ((test-name (list-ref toptest 3))) -;; ;; (run-id (list-ref toptest 5))) -;; (db:top-test-set-per-pf-counts dbstruct run-id test-name))) -;; toplevels))) + ;;call end of eud of run detection for posthook + (launch:end-of-run-check run-id) + ))))))) ;; BUG: Probably broken - does not explicitly use run-id in the query ;; (define (db:top-test-set-per-pf-counts dbstruct run-id test-name) (db:general-call dbstruct 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name))) @@ -3255,11 +3275,11 @@ (let* ((stmth (db:get-cache-stmth dbstruct db qry))) (sqlite3:first-result stmth run-id)))))) ;; For a given testname how many items are running? Used to determine ;; probability for regenerating html -;; +;; (define (db:get-count-tests-running-for-testname dbstruct run-id testname) (db:with-db dbstruct run-id #f Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -1369,11 +1369,11 @@ ;; (thread-sleep! 1) ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) (item-path (item-list->path itemdat)) (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) (let loop ((delta (- (current-seconds) *last-launch*)) - (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1))) + (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0))) (if (> launch-delay delta) (begin (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -315,11 +315,13 @@ (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move ;; to cond clauses below where we determine we ;; have too many jobs running rather than each ;; time the and condition above is true (which - ;; seems like always)? + ;; seems like always)? + ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way + 0) ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) 10) ;; obviously haven't had any work to do for a while (else 0))) ;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? @@ -1192,11 +1194,11 @@ ((not have-resources) ;; simply try again after waiting a second (if (runs:lownoise "no resources" 60) (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. - (thread-sleep! 1) + (thread-sleep! 0.25) ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) ;; This is the final stage, everything is in place so launch the test ;;