Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -1673,11 +1673,11 @@ ;; get values from cached info from dropping file in logs dir ;; e.g. key is host and dtype is normalized-load ;; (define (common:get-cached-info key dtype #!key (age 5)) - (let* ((fullpath (conc *toppath* "/logs/" key "-" dtype ".log"))) + (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))) (if (and (file-exists? fullpath) (file-read-access? fullpath)) (handle-exceptions exn #f @@ -1684,18 +1684,20 @@ (debug:print 2 *default-log-port* "reading file " fullpath) (let ((real-age (- (current-seconds)(file-change-time fullpath)))) (if (< real-age age) (with-input-from-file fullpath read) (begin - (debug:print 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds)to trust, skipping reading it") + (debug:print 1 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") #f)))) (begin (debug:print 2 *default-log-port* "not reading file " fullpath) #f)))) (define (common:write-cached-info key dtype dat) - (let* ((fullpath (conc *toppath* "/logs/" key "-" dtype ".log"))) + (let* ((fulldir (conc *toppath* "/.sysdata")) + (fullpath (conc fulldir "/" key "-" dtype ".log"))) + (if (not (file-exists? fulldir))(create-directory fulldir #t)) (handle-exceptions exn #f (with-output-to-file fullpath (lambda ()(pp dat)))))) @@ -1744,11 +1746,11 @@ (define (common:get-normalized-cpu-load-raw remote-host) (let* ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost (or (common:get-cached-info actual-host "normalized-load") (let ((data (if remote-host (with-input-from-pipe - (conc "ssh " remote-host " cat /proc/loadavg;cat /proc/cpuinfo;echo end") + (conc "ssh " remote-host " \"cat /proc/loadavg;cat /proc/cpuinfo;echo end\"") read-lines) (append (with-input-from-file "/proc/loadavg" read-lines) (with-input-from-file "/proc/cpuinfo" @@ -1766,11 +1768,11 @@ (tal (cdr data)) (loads #f) (proc-num 0) ;; processor includes threads (phys-num 0) ;; physical chip on motherboard (core-num 0)) ;; core - ;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num) + ;;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num) (if (null? tal) ;; have all our data, calculate normalized load and return result (let* ((act-proc (+ proc-num 1)) (act-phys (+ phys-num 1)) (act-core (+ core-num 1)) (adj-proc-load (/ (car loads) act-proc)) @@ -1919,54 +1921,26 @@ (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) -(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) - (let* ((loadavg (common:get-cpu-load remote-host)) - (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (common:get-num-cpus remote-host) - numcpus-in)) - (maxload (if force-maxload - maxload-in - (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? - (first (car loadavg)) - (next (cadr loadavg)) - (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 - (loadjmp (- first next)) - (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously - (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload - ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) - (cond - ((and (> first adjload) - (> count 0)) - (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) - (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) - ((and (> loadjmp numcpus) - (> count 0)) - (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) - (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) - (define (common:wait-for-homehost-load maxload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) - (hh (if hh-dat (car hh-dat) #f)) - (numcpus (common:get-num-cpus hh))) + (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxload msg hh))) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) - (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often! + (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often! (let* ((proc (lambda () (let loop ((numcpu 0) (inl (read-line))) (if (eof-object? inl) (begin - (common:write-cached-info remote-host "num-cpus" numcpu) + (common:write-cached-info actual-host "num-cpus" numcpu) numcpu) (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) (+ numcpu 1) numcpu) (read-line)))))) @@ -1982,10 +1956,39 @@ ;; (define (common:wait-for-normalized-load maxload msg remote-host) (let ((num-cpus (common:get-num-cpus remote-host))) (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) +;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load +;; +(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) + (let* ((loadavg (common:get-cpu-load remote-host)) + (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (common:get-num-cpus remote-host) + numcpus-in)) + (maxload (if force-maxload + maxload-in + (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? + (first (car loadavg)) + (next (cadr loadavg)) + (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 + (loadjmp (- first next)) + (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously + (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload + ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) + (cond + ((and (> first adjload) + (> count 0)) + (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) + (thread-sleep! adjwait) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) + ((and (> loadjmp numcpus) + (> count 0)) + (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) + (thread-sleep! adjwait) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) + (define (get-uname . params) (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) (uname #f)) (if (null? (car uname-res)) "unknown" Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.6538) +(define megatest-version 1.6543) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -22,13 +22,13 @@ (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (include "common_records.scm") -(declare (uses rmtmod)) +;; (declare (uses rmtmod)) -(import rmtmod) +;; (import rmtmod) ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;; @@ -58,14 +58,24 @@ ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected - (common:telemetry-log (conc "rmt:"(->string cmd)) + #;(common:telemetry-log (conc "rmt:"(->string cmd)) payload: `((rid . ,rid) (params . ,params))) + (if (> attemptnum 2) + (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) + + (cond + ((> attemptnum 2) (thread-sleep! 0.05)) + ((> attemptnum 10) (thread-sleep! 0.5)) + ((> attemptnum 20) (thread-sleep! 1))) + (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) + (begin (rmt:start-server rid) (thread-sleep! 3))) + ;;DOT digraph megatest_state_status { ;;DOT ranksep=0; ;;DOT // rankdir=LR; ;;DOT node [shape="box"]; @@ -80,10 +90,11 @@ ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas (runremote (or area-dat *runremote*)) + (attemptnum (+ 1 attemptnum)) (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))) ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"]; ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; @@ -103,13 +114,13 @@ ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) (cond ;;DOT EXIT; ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" } - ;; give up if more than 15 attempts - ((> attemptnum 15) - (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.") + ;; give up if more than 150 attempts + ((> attemptnum 150) + (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.") (exit 1)) ;;DOT CASE2 [label="local\nreadonly\nquery"]; ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2} ;;DOT CASE2 -> "rmt:open-qry-close-locally"; @@ -931,10 +942,69 @@ (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id))) (define (rmt:test-get-archive-block-info archive-block-id) (rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id))) -(set-functions rmt:send-receive remote-server-url-set! + +(define (rmtmod:calc-ro-mode runremote *toppath*) + (if (and runremote + (remote-ro-mode-checked runremote)) + (remote-ro-mode runremote) + (let* ((dbfile (conc *toppath* "/megatest.db")) + (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future + (if runremote + (begin + (remote-ro-mode-set! runremote ro-mode) + (remote-ro-mode-checked-set! runremote #t) + ro-mode) + ro-mode)))) + +(define (extras-readonly-mode rmt-mutex log-port cmd params) + (mutex-unlock! rmt-mutex) + (debug:print-info 12 log-port "rmt:send-receive, case 3") + (debug:print 0 log-port "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params) + #f) + +(define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params) + (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) + (mutex-lock! *rmt-mutex*) + (remote-conndat-set! runremote #f) + (http-transport:close-connections area-dat: runremote) + (remote-server-url-set! runremote #f) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") + (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) + +(define (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd) + (if (and (vector? res) + (eq? (vector-length res) 2) + (eq? (vector-ref res 1) 'overloaded)) ;; since we are + ;; looking at the + ;; data to carry the + ;; error we'll use a + ;; fairly obtuse + ;; combo to minimise + ;; the chances of + ;; some sort of + ;; collision. this + ;; is the case where + ;; the returned data + ;; is bad or the + ;; server is + ;; overloaded and we + ;; want to ease off + ;; the queries + (let ((wait-delay (+ attemptnum (* attemptnum 10)))) + (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") + (mutex-lock! *rmt-mutex*) + (http-transport:close-connections area-dat: runremote) + (set! *runremote* #f) ;; force starting over + (mutex-unlock! *rmt-mutex*) + (thread-sleep! wait-delay) + (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) + res)) ;; All good, return res + +#;(set-functions rmt:send-receive remote-server-url-set! http-transport:close-connections remote-conndat-set! debug:print debug:print-info remote-ro-mode remote-ro-mode-set! remote-ro-mode-checked-set! remote-ro-mode-checked) Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -56,64 +56,6 @@ (set! remote-ro-mode ro-mode) (set! remote-ro-mode-set! ro-mode-set) (set! remote-ro-mode-checked-set! ro-mode-checked-set) (set! remote-ro-mode-checked ro-mode-checked)) -(define (rmtmod:calc-ro-mode runremote *toppath*) - (if (and runremote - (remote-ro-mode-checked runremote)) - (remote-ro-mode runremote) - (let* ((dbfile (conc *toppath* "/megatest.db")) - (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future - (if runremote - (begin - (remote-ro-mode-set! runremote ro-mode) - (remote-ro-mode-checked-set! runremote #t) - ro-mode) - ro-mode)))) - -(define (extras-readonly-mode rmt-mutex log-port cmd params) - (mutex-unlock! rmt-mutex) - (debug:print-info 12 log-port "rmt:send-receive, case 3") - (debug:print 0 log-port "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params) - #f) - -(define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params) - (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) - (mutex-lock! *rmt-mutex*) - (remote-conndat-set! runremote #f) - (http-transport:close-connections area-dat: runremote) - (remote-server-url-set! runremote #f) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") - (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) - -(define (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd) - (if (and (vector? res) - (eq? (vector-length res) 2) - (eq? (vector-ref res 1) 'overloaded)) ;; since we are - ;; looking at the - ;; data to carry the - ;; error we'll use a - ;; fairly obtuse - ;; combo to minimise - ;; the chances of - ;; some sort of - ;; collision. this - ;; is the case where - ;; the returned data - ;; is bad or the - ;; server is - ;; overloaded and we - ;; want to ease off - ;; the queries - (let ((wait-delay (+ attemptnum (* attemptnum 10)))) - (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") - (mutex-lock! *rmt-mutex*) - (http-transport:close-connections area-dat: runremote) - (set! *runremote* #f) ;; force starting over - (mutex-unlock! *rmt-mutex*) - (thread-sleep! wait-delay) - (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) - res)) ;; All good, return res - ) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1017,11 +1017,10 @@ (non-completed (filter (lambda (x) ;; remove hed from not completed list, duh, of course it is not completed! (not (equal? x hed))) (runs:calc-not-completed prereqs-not-met))) (loop-list (list hed tal reg reruns)) ;; configure the load runner - (numcpus (common:get-num-cpus #f)) (maxload (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3.0"))) ;; use a non-number string to disable (maxhomehostload (string->number (or (configf:lookup *configdat* "jobtools" "maxhomehostload") "2.0"))) ;; use a non-number string to disable (waitdelay (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60")))) (debug:print-info 4 *default-log-port* "have-resources: " have-resources " prereqs-not-met: (" (string-intersperse @@ -1119,14 +1118,19 @@ (not (member 'exclusive testmode))))) ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! *max-tries-hash* (make-hash-table)) + ;; well, first lets see if cpu load throttling is enabled. If so wait around until the ;; average cpu load is under the threshold before continuing - (if maxload ;; only gate if maxload is specified - (common:wait-for-cpuload maxload numcpus waitdelay)) + + ;; jobtools maxload is useful for where the full Megatest run is done on one machine + (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized + (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) + + ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) @@ -1743,37 +1747,48 @@ ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; (define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry) ;; All these vars might be referenced by the testconfig file reader + ;; + ;; NEED to reprocess testconfig here, ensuring that item variables are available. + ;; This is for Tal's issue with item-specific env vars not being set for use in skip. + ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 + ;; (let* ((test-name (tests:testqueue-get-testname test-record)) (test-waitons (tests:testqueue-get-waitons test-record)) - (test-conf (tests:testqueue-get-testconfig test-record)) (itemdat (tests:testqueue-get-itemdat test-record)) - (test-path (hash-table-ref all-tests-registry test-name)) ;; (conc *toppath* "/tests/" test-name)) ;; could use tests:get-testconfig here ... - (force (hash-table-ref/default flags "-force" #f)) - (rerun (hash-table-ref/default flags "-rerun" #f)) - (keepgoing (hash-table-ref/default flags "-keepgoing" #f)) - (incomplete-timeout (string->number (or (configf:lookup *configdat* "setup" "incomplete-timeout") "x"))) (item-path "") (db #f) (full-test-name #f)) ;; setting itemdat to a list if it is #f (if (not itemdat)(set! itemdat '())) (set! item-path (item-list->path itemdat)) (set! full-test-name (db:test-make-full-name test-name item-path)) + (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path) ;; these may be needed by the launching process + + (let* ((test-conf ;; re-instate the tests:get-testconfig once the kinks are worked out. FIXME!!! + ;; (tests:get-testconfig test-name item-path all-tests-registry #t force-create: #t)) + (tests:testqueue-get-testconfig test-record )) + (test-path (hash-table-ref all-tests-registry test-name)) ;; (conc *toppath* "/tests/" test-name)) ;; could use tests:get-testconfig here ... + + (force (hash-table-ref/default flags "-force" #f)) + (rerun (hash-table-ref/default flags "-rerun" #f)) + (keepgoing (hash-table-ref/default flags "-keepgoing" #f)) + (incomplete-timeout (string->number (or (configf:lookup *configdat* "setup" "incomplete-timeout") "x"))) + ) + (debug:print-info 4 *default-log-port* "\nTESTNAME: " full-test-name "\n test-config: " (hash-table->alist test-conf) "\n itemdat: " itemdat ) (debug:print 2 *default-log-port* "Attempting to launch test " full-test-name) ;; (setenv "MT_TEST_NAME" test-name) ;; ;; (setenv "MT_ITEMPATH" item-path) ;; (setenv "MT_RUNNAME" runname) - (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path) ;; these may be needed by the launching process (change-directory *toppath*) ;; Here is where the test_meta table is best updated ;; Yes, another use of a global for caching. Need a better way? ;; @@ -1881,21 +1896,26 @@ ;; run-ids = #f means *all* runs (let ((running-tests (rmt:get-tests-for-runs-mindata #f full-test-name '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED") '() #f))) (if (not (null? running-tests)) ;; have to skip (set! skip-test "Skipping due to previous tests running")))) - ((and skip-check - (configf:lookup test-conf "skip" "fileexists")) - (if (common:file-exists? (configf:lookup test-conf "skip" "fileexists")) - (set! skip-test (conc "Skipping due to existance of file " (configf:lookup test-conf "skip" "fileexists"))))) - - ((and skip-check - (configf:lookup test-conf "skip" "filenotexists")) - (if (not (common:file-exists? (configf:lookup test-conf "skip" "filenotexists"))) - (set! skip-test (conc "Skipping due to non existance of file " (configf:lookup test-conf "skip" "filenotexists"))))) - - ((and skip-check + ;; split the string and OR of file-exists? + ((and skip-check + (configf:lookup test-conf "skip" "fileexists")) + (let* ((files (string-split (configf:lookup test-conf "skip" "fileexists"))) + (existing (filter common:file-exists? files))) + (if (not (null? existing)) ;; (common:file-exists? (configf:lookup test-conf "skip" "fileexists")) + (set! skip-test (conc "Skipping due to existance of file(s) " (string-intersperse existing ", ")))))) ;; (configf:lookup test-conf "skip" "fileexists"))))) + + ((and skip-check + (configf:lookup test-conf "skip" "filenotexists")) + (let* ((files (string-split (configf:lookup test-conf "skip" "filenotexists"))) + (existing (filter common:file-exists? files))) + (if (null? existing) ;; (common:file-exists? (configf:lookup test-conf "skip" "filenotexists"))) + (set! skip-test (conc "Skipping due to non existance of files " (string-intersperse files ", ")))))) ;; (configf:lookup test-conf "skip" "filenotexists"))))) + + ((and skip-check (configf:lookup test-conf "skip" "script")) (if (= (system (configf:lookup test-conf "skip" "script")) 0) (set! skip-test (conc "Skipping due to zero return value of script " (configf:lookup test-conf "skip" "script"))))) ((and skip-check @@ -1940,11 +1960,11 @@ (debug:print-error 0 *default-log-port* "Failed to launch test " full-test-name ". Unrecognised state " (test:get-state testdat)) (case (string->symbol (test:get-state testdat)) ((COMPLETED INCOMPLETE) (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) (else - (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)))))))) + (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN))))))))) ;;====================================================================== ;; END OF NEW STUFF ;;====================================================================== Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -1,6 +1,5 @@ - ;; Copyright 2006-2017, Matthew Welland. ;; ;; This file is part of Megatest. ;; ;; Megatest is free software: you can redistribute it and/or modify @@ -151,11 +150,12 @@ (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time - (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever + ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever + (common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory)))