Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -327,13 +327,13 @@ ;; 0) ;; ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time ;; (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) ;; 10) ;; obviously haven't had any work to do for a while ;; (else 0))) -;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? -;; (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01) -;; ))) +;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? +;; (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01) +;; ))) (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) @@ -994,13 +994,13 @@ (null? non-completed)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) - (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; + (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing - ;; + ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!! ;; No runsdat, can't do this yet ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) ;; (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -323,16 +323,26 @@ ;; wait for server=start-last to be three seconds old ;; (define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) - (reftime (configf:lookup-number *configdat* "server" "idletime" default: 4))) + (reftime (configf:lookup-number *configdat* "server" "idletime" default: 4)) + (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) (all-go (> delta reftime))) - (if all-go + (if (and all-go + (begin + (with-output-to-file start-flag + (lambda () + (print server-key))) + (thread-sleep! 0.25) + (let ((res (with-input-from-file start-flag + (lambda () + (read-line))))) + (equal? server-key res)))) #t ;; (system (conc "touch " start-flag)) ;; lazy but safe (begin (debug:print-info 0 *default-log-port* "Gating server start, last start: " fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) (thread-sleep! reftime)