Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -16,11 +16,12 @@ ;; along with Megatest. If not, see . ;; strftime('%m/%d/%Y %H:%M:%S','now','localtime') (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) - posix-extras directory-utils pathname-expand typed-records format sxml-serializer sxml-modifications) + posix-extras directory-utils pathname-expand typed-records format sxml-serializer + sxml-modifications matchable) (declare (unit runs)) (declare (uses db)) (declare (uses common)) (declare (uses items)) @@ -1593,19 +1594,10 @@ itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) - ;; -- removed BB 17ww28 - no longer needed. - ;; every 15 minutes verify the server is there for this run - ;; (if (and (common:low-noise-print 240 "try start server" run-id) - ;; (not (or (and *runremote* - ;; (remote-server-url *runremote*) - ;; (server:ping (remote-server-url *runremote*))) - ;; (server:check-if-running *toppath*)))) - ;; (server:kind-run *toppath*)) - (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1))) @@ -1648,11 +1640,11 @@ "\n regfull: " regfull "\n reglen: " reglen "\n length reg: " (length reg) ) - (runs:parallel-runners-mgmt runsdat) + ;; (runs:parallel-runners-mgmt runsdat) ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (member test-name waitons) (begin @@ -1688,14 +1680,35 @@ (loop (car tal)(cdr tal) reg reruns)) (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; This would be a good place to block on homehost load - - - - (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running + ;; gonna try a strategy change here. + ;; + ;; check if can run more tests. if yes, continue, if no, rest for 10 seconds, check again + ;; repeat until can run more tests + ;; + ;; look at the test jobgroup and tot jobs running + (let loop-can-run-more + ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) + (remtries 60)) + (match res + ((run-more num-running . rem) + (if (or run-more + (< remtries 1)) + (begin + (if (runs:lownoise "num-running" 30) + (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs)) + (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through + (begin + (if (runs:lownoise "num-running" 10) + (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of " + max-concurrent-jobs " allowed.")) + (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable + (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) + (- remtries 1))))))) + (let ((loop-list (runs:process-expanded-tests runsdat testdat))) (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;;