Index: commonmod.scm ================================================================== --- commonmod.scm +++ commonmod.scm @@ -184,10 +184,11 @@ make-and-init-bigdata call-with-environment-variables common:simple-file-lock common:simple-file-lock-and-wait common:simple-file-release-lock +common:with-simple-file-lock common:fail-safe get-file-descriptor-count common:get-this-exe-fullpath common:get-sync-lock-filepath common:find-local-megatest @@ -1242,10 +1243,17 @@ (define (common:simple-file-release-lock fname) (handle-exceptions exn #f ;; I don't really care why this failed (at least for now) (delete-file* fname))) + +(define (common:with-simple-file-lock fname proc) + (let* ((lkfname (conc fname ".lock"))) + (common:simple-file-lock-and-wait lkfname) + (let ((res (proc))) + (common:simple-file-release-lock lkfname) + res))) ;;====================================================================== ;; PUlled below from common.scm ;;====================================================================== Index: configfmod.scm ================================================================== --- configfmod.scm +++ configfmod.scm @@ -49,10 +49,11 @@ configf:write-alist configf:write-config find-config getenv mytarget + my-with-lock nice-path process:cmd-run->list runconfig:read runconfigs-get safe-setenv @@ -114,14 +115,19 @@ ;;====================================================================== ;; while targets are Megatest specific they are a useful concept (define mytarget (make-parameter #f)) +;; fake locker +(define (fake-locker fname proc)(proc)) + ;; locking is optional, many environments don't care (e.g. running on one machine) ;; NOTE: the locker must follow the same syntax as with-dot-lock* +;; with-dot-lock* has problems if /tmp and the file being +;; locked are not on the same filesystem ;; -(define my-with-lock (make-parameter with-dot-lock*)) +(define my-with-lock (make-parameter fake-locker)) ;; with-dot-lock*)) ;;====================================================================== ;; move debug stuff to separate module then put these back where they belong ;;====================================================================== ;;====================================================================== @@ -1190,11 +1196,11 @@ ;;====================================================================== ;; DO THE LOCKING AROUND THE CALL ;;====================================================================== ;; -(define (configf:write-alist cdat fname) +(define (configf:write-alist cdat fname #!optional (check-written #f)) ;; (if (not (common:faux-lock fname)) ;; (debug:print 0 *default-log-port* "INFO: NEED LOCKING ADDED HERE " fname) ((my-with-lock) fname (lambda () @@ -1202,26 +1208,27 @@ (res (begin (with-output-to-file fname ;; first write out the file (lambda () (pp dat))) - ;; I don't like this. It makes write-alist opaque and complicated. -mrw- - (if (file-exists? fname) ;; now verify it is readable - (if (configf:read-alist fname) - #t ;; data is good. - (begin - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "deleting " fname " failed, exn=" exn) - #f) - (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") - (delete-file fname)) - #f)) - #f)))) + ;; I don't like this. It makes write-alist complicated + ;; move to something like write-and-verify-alist. -mrw- + (if check-written + (if (file-exists? fname) ;; now verify it is readable + (if (configf:read-alist fname) + 'data-good ;; data is good. + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "deleting " fname " failed, exn=" exn) + 'data-bad) + (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") + (delete-file fname))) + 'data-not-there) + 'data-not-checked)))) res)))) (define (common:get-fields cfgdat) (let ((fields (hash-table-ref/default cfgdat "fields" '()))) (map car fields))) ) Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -3454,12 +3454,14 @@ (db:with-db dbstruct run-id #f (lambda (db) - (let* ((stmth (db:get-cache-stmth dbstruct db qry))) - (sqlite3:first-result stmth run-id)))))) + (let* (#;(stmth (db:get-cache-stmth dbstruct db qry))) + #;(sqlite3:first-result stmth run-id) + (sqlite3:first-result db qry run-id) + ))))) ;; For a given testname how many items are running? Used to determine ;; probability for regenerating html ;; (define (db:get-count-tests-running-for-testname dbstruct run-id testname) Index: launchmod.scm ================================================================== --- launchmod.scm +++ launchmod.scm @@ -663,11 +663,12 @@ (let* ((tconfig-fname (conc work-area "/.testconfig")) (tconfig-tmpfile (conc tconfig-fname ".tmp")) (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) (scripts (configf:get-section tconfig "scripts"))) ;; create .testconfig file - (configf:write-alist tconfig tconfig-tmpfile) + (configf:write-alist tconfig tconfig-tmpfile #t) ;; the #t forces a check of the written data + (assert (file-exists? tconfig-tmpfile) "FATAL: We just wrote the dang file, how can it not exist?") (move-file tconfig-tmpfile tconfig-fname #t) (delete-file* ".final-status") ;; extract scripts from testconfig and write them to files in test run dir (for-each Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -169,10 +169,12 @@ ;; (return-method 'direct) ;; ulex parameters ;; (work-method 'mailbox) ;; (return-method 'mailbox) + +(my-with-lock common:with-simple-file-lock) ;; fake out readline usage of toplevel-command (define (toplevel-command . a) #f) (define *didsomething* #f) (define *db* #f) ;; this is only for the repl, do not use in general!!!! Index: runsmod.scm ================================================================== --- runsmod.scm +++ runsmod.scm @@ -244,10 +244,23 @@ (begin (hash-table-set! *runs:denoise* key currtime) #t) #f))) +(define *too-soon-delays* (make-hash-table)) + +;; to-soon delay, when matching event happened in less than dseconds delay wseconds +;; +(define (runs:too-soon-delay key dseconds wseconds) + (let* ((last-time (hash-table-ref/default *too-soon-delays* key #f))) + (if (and last-time + (< (- (current-seconds) last-time) dseconds)) + (begin + (debug:print-info 0 *default-log-port* "Whoa, slow down there ... "key" has been too recently seen.") + (thread-sleep! wseconds))) + (hash-table-set! *too-soon-delays* key (current-seconds)))) + (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) @@ -1467,11 +1480,13 @@ newtal: newtal itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) - + + (runs:too-soon-delay (conc "loop delay " hed) 1 1) + (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1))) @@ -1494,10 +1509,11 @@ (if (or (not (null? tal))(not (null? reg))) (loop (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns)))) + ;; (loop (car tal)(cdr tal) reg reruns)))) (runs:incremental-print-results run-id) (debug:print 4 *default-log-port* "TOP OF LOOP => " "test-name: " test-name @@ -1725,10 +1741,11 @@ (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) + (thread-sleep! 5) ;; let's always sleep, prevents abutting calls to rum:get-count-tests-running-for-run-id - didn't help (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id))) (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! @@ -1735,11 +1752,10 @@ (runs:find-and-mark-incomplete-and-check-end-of-run run-id #f) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) - (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed.