Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -56,10 +56,14 @@ (include "megatest-fossil-hash.scm") (include "vg_records.scm") (dbfile:db-init-proc db:initialize-main-db) +;; globals to dashboard module +(define *updaters-running* #f) +(define *updaters-thread* #f) + (define help (conc "Megatest Dashboard, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version " license GPL, Copyright (C) Matt Welland 2012-2017 Usage: dashboard [options] @@ -3773,19 +3777,19 @@ dbkeys) res)))) fres)))) (define (dashboard:runs-tab-updater commondat tab-num) - (debug:catch-and-dump - (lambda () +;; (debug:catch-and-dump +;; (lambda () (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num)) (dbkeys (dboard:tabdat-dbkeys tabdat))) (dashboard:do-update-rundat tabdat) (let ((uidat (dboard:commondat-uidat commondat))) (update-buttons tabdat uidat (dboard:tabdat-numruns tabdat) (dboard:tabdat-num-tests tabdat))) )) - "dashboard:runs-tab-updater")) +;; "dashboard:runs-tab-updater")) ;;====================================================================== ;; The heavy lifting starts here ;;====================================================================== @@ -3852,40 +3856,37 @@ ;; (dboard:commondat-add-updater ;; commondat ;; (lambda () ;; (dashboard:runs-tab-updater commondat 1)) ;; tab-num: 2) + (iup:callback-set! *tim* "ACTION_CB" (lambda (time-obj) - (let ((update-is-running #f)) - (mutex-lock! (dboard:commondat-update-mutex commondat)) - (set! update-is-running (dboard:commondat-updating commondat)) - (if (not update-is-running) - (dboard:commondat-updating-set! commondat #t)) - (mutex-unlock! (dboard:commondat-update-mutex commondat)) - (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update + (if (not *updaters-thread*) (begin - (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat) - (mutex-lock! (dboard:commondat-update-mutex commondat)) - (dboard:commondat-updating-set! commondat #f) - (mutex-unlock! (dboard:commondat-update-mutex commondat))) + ;; (debug:print-info 0 *default-log-port* "Updater started...") + (set! *updaters-thread* + (make-thread + (lambda () + (dboard:common-run-curr-updaters commondat)))) + (thread-start! *updaters-thread*)) + (begin + (debug:print-info 0 *default-log-port* "Updater restarted...") + (thread-resume! *updaters-thread*))) + (thread-sleep! 0.25) + (if (eq? (thread-state *updaters-thread*) 'running) + (begin + (debug:print-info 0 *default-log-port* "Updater suspended...") + (thread-suspend! *updaters-thread*)) + (begin + (set! *updaters-thread* #f) + ;; (debug:print-info 0 *default-log-port* "Updater done...") )) 1)))) - ;; (debug:print 0 *default-log-port* "Starting updaters") - (let ((th1 (make-thread (lambda () - (thread-sleep! 1) - (dboard:common-run-curr-updaters commondat 0) ;; force update of summary tab - ) "update buttons once")) - (th2 (make-thread iup:main-loop "Main loop"))) - ;; (print "Starting main loop") - (thread-start! th2) - (thread-join! th2) - ) - ) - ) -) + (iup:main-loop) + ))) (define last-copy-time 0) ;; Sync to tmp only if in read-only mode. Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -336,11 +336,11 @@ (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) - (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) + (job-group-limit (let ((jobg-count (configf:lookup-number *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0) (runs:inc-can-run-more-tests-count runsdat)) ;; (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -237,11 +237,24 @@ ;; Get the list of server logs. (let* ( ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) - (server-logs (glob (conc areapath "/logs/server-*-*.log"))) + (server-logs + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "server:get-list: glob failed , exn=" exn) + (thread-sleep! 60) + (system "lsof -c mtest > /tmp/$USER/glob-failed.$$.lsof") + (debug:print 0 *default-log-port* "lsof output saved in /tmp/$USER/glob-failed.$$.lsof") + (thread-sleep! 60) + (glob (conc areapath "/logs/server-*-*.log")) + ) + (glob (conc areapath "/logs/server-*-*.log")) + ) + ) (num-serv-logs (length server-logs))) (if (or (null? server-logs) (= num-serv-logs 0)) (let () (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) '()