Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -413,20 +413,29 @@ (tmp-area (common:get-db-tmp-area)) (dbfiles (glob (conc tmp-area"/.megatest/*.db"))) (sync-durations (make-hash-table)) (no-sync-db (db:open-no-sync-db))) (for-each - (lambda (file) + (lambda (file) ;; tmp db file (debug:print-info 3 *default-log-port* "file: " file) - (let* ((fname (conc (pathname-file file) ".db")) - (fulln (conc *toppath*"/.megatest/"fname)) - (time1 (if (file-exists? file) - (file-modification-time file) + (let* ((fname (conc (pathname-file file) ".db")) ;; fname is tmp db file + (wal-file (conc fname "-wal")) + (shm-file (conc fname "-shm")) + (fulln (conc *toppath*"/.megatest/"fname)) ;; fulln is nfs db name + (wal-time (if (file-exists? wal-file) + (file-modification-time wal-file) + 0)) + (shm-time (if (file-exists? shm-file) + (file-modification-time shm-file) + 0)) + + (time1 (if (file-exists? file) ;; time1 is the max itime of the tmp db, -wal and -shm files. + (max (file-modification-time file) wal-time shm-time) (begin (debug:print-info 2 *default-log-port* "Sync - I do not see file "file) 1))) - (time2 (if (file-exists? fulln) + (time2 (if (file-exists? fulln) ;; time2 is nfs file time (file-modification-time fulln) (begin (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln) 0))) (changed (> (- time1 time2) (+ (random 5) 1))) ;; it has been at some few seconds since last synced @@ -481,12 +490,11 @@ (dest-area (if old2new tmp-area *toppath*)) (dbfiles (if old2new (glob (conc *toppath* "/.megatest/*.db")) (glob (conc tmp-area "/.megatest/*.db")))) (keys (db:get-keys dbstruct)) (sync-durations (make-hash-table))) - - (if killservers + (if (and killservers servers) (begin (for-each (lambda (server) (handle-exceptions exn @@ -498,10 +506,13 @@ (tasks:kill-server host pid))))) servers) (delete-file* (common:get-sync-lock-filepath)) ) ) + + (if (not dbfiles) + (debug:print-error 0 *default-log-port* "no dbfiles found in " (conc *toppath* "/.megatest")) (for-each (lambda (srcfile) (debug:print-info 3 *default-log-port* "file: " srcfile) (let* ((fname (conc (pathname-file srcfile) ".db")) (basename (pathname-file srcfile)) @@ -522,10 +533,11 @@ (changed ( < (- time2 time1) 6.0)) ;; dest db not updated within last 6 seconds (do-cp (cond ((not (file-exists? destfile)) ;; shouldn't happen, but this might recover (debug:print-info 2 *default-log-port* "File " destfile " not found. Copying "srcfile" to "destfile) + ;; TODO: Need to fix this for WAL mod. Can't just copy. (system (conc "/bin/mkdir -p " dest-directory)) (system (conc "/bin/cp " srcfile " " destfile)) #t) (changed ;; (and changed ;; (> (- (current-seconds) time1) 3)) ;; if file is changed and three seconds have passed. @@ -559,10 +571,11 @@ (debug:print-info 2 *default-log-port* "skipping delta sync. " srcfile " is up to date") ) ) ) dbfiles + ) ) data-synced ) ) @@ -2305,12 +2318,12 @@ (sqlite3:for-each-row (lambda (run-id testname item-path state status) ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-"))) db - "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=?;" - test-id))) + "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" + test-id run-id))) res)) ;; get a useful subset of the tests data (used in dashboard ;; use db:mintest-get-{id ,run_id,testname ...} ;; Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -336,11 +336,11 @@ (dir-access (file-write-access? (pathname-directory fname))) (retry (lambda () (thread-sleep! delay-time) (if (> tries-left 0) (dbfile:cautious-open-database fname init-proc - sync-mode: sync-mode journal-mode: journal-mode + sync-mode: sync-mode journal-mode (- tries-left 1)))))) (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up.")) (if (and (file-write-access? fname) (file-exists? busy-file)) @@ -351,11 +351,11 @@ (thread-sleep! 1) (if (eq? tries-left 2) (begin (dbfile:print-err "INFO: forcing journal rollup "busy-file) (dbfile:brute-force-salvage-db fname))) - (dbfile:cautious-open-database fname init-proc sync-mode: sync-mode journal-mode: journal-mode (- tries-left 1))) + (dbfile:cautious-open-database fname init-proc sync-mode journal-mode (- tries-left 1))) (let* ((result (condition-case (if dir-access (dbfile:with-simple-file-lock (conc fname ".lock") Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -153,11 +153,11 @@ (determine-proxy (constantly #f))) (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname) (handle-exceptions exn (begin - (print-error-message exn) + ;; (print-error-message exn) (if (< portnum 64000) (begin (debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 5 *default-log-port* "exn=" (condition->list exn)) @@ -448,34 +448,23 @@ (lambda () (delete-file* servinf)) *on-exit-procs*)) ;; put data about this server into a simple flat file host.port (debug:print-info 0 *default-log-port* "Received server alive signature") - #;(common:save-pkt `((action . alive) - (T . server) - (pid . ,(current-process-id)) - (ipaddr . ,(car sdat)) - (port . ,(cadr sdat))) - *configdat* #t) sdat) (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes - (let* ((ipaddr (car sdat)) + (if sdat + (let* ((ipaddr (car sdat)) (port (cadr sdat)) (servinf (conc (server:get-servinfo-dir *toppath*)"/"ipaddr":"port))) - (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") - ;; (delete-file* servinf) ;; handled by on-exit, can be removed - #;(common:save-pkt `((action . died) - (T . server) - (pid . ,(current-process-id)) - (ipaddr . ,(car sdat)) - (port . ,(cadr sdat)) - (msg . "Transport died?")) - *configdat* #t) + (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") (exit)) + (exit) + ) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) @@ -576,18 +565,18 @@ (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn) (if (and ;; (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter (not *server-overloaded*) (file-exists? servinfofile)) (change-file-times servinfofile curr-time curr-time))) - (if (or (common:low-noise-print 120 "start new server") + (if (and (common:low-noise-print 120 "start new server") (> *api-process-request-count* 50)) ;; if this server is kind of busy start up another (begin - (debug:print-info 0 *default-log-port* "Server is busy, parallel-api-count "*api-process-request-count*", start another if possible...") + (debug:print-info 0 *default-log-port* "Server is busy, api-count "*api-process-request-count*", start another if possible...") (server:kind-run *toppath*) (if (> *api-process-request-count* 100) (begin - (debug:print-info 0 *default-log-port* "Server is overloaded at parallel-api-count="*api-process-request-count*", removing "servinfofile) + (debug:print-info 0 *default-log-port* "Server is overloaded at api-count=" *api-process-request-count*", removing "servinfofile) (delete-file* servinfofile))))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.8003) +(define megatest-version 1.8004) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -654,23 +654,10 @@ ;; for some switches always print the command to stderr ;; (if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun") (debug:print 0 *default-log-port* (string-intersperse (argv) " "))) -;; some switches imply homehost. Exit here if not on homehost -;; -(let ((homehost-required (list "-cleanup-db"))) - (if (apply args:any? homehost-required) - (if (not (server:choose-server *toppath* 'home?)) - (for-each - (lambda (switch) - (if (args:get-arg switch) - (begin - (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch - ", you can move homehost by removing the .homehost file but this will disrupt any runs in progress.") - (exit 1)))) - homehost-required)))) ;;====================================================================== ;; Misc setup stuff ;;====================================================================== @@ -2321,10 +2308,14 @@ (begin (if (not (launch:setup)) (begin (debug:print 0 *default-log-port* "Failed to setup, exiting") (exit 1))) + (if (not (server:choose-server *toppath* 'home?)) + (begin + (debug:print 0 *default-log-port* "Servers are not running on this host or no servers alive. Cannot run cleanup-db") + (exit 1))) (let ((dbstructs (db:setup #f))) (common:cleanup-db dbstructs)) (set! *didsomething* #t))) (if (args:get-arg "-mark-incompletes") Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -64,11 +64,11 @@ exn (begin ;; (release-dot-lock fname) (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 5 *default-log-port* "exn=" (condition->list exn)) + ;; (debug:print 5 *default-log-port* "exn=" (condition->list exn)) (if (common:file-exists? fname)(delete-file fname)) ;; brutally get rid of it (print-call-chain (current-error-port))) (let* (;; (lock (obtain-dot-lock fname 2 9 10)) (db (portlogger:open-db fname)) (res (apply proc db params))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -403,11 +403,11 @@ ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; (define (server:get-servers-info areapath) - (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") + ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) (if (not (file-exists? servinfodir)) (create-directory servinfodir)) (let* ((allfiles (glob (conc servinfodir"/*"))) (res (make-hash-table))) @@ -449,10 +449,14 @@ ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running (else (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) #t)))) + +(define server-last-start 0) + + ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; ;; mode: ;; best - get best server (random of newest five) @@ -466,18 +470,31 @@ ;; 1. sort by age descending ;; 2. take five ;; 3. check alive, discard if not and repeat ;; first we clean up old server files (server:clean-up-old areapath) + (let* ((since-last (- (current-seconds) server-last-start)) + (server-start-delay 10)) + (if ( < (- (current-seconds) server-last-start) 10 ) + (begin + (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) + (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") + (thread-sleep! server-start-delay) + ) + (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) + ) + ) (let* ((serversdat (server:get-servers-info areapath)) (servkeys (hash-table-keys serversdat)) (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last (sort servkeys ;; list of "host:port" (lambda (a b) (>= (list-ref (hash-table-ref serversdat a) 2) (list-ref (hash-table-ref serversdat b) 2)))) '()))) + (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) + (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) (if (not (null? by-time-asc)) (let* ((oldest (last by-time-asc)) (oldest-dat (hash-table-ref serversdat oldest)) (host (list-ref oldest-dat 0)) (all-valid (filter (lambda (x) @@ -514,11 +531,12 @@ (else (debug:print 0 *default-log-port* "ERROR: invalid command "mode) #f))) (begin (server:run areapath) - (thread-sleep! 3) + (set! server-last-start (current-seconds)) + ;; (thread-sleep! 3) (case mode ((homehost) (cons #f #f)) (else #f)))))) (define (server:get-servinfo-dir areapath) @@ -587,11 +605,12 @@ (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) (if (or server-info (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. (server:record->url server-info) - (let ((num-ok (length (server:choose-server areapath 'all-valid)))) + (let* ( (servers (server:choose-server areapath 'all-valid)) + (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) (if (and (> try-num 0) ;; first time through simply wait a little while then try again (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one (server:run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath)