Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -243,33 +243,36 @@ (lambda () (let* ((indat (deserialize)) (newcount (+ *api-process-request-count* 1)) (delay-wait (if (> newcount 10) (- newcount 10) - 0))) + 0)) + (normal-proc (lambda (cmd run-id params) + (case cmd + ((ping) *server-signature*) + (else + (api:dispatch-request dbstruct cmd run-id params)))))) (set! *api-process-request-count* newcount) (set! *db-last-access* (current-seconds)) (match indat ((cmd run-id params meta) (let* ((ttdat *server-info*) (server-state (tt-state ttdat)) (status (cond - ((> newcount 30) 'busy) - ((> newcount 15) 'loaded) + ;; ((> newcount 600) 'busy) + ;; ((> newcount 300) 'loaded) (else 'ok))) (errmsg (case status ((busy) (conc "Server overloaded, "newcount" threads in flight")) ((loaded) (conc "Server loaded, "newcount" threads in flight")) (else #f))) (result (case status - ((busy) (- newcount 29)) - ((loaded) #f) + ((busy) (- newcount 29)) ;; call back in as many seconds + ((loaded) + (normal-proc cmd run-id params)) (else - (case cmd - ((ping) *server-signature*) - (else - (api:dispatch-request dbstruct cmd run-id params)))))) + (normal-proc cmd run-id params)))) (meta (case cmd ((ping) `((sstate . ,server-state))) (else `((wait . ,delay-wait))))) (payload (list status errmsg result meta))) (set! *api-process-request-count* (- *api-process-request-count* 1)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2854,10 +2854,24 @@ db (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id in (" (string-intersperse (map conc test-ids) ",") ");")) res)))) +;; try every second until tries times proc +;; +(define (db:keep-trying-until-true proc params tries) + (let* ((res (apply proc params))) + (if res + res + (if (> tries 0) + (begin + (thread-sleep! 1) + (db:keep-trying-until-true proc params (- tries 1))) + (begin + (debug:print-info 0 *default-log-port* "proc never returned true, params="params) + #f))))) + (define (db:get-test-info dbstruct run-id test-name item-path) (db:with-db dbstruct run-id #f @@ -3316,11 +3330,14 @@ (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) ;; establish info on incoming test followed by info on top level test ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met (let* ((testdat (if (number? test-name) (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id - (db:get-test-info dbstruct run-id test-name item-path))) + (db:keep-trying-until-true ;; in our threaded stuff this call could happen before the test is registered (maybe?) + db:get-test-info + (list dbstruct run-id test-name item-path) + 10))) (test-id (db:test-get-id testdat)) (test-name (if (number? test-name) (db:test-get-testname testdat) test-name)) (item-path (db:test-get-item-path testdat)) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -17,11 +17,11 @@ ;; along with Megatest. If not, see . ;;====================================================================== (declare (unit dbfile)) -;; (declare (uses debugprint)) +(declare (uses debugprint)) (declare (uses commonmod)) (module dbfile * @@ -37,11 +37,11 @@ stack files ports commonmod - ;; debugprint + debugprint ) (define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic (define num-run-dbs (make-parameter 10)) ;; number of db's in .megatest (define dbfile:testsuite-name (make-parameter #f)) @@ -407,10 +407,11 @@ ;; opens and returns handle and nothing else ;; (define (dbfile:raw-open-no-sync-db dbpath) (if (not (file-exists? dbpath)) (create-directory dbpath #t)) + (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db") (let* ((dbname (conc dbpath "/no-sync.db")) (db-exists (file-exists? dbname)) (init-proc (lambda (db) (if (not db-exists) (begin @@ -468,11 +469,13 @@ ;; either: ;; fails returns (#f . lock-creation-time) ;; succeeds (returns (#t . lock-creation-time) ;; use (db:no-sync-del! db keyname) to release the lock ;; -(define (db:no-sync-get-lock db keyname) +;; +;; +(define (db:no-sync-get-lock db keyname . identification) (sqlite3:with-transaction db (lambda () (condition-case (let* ((curr-val (db:no-sync-get/default db keyname #f))) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -1459,10 +1459,11 @@ ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) + (assert runname "FATAL: launch-test called with no runname") (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ( ;; (lock-key (conc "test-" test-id)) ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds ;; (if (car lock) @@ -1565,11 +1566,11 @@ (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f) ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk *configdat* tconfig)) (debug:print 2 *default-log-port* "best disk path = " diskpath) (if diskpath - (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) + (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 *default-log-port* "Using work area " work-area)) (begin (set! work-area (conc test-path "/tmp_run")) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -95,10 +95,11 @@ ;; set some parameters here - these need to be put in something that can be loaded from other ;; executables such as dashboard and mtutil ;; (include "transport-mode.scm") (dbfile:db-init-proc db:initialize-main-db) +(debug:enable-timestamp #t) ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file ;; (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc"))) (if (common:file-exists? debugcontrolf) Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -208,15 +208,15 @@ (begin (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay - (debug:print 0 *default-log-port* "WARNING: server is overloaded, will try again in "result" seconds.") + (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is overloaded, will try again in "result" seconds.") (thread-sleep! (if (number? result) result 2)) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) ((loaded) - (debug:print 0 *default-log-port* "WARNING: server is loaded, will try again in a 1/4 second.") + (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, will try again in a 1/4 second.") (thread-sleep! 0.25) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (else result))) (else @@ -267,11 +267,17 @@ (define (tt:get-server-info-sorted ttdat dbfname) (let* ((areapath (tt-areapath ttdat)) (sfiles (tt:find-server areapath dbfname)) (sdats (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read (sorted (sort sdats (lambda (a b) - (< (list-ref a 2)(list-ref b 2)))))) + (< (list-ref a 2)(list-ref b 2))))) + (count 0)) + (for-each + (lambda (rec) + (debug:print 0 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", ")) + (set! count (+ count 1))) + sorted) sorted)) (define (tt:get-current-server-info ttdat dbfname) (assert (tt-areapath ttdat) "FATAL: areapath not set in ttdat.") ;; @@ -387,43 +393,52 @@ (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) - (debug:print-info 0 *default-log-port* "Keep running, I'm the top server on "(tt-host ttdat)":"(tt-port ttdat)) - (if db-locked-in - #t - (let* ((lockinfo (dbfile:with-no-sync-db nosyncdbpath - (lambda (db) - (db:no-sync-get-lock db dbfname)))) - (success (car lockinfo))) - (if success - (begin - (tt-state-set! ttdat 'running) - (debug:print 0 *default-log-port* "Got server lock for "dbfname) - (set! db-locked-in #t) - #t) - (begin - (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname) - #f))))) + (let* ((res (if db-locked-in + #t + (let* ((lockinfo (dbfile:with-no-sync-db + nosyncdbpath + (lambda (db) + (db:no-sync-get-lock db dbfname)))) + (success (car lockinfo))) + (if success + (begin + (tt-state-set! ttdat 'running) + (debug:print 0 *default-log-port* "Got server lock for " + dbfname) + (set! db-locked-in #t) + #t) + (begin + (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname) + #f)))))) + (if res + (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " + dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) + res)) (else (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) (let* ((leadsrv (car servers))) (match leadsrv ((host port startseconds server-id pid dbfname servinfofile) - (if (tt:ping host port server-id) - #f ;; not the server, but all good, want to exit - (if (and (file-exists? servinfofile) - (> (- (current-seconds)(file-modification-time servinfofile)) 15)) + (let* ((res (tt:ping host port server-id))) + (debug:print-info 0 *default-log-port* "Ping to "host":"port", with server-id "server-id + ", and file "servinfofile" returned "res) + (if res + #f ;; not the server, but all good, want to exit + (if (and (file-exists? servinfofile) + (> (- (current-seconds)(file-modification-time servinfofile)) 30)) (begin - ;; can't ping and file has been on disk 5 seconds, go ahead and try to remove it + ;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile) (delete-file* servinfofile) #t) ;; not the server but the server is not reachable (begin - (debug:print 0 *default-log-port* "I'm not the server but will try again since "servinfofile" is fresh") - #t)))) + (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", trying again.") + (thread-sleep! 1) ;; just because + #t))))) (else ;; should never get here (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv) (assert #f "Bad server record "leadsrv)))))))) (if ok ;; (if (> *api-process-request-count* 0) ;; have requests in flight @@ -433,12 +448,14 @@ (cleanup) (exit))) (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (curr-secs (current-seconds))) - (if (> (- curr-secs last-update) 3) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem? + (if (and (eq? (tt-state ttdat) 'running) + (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem? (begin + (set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds)) ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs)))) (if (< (- (current-seconds) (tt-last-access ttdat)) 60) (begin