Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -28,10 +28,17 @@ (declare (uses common)) (declare (uses db)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. +(module client +* + +) + +(import client) + (include "common_records.scm") (include "db_records.scm") ;; client:get-signature (define (client:get-signature) @@ -44,13 +51,10 @@ #;(define (client:logout serverdat) (let ((ok (and (socket? serverdat) (cdb:logout serverdat *toppath* (client:get-signature))))) ok)) -(define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0)) - (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) - ;; Do all the connection work, look up the transport type and set up the ;; connection if required. ;; ;; There are two scenarios. ;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline @@ -60,17 +64,27 @@ ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; -(define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f)) +;;(define (http-transport:server-dat-make-url runremote) +(define (client:get-url runremote) + (if (and (remote-iface runremote) + (remote-port runremote)) + (conc "http://" + (remote-iface runremote) + ":" + (remote-port runremote)) + #f)) + +(define (client:setup areapath runremote #!key (remaining-tries 100) (failed-connects 0)(area-dat #f)) (mutex-lock! *rmt-mutex*) - (let ((res (client:setup-http-baby areapath remaining-tries: remaining-tries failed-connects: failed-connects area-dat: area-dat))) + (let ((res (client:setup-http areapath runremote remaining-tries: remaining-tries failed-connects: failed-connects area-dat: area-dat))) (mutex-unlock! *rmt-mutex*) res)) -(define (client:setup-http-baby areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f)) +(define (client:setup-http areapath runremote #!key (remaining-tries 100) (failed-connects 0)) (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) (server:start-and-wait areapath) (if (<= remaining-tries 0) (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server") @@ -77,52 +91,72 @@ (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; - (let* ((server-dat (server:choose-server areapath 'best)) - (runremote (or area-dat *runremote*))) + (let* ((server-dat (server:choose-server areapath 'best))) ;; list host port start-time server-id pid +;; (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) + (begin + (if (< remaining-tries 99)(thread-sleep! 1)) ;; obviously it needs time + (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1))) (match server-dat ((host port start-time server-id pid) (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if (and (not area-dat) - (not *runremote*)) - (begin - (set! *runremote* (make-remote)) - (let* ((server-info (remote-server-info *runremote*))) + (if (not runremote) + (begin + ;; Here we are creating a runremote where there was none or it was clobbered with #f + ;; + (set! runremote (make-remote)) + (let* ((server-info (server:check-if-running areapath))) + (remote-server-info-set! runremote server-info) (if server-info (begin - (remote-server-url-set! *runremote* (server:record->url server-info)) - (remote-server-id-set! *runremote* (server:record->id server-info))))))) + (remote-server-url-set! runremote (server:record->url server-info)) + (remote-server-id-set! runremote (server:record->id server-info))))))) + ;; at this point we have a runremote (if (and host port server-id) - (let* ((start-res (http-transport:client-connect host port server-id)) - (ping-res (rmt:login-no-auto-client-setup start-res))) - (if (and start-res - ping-res) - (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago - (if runremote - (begin - (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) - (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))) + (let* ((nada (client:connect host port server-id runremote)) + (ping-res (rmt:login-no-auto-client-setup runremote))) + (if ping-res + (if runremote + (begin + (debug:print-info 2 *default-log-port* "connected to " (client:get-url runremote)) + runremote) + (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1))) (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 - (case *transport-type* - ((http)(http-transport:close-connections))) - (if *runremote* - (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) - ) + (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... ping-res=" ping-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 + (http-transport:close-connections runremote) (thread-sleep! 1) - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) + (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered ;; (server:kind-run areapath) (server:start-and-wait areapath) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))) + (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1))))) (else (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat))))))) +;; +;; connect - stored in remote-condat +;; +;; (define (http-transport:client-connect iface port server-id runremote) +(define (client:connect iface port server-id runremote-in) + (let* ((runremote (or runremote-in + (make-runremote)))) + (debug:print-info 2 *default-log-port* "Connecting to server at "iface":"port", id "server-id) + (let* ((api-url (conc "http://" iface ":" port "/api")) + (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) + (api-req (make-request method: 'POST uri: api-uri))) + ;; (server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id))) + (remote-iface-set! runremote iface) + (remote-port-set! runremote port) + (remote-server-id-set! runremote server-id) + (remote-connect-time-set! runremote (current-seconds)) + (remote-last-access-set! runremote (current-seconds)) + (remote-api-url-set! runremote api-url) + (remote-api-uri-set! runremote api-uri) + (remote-api-req-set! runremote api-req) + runremote))) + Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -315,26 +315,34 @@ (else "FAIL"))) (define (common:logpro-exit-code->test-status exit-code) (status-sym->string (common:logpro-exit-code->status-sym exit-code))) +;; (defstruct remote (hh-dat (let ((res (or (server:choose-server *toppath* 'homehost) (cons #f #f)))) (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res)) res)) (server-url #f) ;; (server:check-if-running *toppath*) #f)) (server-id #f) - (server-info (if *toppath* (server:check-if-running *toppath*) #f)) + (server-info #f) ;; (if *toppath* (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive - (connect-time (current-seconds)) - (conndat #f) - (transport *transport-type*) + (connect-time (current-seconds)) ;; when we first connected + (last-access (current-seconds)) ;; last time we talked to server + (conndat #f) ;; iface port api-uri api-url api-req seconds server-id (server-timeout (server:expiration-timeout)) (force-server #f) (ro-mode #f) - (ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode + (ro-mode-checked #f) ;; flag that indicates we have checked for ro-mode + + ;; conndat stuff + (iface #f) ;; TODO: Consolidate this data with server-url and server-info above + (port #f) + (api-url #f) + (api-uri #f) + (api-req #f)) ;; launching and hosts (defstruct host (reachable #f) (last-update 0) @@ -408,13 +416,22 @@ (define (common:version-changed?) (not (equal? (common:get-last-run-version) (common:version-signature)))) + +;; From 1.70 to 1.80, db's are compatible. + (define (common:api-changed?) - (not (equal? (substring (->string megatest-version) 0 4) - (substring (conc (common:get-last-run-version)) 0 4)))) + (let* ( + (megatest-major-version (substring (->string megatest-version) 0 4)) + (run-major-version (substring (conc (common:get-last-run-version)) 0 4)) + ) + (and (not (equal? megatest-major-version "1.80")) + (not (equal? megatest-major-version megatest-run-version))) + ) +) ;;====================================================================== ;; Move me elsewhere ... ;; RADT => Why do we meed the version check here, this is called only if version misma ;; @@ -520,11 +537,11 @@ (define (common:rotate-logs) (let* ((all-files (make-hash-table)) (stats (make-hash-table)) (inc-stat (lambda (key) (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) - (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age + (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) (handle-exceptions exn @@ -599,14 +616,15 @@ ;;====================================================================== ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified ;; Do NOT check if not on homehost! ;; (define (common:exit-on-version-changed) - (if (common:on-homehost?) + (if (and *toppath* ;; do nothing if *toppath* not yet provided + (common:on-homehost?)) (if (common:api-changed?) (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) - (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db")) + (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".megatest/main.db")) (read-only (not (file-write-access? dbfile))) (dbstruct (db:setup #t))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) (debug:print 0 *default-log-port* "WARNING: Version mismatch!\n" " expected: " (common:version-signature) "\n" @@ -626,14 +644,14 @@ (common:cleanup-db dbstruct))) ((not (common:file-exists? mtconf)) (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") (exit 1)) ((not (common:file-exists? dbfile)) - (debug:print 0 *default-log-port* " megatest.db does not exist in this area. Cannot proceed with megatest version migration.") + (debug:print 0 *default-log-port* " .megatest/main.db does not exist in this area. Cannot proceed with megatest version migration.") (exit 1)) ((not (eq? (current-user-id)(file-owner mtconf))) - (debug:print 0 *default-log-port* " You do not own megatest.db in this area. Cannot proceed with megatest version migration.") + (debug:print 0 *default-log-port* " You do not own .megatest/main.db in this area. Cannot proceed with megatest version migration.") (exit 1)) (read-only (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") (exit 1)) (else @@ -710,19 +728,21 @@ "")))) (define (common:alist-ref/default key alist default) (or (alist-ref key alist) default)) -(define (common:low-noise-print waitval . keys) - (let* ((key (string-intersperse (map conc keys) "-" )) - (lasttime (hash-table-ref/default *common:denoise* key 0)) - (currtime (current-seconds))) - (if (> (- currtime lasttime) waitval) - (begin - (hash-table-set! *common:denoise* key currtime) - #t) - #f))) +;; moved into commonmod +;; +;; (define (common:low-noise-print waitval . keys) +;; (let* ((key (string-intersperse (map conc keys) "-" )) +;; (lasttime (hash-table-ref/default *common:denoise* key 0)) +;; (currtime (current-seconds))) +;; (if (> (- currtime lasttime) waitval) +;; (begin +;; (hash-table-set! *common:denoise* key currtime) +;; #t) +;; #f))) (define (common:get-megatest-exe) (or (getenv "MT_MEGATEST") "megatest")) (define (common:read-encoded-string instr) @@ -946,20 +966,21 @@ (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn) (exit 1)) - (let* ((tsname (common:get-testsuite-name)) + (let* ((toppath (common:real-path *toppath*)) + (tsname (common:get-testsuite-name)) (dbpath (common:get-create-writeable-dir (list (conc "/tmp/" (current-user-name) "/megatest_localdb/" tsname "/" - (string-translate *toppath* "/" ".")) + (string-translate toppath "/" ".")) (conc "/tmp/" (current-process-id) ;; just in case we have an issue with the dir by own user name "/megatest_localdb/" tsname - (string-translate *toppath* "/" ".")) + (string-translate toppath "/" ".")) )))) (set! *db-cache-path* dbpath) ;; ensure megatest area has .megatest (let ((dbarea (conc *toppath* "/.megatest"))) (if (not (file-exists? dbarea)) @@ -980,12 +1001,13 @@ ;;====================================================================== ;; E X I T H A N D L I N G ;;====================================================================== (define (common:run-sync?) - (and (common:on-homehost?) - (args:get-arg "-server"))) + (and *toppath* ;; gate if called before *toppath* is set + (common:on-homehost?) + (args:get-arg "-server"))) (define (common:human-time) (time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S")) @@ -1598,10 +1620,30 @@ path) ;; just give up (with-input-from-pipe (conc "/bin/readlink -f " path) (lambda () (read-line))))) + +;; for reasons I don't understand multiple calls to real-path in parallel threads +;; must be protected by mutexes +;; +(define (common:real-path inpath) + ;; (process:cmd-run-with-stderr->list "readlink" "-f" inpath)) ;; cmd . params) + ;; (let-values + ;; (((inp oup pid) (process "readlink" (list "-f" inpath)))) + ;; (with-input-from-port inp + ;; (let loop ((inl (read-line)) + ;; (res #f)) + ;; (print "inl=" inl) + ;; (if (eof-object? inl) + ;; (begin + ;; (close-input-port inp) + ;; (close-output-port oup) + ;; ;; (process-wait pid) + ;; res) + ;; (loop (read-line) inl)))))) + (with-input-from-pipe (conc "readlink -f " inpath) read-line)) ;;====================================================================== ;; returns *effective load* (not normalized) ;; (define (common:get-intercept onemin fivemin) @@ -1989,10 +2031,17 @@ (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-homehost-load maxnormload msg) + (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test... + (if (not *toppath*) + (begin + (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.") + (thread-sleep! 30) + (if (< (- (current-seconds) start-time) 300) + (loop start-time))))) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (server:choose-server *toppath* 'homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) @@ -2218,30 +2267,10 @@ (uname #f)) (if (null? (car uname-res)) "unknown" (caar uname-res)))) -;; for reasons I don't understand multiple calls to real-path in parallel threads -;; must be protected by mutexes -;; -(define (common:real-path inpath) - ;; (process:cmd-run-with-stderr->list "readlink" "-f" inpath)) ;; cmd . params) - ;; (let-values - ;; (((inp oup pid) (process "readlink" (list "-f" inpath)))) - ;; (with-input-from-port inp - ;; (let loop ((inl (read-line)) - ;; (res #f)) - ;; (print "inl=" inl) - ;; (if (eof-object? inl) - ;; (begin - ;; (close-input-port inp) - ;; (close-output-port oup) - ;; ;; (process-wait pid) - ;; res) - ;; (loop (read-line) inl)))))) - (with-input-from-pipe (conc "readlink -f " inpath) read-line)) - ;;====================================================================== ;; D I S K S P A C E ;;====================================================================== (define (common:get-disk-space-used fpath) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -359,13 +359,15 @@ (define (db:cache-for-read-only source target #!key (use-last-update #f)) (if (and (hash-table-ref/default *global-db-store* target #f) (>= (file-modification-time target)(file-modification-time source))) (hash-table-ref *global-db-store* target) (let* ((toppath (launch:setup)) - (targ-db-last-mod (if (common:file-exists? target) - (file-modification-time target) - 0)) + (targ-db-last-mod (db:get-sqlite3-mod-time target)) +;; (if (common:file-exists? target) +;; BUG: This needs to include wal mode stuff .shm etc. +;; (file-modification-time target) +;; 0)) (cache-db (or (hash-table-ref/default *global-db-store* target #f) (db:open-megatest-db path: target))) (source-db (db:open-megatest-db path: source)) (curr-time (current-seconds)) (res '()) @@ -405,35 +407,54 @@ ;; use-last-update: #t))) ;; (thread-start! th1) ;; (apply proc cache-db params) ;; )))) - - - +(define (db:get-sqlite3-mod-time fname) + (let* ((wal-file (conc fname "-wal")) + (shm-file (conc fname "-shm")) + (get-mtime (lambda (f) + (if (and (file-exists? f) + (file-read-access? f)) + (file-modification-time f) + 0)))) + (max (get-mtime fname) + (get-mtime wal-file) + (get-mtime shm-file)))) + (define (db:all-db-sync dbstruct) (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db)) (data-synced 0) ;; count of changed records (tmp-area (common:get-db-tmp-area)) (dbfiles (glob (conc tmp-area"/.megatest/*.db"))) (sync-durations (make-hash-table)) (no-sync-db (db:open-no-sync-db))) (for-each - (lambda (file) + (lambda (file) ;; tmp db file (debug:print-info 3 *default-log-port* "file: " file) - (let* ((fname (conc (pathname-file file) ".db")) - (fulln (conc *toppath*"/.megatest/"fname)) - (time1 (if (file-exists? file) - (file-modification-time file) - (begin - (debug:print-info 2 *default-log-port* "Sync - I do not see file "file) - 1))) - (time2 (if (file-exists? fulln) - (file-modification-time fulln) - (begin - (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln) - 0))) + (let* ((fname (conc (pathname-file file) ".db")) ;; fname is tmp db file + (wal-file (conc fname "-wal")) + (shm-file (conc fname "-shm")) + (fulln (conc *toppath*"/.megatest/"fname)) ;; fulln is nfs db name + (wal-time (if (file-exists? wal-file) + (file-modification-time wal-file) + 0)) + (shm-time (if (file-exists? shm-file) + (file-modification-time shm-file) + 0)) + (time1 (db:get-sqlite3-mod-time file)) +;; (if (file-exists? file) ;; time1 is the max itime of the tmp db, -wal and -shm files. +;; (max (file-modification-time file) wal-time shm-time) +;; (begin +;; (debug:print-info 2 *default-log-port* "Sync - I do not see file "file) +;; 1))) + (time2 (db:get-sqlite3-mod-time fulln)) +;; (if (file-exists? fulln) ;; time2 is nfs file time +;; (file-modification-time fulln) +;; (begin +;; (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln) +;; 0))) (changed (> (- time1 time2) (+ (random 5) 1))) ;; it has been at some few seconds since last synced (changed10 (> (- time1 time2) 10)) ;; it has been at least ten seconds since sync'd (jfile-exists (file-exists? (conc file"-journal"))) ;; i.e. are we busy? (do-cp (cond ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover @@ -484,12 +505,11 @@ (dest-area (if old2new tmp-area *toppath*)) (dbfiles (if old2new (glob (conc *toppath* "/.megatest/*.db")) (glob (conc tmp-area "/.megatest/*.db")))) (keys (db:get-keys dbstruct)) (sync-durations (make-hash-table))) - - (if killservers + (if (and killservers servers) (begin (for-each (lambda (server) (handle-exceptions exn @@ -501,10 +521,13 @@ (tasks:kill-server host pid))))) servers) (delete-file* (common:get-sync-lock-filepath)) ) ) + + (if (not dbfiles) + (debug:print-error 0 *default-log-port* "no dbfiles found in " (conc *toppath* "/.megatest")) (for-each (lambda (srcfile) (debug:print-info 3 *default-log-port* "file: " srcfile) (let* ((fname (conc (pathname-file srcfile) ".db")) (basename (pathname-file srcfile)) @@ -525,10 +548,11 @@ (changed ( < (- time2 time1) 6.0)) ;; dest db not updated within last 6 seconds (do-cp (cond ((not (file-exists? destfile)) ;; shouldn't happen, but this might recover (debug:print-info 2 *default-log-port* "File " destfile " not found. Copying "srcfile" to "destfile) + ;; TODO: Need to fix this for WAL mod. Can't just copy. (system (conc "/bin/mkdir -p " dest-directory)) (system (conc "/bin/cp " srcfile " " destfile)) #t) (changed ;; (and changed ;; (> (- (current-seconds) time1) 3)) ;; if file is changed and three seconds have passed. @@ -562,14 +586,17 @@ (debug:print-info 2 *default-log-port* "skipping delta sync. " srcfile " is up to date") ) ) ) dbfiles + ) ) data-synced ) ) + + ;; Sync all changed db's ;; (define (db:tmp->megatest.db-sync dbstruct run-id last-update) (let* ((subdbs (hash-table-values (dbr:dbstruct-subdbs dbstruct))) @@ -2029,18 +2056,19 @@ (lambda (dbdat db) (if msg (sqlite3:execute db "UPDATE runs SET status=?,comment=? WHERE id=?;" status msg run-id) (sqlite3:execute db "UPDATE runs SET status=? WHERE id=?;" status run-id))))) +(define (db:set-run-state-status-db db run-id state status ) + (sqlite3:execute db "UPDATE runs SET status=?,state=? WHERE id=?;" status state run-id)) + (define (db:set-run-state-status dbstruct run-id state status ) (db:with-db dbstruct #f #f (lambda (dbdat db) - (sqlite3:execute db "UPDATE runs SET status=?,state=? WHERE id=?;" status state run-id)))) - - - + (db:set-run-state-status-db db run-id state status)))) + (define (db:get-run-status dbstruct run-id) (let ((res "n/a")) (db:with-db dbstruct #f #f (lambda (dbdat db) @@ -2306,12 +2334,12 @@ (sqlite3:for-each-row (lambda (run-id testname item-path state status) ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-"))) db - "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=?;" - test-id))) + "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" + test-id run-id))) res)) ;; get a useful subset of the tests data (used in dashboard ;; use db:mintest-get-{id ,run_id,testname ...} ;; @@ -2402,25 +2430,28 @@ ;; NOTE: run-id is not used ;; ;; (define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment) (db:with-db dbstruct - run-id - #t + run-id #f (lambda (dbdat db) - (cond - ((and newstate newstatus newcomment) - (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment) - test-id)) - ((and newstate newstatus) - (sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id)) - (else - (if newstate (sqlite3:execute db "UPDATE tests SET state=? WHERE id=?;" newstate test-id)) - (if newstatus (sqlite3:execute db "UPDATE tests SET status=? WHERE id=?;" newstatus test-id)) - (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment) - test-id)))))) - (mt:process-triggers dbstruct run-id test-id newstate newstatus)) + (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment)))) + +(define (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment) + (cond + ((and newstate newstatus newcomment) + (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment) + test-id)) + ((and newstate newstatus) + (sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id)) + (else + (if newstate (sqlite3:execute db "UPDATE tests SET state=? WHERE id=?;" newstate test-id)) + (if newstatus (sqlite3:execute db "UPDATE tests SET status=? WHERE id=?;" newstatus test-id)) + (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment) + test-id)))) + ;; (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NOTE: Moved into calling function + ) ;; NEW BEHAVIOR: Count tests running in all runs! ;; (define (db:get-count-tests-running dbstruct run-id) ;; fastmode) (let* ((qry ;; (if fastmode @@ -2715,18 +2746,21 @@ (db:with-db dbstruct run-id #f (lambda (dbdat db) - (let ((res #f)) - (sqlite3:for-each-row - (lambda (a . b) - (set! res (apply vector a b))) - db - (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE testname=? AND item_path=? AND run_id=?;") - test-name item-path run-id) - res)))) + (db:get-test-info-db db run-id test-name item-path)))) + +(define (db:get-test-info-db db run-id test-name item-path) + (let ((res #f)) + (sqlite3:for-each-row + (lambda (a . b) + (set! res (apply vector a b))) + db + (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE testname=? AND item_path=? AND run_id=?;") + test-name item-path run-id) + res)) (define (db:test-get-rundir-from-test-id dbstruct run-id test-id) (db:with-db dbstruct run-id @@ -3177,11 +3211,13 @@ test-name)) (item-path (db:test-get-item-path testdat)) (tl-testdat (db:get-test-info dbstruct run-id test-name "")) (tl-test-id (if tl-testdat (db:test-get-id tl-testdat) - #f))) + #f)) + (new-state-eh #f) + (new-status-eh #f)) (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) (db:general-call dbstruct run-id 'set-test-start-time (list test-id))) (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct run-id #f @@ -3189,29 +3225,32 @@ (let ((tr-res (sqlite3:with-transaction db (lambda () ;; NB// Pass the db so it is part fo the transaction - (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status + (db:test-set-state-status-db db run-id test-id state status comment) ;; this call sets the item state/status (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item - (let* ((state-status-counts (db:get-all-state-status-counts-for-test db run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test - (state-statuses (db:roll-up-rules state-status-counts state status)) - (newstate (car state-statuses)) - (newstatus (cadr state-statuses))) + (let* ((state-status-counts (db:get-all-state-status-counts-for-test db run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test + (state-statuses (db:roll-up-rules state-status-counts state status)) + (newstate (car state-statuses)) + (newstatus (cadr state-statuses))) + (set! new-state-eh newstate) + (set! new-status-eh newstatus) (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts) " state-status-counts: " - (apply conc - (map (lambda (x) - (conc - (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) - state-status-counts))); end debug:print - - (if tl-test-id - (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct + (apply conc + (map (lambda (x) + (conc + (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) + state-status-counts))); end debug:print + (if tl-test-id + (db:test-set-state-status-db db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct )))))) (mutex-unlock! *db-transaction-mutex*) (if (and test-id state status (equal? status "AUTO")) (db:test-data-rollup dbstruct run-id test-id status)) + (if new-state-eh ;; moved from db:test-set-state-status + (mt:process-triggers dbstruct run-id test-id new-state-eh new-status-eh)) tr-res))))) (define (db:roll-up-rules state-status-counts state status) (let* ((running (length (filter (lambda (x) (member (dbr:counts-state x) *common:running-states*)) @@ -3275,56 +3314,53 @@ (lambda (dbdat db) (let ((tr-res (sqlite3:with-transaction db (lambda () - (let* ((state-status-counts (db:get-all-state-status-counts-for-run db run-id)) + (let* ((state-status-counts (db:get-all-state-status-counts-for-run-db db run-id)) (state-statuses (db:roll-up-rules state-status-counts #f #f )) (newstate (car state-statuses)) (newstatus (cadr state-statuses))) (if (or (not (eq? newstate curr-state)) (not (eq? newstatus curr-status))) - (db:set-run-state-status db run-id newstate newstatus ))))))) + (db:set-run-state-status-db db run-id newstate newstatus ))))))) (mutex-unlock! *db-transaction-mutex*) tr-res)))) + +(define (db:get-all-state-status-counts-for-run-db db run-id) + (sqlite3:map-row + (lambda (state status count) + (make-dbr:counts state: state status: status count: count)) + db + "SELECT state,status,count(id) FROM tests WHERE run_id=? GROUP BY state,status;" + run-id )) (define (db:get-all-state-status-counts-for-run dbstruct run-id) - (let* ((test-count-recs (db:with-db - dbstruct #f #f - (lambda (dbdat db) - (sqlite3:map-row - (lambda (state status count) - (make-dbr:counts state: state status: status count: count)) - db - "SELECT state,status,count(id) FROM tests WHERE run_id=? GROUP BY state,status;" - run-id ))))) - test-count-recs)) - + (db:with-db + dbstruct #f #f + (lambda (dbdat db) + (db:get-all-state-status-counts-for-run-db dbstruct run-id)))) ;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status* ;; ;; NOTE: This is called within a transaction ;; -(define (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path item-state-in item-status-in) - (let* ((test-info (db:get-test-info dbstruct run-id test-name item-path)) +(define (db:get-all-state-status-counts-for-test db run-id test-name item-path item-state-in item-status-in) + (let* ((test-info (db:get-test-info-db db run-id test-name item-path)) (item-state (or item-state-in (db:test-get-state test-info))) (item-status (or item-status-in (db:test-get-status test-info))) - (other-items-count-recs (db:with-db - dbstruct run-id #f - (lambda (dbdat db) - (sqlite3:map-row - (lambda (state status count) - (make-dbr:counts state: state status: status count: count)) - db - ;; ignore current item because we have changed its value in the current transation so this select will see the old value. - "SELECT state,status,count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND item_path !=? GROUP BY state,status;" - run-id test-name item-path)))) - + (other-items-count-recs (sqlite3:map-row + (lambda (state status count) + (make-dbr:counts state: state status: status count: count)) + db + ;; ignore current item because we have changed its value in the current transation so this select will see the old value. + "SELECT state,status,count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND item_path !=? GROUP BY state,status;" + run-id test-name item-path)) ;; add current item to tally outside of sql query - (match-countrec-lambda (lambda (countrec) - (and (equal? (dbr:counts-state countrec) item-state) + (match-countrec-lambda (lambda (countrec) + (and (equal? (dbr:counts-state countrec) item-state) (equal? (dbr:counts-status countrec) item-status)))) - + (already-have-count-rec-list (filter match-countrec-lambda other-items-count-recs)) ;; will have either 0 or 1 count recs depending if another item shares this item's state/status (updated-count-rec (if (null? already-have-count-rec-list) (make-dbr:counts state: item-state status: item-status count: 1) @@ -3334,11 +3370,10 @@ (nonmatch-countrec-lambda (lambda (countrec) (not (match-countrec-lambda countrec)))) (unrelated-rec-list (filter nonmatch-countrec-lambda other-items-count-recs))) - (cons updated-count-rec unrelated-rec-list))) ;; (define (db:get-all-item-states db run-id test-name) ;; (sqlite3:map-row ;; (lambda (a) a) @@ -4616,11 +4651,12 @@ (set! *task-db* #f))))) (if (and (not (args:get-arg "-server")) *runremote*) (begin (debug:print-info 0 *default-log-port* "Closing all client connections...") - (http-client#close-all-connections!))) + (http-transport:close-connections *runremote*) + #;(http-client#close-all-connections!))) ;; (if (and *runremote* ;; (remote-conndat *runremote*)) ;; (begin ;; (http-client#close-all-connections!))) ;; for http-client (if (not (eq? *default-log-port* (current-error-port))) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -328,19 +328,19 @@ (current-error-port) (lambda () (apply print params)))) (define (dbfile:cautious-open-database fname init-proc sync-mode journal-mode #!optional (tries-left 500)) - (let* ((busy-file (conc fname"-journal")) + (let* ((busy-file (conc fname "-journal")) (delay-time (* (- 51 tries-left) 1.1)) (write-access (file-write-access? fname)) (dir-access (file-write-access? (pathname-directory fname))) (retry (lambda () (thread-sleep! delay-time) (if (> tries-left 0) (dbfile:cautious-open-database fname init-proc - sync-mode: sync-mode journal-mode: journal-mode + sync-mode: sync-mode journal-mode (- tries-left 1)))))) (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up.")) (if (and (file-write-access? fname) (file-exists? busy-file)) @@ -351,11 +351,11 @@ (thread-sleep! 1) (if (eq? tries-left 2) (begin (dbfile:print-err "INFO: forcing journal rollup "busy-file) (dbfile:brute-force-salvage-db fname))) - (dbfile:cautious-open-database fname init-proc sync-mode: sync-mode journal-mode: journal-mode (- tries-left 1))) + (dbfile:cautious-open-database fname init-proc sync-mode journal-mode (- tries-left 1))) (let* ((result (condition-case (if dir-access (dbfile:with-simple-file-lock (conc fname ".lock") @@ -513,11 +513,12 @@ (db:sync-touched dbstruct runid keys dbinit) (set! *db-sync-in-progress* #f) (delete-file* lock-file) #t) (begin - (dbfile:print-err "INFO: could not get lock for " from-db-file ", sync likely in progress.") + (if (common:low-noise-print 120 (conc "no lock "from-db-file)) + (dbfile:print-err "INFO: could not get lock for " from-db-file ", sync likely in progress.")) #f )))) ;; ;; Get a lock from the no-sync-db for the from-db, then delta sync the from-db to the to-db, otherwise return #f ;; ;; @@ -658,11 +659,26 @@ '("reviewed" #f) '("iterated" #f) '("avg_runtime" #f) '("avg_disk" #f) '("tags" #f) - '("jobgroup" #f))))) + '("jobgroup" #f)) + + + (list "tasks_queue" + '("id" #f) + '("action" #f) + '("owner" #f) + '("state" #f) + '("target" #f) + '("name" #f) + '("testpatt" #f) + '("keylock" #f) + '("params" #f) + '("creation_time" #f) + '("execution_time" #f)) + ))) (define (db:sync-all-tables-list dbstruct keys) (append (db:sync-main-list dbstruct keys) db:sync-tests-only)) @@ -992,10 +1008,12 @@ ;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;") ;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no ;; (define (db:with-db dbstruct run-id r/w proc . params) + (assert dbstruct "FATAL: db:with-db called with dbstruct "#f) + (assert (dbr:dbstruct? dbstruct) "FATAL: dbstruct is "dbstruct) (let* ((use-mutex (> *api-process-request-count* 25)) ;; risk of db corruption (have-struct (dbr:dbstruct? dbstruct)) (dbdat (if have-struct ;; this stuff just allows us to call with a db handle directly (db:open-db dbstruct run-id (dbfile:db-init-proc)) ;; (dbfile:get-subdb dbstruct run-id) #f)) @@ -1008,10 +1026,11 @@ (jfile (conc fname"-journal")) #;(subdb (if have-struct (dbfile:get-subdb dbstruct run-id) #f)) ) ;; was 25 + (assert (sqlite3:database? db) "FATAL: db:with-db, db is not a database, db="db", fname="fname) (if (file-exists? jfile) (begin (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load") (thread-sleep! 0.2))) (if (and use-mutex Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -27,10 +27,13 @@ (declare (unit dcommon)) (declare (uses gutils)) (declare (uses db)) +(declare (uses commonmod)) + +(import commonmod) ;; (declare (uses synchash)) (include "megatest-version.scm") (include "common_records.scm") (include "db_records.scm") @@ -635,11 +638,12 @@ (common:max (map cadr col-indices)))) (max-visible (max (- (dboard:tabdat-num-tests tabdat) 15) 3)) (max-col-vis (if (> max-col 10) 10 max-col)) (numrows 1) (numcols 1)) - (iup:attribute-set! stats-matrix "CLEARVALUE" "CONTENTS") + (if (common:low-noise-print 60 "runs-stats-update-clear") + (iup:attribute-set! stats-matrix "CLEARVALUE" "CONTENTS")) (iup:attribute-set! stats-matrix "NUMCOL" max-col ) (iup:attribute-set! stats-matrix "NUMLIN" (if (< max-row max-visible) max-visible max-row)) ;; min of 20 (iup:attribute-set! stats-matrix "NUMCOL_VISIBLE" max-col-vis) (iup:attribute-set! stats-matrix "NUMLIN_VISIBLE" (if (> max-row max-visible) max-visible max-row)) ;;(print "row-indices: " row-indices " col-indices: " col-indices) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -153,11 +153,11 @@ (determine-proxy (constantly #f))) (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname) (handle-exceptions exn (begin - (print-error-message exn) + ;; (print-error-message exn) (if (< portnum 64000) (begin (debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 5 *default-log-port* "exn=" (condition->list exn)) @@ -241,25 +241,17 @@ (mutex-lock! *http-mutex*) (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*))) ;; Send "cmd" with json payload "params" to serverdat and receive result ;; -(define (http-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3)(area-dat #f)) - (let* ((fullurl (if (vector? serverdat) - (http-transport:server-dat-get-api-req serverdat) - (begin - (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") - (exit 1)))) +(define (http-transport:client-api-send-receive run-id runremote cmd params #!key (numretries 3)) + (assert (remote? runremote) "FATAL: http-transport:client-api-send-receive called with serverdat="serverdat) + (let* ((fullurl (remote-api-req runremote)) (res (vector #f "uninitialized")) (success #t) (sparams (db:obj->string params transport: 'http)) - (runremote (or area-dat *runremote*)) - (server-id (if (vector? serverdat) - (http-transport:server-dat-get-server-id serverdat) - (begin - (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") - (exit 1))))) + (server-id (remote-server-id runremote))) (debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) ;; set up the http-client here (max-retry-attempts 1) ;; consider all requests indempotent @@ -286,21 +278,13 @@ (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " msg ", exn=" exn) (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey")) (debug:print 0 *default-log-port* " call-chain: " call-chain))) ;; what if another thread is communicating ok? Can't happen due to mutex - (set! *runremote* #f) - (set! runremote #f) - ;; (if runremote - ;; (remote-conndat-set! runremote #f)) - ;; Killing associated server to allow clean retry.") - ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? + (http-transport:close-connections runremote) (mutex-unlock! *http-mutex*) - ;; (signal (make-composite-condition - ;; (make-property-condition 'commfail 'message "failed to connect to server"))) - ;; "communications failed" - (close-all-connections!) + ;; (close-connection! fullurl) (db:obj->string #f)) (with-input-from-request ;; was dat fullurl (list (cons 'key (or server-id "thekey")) (cons 'cmd cmd) @@ -345,67 +329,24 @@ 'timeout 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) ;; careful closing of connections stored in *runremote* ;; -(define (http-transport:close-connections #!key (area-dat #f)) - (let* ((runremote (or area-dat *runremote*)) - (server-dat (if runremote - (remote-conndat runremote) - #f))) ;; (hash-table-ref/default *runremote* run-id #f))) - (if (vector? server-dat) - (let ((api-dat (http-transport:server-dat-get-api-uri server-dat))) - (handle-exceptions +(define (http-transport:close-connections runremote) + (if (remote? runremote) + (let ((api-dat (remote-api-uri runremote))) + (handle-exceptions exn - (begin - (print-call-chain *default-log-port*) - (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) - (close-connection! api-dat) - (close-idle-connections!) - #t)) - #f))) - - -(define (make-http-transport:server-dat)(make-vector 6)) -(define (http-transport:server-dat-get-iface vec) (vector-ref vec 0)) -(define (http-transport:server-dat-get-port vec) (vector-ref vec 1)) -(define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2)) -(define (http-transport:server-dat-get-api-url vec) (vector-ref vec 3)) -(define (http-transport:server-dat-get-api-req vec) (vector-ref vec 4)) -(define (http-transport:server-dat-get-last-access vec) (vector-ref vec 5)) -;(define (http-transport:server-dat-get-socket vec) (vector-ref vec 6)) -(define (http-transport:server-dat-get-server-id vec) (vector-ref vec 6)) - -(define (http-transport:server-dat-make-url vec) - (if (and (http-transport:server-dat-get-iface vec) - (http-transport:server-dat-get-port vec)) - (conc "http://" - (http-transport:server-dat-get-iface vec) - ":" - (http-transport:server-dat-get-port vec)) + (begin + (print-call-chain *default-log-port*) + (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) + (if (args:any-defined? "-server" "-execute" "-run") + (debug:print-info 0 *default-log-port* "Closing connections to "api-dat)) + (if api-dat (close-connection! api-dat)) + (remote-conndat-set! runremote #f) + #t)) #f)) - -(define (http-transport:server-dat-update-last-access vec) - (if (vector? vec) - (vector-set! vec 5 (current-seconds)) - (begin - (print-call-chain (current-error-port)) - (debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!")))) - -;; -;; connect -;; -(define (http-transport:client-connect iface port server-id) - (debug:print-info 0 *default-log-port* "Connecting to client at "iface":"port", with server-id "server-id) - (let* ((api-url (conc "http://" iface ":" port "/api")) - (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) - (api-req (make-request method: 'POST uri: api-uri)) - (server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id))) - server-dat)) - - - ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; (define (http-transport:keep-running) @@ -429,11 +370,11 @@ (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (and sdat (not changed) (> (- (current-seconds) start-time) 2)) - (let* ((servinfodir (conc *toppath*"/.servinfo")) + (let* ((servinfodir (server:get-servinfo-dir *toppath*)) ;; (conc *toppath*"/.servinfo")) (ipaddr (car sdat)) (port (cadr sdat)) (servinf (conc servinfodir"/"ipaddr":"port))) (set! servinfofile servinf) (if (not (file-exists? servinfodir)) @@ -448,34 +389,23 @@ (lambda () (delete-file* servinf)) *on-exit-procs*)) ;; put data about this server into a simple flat file host.port (debug:print-info 0 *default-log-port* "Received server alive signature") - #;(common:save-pkt `((action . alive) - (T . server) - (pid . ,(current-process-id)) - (ipaddr . ,(car sdat)) - (port . ,(cadr sdat))) - *configdat* #t) sdat) (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes - (let* ((ipaddr (car sdat)) + (if sdat + (let* ((ipaddr (car sdat)) (port (cadr sdat)) - (servinf (conc *toppath*"/.servinfo/"ipaddr":"port))) - (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") - ;; (delete-file* servinf) ;; handled by on-exit, can be removed - #;(common:save-pkt `((action . died) - (T . server) - (pid . ,(current-process-id)) - (ipaddr . ,(car sdat)) - (port . ,(cadr sdat)) - (msg . "Transport died?")) - *configdat* #t) + (servinf (conc (server:get-servinfo-dir *toppath*)"/"ipaddr":"port))) + (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") (exit)) + (exit) + ) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) @@ -504,11 +434,12 @@ (if (and no-sync-db (common:low-noise-print 10 "sync-all")) ;; cheesy way to reduce frequency of running sync :) (begin (if (common:low-noise-print 120 "sync-all-print") (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S"))) - (db:all-db-sync *dbstruct-dbs*)))) + (db:all-db-sync *dbstruct-dbs*) + ))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) @@ -560,22 +491,22 @@ (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn) - (if (and (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter + (if (and ;; (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter (not *server-overloaded*) (file-exists? servinfofile)) (change-file-times servinfofile curr-time curr-time))) - (if (or (common:low-noise-print 120 "start new server") + (if (and (common:low-noise-print 120 "start new server") (> *api-process-request-count* 50)) ;; if this server is kind of busy start up another (begin - (debug:print-info 0 *default-log-port* "Server is busy, parallel-api-count "*api-process-request-count*", start another if possible...") + (debug:print-info 0 *default-log-port* "Server is busy, api-count "*api-process-request-count*", start another if possible...") (server:kind-run *toppath*) (if (> *api-process-request-count* 100) (begin - (debug:print-info 0 *default-log-port* "Server is overloaded at parallel-api-count="*api-process-request-count*", removing "servinfofile) + (debug:print-info 0 *default-log-port* "Server is overloaded at api-count=" *api-process-request-count*", removing "servinfofile) (delete-file* servinfofile))))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) @@ -621,59 +552,36 @@ ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch) - ;; check that a server start is in progress, pause or exit if so - (let* ((tmp-area (common:get-db-tmp-area)) - (server-start (conc tmp-area "/.server-start")) - (server-started (conc tmp-area "/.server-started")) - (start-time (common:lazy-modification-time server-start)) - (started-time (common:lazy-modification-time server-started)) - (server-starting (< start-time started-time)) ;; if start-time is less than started-time then a server is still starting - (start-time-old (> (- (current-seconds) start-time) 5)) - (cleanup-proc (lambda (msg) - (let* ((serv-fname (conc "server-" (current-process-id) "-" (get-host-name) ".log")) - (full-serv-fname (conc *toppath* "/logs/" serv-fname)) - (new-serv-fname (conc *toppath* "/logs/" "defunct-" serv-fname))) - (debug:print 0 *default-log-port* msg) - (if (common:file-exists? full-serv-fname) - (system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname)) - (debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname)) - (exit))))) - #;(if (and (not start-time-old) ;; last server start try was less than five seconds ago - (not server-starting)) - (begin - (cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting") - (exit))) - ;; lets not even bother to start if there are already three or more server files ready to go - #;(let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) - (if (> num-alive 3) - (begin - (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up")) - (exit)))) - #;(common:save-pkt `((action . start) - (T . server) - (pid . ,(current-process-id))) - *configdat* #t) - (let* ((th2 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server run thread started") - (http-transport:run - (if (args:get-arg "-server") - (args:get-arg "-server") - "-") - )) "Server run")) - (th3 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server monitor thread started") - (http-transport:keep-running) - "Keep running")))) - (thread-start! th2) - (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. - (thread-start! th3) - (set! *didsomething* #t) - (thread-join! th2) - (exit)))) + ;; check the .servinfo directory, are there other servers running on this + ;; or another host? + (let* ((server-start-is-ok (server:minimal-check *toppath*))) + (if (not server-start-is-ok) + (begin + (debug:print 0 *default-log-port* "ERROR: server start not ok, exiting now.") + (exit 1)))) + + ;; check that a server start is in progress, pause or exit if so + (let* ((th2 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server run thread started") + (http-transport:run + (if (args:get-arg "-server") + (args:get-arg "-server") + "-") + )) "Server run")) + (th3 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server monitor thread started") + (http-transport:keep-running) + "Keep running")))) + (thread-start! th2) + (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. + (thread-start! th3) + (set! *didsomething* #t) + (thread-join! th2) + (exit))) ;; (define (http-transport:server-signal-handler signum) ;; (signal-mask! signum) ;; (handle-exceptions ;; exn Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.7009) +(define megatest-version 1.8006) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -654,23 +654,10 @@ ;; for some switches always print the command to stderr ;; (if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun") (debug:print 0 *default-log-port* (string-intersperse (argv) " "))) -;; some switches imply homehost. Exit here if not on homehost -;; -(let ((homehost-required (list "-cleanup-db"))) - (if (apply args:any? homehost-required) - (if (not (server:choose-server *toppath* 'home?)) - (for-each - (lambda (switch) - (if (args:get-arg switch) - (begin - (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch - ", you can move homehost by removing the .homehost file but this will disrupt any runs in progress.") - (exit 1)))) - homehost-required)))) ;;====================================================================== ;; Misc setup stuff ;;====================================================================== @@ -934,13 +921,13 @@ ;;====================================================================== ;; Server? Start up here. ;; (if (args:get-arg "-server") - (let ((tl (launch:setup)) - (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) - (server:launch 0 transport-type) + (let ((tl (launch:setup))) + ;; (server:launch 0 'http) + (http-transport:launch) (set! *didsomething* #t))) ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to ;; a specific Megatest area. Detail are being hashed out and this may change. ;; @@ -2321,10 +2308,14 @@ (begin (if (not (launch:setup)) (begin (debug:print 0 *default-log-port* "Failed to setup, exiting") (exit 1))) + (if (not (server:choose-server *toppath* 'home?)) + (begin + (debug:print 0 *default-log-port* "Servers are not running on this host or no servers alive. Cannot run cleanup-db") + (exit 1))) (let ((dbstructs (db:setup #f))) (common:cleanup-db dbstructs)) (set! *didsomething* #t))) (if (args:get-arg "-mark-incompletes") ADDED mtargs/mtargs.egg Index: mtargs/mtargs.egg ================================================================== --- /dev/null +++ mtargs/mtargs.egg @@ -0,0 +1,7 @@ +((license "LGPL") + (version 0.1) + (category misc) + (dependencies srfi-69 srfi-1) + (author "Matt Welland") + (synopsis "Primitive argument processor.") + (components (extension mtargs))) Index: mtargs/mtargs.scm ================================================================== --- mtargs/mtargs.scm +++ mtargs/mtargs.scm @@ -19,22 +19,28 @@ (module mtargs ( arg-hash get-arg get-arg-from - usage get-args + usage print-args any-defined? - help ) -(import scheme chicken data-structures extras posix ports files) -(use srfi-69 srfi-1) +(import scheme) ;; gives us cond-expand in chicken-4 + +(cond-expand + (chicken-5 + (import scheme (chicken base) (chicken port) (chicken file) (chicken process-context)) + (import srfi-69 srfi-1)) + (chicken-4 + (import chicken posix srfi-69 srfi-1)) + (else)) +(define usage (make-parameter print)) (define arg-hash (make-hash-table)) -(define help "") (define (get-arg arg . default) (if (null? default) (hash-table-ref/default arg-hash arg #f) (hash-table-ref/default arg-hash arg (car default)))) @@ -48,28 +54,10 @@ (define (get-arg-from ht arg . default) (if (null? default) (hash-table-ref/default ht arg #f) (hash-table-ref/default ht arg (car default)))) -(define (usage . args) - (if (> (length args) 0) - (apply print "ERROR: " args)) - (if (string? help) - (print help) - (print "Usage: " (car (argv)) " ... ")) - (exit 0)) - - ;; one-of args defined -(define (any-defined? . param) - (let ((res #f)) - (for-each - (lambda (arg) - (if (get-arg arg)(set! res #t))) - param) - res)) - -;; args: (define (get-args args params switches arg-hash num-needed) (let* ((numtargs (length args)) (adj-num-needed (if num-needed (+ num-needed 2) #f))) (if (< numtargs (if adj-num-needed adj-num-needed 2)) (if (>= num-needed 1) @@ -94,13 +82,12 @@ (else (if (null? tail)(append remtargs (list arg)) ;; return the non-used args (loop (car tail)(cdr tail)(append remtargs (list arg)))))))) )) -(define (print-args remtargs arg-hash) - (print "ARGS: " remtargs) +(define (print-args arg-hash) (for-each (lambda (arg) (print " " arg " " (hash-table-ref/default arg-hash arg #f))) (hash-table-keys arg-hash))) ) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -64,11 +64,11 @@ exn (begin ;; (release-dot-lock fname) (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 5 *default-log-port* "exn=" (condition->list exn)) + ;; (debug:print 5 *default-log-port* "exn=" (condition->list exn)) (if (common:file-exists? fname)(delete-file fname)) ;; brutally get rid of it (print-call-chain (current-error-port))) (let* (;; (lock (obtain-dot-lock fname 2 9 10)) (db (portlogger:open-db fname)) (res (apply proc db params))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -42,19 +42,18 @@ ;;====================================================================== ;; if a server is either running or in the process of starting call client:setup ;; else return #f to let the calling proc know that there is no server available ;; -(define (rmt:get-connection-info areapath #!key (area-dat #f)) ;; TODO: push areapath down. - (let* ((runremote (or area-dat *runremote*)) - (cinfo (if (remote? runremote) +(define (rmt:get-connection-info areapath runremote) ;; TODO: push areapath down. + (let* ((cinfo (if (remote? runremote) (remote-conndat runremote) #f))) (if cinfo cinfo (if (server:check-if-running areapath) - (client:setup areapath) + (client:setup areapath runremote) #f)))) (define (rmt:on-homehost? runremote) (let* ((hh-dat (remote-hh-dat runremote))) (if (pair? hh-dat) @@ -170,17 +169,18 @@ ;;DOT CASE4 [label="reset\nconnection"]; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} ;;DOT CASE4 -> "rmt:send-receive"; ;; reset the connection if it has been unused too long ((and runremote - (remote-conndat runremote) + ;; (remote-conndat runremote) (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on - (+ (http-transport:server-dat-get-last-access (remote-conndat runremote)) + (+ (remote-last-access runremote) (remote-server-timeout runremote)))) (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") - (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. - (http-transport:close-connections area-dat: runremote) + (http-transport:close-connections runremote) + ;; moving this setting of runremote conndat to #f to inside the http-transport:close-connections + ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;;DOT CASE5 [label="local\nread"]; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; @@ -198,27 +198,27 @@ ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; ;;DOT CASE6 -> "rmt:send-receive"; ;; on homehost and this is a write, we already have a server, but server has died ;; reinstate this keep-alive section but inject a time condition into the (add ... - - #;((and (cdr (remote-hh-dat runremote)) ;; on homehost - (not (member cmd api:read-only-queries)) ;; this is a write - (remote-server-url runremote) ;; have a server - (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. - (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6") - (http-transport:close-connections area-dat: runremote) ;; make sure to clean up - (set! *runremote* (make-remote)) - (let* ((server-info (remote-server-info *runremote*))) - (if server-info - (begin - (remote-server-url-set! *runremote* (server:record->url server-info)) - (remote-server-id-set! *runremote* (server:record->id server-info))))) - (remote-force-server-set! runremote (common:force-server?)) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") - (rmt:send-receive cmd rid params attemptnum: attemptnum)) + ;; + ;; ((and (cdr (remote-hh-dat runremote)) ;; on homehost + ;; (not (member cmd api:read-only-queries)) ;; this is a write + ;; (remote-server-url runremote) ;; have a server + ;; (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. + ;; (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6") + ;; (http-transport:close-connections area-dat: runremote) ;; make sure to clean up + ;; (set! *runremote* (make-remote)) + ;; (let* ((server-info (remote-server-info *runremote*))) + ;; (if server-info + ;; (begin + ;; (remote-server-url-set! *runremote* (server:record->url server-info)) + ;; (remote-server-id-set! *runremote* (server:record->id server-info))))) + ;; (remote-force-server-set! runremote (common:force-server?)) + ;; (mutex-unlock! *rmt-mutex*) + ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") + ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;;DOT CASE7 [label="homehost\nwrite"]; ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7}; ;;DOT CASE7 -> "rmt:open-qry-close-locally"; ;; on homehost and this is a write, we already have a server @@ -261,11 +261,11 @@ (not (remote-conndat runremote)))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) (mutex-unlock! *rmt-mutex*) (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? (server:start-and-wait *toppath*)) - (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http + (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;;DOT CASE10 [label="on homehost"]; ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10}; ;;DOT CASE10 -> "rmt:open-qry-close-locally"; @@ -282,69 +282,43 @@ ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; ;; not on homehost, do server query (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid))))) ;;DOT } -;; No Title -;; Error: (vector-ref) out of range -;; #(# (#("db.scm:3740: regex#regexp" #f #f) #("db.scm:3739: regex#string-substitute" #f #f) #("db.scm:3738: base64#base64-decode" #f #f) #("db.scm:3737: z3#z3:decode-buffer" #f #f) #("db.scm:3736: with-input-from-string" #f #f) #("db.scm:3741: s11n#deserialize" #f #f) #("api.scm:374: api:execute-requests" #f #f) #("api.scm:139: call-with-current-continuation" #f #f) #("api.scm:139: with-exception-handler" #f #f) #("api.scm:139: ##sys#call-with-values" #f #f) #("api.scm:158: string->symbol" #f #f) #("api.scm:160: current-milliseconds" #f #f) #("api.scm:161: dbr:dbstruct-read-only" #f #f) #("api.scm:139: k15" #f #f) #("api.scm:139: g19" #f #f) #("api.scm:142: get-call-chain" #f #f)) #("get-test-info-by-id" (1102 507299))) -;; 6 -;; -;; Call history: -;; -;; http-transport.scm:306: thread-terminate! -;; http-transport.scm:307: debug:print-info -;; common_records.scm:235: debug:debug-mode -;; rmt.scm:259: k587 -;; rmt.scm:259: g591 -;; rmt.scm:276: http-transport:server-dat-update-last-access -;; http-transport.scm:364: current-seconds -;; rmt.scm:282: debug:print-info -;; common_records.scm:235: debug:debug-mode -;; rmt.scm:283: mutex-unlock! -;; rmt.scm:287: extras-transport-succeded <-- -;; +-----------------------------------------------------------------------------+ -;; | Exit Status : 70 -;; - ;; bunch of small functions factored out of send-receive to make debug easier ;; (define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid) ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") ;; (mutex-lock! *rmt-mutex*) (let* ((conninfo (remote-conndat runremote)) - (dat-in (case (remote-transport runremote) - ((http) (condition-case ;; handling here has - ;; caused a lot of - ;; problems. However it - ;; is needed to deal with - ;; attemtped - ;; communication to - ;; servers that have gone - ;; away - (http-transport:client-api-send-receive 0 conninfo cmd params) - ((servermismatch) (vector #f "Server id mismatch" )) - ((commfail)(vector #f "communications fail")) - ((exn)(vector #f "other fail" (print-call-chain))))) - (else - (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") - (exit)))) + (dat-in (condition-case ;; handling here has + ;; caused a lot of + ;; problems. However it + ;; is needed to deal with + ;; attemtped + ;; communication to + ;; servers that have gone + ;; away + (http-transport:client-api-send-receive 0 runremote cmd params) + ;; (http-transport:client-api-send-receive 0 conninfo cmd params runremote) + ((servermismatch) (vector #f "Server id mismatch" )) + ((commfail)(vector #f "communications fail")) + ((exn)(vector #f "other fail" (print-call-chain))))) (dat (if (and (vector? dat-in) ;; ... check it is a correct size (> (vector-length dat-in) 1)) dat-in (vector #f (conc "communications fail (type 2), dat-in=" dat-in)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (and (vector? conninfo) (< 5 (vector-length conninfo))) - (http-transport:server-dat-update-last-access conninfo) ;; refresh access time + (remote-last-access-set! runremote (current-seconds)) ;; refresh access time (begin (debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo) (set! conninfo #f) - (remote-conndat-set! *runremote* #f) ;; NOTE: *runremote* is global copy of runremote. Purpose: factor out global. - (http-transport:close-connections area-dat: runremote))) + (http-transport:close-connections runremote))) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was ;; successful, have to examine the data to see if ;; there was a detected issue at the other end @@ -432,18 +406,13 @@ (mutex-lock! *db-multi-sync-mutex*) / (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) (mutex-unlock! *db-multi-sync-mutex*))))) res)) -(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) +(define (rmt:send-receive-no-auto-client-setup runremote cmd run-id params) (let* ((run-id (if run-id run-id 0)) - (res ;; (handle-exceptions - ;; exn - ;; (begin - ;; (print "transport failed. exn=" exn) - ;; #f) - (http-transport:client-api-send-receive run-id connection-info cmd params))) ;; ) + (res (http-transport:client-api-send-receive run-id runremote cmd params))) (if (and res (vector-ref res 0)) (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! #f))) ;;====================================================================== @@ -470,15 +439,12 @@ (rmt:send-receive 'login run-id (list *toppath* megatest-version (client:get-signature)))) ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; -(define (rmt:login-no-auto-client-setup connection-info) - (case *transport-type* ;; run-id of 0 is just a placeholder - ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version (client:get-signature)))) - ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*))) - )) +(define (rmt:login-no-auto-client-setup runremote) + (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature)))) ;; hand off a call to one of the db:queries statements ;; added run-id to make looking up the correct db possible ;; (define (rmt:general-call stmtname run-id . params) @@ -1067,12 +1033,11 @@ #f) (define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params) (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (mutex-lock! *rmt-mutex*) - (remote-conndat-set! runremote #f) - (http-transport:close-connections area-dat: runremote) + (http-transport:close-connections runremote) (remote-server-url-set! runremote #f) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) @@ -1096,11 +1061,11 @@ ;; want to ease off ;; the queries (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (mutex-lock! *rmt-mutex*) - (http-transport:close-connections area-dat: runremote) + (http-transport:close-connections runremote) (set! *runremote* #f) ;; force starting over (mutex-unlock! *rmt-mutex*) (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res)) ;; All good, return res Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1279,15 +1279,23 @@ (list hed tal reg reruns)) ;; If no resources are available just kill time and loop again ;; ((not have-resources) ;; simply try again after waiting a second - (if (runs:lownoise "no resources" 60) + (if (runs:lownoise "no resources" 600) (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ...")) + ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. - (thread-sleep! 0.25) + ;; (thread-sleep! 0.25) + + ;; new logic. + ;; If it has been more than 10 seconds since we were last here don't wait at all + ;; otherwise sleep 2 seconds to give db a rest and let dashboard read data + (if (runs:lownoise "frequent-no-resources" 10) + (thread-sleep! 0.25) ;; no significant delay + (thread-sleep! 2)) ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) ;; This is the final stage, everything is in place so launch the test ;; @@ -1775,11 +1783,11 @@ (last-jobs-check-time (runs:dat-last-jobs-check-time runsdat)) (should-check-jobs (match can-run-more-tests ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params) (if (< (- max-concurrent-jobs num-running) 25) (begin - (debug:print-info 0 *default-log-port* + (debug:print-info 2 *default-log-port* "less than 20 jobs headroom, ("max-concurrent-jobs "-"num-running")>20. Forcing prelaunch check.") #t) #f)) (else #f)))) ;; no record yet Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -65,21 +65,10 @@ ;;====================================================================== ;; Call this to start the actual server ;; -;; all routes though here end in exit ... -;; -;; start_server -;; -(define (server:launch run-id transport-type) - (case transport-type - ((http)(http-transport:launch)) - ;;((nmsg)(nmsg-transport:launch run-id)) - ;;((rpc) (rpc-transport:launch run-id)) - (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))) - ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== ;; Get the transport @@ -133,18 +122,11 @@ ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area - (let* (;; (curr-host (get-host-name)) - ;; (attempt-in-progress (server:start-attempted? areapath)) - ;; (dot-server-url (server:check-if-running areapath)) - ;; (curr-ip (server:get-best-guess-address curr-host)) - ;; (curr-pid (current-process-id)) - ;; (homehost (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) - ;; (target-host (car homehost)) - (testsuite (common:get-testsuite-name)) + (let* ((testsuite (common:get-testsuite-name)) (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") "")) (cmdln (conc (common:get-megatest-exe) " -server - ";; (or target-host "-") @@ -190,46 +172,48 @@ (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0) (bad-dat (list #f #f #f #f #f))) (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn) - bad-dat) ;; no idea what went wrong, call it a bad server - (with-input-from-file - logf - (lambda () - (let loop ((inl (read-line)) - (lnum 0)) - (if (not (eof-object? inl)) - (let ((mlst (string-match server-rx inl)) - (dbprep (string-match dbprep-rx inl))) - (if dbprep (set! dbprep-found 1)) - (if (not mlst) - (if (< lnum 500) ;; give up if more than 500 lines of server log read - (loop (read-line)(+ lnum 1)) - (begin + exn + (begin + ;; WARNING: this is potentially dangerous to blanket ignore the errors + (if (file-exists? logf) + (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) + bad-dat) ;; no idea what went wrong, call it a bad server + (with-input-from-file + logf + (lambda () + (let loop ((inl (read-line)) + (lnum 0)) + (if (not (eof-object? inl)) + (let ((mlst (string-match server-rx inl)) + (dbprep (string-match dbprep-rx inl))) + (if dbprep (set! dbprep-found 1)) + (if (not mlst) + (if (< lnum 500) ;; give up if more than 500 lines of server log read + (loop (read-line)(+ lnum 1)) + (begin (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) bad-dat)) - (match mlst - ((_ host port start server-id pid) - (list host - (string->number port) - (string->number start) - server-id - (string->number pid))) - (else - (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) - bad-dat)))) - (begin - (if dbprep-found - (begin - (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) - (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) - bad-dat)))))))) + (match mlst + ((_ host port start server-id pid) + (list host + (string->number port) + (string->number start) + server-id + (string->number pid))) + (else + (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) + bad-dat)))) + (begin + (if dbprep-found + (begin + (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) + (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) + bad-dat)))))))) ;; ;; get a list of servers from the log files, with all relevant data ;; ;; ( mod-time host port start-time pid ) ;; ;; ;; (define (server:get-list areapath #!key (limit #f)) @@ -419,11 +403,12 @@ ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; (define (server:get-servers-info areapath) - (let* ((servinfodir (conc *toppath*"/.servinfo"))) + ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") + (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) (if (not (file-exists? servinfodir)) (create-directory servinfodir)) (let* ((allfiles (glob (conc servinfodir"/*"))) (res (make-hash-table))) (for-each @@ -432,15 +417,45 @@ (serverdat (server:logf-get-start-info f))) (match serverdat ((host port start server-id pid) (if (and host port start server-id pid) (hash-table-set! res hostport serverdat) - (debug:print-info 0 *default-log-port* "bad server info for "f": "serverdat))) + (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) (else - (debug:print-info 0 *default-log-port* "bad server info for "f": "serverdat))))) + (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) allfiles) res))) + +;; check the .servinfo directory, are there other servers running on this +;; or another host? +;; +;; returns #t => ok to start another server +;; #f => not ok to start another server +;; +(define (server:minimal-check areapath) + (server:clean-up-old areapath) + (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo")) + (servrs (glob (conc srvdir"/*"))) + (thishostip (server:get-best-guess-address (get-host-name))) + (thisservrs (glob (conc srvdir"/"thishostip":*"))) + (homehostinf (server:choose-server areapath 'homehost)) + (havehome (car homehostinf)) + (wearehome (cdr homehostinf))) + (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome + ", numservers: "(length thisservrs)) + (cond + ((not havehome) #t) ;; no homehost yet, go for it + ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another + ((and havehome (not wearehome)) #f) ;; we are not the home host + ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running + (else + (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) + #t)))) + + +(define server-last-start 0) + ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; ;; mode: @@ -453,29 +468,46 @@ ;; 1. sort by age ascending and ping until good ;; find alive rand from youngest ;; 1. sort by age descending ;; 2. take five ;; 3. check alive, discard if not and repeat + ;; first we clean up old server files + (server:clean-up-old areapath) + (let* ((since-last (- (current-seconds) server-last-start)) + (server-start-delay 10)) + (if ( < (- (current-seconds) server-last-start) 10 ) + (begin + (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) + (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") + (thread-sleep! server-start-delay) + ) + (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) + ) + ) (let* ((serversdat (server:get-servers-info areapath)) (servkeys (hash-table-keys serversdat)) - (by-time-asc (if (not (null? servkeys)) + (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last (sort servkeys ;; list of "host:port" (lambda (a b) (>= (list-ref (hash-table-ref serversdat a) 2) (list-ref (hash-table-ref serversdat b) 2)))) '()))) + (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) + (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) (if (not (null? by-time-asc)) (let* ((oldest (last by-time-asc)) (oldest-dat (hash-table-ref serversdat oldest)) (host (list-ref oldest-dat 0)) (all-valid (filter (lambda (x) (equal? host (list-ref (hash-table-ref serversdat x) 0))) by-time-asc)) - (best-five (lambda () - (if (> (length all-valid) 5) - (take all-valid 5) - all-valid))) + (best-ten (lambda () + (if (> (length all-valid) 11) + (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out + (if (> (length all-valid) 8) + (drop-right all-valid 1) + all-valid)))) (names->dats (lambda (names) (map (lambda (x) (hash-table-ref serversdat x)) names))) (am-home? (lambda () @@ -488,25 +520,54 @@ (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) (print "youngest: "(hash-table-ref serversdat (car all-valid)))) ((home) host) ((homehost) (cons host (am-home?))) ;; shut up old code ((home?) (am-home?)) - ((best-five)(names->dats (best-five))) + ((best-ten)(names->dats (best-ten))) ((all-valid)(names->dats all-valid)) - ((best) (let* ((best-five (best-five)) - (len (length best-five))) - (hash-table-ref serversdat (list-ref best-five (random len))))) + ((best) (let* ((best-ten (best-ten)) + (len (length best-ten))) + (hash-table-ref serversdat (list-ref best-ten (random len))))) ((count)(length all-valid)) (else (debug:print 0 *default-log-port* "ERROR: invalid command "mode) #f))) (begin (server:run areapath) - (thread-sleep! 3) + (set! server-last-start (current-seconds)) + ;; (thread-sleep! 3) (case mode ((homehost) (cons #f #f)) (else #f)))))) + +(define (server:get-servinfo-dir areapath) + (let* ((spath (conc areapath"/.servinfo"))) + (if (not (file-exists? spath)) + (create-directory spath #t)) + spath)) + +(define (server:clean-up-old areapath) + ;; any server file that has not been touched in ten minutes is effectively dead + (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) + (for-each + (lambda (sfile) + (let* ((modtime (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) + (current-seconds)) + (file-modification-time sfile)))) + (if (and (number? modtime) + (> (- (current-seconds) modtime) + 600)) + (begin + (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") + (handle-exceptions + exn + (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) + (delete-file sfile)))))) + sfiles))) ;; would like to eventually get rid of this ;; (define (common:on-homehost?) (server:choose-server *toppath* 'home?)) @@ -516,11 +577,17 @@ ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least seconds old ;; (server:wait-for-server-start-last-flag areapath) - (if (< (server:choose-server areapath 'count) 10) + (let loop () + (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) + (begin + (if (common:low-noise-print 30 "our-host-load") + (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) + (loop)))) + (if (< (server:choose-server areapath 'count) 20) (server:run areapath)) #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((lock-file (conc areapath "/logs/server-start.lock"))) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 25) @@ -538,11 +605,12 @@ (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) (if (or server-info (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. (server:record->url server-info) - (let ((num-ok (length (server:choose-server areapath 'all-valid)))) + (let* ( (servers (server:choose-server areapath 'all-valid)) + (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) (if (and (> try-num 0) ;; first time through simply wait a little while then try again (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one (server:run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath) @@ -555,11 +623,11 @@ ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed - (servers (server:choose-server areapath 'best-five))) ;; (server:get-best (server:get-list areapath)))) + (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) (if (or (and servers (null? servers)) (not servers)) ;; (and (list? servers) ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers @@ -596,48 +664,43 @@ ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; -(define (server:ping host-port-in server-id #!key (do-exit #f)) - (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - #f ;; (server:check-if-running *toppath*) - ;; (if (number? host-port-in) ;; we were handed a server-id - ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) - ;; ;; (print "srec: " srec " host-port-in: " host-port-in) - ;; (if srec - ;; (conc (vector-ref srec 3) ":" (vector-ref srec 4)) - ;; (conc "no such server-id " host-port-in))) - host-port-in))) ;; ) - (let* ((host-port (if host:port - (let ((slst (string-split host:port ":"))) - (if (eq? (length slst) 2) - (list (car slst)(string->number (cadr slst))) - #f)) - #f))) -;; (toppath (launch:setup))) - ;; (print "host-port=" host-port) - (if (not host-port) - (begin - (if host-port-in - (debug:print 0 *default-log-port* "ERROR: bad host:port")) - (if do-exit (exit 1)) - #f) - (let* ((iface (car host-port)) - (port (cadr host-port)) - (server-dat (http-transport:client-connect iface port server-id)) - (login-res (rmt:login-no-auto-client-setup server-dat))) - (if (and (list? login-res) - (car login-res)) - (begin - ;; (print "LOGIN_OK") - (if do-exit (exit 0)) - #t) - (begin - ;; (print "LOGIN_FAILED") - (if do-exit (exit 1)) - #f))))))) +(define (server:ping host:port server-id #!key (do-exit #f)) + (let* ((host-port (cond + ((string? host:port) + (let ((slst (string-split host:port ":"))) + (if (eq? (length slst) 2) + (list (car slst)(string->number (cadr slst))) + #f))) + (else + #f)))) + (cond + ((and (list? host-port) + (eq? (length host-port) 2)) + (let* ((myrunremote (make-remote)) + (iface (car host-port)) + (port (cadr host-port)) + (server-dat (client:connect iface port server-id myrunremote)) + (login-res (rmt:login-no-auto-client-setup myrunremote))) + (if (and (list? login-res) + (car login-res)) + (begin + ;; (print "LOGIN_OK") + (if do-exit (exit 0)) + #t) + (begin + ;; (print "LOGIN_FAILED") + (if do-exit (exit 1)) + #f)))) + (else + (if host:port + (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port)) + (if do-exit + (exit 1) + #f))))) ;; run ping in separate process, safest way in some cases ;; (define (server:ping-server ifaceport) (with-input-from-pipe @@ -668,11 +731,11 @@ (define (server:expiration-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below (* 3600 (string->number tmo)) - 60))) + 600))) (define (server:get-best-guess-address hostname) (let ((res #f)) (for-each (lambda (adr)