Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -671,12 +671,15 @@ (pp dat))) (if (file-exists? fname) ;; now verify it is readable (if (configf:read-alist fname) #t ;; data is good. (begin - (delete-file fname) - (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") + (handle-exceptions + exn + #f + (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") + (delete-file fname)) #f)) #f))) ;; convert hierarchial list to ini format ;; Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -472,19 +472,20 @@ (port (cadr host-port)) (start-res (http-transport:client-connect host port)) (ping-res (rmt:login-no-auto-client-setup start-res))) (if (and start-res ping-res) - (let ((url (http-transport:server-dat-make-url start-res))) + ;; (begin ;; let ((url (http-transport:server-dat-make-url start-res))) + (begin (remote-conndat-set! *runremote* start-res) - (remote-server-url-set! *runremote* url) - (if (server:ping url) - (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.") - (begin - (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url) - (remote-conndat-set! *runremote* #f) - (remote-server-url-set! *runremote* #f)))) + ;; (remote-server-url-set! *runremote* url) + ;; (if (server:ping url) + (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data.")) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url) + ;; (remote-conndat-set! *runremote* #f) + ;; (remote-server-url-set! *runremote* #f)))) (debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.") )) (begin (debug:print-info 0 *default-log-port* (if host-port (conc "received invalid host-port information " host-port) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -157,12 +157,11 @@ -server -|hostname : start the server (reduces contention on megatest.db), use - to automatically figure out hostname -transport http|rpc : use http or rpc for transport (default is http) -log logfile : send stdout and stderr to logfile -list-servers : list the servers - -stop-server id : stop server specified by id (see output of -list-servers), use - 0 to kill all + -kill-servers : kill all servers -repl : start a repl (useful for extending megatest) -load file.scm : load and run file.scm -mark-incompletes : find and mark incomplete tests -ping run-id|host:port : ping server, exit with 0 if found -debug N|N,M,O... : enable debug 0-N or N and M and O ... @@ -253,13 +252,11 @@ ":units" ;; misc "-start-dir" "-contour" "-server" - "-stop-server" "-transport" - "-kill-server" "-port" "-extract-ods" "-pathmod" "-env2file" "-envcap" @@ -319,10 +316,11 @@ ;; misc "-repl" "-lock" "-unlock" "-list-servers" + "-kill-servers" "-run-wait" ;; wait on a run to complete (i.e. no RUNNING) "-local" ;; run some commands using local db access "-generate-html" ;; misc queries @@ -491,13 +489,13 @@ (printf "Sending signal/term to ~A\n" pid) (process-signal pid signal/term)))))) (process:children #f)) (original-exit exit-code))))) -;; for some switches alway print the command to stderr +;; for some switches always print the command to stderr ;; -(if (args:any? "-run" "-runall" "-list-runs" "-remove-runs" "-set-state-status") +(if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status") (debug:print 0 *default-log-port* (string-intersperse (argv) " "))) ;;====================================================================== ;; Misc setup stuff ;;====================================================================== @@ -767,59 +765,46 @@ (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) (server:launch 0 transport-type) (set! *didsomething* #t))) (if (or (args:get-arg "-list-servers") - (args:get-arg "-stop-server") - (args:get-arg "-kill-server")) + (args:get-arg "-kill-servers")) (let ((tl (launch:setup))) - (if tl + (if tl ;; all roads from here exit (let* ((servers (server:get-list *toppath*)) - (fmtstr "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n") - (servers-to-kill '()) - (kill-switch (if (args:get-arg "-kill-server") "-9" "")) - (killinfo (or (args:get-arg "-stop-server") (args:get-arg "-kill-server") )) - (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) - (sid (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f))) - (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport") - (format #t fmtstr "==" "=====" "===" "====" "=================" "======" "========" "=====" "=========") - (for-each + (fmtstr "~8a~22a~20a~20a~8a\n")) + (format #t fmtstr "pid" "Interface:port" "age (hms)" "Last mod" "State") + (format #t fmtstr "===" "==============" "=========" "========" "=====") + (for-each ;; ( mod-time host port start-time pid ) (lambda (server) - (let* ((id (vector-ref server 0)) - (pid (vector-ref server 1)) - (hostname (vector-ref server 2)) - (interface (vector-ref server 3)) - (pullport (vector-ref server 4)) - (pubport (vector-ref server 5)) - (start-time (vector-ref server 6)) - (priority (vector-ref server 7)) - (state (vector-ref server 8)) - (mt-ver (vector-ref server 9)) - (last-update (vector-ref server 10)) - (transport (vector-ref server 11)) - (killed #f) - (status (< last-update 20))) - ;; (zmq-sockets (if status (server:client-connect hostname port) #f))) - ;; no need to login as status of #t indicates we are connecting to correct - ;; server - ;; (if (equal? state "dead") - ;; (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day. - ;; (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete)) - ;; (if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds - ;; (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid))) - (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update - (if status "alive" "dead") transport) - (if (or (equal? id sid) - (equal? sid 0)) ;; kill all/any + (let* ((mtm (any->number (car server))) + (mod (if mtm (- (current-seconds) mtm) "unk")) + (age (- (current-seconds)(or (any->number (list-ref server 3)) (current-seconds)))) + (url (conc (cadr server) ":" (caddr server))) + (pid (list-ref server 4)) + (alv (if (number? mod)(< mod 10) #f))) + (format #t + fmtstr + pid + url + (seconds->hr-min-sec age) + (seconds->hr-min-sec mod) + (if alv "alive" "dead")) + (if (and alv + (args:get-arg "-kill-servers")) (begin - (debug:print-info 0 *default-log-port* "Attempting to kill "kill-switch" server with pid " pid) - (tasks:kill-server hostname pid kill-switch: kill-switch))))) - servers) + (debug:print-info 0 *default-log-port* "Attempting to kill server with pid " pid) + (server:kill server))))) + (sort servers (lambda (a b) + (let ((ma (or (any->number (car a)) 9e9)) + (mb (or (any->number (car b)) 9e9))) + (> ma mb))))) (debug:print-info 1 *default-log-port* "Done with listservers") (set! *didsomething* #t) - (exit)) ;; must do, would have to add checks to many/all calls below + (exit)) (exit)))) + ;; must do, would have to add checks to many/all calls below ;;====================================================================== ;; Weird special calls that need to run *after* the server has started? ;;====================================================================== Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -50,29 +50,25 @@ db)) (define (portlogger:open-run-close proc . params) (let* ((fname (conc "/tmp/." (current-user-name) "-portlogger.db")) (avail (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away - ;;(handle-exceptions - ;; exn - ;; (begin - ;; ;; (release-dot-lock fname) - ;; (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params) - ;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - ;; (debug:print 0 *default-log-port* "exn=" (condition->list exn)) - ;; (if (file-exists? fname) - ;; (begin - ;; (debug:print 0 *default-log-port* "Removing portlogger database file " fname) - ;; (delete-file fname))) ;; just get rid of the portlogger file - ;; (print-call-chain (current-error-port))) + (handle-exceptions + exn + (begin + ;; (release-dot-lock fname) + (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "exn=" (condition->list exn)) + (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it + (print-call-chain (current-error-port))) (let* (;; (lock (obtain-dot-lock fname 2 9 10)) (db (portlogger:open-db fname)) (res (apply proc db params))) (sqlite3:finalize! db) ;; (release-dot-lock fname) - res))) -;; ) + res)))) ;; (fold-row PROC INIT DATABASE SQL . PARAMETERS) (define (portlogger:take-port db portnum) (let* ((qry1 (sqlite3:prepare db "INSERT INTO ports (port,state) VALUES (?,?);")) (qry2 (sqlite3:prepare db "UPDATE ports SET state=?,update_time=strftime('%s','now') WHERE port=?;")) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -97,14 +97,14 @@ ) ;; reset the connection if it has been unused too long ((and runremote (remote-conndat runremote) - (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts + (let ((expire-time (+ (- start-time (remote-server-timeout runremote))))) ;; NOTE: REMOVED the 30 second noise. If adding it back be sure to offset!! add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") - (remote-conndat-set! runremote #f) + (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area ((not runremote) ;; can remove this one. should never get here. (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -419,8 +419,8 @@ (if (and (string? tmo) (string->number tmo)) (* 60 60 (string->number tmo)) ;; (* 3 24 60 60) ;; default to three days ;;(* 60 60 1) ;; default to one hour - (* 60 60 0.25) ;; default to 0.25 hours + (* 60 5) ;; default to five minutes )))