Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -17,11 +17,11 @@ (use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) ;; (use zmq) (import (prefix sqlite3 sqlite3:)) -(use spiffy uri-common intarweb http-client spiffy-request-vars) +(use spiffy uri-common intarweb http-client spiffy-request-vars uri-common intarweb) (declare (unit client)) (declare (uses common)) (declare (uses db)) @@ -75,11 +75,20 @@ ;; ;; DEBUG STUFF ;; (if (eq? *transport-type* 'fs)(begin (print "ERROR!!!!!!! refusing to run with transport " *transport-type*)(exit 99))) (debug:print-info 11 "Using transport type of " *transport-type* (if hostinfo (conc " to connect to " hostinfo) "")) (case *transport-type* - ((fs)(if (not *megatest-db*)(set! *megatest-db* (open-db)))) + ((fs) ;; (if (not *megatest-db*)(set! *megatest-db* (open-db)))) + ;; we are not doing fs any longer. let's cheat and start up a server + ;; if we are falling back on fs (not 100% supported) do an about face and start a server + (if (not (equal? (args:get-arg "-transport") "fs")) + (begin + (set! *transport-type* #f) + (system (conc "megatest -list-servers | grep " megatest-version " | grep alive || megatest -server - -daemonize && sleep 3")) + (thread-sleep! 1) + (if (> numtries 0) + (client:setup numtries: (- numtries 1)))))) ((http) (http-transport:client-connect (tasks:hostinfo-get-interface hostinfo) (tasks:hostinfo-get-port hostinfo))) ((zmq) (zmq-transport:client-connect (tasks:hostinfo-get-interface hostinfo) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -54,10 +54,11 @@ (define *server-info* #f) (define *time-to-exit* #f) (define *received-response* #f) (define *default-numtries* 10) (define *server-run* #t) +(define *db-write-access* #t) (define *target* (make-hash-table)) ;; cache the target here; target is keyval1/keyval2/.../keyvalN (define *keys* (make-hash-table)) ;; cache the keys here (define *keyvals* (make-hash-table)) Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -353,26 +353,33 @@ (store-button store-label) (command-text-box (iup:textbox #:expand "HORIZONTAL" #:font "Courier New, -10")) (command-launch-button (iup:button "Execute!" #:action (lambda (x) (let ((cmd (iup:attribute command-text-box "VALUE"))) (system (conc cmd " &")))))) + (kill-jobs (lambda (x) + (iup:attribute-set! + command-text-box "VALUE" + (conc "xterm -geometry 180x20 -e \"megatest -target " keystring " :runname " runname + " -set-state-status KILLREQ,n/a -testpatt %/% " + ;; (conc testname "/" (if (equal? item-path "") "%" item-path)) + " :state RUNNING ;echo Press any key to continue;bash -c 'read -n 1 -s'\"")))) (run-test (lambda (x) (iup:attribute-set! command-text-box "VALUE" (conc "xterm -geometry 180x20 -e \"megatest -target " keystring " :runname " runname " -runtests " (conc testname "/" (if (equal? item-path "") "%" item-path)) - ";echo Press any key to continue;bash -c 'read -n 1 -s'\"")))) + " ;echo Press any key to continue;bash -c 'read -n 1 -s'\"")))) (remove-test (lambda (x) (iup:attribute-set! command-text-box "VALUE" (conc "xterm -geometry 180x20 -e \"megatest -remove-runs -target " keystring " :runname " runname " -testpatt " (conc testname "/" (if (equal? item-path "") "%" item-path)) - " -v;echo Press any key to continue;bash -c 'read -n 1 -s'\""))))) + " -v ;echo Press any key to continue;bash -c 'read -n 1 -s'\""))))) (cond ((not testdat)(begin (print "ERROR: bad test info for " test-id)(exit 1))) ((not rundat)(begin (print "ERROR: found test info but there is a problem with the run info for " run-id)(exit 1))) (else ;; (test-set-status! db run-id test-name state status itemdat) @@ -388,15 +395,16 @@ (host-info-panel testdat store-label) ;; The controls (iup:frame #:title "Actions" (iup:vbox (iup:hbox - (iup:button "View Log" #:action viewlog #:size "80x") - (iup:button "Start Xterm" #:action xterm #:size "80x") - (iup:button "Run Test" #:action run-test #:size "80x") - (iup:button "Clean Test" #:action remove-test #:size "80x") - (iup:button "Close" #:action (lambda (x)(exit)) #:size "80x")) + (iup:button "View Log" #:action viewlog #:size "80x") + (iup:button "Start Xterm" #:action xterm #:size "80x") + (iup:button "Run Test" #:action run-test #:size "80x") + (iup:button "Clean Test" #:action remove-test #:size "80x") + (iup:button "Kill All Jobs" #:action kill-jobs #:size "80x") + (iup:button "Close" #:action (lambda (x)(exit)) #:size "80x")) (apply iup:hbox (list command-text-box command-launch-button)))) (set-fields-panel test-id testdat) (let ((tabs @@ -441,11 +449,12 @@ (let ((val (vector-ref hed (- colnum 1)))) (iup:attribute-set! steps-matrix (conc rownum ":" colnum)(if val (conc val) "")) (if (< colnum 6) (loop hed tal rownum (+ colnum 1)) (if (not (null? tal)) - (loop (car tal)(cdr tal)(+ rownum 1) 1))))))))) + (loop (car tal)(cdr tal)(+ rownum 1) 1)))) + (iup:attribute-set! steps-matrix "REDRAW" "ALL")))))) (hash-table-set! widgets "StepsMatrix" proc) (proc testdat)) steps-matrix) ;; populate the Test Data panel (iup:frame Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -68,14 +68,18 @@ (begin (debug:print 0 "ERROR: Attempted to open db when not in megatest area. Exiting.") (exit)))) (let* ((dbpath (conc *toppath* "/megatest.db")) ;; fname) (dbexists (file-exists? dbpath)) + (write-access (file-write-access? dbpath)) (db (sqlite3:open-database dbpath)) ;; (never-give-up-open-db dbpath)) (handler (make-busy-timeout (if (args:get-arg "-override-timeout") (string->number (args:get-arg "-override-timeout")) 136000)))) ;; 136000))) ;; 136000 = 2.2 minutes + (if (and dbexists + (not write-access)) + (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control (debug:print-info 11 "open-db, dbpath=" dbpath " argv=" (argv)) (sqlite3:set-busy-handler! db handler) (if (not dbexists) (db:initialize db)) (db:set-sync db) @@ -1263,12 +1267,17 @@ (debug:print-info 11 "zdat=" zdat) (let* ((res #f) (rawdat (http-transport:client-send-receive serverdat zdat)) (tmp #f)) (debug:print-info 11 "Sent " zdat ", received " rawdat) - (set! tmp (db:string->obj rawdat)) - (vector-ref tmp 2)))) + (if rawdat + (begin + (set! tmp (db:string->obj rawdat)) + (vector-ref tmp 2)) + (begin + (debug:print 0 "ERROR: Communication with the server failed. Exiting if possible") + (exit 1)))))) ((zmq) (handle-exceptions exn (begin (debug:print-info 0 "cdb:client-call timeout or error. Trying again in 5 seconds") Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -11,11 +11,11 @@ (require-extension (srfi 18) extras tcp s11n) (use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) (import (prefix sqlite3 sqlite3:)) -(use spiffy uri-common intarweb http-client spiffy-request-vars) +(use spiffy uri-common intarweb http-client spiffy-request-vars uri-common intarweb) ;; Configurations for server (tcp-buffer-size 2048) (max-connections 2048) @@ -146,76 +146,106 @@ (set! *runremote* (list ipaddrstr portnum)) ;; (open-run-close tasks:remove-server-records tasks:open-db) (open-run-close tasks:server-register tasks:open-db (current-process-id) - ipaddrstr portnum 0 'live 'http) - (print "INFO: Trying to start server on " ipaddrstr ":" portnum) + ipaddrstr portnum 0 'startup 'http) + (debug:print 1 "INFO: Trying to start server on " ipaddrstr ":" portnum) ;; This starts the spiffy server ;; NEED WAY TO SET IP TO #f TO BIND ALL (start-server bind-address: ipaddrstr port: portnum) (open-run-close tasks:server-delete tasks:open-db ipaddrstr portnum) - (print "INFO: server has been stopped"))) + (debug:print 1 "INFO: server has been stopped"))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== ;;====================================================================== ;; C L I E N T S ;;====================================================================== +(define *http-mutex* (make-mutex)) + +;; (system "megatest -list-servers | grep alive || megatest -server - -daemonize && sleep 4") + ;; ;; ;; 1 Hello, world! Goodbye Dolly ;; Send msg to serverdat and receive result -(define (http-transport:client-send-receive serverdat msg) - (let* ((url (http-transport:make-server-url serverdat)) - (fullurl (conc url "/ctrl")) ;; (conc url "/?dat=" msg))) - (numretries 0)) +(define (http-transport:client-send-receive serverdat msg #!key (numretries 30)) + (let* (;; (url (http-transport:make-server-url serverdat)) + (fullurl (caddr serverdat)) ;; (conc url "/ctrl")) ;; (conc url "/?dat=" msg))) + (res #f)) (handle-exceptions exn - (if (< numretries 200) - (http-transport:client-send-receive serverdat msg)) + (begin + (print "ERROR IN http-transport:client-send-receive " ((condition-property-accessor 'exn 'message) exn)) + (thread-sleep! 2) + (if (> numretries 0) + (http-transport:client-send-receive serverdat msg numretries: (- numretries 1)))) (begin (debug:print-info 11 "fullurl=" fullurl "\n") ;; set up the http-client here - (max-retry-attempts 100) + (max-retry-attempts 5) + ;; consider all requests indempotent (retry-request? (lambda (request) - (thread-sleep! (/ (if (> numretries 100) 100 numretries) 10)) - (set! numretries (+ numretries 1)) - #t)) + #t)) ;; (thread-sleep! (/ (if (> numretries 100) 100 numretries) 10)) + ;; (set! numretries (- numretries 1)) + ;; #t)) ;; send the data and get the response ;; extract the needed info from the http data and ;; process and return it. - (let* ((res (with-input-from-request fullurl - ;; #f - ;; msg - (list (cons 'dat msg)) - read-string))) + (let* ((send-recieve (lambda () + (mutex-lock! *http-mutex*) + (set! res (with-input-from-request + fullurl + (list (cons 'dat msg)) + read-string)) + (close-all-connections!) + (mutex-unlock! *http-mutex*))) + (time-out (lambda () + (thread-sleep! 5) + (if (not res) + (begin + (debug:print 0 "WARNING: communication with the server timed out.") + (mutex-unlock! *http-mutex*) + (http-transport:client-send-receive serverdat msg numretries: (- numretries 1)) + (if (< numretries 3) ;; on last try just exit + (begin + (debug:print 0 "ERROR: communication with the server timed out. Giving up.") + (exit 1))))))) + (th1 (make-thread send-recieve "with-input-from-request")) + (th2 (make-thread time-out "time out"))) + (thread-start! th1) + (thread-start! th2) + (thread-join! th1) + (thread-terminate! th2) (debug:print-info 11 "got res=" res) (let ((match (string-search (regexp "(.*)<.body>") res))) (debug:print-info 11 "match=" match) (let ((final (cadr match))) (debug:print-info 11 "final=" final) final))))))) (define (http-transport:client-connect iface port) (let* ((login-res #f) - (serverdat (list iface port))) + (uri-dat (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/ctrl")))) + (serverdat (list iface port uri-dat))) (set! login-res (client:login serverdat)) (if (and (not (null? login-res)) (car login-res)) (begin (debug:print-info 2 "Logged in and connected to " iface ":" port) (set! *runremote* serverdat) serverdat) (begin - (debug:print-info 0 "Failed to login or connect to " iface ":" port) - (set! *runremote* #f) - (set! *transport-type* 'fs) - #f)))) + (debug:print-info 0 "ERROR: Failed to login or connect to " iface ":" port) + (exit 1))))) +;; (set! *runremote* #f) +;; (set! *transport-type* 'fs) +;; #f)))) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; @@ -226,11 +256,12 @@ (let* ((server-info (let loop () (let ((sdat #f)) (mutex-lock! *heartbeat-mutex*) (set! sdat *runremote*) (mutex-unlock! *heartbeat-mutex*) - (if sdat sdat + (if sdat + sdat (begin (sleep 4) (loop)))))) (iface (car server-info)) (port (cadr server-info)) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.5422) +(define megatest-version 1.5427) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -24,10 +24,11 @@ (declare (uses server)) (declare (uses client)) (declare (uses tests)) (declare (uses genexample)) (declare (uses daemon)) +(declare (uses db)) (define *db* #f) ;; this is only for the repl, do not use in general!!!! (include "common_records.scm") (include "key_records.scm") @@ -245,10 +246,15 @@ (print megatest-version) (exit))) (define *didsomething* #f) +(if (and (or (args:get-arg "-list-targets") + (args:get-arg "-list-db-targets")) + (not (args:get-arg "-transport"))) + (hash-table-set! args:arg-hash "-transport" "fs")) + ;;====================================================================== ;; Misc setup stuff ;;====================================================================== (debug:setup) @@ -304,22 +310,31 @@ (let loop ((servers (open-run-close tasks:get-best-server tasks:open-db)) (trycount 0)) (if (or (not servers) (null? servers)) (begin - (if (eq? trycount 0) ;; just do the server start once + (if (even? trycount) ;; just do the server start every other time through this loop (every 8 seconds) (begin (debug:print 0 "INFO: Starting server as none running ...") ;; (server:launch (string->symbol (args:get-arg "-transport" "http")))) + ;; no need to use fork, no need to do the list-servers trick. Just start the damn server, it will exit on it's own + ;; if there is an existing server + (system "megatest -server - -daemonize") + (thread-sleep! 3) ;; (process-run (car (argv)) (list "-server" "-" "-daemonize" "-transport" (args:get-arg "-transport" "http"))) - (process-fork (lambda () - (daemon:ize) - (server:launch (string->symbol (args:get-arg "-transport" "http"))))) - (thread-sleep! 3)) - (debug:print-info 0 "Waiting for server to start")) - (loop (open-run-close tasks:get-best-server tasks:open-db) - (+ trycount 1))) + ;; (system (conc "megatest -list-servers | egrep '" megatest-version ".*alive' || megatest -server - -daemonize && sleep 3")) + ;; (process-fork (lambda () + ;; (daemon:ize) + ;; (server:launch (string->symbol (args:get-arg "-transport" "http"))))) + ) + (begin + (debug:print-info 0 "Waiting for server to start") + (thread-sleep! 4))) + (if (< trycount 10) + (loop (open-run-close tasks:get-best-server tasks:open-db) + (+ trycount 1)) + (debug:print 0 "WARNING: Couldn't start or find a server."))) (debug:print 0 "INFO: Server(s) running " servers) ))))) (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server")) @@ -827,12 +842,13 @@ (args:get-arg "-setlog") (args:get-arg "-m")) ;; (if db (sqlite3:finalize! db)) (set! *didsomething* #t))) -(if (or (and (args:get-arg "-setlog") ;; since setting up is so costly lets piggyback on -test-status - (not (args:get-arg "-step"))) ;; -setlog may have been processed already in the "-step" previous +(if (or (args:get-arg "-setlog") ;; since setting up is so costly lets piggyback on -test-status + ;; (not (args:get-arg "-step"))) ;; -setlog may have been processed already in the "-step" previous + ;; NEW POLICY - -setlog sets test overall log on every call. (args:get-arg "-set-toplog") (args:get-arg "-test-status") (args:get-arg "-set-values") (args:get-arg "-load-test-data") (args:get-arg "-runstep") Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -21,16 +21,20 @@ ;;====================================================================== ;; Tasks db ;;====================================================================== (define (tasks:open-db) - (let* ((dbpath (conc *toppath* "/monitor.db")) - (exists (file-exists? dbpath)) - (mdb (sqlite3:open-database dbpath)) ;; (never-give-up-open-db dbpath)) - (handler (make-busy-timeout 36000))) + (let* ((dbpath (conc *toppath* "/monitor.db")) + (exists (file-exists? dbpath)) + (write-access (file-write-access? dbpath)) + (mdb (sqlite3:open-database dbpath)) ;; (never-give-up-open-db dbpath)) + (handler (make-busy-timeout 36000))) + (if (and exists + (not write-access)) + (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control (sqlite3:set-busy-handler! mdb handler) - (sqlite3:execute mdb (conc "PRAGMA synchronous = 1;")) + (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;")) (if (not exists) (begin (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY, action TEXT DEFAULT '', owner TEXT, @@ -105,19 +109,20 @@ )) ;; NB// two servers with same pid on different hosts will be removed from the list if pid: is used! (define (tasks:server-deregister mdb hostname #!key (port #f)(pid #f)(action 'delete)) (debug:print-info 11 "server-deregister " hostname ", port " port ", pid " pid) - (if pid - (case action - ((delete)(sqlite3:execute mdb "DELETE FROM servers WHERE pid=?;" pid)) - (else (sqlite3:execute mdb "UPDATE servers SET state='dead' WHERE pid=?;" pid))) - (if port + (if *db-write-access* + (if pid (case action + ((delete)(sqlite3:execute mdb "DELETE FROM servers WHERE pid=?;" pid)) + (else (sqlite3:execute mdb "UPDATE servers SET state='dead' WHERE pid=?;" pid))) + (if port + (case action ((delete)(sqlite3:execute mdb "DELETE FROM servers WHERE (interface=? or hostname=?) AND port=?;" hostname hostname port)) (else (sqlite3:execute mdb "UPDATE servers SET state='dead' WHERE (interface=? or hostname=?) AND port=?;" hostname hostname port))) - (debug:print 0 "ERROR: tasks:server-deregister called with neither pid nor port specified")))) + (debug:print 0 "ERROR: tasks:server-deregister called with neither pid nor port specified"))))) (define (tasks:server-deregister-self mdb hostname) (tasks:server-deregister mdb hostname pid: (current-process-id))) ;; need a simple call for robustly removing records given host and port @@ -142,11 +147,17 @@ (if hostname hostname iface)(if pid pid port)) res)) (define (tasks:server-update-heartbeat mdb server-id) (debug:print-info 1 "Heart beat update of server id=" server-id) - (sqlite3:execute mdb "UPDATE servers SET heartbeat=strftime('%s','now') WHERE id=?;" server-id)) + (handle-exceptions + exn + (begin + (debug:print 0 "WARNING: probable timeout on monitor.db access") + (thread-sleep! 1) + (tasks:server-update-heartbeat mdb server-id)) + (sqlite3:execute mdb "UPDATE servers SET heartbeat=strftime('%s','now') WHERE id=?;" server-id))) ;; alive servers keep the heartbeat field upto date with seconds every 6 or so seconds (define (tasks:server-alive? mdb server-id #!key (iface #f)(hostname #f)(port #f)(pid #f)) (let* ((server-id (if server-id server-id