Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -55,73 +55,75 @@ ;; ;; lookup_server, need to remove *runremote* stuff ;; (define (client:setup run-id #!key (remaining-tries 100) (failed-connects 0)) (debug:print-info 2 "client:setup remaining-tries=" remaining-tries) - (if (<= remaining-tries 0) - (begin - (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) - (exit 1)) - (let ((host-info (hash-table-ref/default *runremote* run-id #f))) - (if host-info - (let* ((iface (http-transport:server-dat-get-iface host-info)) - (port (http-transport:server-dat-get-port host-info)) - (start-res (http-transport:client-connect iface port)) - (ping-res (rmt:login-no-auto-client-setup start-res run-id))) - (if ping-res ;; sucessful login? - (begin - (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries) - ;; Why add the close-connections here? - ;; (http-transport:close-connections run-id) - (hash-table-set! *runremote* run-id start-res) - start-res) ;; return the server info - ;; have host info but no ping. shutdown the current connection and try again - (begin ;; login failed - (debug:print-info 1 "client:setup, ping is bad for start-res=" start-res " and *runremote*=" host-info) - (http-transport:close-connections run-id) - (hash-table-delete! *runremote* run-id) - (if (< remaining-tries 8) - (thread-sleep! 5) - (thread-sleep! 1)) - (client:setup run-id remaining-tries: (- remaining-tries 1))))) - ;; YUK: rename server-dat here - (let* ((server-dat (tasks:get-server (tasks:get-db) run-id))) - (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if server-dat - (let* ((iface (tasks:hostinfo-get-interface server-dat)) - (port (tasks:hostinfo-get-port server-dat)) - (start-res (http-transport:client-connect iface port)) - (ping-res (rmt:login-no-auto-client-setup start-res run-id))) - (if (and start-res - ping-res) - (begin - (hash-table-set! *runremote* run-id start-res) - (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) - (http-transport:close-connections run-id) - (hash-table-delete! *runremote* run-id) - (tasks:server-force-clean-run-record (tasks:get-db) - run-id - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat) - " client:setup (server-dat = #t)") - (thread-sleep! 2) - (server:try-running run-id) - (thread-sleep! 10) ;; give server a little time to start up - (client:setup run-id remaining-tries: (- remaining-tries 1))))) - (begin ;; no server registered - (let ((num-available (tasks:num-in-available-state (tasks:get-db) run-id))) - (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) - (thread-sleep! 2) - (if (< num-available 2) - (begin - ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") - (server:try-running run-id))) - (thread-sleep! 10) ;; give server a little time to start up - (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))) + (let* ((tdbdat (tasks:open-db)) + (tdb (db:dbdat-get-db tdbdat))) + (if (<= remaining-tries 0) + (begin + (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) + (exit 1)) + (let ((host-info (hash-table-ref/default *runremote* run-id #f))) + (if host-info + (let* ((iface (http-transport:server-dat-get-iface host-info)) + (port (http-transport:server-dat-get-port host-info)) + (start-res (http-transport:client-connect iface port)) + (ping-res (rmt:login-no-auto-client-setup start-res run-id))) + (if ping-res ;; sucessful login? + (begin + (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries) + ;; Why add the close-connections here? + ;; (http-transport:close-connections run-id) + (hash-table-set! *runremote* run-id start-res) + start-res) ;; return the server info + ;; have host info but no ping. shutdown the current connection and try again + (begin ;; login failed + (debug:print-info 1 "client:setup, ping is bad for start-res=" start-res " and *runremote*=" host-info) + (http-transport:close-connections run-id) + (hash-table-delete! *runremote* run-id) + (if (< remaining-tries 8) + (thread-sleep! 5) + (thread-sleep! 1)) + (client:setup run-id remaining-tries: (- remaining-tries 1))))) + ;; YUK: rename server-dat here + (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) + (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) + (if server-dat + (let* ((iface (tasks:hostinfo-get-interface server-dat)) + (port (tasks:hostinfo-get-port server-dat)) + (start-res (http-transport:client-connect iface port)) + (ping-res (rmt:login-no-auto-client-setup start-res run-id))) + (if (and start-res + ping-res) + (begin + (hash-table-set! *runremote* run-id start-res) + (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res)) + start-res) + (begin ;; login failed but have a server record, clean out the record and try again + (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) + (http-transport:close-connections run-id) + (hash-table-delete! *runremote* run-id) + (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) + run-id + (tasks:hostinfo-get-interface server-dat) + (tasks:hostinfo-get-port server-dat) + " client:setup (server-dat = #t)") + (thread-sleep! 2) + (server:try-running run-id) + (thread-sleep! 10) ;; give server a little time to start up + (client:setup run-id remaining-tries: (- remaining-tries 1))))) + (begin ;; no server registered + (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) + (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) + (thread-sleep! 2) + (if (< num-available 2) + (begin + ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") + (server:try-running run-id))) + (thread-sleep! 10) ;; give server a little time to start up + (client:setup run-id remaining-tries: (- remaining-tries 1))))))))))) ;; keep this as a function to ease future (define (client:start run-id server-info) (http-transport:client-connect (tasks:hostinfo-get-interface server-info) (tasks:hostinfo-get-port server-info))) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -1453,12 +1453,11 @@ (define *monitor-db-path* (conc *dbdir* "/monitor.db")) (define *last-monitor-update-time* 0) ;; Force creation of the db in case it isn't already there. -(let ((db (tasks:open-db))) - (sqlite3:finalize! db)) +(tasks:open-db) (define (dashboard:get-youngest-run-db-mod-time) (handle-exceptions exn (begin Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -475,14 +475,14 @@ ;; first pass implementation, just insert all changed rows (for-each (lambda (targdb) (let* ((db (db:dbdat-get-db targdb)) - (stmth (sqlite3:prepare targdb full-ins))) - (db:delay-if-busy targdb) + (stmth (sqlite3:prepare db full-ins))) + ;; (db:delay-if-busy targdb) ;; NO WAITING (sqlite3:with-transaction - targdb + db (lambda () (for-each ;; (lambda (fromrow) (let* ((a (vector-ref fromrow 0)) (curr (hash-table-ref/default todat a #f)) @@ -534,18 +534,18 @@ (run-ids (if run-ids run-ids (if toppath (begin (db:delay-if-busy mtdb) (db:get-all-run-ids mtdb))))) - (mdb (tasks:open-db)) - (servers (tasks:get-all-servers mdb))) + (tdbdat (tasks:open-db)) + (servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))) ;; kill servers (if (member 'killservers options) (for-each (lambda (server) - (tasks:server-delete-record mdb (vector-ref server 0) "dbmigration") + (tasks:server-delete-record (db:delay-if-busy tdbdat) (vector-ref server 0) "dbmigration") (tasks:kill-server (vector-ref server 2)(vector-ref server 1))) servers)) ;; clear out junk records ;; @@ -586,12 +586,13 @@ (db:delay-if-busy mtdb) (if (eq? run-id 0) (db:sync-tables (db:sync-main-list dbstruct) fromdb mtdb) (db:sync-tables db:sync-tests-only fromdb mtdb)))) run-ids)) - (db:close-all dbstruct) - (sqlite3:finalize! mdb))) + ;; (db:close-all dbstruct) + ;; (sqlite3:finalize! mdb) + )) ;; keeping it around for debugging purposes only (define (open-run-close-no-exception-handling proc idb . params) (debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params) (if (or *db-write-access* @@ -1830,12 +1831,13 @@ (define (db:test-get-top-process-pid dbstruct run-id test-id) (db:with-db dbstruct run-id #f - (sqlite3:first-result db "SELECT attemptnum FROM tests WHERE id=?;" - test-id))) + (lambda (db) + (sqlite3:first-result db "SELECT attemptnum FROM tests WHERE id=?;" + test-id)))) (define db:test-record-fields '("id" "run_id" "testname" "state" "status" "event_time" "host" "cpuload" "diskfree" "uname" "rundir" "item_path" "run_duration" "final_logf" "comment" "shortdir" "attemptnum")) @@ -2434,10 +2436,11 @@ (loop (car tal)(cdr tal)))))))))) (define (db:delay-if-busy dbdat #!key (count 6)) (if dbdat (let* ((dbpath (db:dbdat-get-path dbdat)) + (db (db:dbdat-get-db dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline (dbfj (conc dbpath "-journal"))) (if (file-exists? dbfj) (case count ((6) (thread-sleep! 0.2) @@ -2457,11 +2460,13 @@ ((1) (thread-sleep! 6.4) (db:delay-if-busy count: 0)) (else (debug:print-info 0 "delaying db access due to high database load.") - (thread-sleep! 12.8))))))) + (thread-sleep! 12.8)))) + db) + "bogus result from db:delay-if-busy")) (define (db:test-get-records-for-index-file dbstruct run-id test-name) (let ((res '())) (db:with-db dbstruct Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -446,20 +446,22 @@ (iup:vbox ;; (iup:label "Run statistics" #:expand "HORIZONTAL") stats-matrix))) (define (dcommon:servers-table) - (let* ((colnum 0) + (let* ((tdbdat (tasks:open-db)) + (tdb (db:dbdat-get-db tdbdat)) + (colnum 0) (rownum 0) (servers-matrix (iup:matrix #:expand "YES" #:numcol 7 #:numcol-visible 7 #:numlin-visible 5 )) (colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId")) (updater (lambda () - (let ((servers (tasks:get-all-servers (tasks:get-db)))) + (let ((servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))) (iup:attribute-set! servers-matrix "NUMLIN" (length servers)) ;; (set! colnum 0) ;; (for-each (lambda (colname) ;; ;; (print "colnum: " colnum " colname: " colname) ;; (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -134,11 +134,12 @@ (http-transport:try-start-server run-id ipaddrstr start-port server-id))) ;; This is recursively run by http-transport:run until sucessful ;; (define (http-transport:try-start-server run-id ipaddrstr portnum server-id) - (let ((config-hostname (configf:lookup *configdat* "server" "hostname"))) + (let ((config-hostname (configf:lookup *configdat* "server" "hostname")) + (tdbdat (tasks:open-db))) (debug:print-info 0 "http-transport:try-start-server run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname) (handle-exceptions exn (begin (print-error-message exn) @@ -155,18 +156,16 @@ (http-transport:try-start-server run-id ipaddrstr (portlogger:open-run-close portlogger:find-port) server-id)) (begin - (tasks:wait-on-busy-monitor.db) - (tasks:server-force-clean-run-record (tasks:get-db) run-id ipaddrstr portnum " http-transport:try-start-server") + (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server") (print "ERROR: Tried and tried but could not start the server")))) ;; any error in following steps will result in a retry (set! *server-info* (list ipaddrstr portnum)) - (tasks:wait-on-busy-monitor.db) (tasks:server-set-interface-port - (tasks:get-db) + (db:delay-if-busy tdbdat) server-id ipaddrstr portnum) (debug:print 0 "INFO: Trying to start server on " ipaddrstr ":" portnum) ;; This starts the spiffy server ;; NEED WAY TO SET IP TO #f TO BIND ALL @@ -175,12 +174,11 @@ (start-server port: portnum bind-address: (if (equal? config-hostname "-") ipaddrstr config-hostname)) (start-server port: portnum)) ;; (portlogger:open-run-close portlogger:set-port portnum "released") - (tasks:wait-on-busy-monitor.db) - (tasks:server-force-clean-run-record (tasks:get-db) run-id ipaddrstr portnum " http-transport:try-start-server") + (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server") (debug:print 1 "INFO: server has been stopped")))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== @@ -259,11 +257,10 @@ (close-all-connections!)) (debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries) (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1))) (begin (mutex-unlock! *http-mutex*) - (tasks:wait-on-busy-monitor.db) (tasks:kill-server-run-id run-id) #f)) (begin (debug:print-info 11 "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") ;; set up the http-client here @@ -279,11 +276,11 @@ ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) ;; ((exn http client-error) e (print e))) (set! res (handle-exceptions exn (begin - (debug:print 0 "WARNING: failure in with-input-from-request. Killing associated server to allow clean retry.") + (debug:print 0 "WARNING: failure in with-input-from-request to " fullrul ". Killing associated server to allow clean retry.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (hash-table-delete! *runremote* run-id) (tasks:kill-server-run-id run-id) #f) (with-input-from-request ;; was dat @@ -354,11 +351,12 @@ ;; (define (http-transport:keep-running server-id run-id) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive - (let* ((server-info (let loop ((start-time (current-seconds)) + (let* ((tdbdat (tasks:open-db)) + (server-info (let loop ((start-time (current-seconds)) (changed #t) (last-sdat "not this")) (let ((sdat #f)) (thread-sleep! 0.01) (debug:print-info 0 "Waiting for server alive signature") @@ -371,23 +369,20 @@ sdat (begin (debug:print-info 0 "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes - (let ((tdb (tasks:open-db))) + (begin (debug:print 0 "ERROR: transport appears to have died, exiting server " server-id " for run " run-id) - (tasks:wait-on-busy-monitor.db) - (tasks:server-delete-record tdb server-id "failed to start, never received server alive signature") - (sqlite3:finalize! tdb) + (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature") (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) - (tdb (tasks:open-db)) (server-timeout (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (string->number tmo)) (* 60 60 (string->number tmo)) ;; (* 3 24 60 60) ;; default to three days @@ -398,11 +393,11 @@ (server-state 'available)) ;; Use this opportunity to sync the inmemdb to db (let ((start-time (current-milliseconds)) (sync-time #f) (rem-time #f)) - + ;; inmemdb is a dbstruct (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) (set! sync-time (- (current-milliseconds) start-time)) (set! rem-time (quotient (- 4000 sync-time) 1000)) (debug:print 2 "SYNC: time= " sync-time ", rem-time=" rem-time) @@ -409,15 +404,14 @@ ;; ;; set_running after our first pass through and start the db ;; (if (eq? server-state 'available) (begin - (tasks:wait-on-busy-monitor.db) - (tasks:server-set-state! tdb server-id "dbprep") + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") (thread-sleep! 5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *inmemdb* (db:setup run-id)) - (tasks:server-set-state! tdb server-id "running"))) + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))) (if (and (<= rem-time 4) (> rem-time 0)) (thread-sleep! rem-time) (thread-sleep! 4))) ;; fallback for if the math is changed ... @@ -469,16 +463,15 @@ (loop 0 server-state)) (begin (debug:print-info 0 "Starting to shutdown the server.") ;; need to delete only *my* server entry (future use) (set! *time-to-exit* #t) - (tasks:wait-on-busy-monitor.db) ;; wait here in addition to just before the shutting-down (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) ;; ;; start_shutdown ;; - (tasks:server-set-state! tdb server-id "shutting-down") + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") (portlogger:open-run-close portlogger:set-port port "released") (thread-sleep! 5) (debug:print-info 0 "Max cached queries was " *max-cache-size*) (debug:print-info 0 "Number of cached writes " *number-of-writes*) (debug:print-info 0 "Average cached write time " @@ -493,72 +486,69 @@ "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") - (tasks:wait-on-busy-monitor.db) - (tasks:server-delete-record tdb server-id " http-transport:keep-running") + (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running") (exit)))))) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) - (set! *run-id* run-id) - (if (args:get-arg "-daemonize") - (begin - (daemon:ize) - (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it - (begin - (current-error-port *alt-log-file*) - (current-output-port *alt-log-file*))))) - (if (server:check-if-running run-id) - (begin - (debug:print 0 "INFO: Server for run-id " run-id " already running") - (exit 0))) - (tasks:wait-on-busy-monitor.db) - (let loop ((server-id (tasks:server-lock-slot (tasks:get-db) run-id)) - (remtries 4)) - (if (not server-id) - (if (> remtries 0) - (begin - (thread-sleep! 2) - (tasks:wait-on-busy-monitor.db) - (loop (tasks:server-lock-slot (tasks:get-db) run-id) - (- remtries 1))) - (begin - ;; since we didn't get the server lock we are going to clean up and bail out - (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") - (tasks:wait-on-busy-monitor.db) - (tasks:server-delete-records-for-this-pid (tasks:get-db) " http-transport:launch") - )) - (let* ((th2 (make-thread (lambda () - (debug:print-info 0 "Server run thread started") - (http-transport:run - (if (args:get-arg "-server") - (args:get-arg "-server") - "-") - run-id - server-id)) "Server run")) - (th3 (make-thread (lambda () - (debug:print-info 0 "Server monitor thread started") - (http-transport:keep-running server-id run-id)) - "Keep running"))) - ;; Database connection - - - ;; don't start the db here - - ;; (set! *inmemdb* (db:setup run-id)) - - - (thread-start! th2) - (thread-start! th3) - (set! *didsomething* #t) - (thread-join! th2) - (exit))))) + (let* ((tdbdat (tasks:open-db))) + (set! *run-id* run-id) + (if (args:get-arg "-daemonize") + (begin + (daemon:ize) + (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it + (begin + (current-error-port *alt-log-file*) + (current-output-port *alt-log-file*))))) + (if (server:check-if-running run-id) + (begin + (debug:print 0 "INFO: Server for run-id " run-id " already running") + (exit 0))) + (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) + (remtries 4)) + (if (not server-id) + (if (> remtries 0) + (begin + (thread-sleep! 2) + (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id) + (- remtries 1))) + (begin + ;; since we didn't get the server lock we are going to clean up and bail out + (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") + (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") + )) + (let* ((th2 (make-thread (lambda () + (debug:print-info 0 "Server run thread started") + (http-transport:run + (if (args:get-arg "-server") + (args:get-arg "-server") + "-") + run-id + server-id)) "Server run")) + (th3 (make-thread (lambda () + (debug:print-info 0 "Server monitor thread started") + (http-transport:keep-running server-id run-id)) + "Keep running"))) + ;; Database connection + + + ;; don't start the db here + + ;; (set! *inmemdb* (db:setup run-id)) + + + (thread-start! th2) + (thread-start! th3) + (set! *didsomething* #t) + (thread-join! th2) + (exit)))))) (define (http-transport:server-signal-handler signum) (signal-mask! signum) (handle-exceptions exn Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -547,11 +547,12 @@ (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server")) (let ((tl (launch:setup-for-run))) (if tl - (let* ((servers (open-run-close tasks:get-all-servers tasks:open-db)) + (let* ((tdbdat (tasks:open-db)) + (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))) (fmtstr "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n") (servers-to-kill '()) (killinfo (args:get-arg "-stop-server")) (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) (sid (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f))) @@ -576,13 +577,13 @@ ;; (zmq-sockets (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server (if (equal? state "dead") (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day. - (open-run-close tasks:server-deregister tasks:open-db hostname pullport: pullport pid: pid action: 'delete)) + (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete)) (if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds - (open-run-close tasks:server-deregister tasks:open-db hostname pullport: pullport pid: pid))) + (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid))) (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update (if status "alive" "dead") transport) (if (or (equal? id sid) (equal? sid 0)) ;; kill all/any (begin Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -76,11 +76,13 @@ (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f))) (if cinfo cinfo ;; NB// can cache the answer for server running for 10 seconds ... ;; ;; (and (not (rmt:write-frequency-over-limit? cmd run-id)) - (if (tasks:server-running-or-starting? (tasks:get-db) run-id) + (if (tasks:server-running-or-starting? (db:delay-if-busy + (tasks:open-db)) + run-id) (let ((res (client:setup run-id))) (if res (hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully) #f)) #f)))) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -214,25 +214,24 @@ (all-tests-registry #f) ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names (all-test-names #f) ;; (hash-table-keys all-tests-registry)) (test-names #f) ;; (tests:filter-test-names all-test-names test-patts)) (required-tests #f) ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work (task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id))) - (tasks-db (tasks:open-db))) + (tdbdat (tasks:open-db))) (set-signal-handler! signal/int (lambda (signum) (signal-mask! signum) - (let ((tdb (tasks:open-db))) - (tasks:set-state-given-param-key tdb task-key "killed") - ;; (sqlite3:interrupt! tdb) ;; seems silly? - (sqlite3:finalize! tdb)) + (print "Received signal " signum ", cleaning up before exit. Please wait...") + (let ((tdbdat (tasks:open-db))) + (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "killed")) (print "Killed by signal " signum ". Exiting") (exit))) ;; register this run in monitor.db - (tasks:add tasks-db "run-tests" user target runname test-patts task-key) ;; params) - (tasks:set-state-given-param-key tasks-db task-key "running") + (tasks:add (db:delay-if-busy tdbdat) "run-tests" user target runname test-patts task-key) ;; params) + (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "running") (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process (if (file-exists? runconfigf) (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target) (debug:print 0 "WARNING: You do not have a run config file: " runconfigf)) @@ -393,12 +392,13 @@ (if (not (hash-table-ref/default flags "-rerun" #f)) (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS")) (runs:run-tests target runname test-patts user flags run-count: (- run-count 1))))) (debug:print-info 0 "No tests to run"))) (debug:print-info 4 "All done by here") - (tasks:set-state-given-param-key tasks-db task-key "done") - (sqlite3:finalize! tasks-db))) + (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "done") + ;; (sqlite3:finalize! tasks-db) + )) ;; loop logic. These are used in runs:run-tests-queue to make it a bit more readable. ;; ;; If reg not full and have items in tal then loop with (car tal)(cdr tal) reg reruns @@ -1397,11 +1397,11 @@ ;; NB// should pass in keys? ;; (define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(remove-data-only #f)) (common:clear-caches) ;; clear all caches (let* ((db #f) - (tasks-db (tasks:open-db)) + (tdbdat (tasks:open-db)) (keys (rmt:get-keys)) (rundat (mt:get-runs-by-patt keys runnamepatt target)) (header (vector-ref rundat 0)) (runs (vector-ref rundat 1)) (states (if state (string-split state ",") '())) @@ -1436,11 +1436,11 @@ (begin (case action ((remove-runs) ;; seek and kill in flight -runtests with % as testpatt here (if (equal? testpatt "%") - (tasks:kill-runner tasks-db target run-name) + (tasks:kill-runner (db:delay-if-busy tdbdat) target run-name) (debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) (debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((print-run) @@ -1549,11 +1549,12 @@ ;; (debug:print 1 "Removing run dir " runpath) ;; (system (conc "rmdir -p " runpath)))) ))))) )) runs) - (sqlite3:finalize! tasks-db)) + ;; (sqlite3:finalize! (db:delay-if-busy tdbdat)) + ) #t) (define (runs:remove-test-directory db test remove-data-only) (let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree (real-dir (if (file-exists? run-dir) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -127,12 +127,13 @@ (if (eq? run-id 0) (server:run run-id) (rmt:start-server run-id))) (define (server:check-if-running run-id) - (let loop ((server (tasks:get-server (tasks:get-db) run-id)) - (trycount 0)) + (let ((tdbdat (tasks:open-db))) + (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id)) + (trycount 0)) (if server ;; note: client:start will set *runremote*. this needs to be changed ;; also, client:start will login to the server, also need to change that. ;; ;; client:start returns #t if login was successful. @@ -143,25 +144,26 @@ ;; if the server didn't respond we must remove the record (if res #t (begin (debug:print-info 0 "server at " server " not responding, removing record") - (tasks:server-force-clean-running-records-for-run-id (tasks:get-db) run-id + (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id " server:check-if-running") res))) - #f))) + #f)))) ;; called in megatest.scm, host-port is string hostname:port ;; (define (server:ping run-id host:port) - (let* ((host-port (let ((slst (string-split host:port ":"))) - (if (eq? (length slst) 2) - (list (car slst)(string->number (cadr slst))) - #f))) - (toppath (launch:setup-for-run)) - (server-db-dat (if (not host-port)(tasks:get-server (tasks:get-db) run-id) #f))) - (if (not run-id) + (let ((tdbdat (tasks:open-db))) + (let* ((host-port (let ((slst (string-split host:port ":"))) + (if (eq? (length slst) 2) + (list (car slst)(string->number (cadr slst))) + #f))) + (toppath (launch:setup-for-run)) + (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f))) + (if (not run-id) (begin (debug:print 0 "ERROR: must specify run-id when doing ping, -run-id n") (print "ERROR: No run-id") (exit 1)) (if (and (not host-port) @@ -178,11 +180,11 @@ (begin (print "LOGIN_OK") (exit 0)) (begin (print "LOGIN_FAILED") - (exit 1)))))))) + (exit 1))))))))) ;; run ping in separate process, safest way in some cases ;; (define (server:ping-server run-id iface port) (with-input-from-pipe Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -51,13 +51,10 @@ (vector-ref *task-db* 1) (let* ((linktree (configf:lookup *configdat* "setup" "linktree")) (dbpath (conc linktree "/.db/monitor.db"))) dbpath))) -(define (tasks:wait-on-busy-monitor.db) - (tasks:wait-on-journal (tasks:get-task-db-path) 30)) - ;; If file exists AND ;; file readable ;; ==> open it ;; If file exists AND ;; file NOT readable @@ -64,29 +61,31 @@ ;; ==> open in-mem version ;; If file NOT exists ;; ==> open in-mem version ;; (define (tasks:open-db) - (let* ((dbpath (tasks:get-task-db-path)) - (avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away - (exists (file-exists? dbpath)) - (write-access (file-write-access? dbpath)) - (mdb (cond - ((file-write-access? *toppath*)(sqlite3:open-database dbpath)) - ((file-read-access? dbpath) (sqlite3:open-database dbpath)) - (else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath)) - (handler (make-busy-timeout 36000))) - (if (and exists - (not write-access)) - (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control - (sqlite3:set-busy-handler! mdb handler) - (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;")) - (if (or (and (not exists) - (file-write-access? *toppath*)) - (not (file-read-access? dbpath))) - (begin - (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY, + (if *task-db* + *task-db* + (let* ((dbpath (tasks:get-task-db-path)) + (avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away + (exists (file-exists? dbpath)) + (write-access (file-write-access? dbpath)) + (mdb (cond + ((file-write-access? *toppath*)(sqlite3:open-database dbpath)) + ((file-read-access? dbpath) (sqlite3:open-database dbpath)) + (else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath)) + (handler (make-busy-timeout 36000))) + (if (and exists + (not write-access)) + (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control + (sqlite3:set-busy-handler! mdb handler) + (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;")) + (if (or (and (not exists) + (file-write-access? *toppath*)) + (not (file-read-access? dbpath))) + (begin + (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY, action TEXT DEFAULT '', owner TEXT, state TEXT DEFAULT 'new', target TEXT DEFAULT '', name TEXT DEFAULT '', @@ -93,18 +92,18 @@ testpatt TEXT DEFAULT '', keylock TEXT, params TEXT, creation_time TIMESTAMP, execution_time TIMESTAMP);") - (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS monitors (id INTEGER PRIMARY KEY, + (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS monitors (id INTEGER PRIMARY KEY, pid INTEGER, start_time TIMESTAMP, last_update TIMESTAMP, hostname TEXT, username TEXT, CONSTRAINT monitors_constraint UNIQUE (pid,hostname));") - (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS servers (id INTEGER PRIMARY KEY, + (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS servers (id INTEGER PRIMARY KEY, pid INTEGER, interface TEXT, hostname TEXT, port INTEGER, pubport INTEGER, @@ -113,31 +112,24 @@ state TEXT, mt_version TEXT, heartbeat TIMESTAMP, transport TEXT, run_id INTEGER);") -;; CONSTRAINT servers_constraint UNIQUE (pid,hostname,port));") - (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS clients (id INTEGER PRIMARY KEY, + ;; CONSTRAINT servers_constraint UNIQUE (pid,hostname,port));") + (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS clients (id INTEGER PRIMARY KEY, server_id INTEGER, pid INTEGER, hostname TEXT, cmdline TEXT, login_time TIMESTAMP, logout_time TIMESTAMP DEFAULT -1, CONSTRAINT clients_constraint UNIQUE (pid,hostname));") - - )) - mdb)) - -(define (tasks:get-db) - (if *task-db* - (vector-ref *task-db* 0) - (let ((db (tasks:open-db)) - (pth (tasks:get-task-db-path))) - (set! *task-db* (vector db pth)) - db))) - + + )) + (set! *task-db* (cons mdb dbpath)) + *task-db*))) + ;;====================================================================== ;; Server and client management ;;====================================================================== ;; make-vector-record tasks hostinfo id interface port pubport transport pid hostname @@ -352,21 +344,22 @@ (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; (define (tasks:kill-server-run-id run-id #!key (tag "default")) - (let* ((tdb (tasks:open-db)) - (sdat (tasks:get-server tdb run-id))) + (let* ((tdbdat (tasks:open-db)) + (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) - (tasks:server-delete-record tdb server-id tag) ) + (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) - (sqlite3:finalize! tdb))) + ;; (sqlite3:finalize! tdb) + )) ;; (if status ;; #t means alive ;; (begin ;; (if (equal? hostname (get-host-name)) ;; (handle-exceptions