Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -39,10 +39,11 @@ get-var get-keys get-key-vals test-toplevel-num-items get-test-info-by-id + get-test-state-status-by-id get-steps-info-by-id get-data-info-by-id test-get-rundir-from-test-id get-count-tests-running-for-testname get-count-tests-running @@ -328,10 +329,11 @@ ((test-get-archive-block-info) (apply db:test-get-archive-block-info dbstruct params)) ;; TESTS ((test-toplevel-num-items) (apply db:test-toplevel-num-items dbstruct params)) ((get-test-info-by-id) (apply db:get-test-info-by-id dbstruct params)) + ((get-test-state-status-by-id) (apply db:get-test-state-status-by-id dbstruct params)) ((test-get-rundir-from-test-id) (apply db:test-get-rundir-from-test-id dbstruct params)) ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params)) ((get-count-tests-running) (apply db:get-count-tests-running dbstruct params)) ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params)) ;; ((delete-test-step-records) (apply db:delete-test-step-records dbstruct params)) @@ -351,10 +353,11 @@ ;; RUNS ((get-run-info) (apply db:get-run-info dbstruct params)) ((get-run-status) (apply db:get-run-status dbstruct params)) ((get-run-state) (apply db:get-run-state dbstruct params)) + ((get-run-state-status) (apply db:get-run-state-status dbstruct params)) ((set-run-status) (apply db:set-run-status dbstruct params)) ((set-run-state-status) (apply db:set-run-state-status dbstruct params)) ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db dbstruct params)) ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params)) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -317,10 +317,15 @@ (define (common:logpro-exit-code->test-status exit-code) (status-sym->string (common:logpro-exit-code->status-sym exit-code))) ;; (defstruct remote + + ;; transport to be used + ;; http - use http-transport + ;; http-read-cached - use http-transport for writes but in-mem cached for reads + (rmode 'http) (hh-dat (let ((res (or (server:choose-server *toppath* 'homehost) (cons #f #f)))) (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res)) res)) (server-url #f) ;; (server:check-if-running *toppath*) #f)) @@ -1367,11 +1372,11 @@ (else (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") #t)))) ;; default to requiring server (if force-result (begin - (debug:print-info 0 *default-log-port* "forcing use of server, force setting is \"" force-setting "\".") + (debug:print-info 0 *default-log-port* "ATTENTION! Forcing use of server, force setting is \"" force-setting "\".") #t) #f))) ;;====================================================================== ;; M I S C L I S T S Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -468,11 +468,13 @@ (hash-table-set! sync-durations (conc fname".db") (- (current-milliseconds) start-time))) (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date") ))) dbfiles) - (if dbdat (dbfile:add-dbdat dbstruct #f dbdat))) + ;; WHY does the dbdat need to be added back? + (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)) + ) #t) ;; options: ;; ;; 'killservers - kills all servers @@ -598,10 +600,11 @@ (mtdb (dbr:subdb-mtdb subdb)) (tmpdb (db:get-subdb dbstruct run-id)) (refndb (dbr:subdb-refndb subdb)) (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb))) ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb) + ;; BUG: verify this is really needed (dbfile:add-dbdat dbstruct run-id tmpdb) (set! res (cons newres res)))) subdbs) res)) @@ -904,10 +907,11 @@ "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND last_df > ?;") dneeded)) + ;; BUG: Verfify this is really needed (dbfile:add-dbdat dbstruct #f dbdat) blocks)) ;; returns id of the record, register a disk allocated to archiving and record it's last known ;; available space @@ -2066,12 +2070,13 @@ dbstruct #f #f (lambda (dbdat db) (sqlite3:for-each-row (lambda (status) (set! res status)) - db - "SELECT status FROM runs WHERE id=?;" + (db:get-cache-stmth + dbdat db + "SELECT status FROM runs WHERE id=?;" ) run-id) res)))) (define (db:get-run-state dbstruct run-id) (let ((res "n/a")) @@ -2079,12 +2084,27 @@ dbstruct #f #f (lambda (dbdat db) (sqlite3:for-each-row (lambda (status) (set! res status)) - db - "SELECT state FROM runs WHERE id=?;" + (db:get-cache-stmth + dbdat db + "SELECT state FROM runs WHERE id=?;" ) + run-id) + res)))) + +(define (db:get-run-state-status dbstruct run-id) + (let ((res (cons "n/a" "n/a"))) + (db:with-db + dbstruct #f #f + (lambda (dbdat db) + (sqlite3:for-each-row + (lambda (state status) + (set! res (cons state status))) + (db:get-cache-stmth + dbdat db + "SELECT state,status FROM runs WHERE id=?;" ) run-id) res)))) ;;====================================================================== @@ -2696,11 +2716,11 @@ (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (dbr:dbdat-dbh mtdb) run-id testrecs))) run-ids))) -;; Get test data using test_id, run-id is not used +;; Get test data using test_id ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct run-id @@ -2713,10 +2733,26 @@ (set! res (vector id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update))) (db:get-cache-stmth dbdat db (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=?;")) test-id) res)))) + +;; Get test state, status using test_id +;; +(define (db:get-test-state-status-by-id dbstruct run-id test-id) + (db:with-db + dbstruct + run-id + #f + (lambda (dbdat db) + (let ((res (cons #f #f))) + (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test + (lambda (state status) + (cons state status)) + (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;") + test-id) + res)))) ;; Use db:test-get* to access ;; Get test data using test_ids. NB// Only works within a single run!! ;; (define (db:get-test-info-by-ids dbstruct run-id test-ids) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -37,13 +37,15 @@ stack files ports commonmod + ;; debugprint ) -;; (import debugprint) +(define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic +(define num-run-dbs (make-parameter 10)) ;; number of db's in .megatest ;;====================================================================== ;; R E C O R D S ;;====================================================================== @@ -191,17 +193,21 @@ (define (dbfile:run-id->path apath run-id) (conc apath"/"(dbfile:run-id->dbname run-id))) (define (db:dbname->path apath dbname) (conc apath"/"dbname)) + +(define (dbfile:run-id->dbnum run-id) + (cond + ((number? run-id) + (modulo run-id (num-run-dbs))) + ((not run-id) "main") ;; 0 or main? + (else run-id))) ;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number (define (dbfile:run-id->dbname run-id) - (cond - ((number? run-id) (conc ".megatest/" (modulo run-id 100) ".db")) - ((not run-id) (conc ".megatest/main.db")) - (else run-id))) + (conc ".megatest/"(dbfile:run-id->dbnum run-id)".db")) ;; Make the dbstruct, setup up auxillary db's and call for main db at least once ;; ;; called in http-transport and replicated in rmt.scm for *local* access. ;; @@ -241,12 +247,16 @@ (begin (stack-pop! (dbr:subdb-dbstack subdb)))))) ;; return a previously opened db handle to the stack of available handles (define (dbfile:add-dbdat dbstruct run-id dbdat) - (let* ((subdb (dbfile:get-subdb dbstruct run-id))) - (stack-push! (dbr:subdb-dbstack subdb) dbdat) + (let* ((subdb (dbfile:get-subdb dbstruct run-id)) + (dbstk (dbr:subdb-dbstack subdb)) + (count (stack-count dbstk))) + (if (> count 15) + (dbfile:print-err "WARNING: stack for "run-id".db is "count".")) + (stack-push! dbstk dbdat) dbdat)) ;; set up a subdb ;; (define (dbfile:init-subdb dbstruct run-id init-proc) @@ -886,11 +896,12 @@ ) ) tbls) (let* ((runtime (- (current-milliseconds) start-time)) (should-print (or ;; (debug:debug-mode 12) - (common:low-noise-print 120 "db sync" (> runtime 500))))) ;; low and high sync times treated as separate. + (common:low-noise-print 120 "db sync") + (> runtime 500)))) ;; low and high sync times treated as separate. (for-each (lambda (dat) (let ((tblname (car dat)) (count (cdr dat))) (set! tot-count (+ tot-count count)) @@ -1004,53 +1015,53 @@ ;; (mutex-unlock! *db-open-mutex*) dbdat)) (define dbfile:db-init-proc (make-parameter #f)) -(define keep-age-param (make-parameter 10)) -(define qif-slope (make-parameter 100)) +;; in xmaxima this gives a curve close to what I want: +;; plot2d ((exp(x/1.2)-1)/300, [x, 0, 10])$ +;; plot2d ((exp(x/1.5)-1)/40, [x, 0, 10])$ +;; plot2d ((exp(x/5)-1)/40, [x, 0, 20])$ +(define (dbfile:droop x) + (/ (- (exp (/ x 5)) 1) 40)) + ;; (* numqrys (/ 1 (qif-slope)))) ;; create a dropping near the db file in a qif dir ;; use count of such files to gate queries (queries in flight) ;; (define (dbfile:wait-for-qif fname run-id params) (let* ((thedir (pathname-directory fname)) - (destdir (conc thedir"/qif-"run-id)) - (uniqn (get-area-path-signature (conc (or run-id "main") params))) + (dbnum (dbfile:run-id->dbnum run-id)) + (destdir (conc thedir"/qif-"dbnum)) + (uniqn (get-area-path-signature (conc dbnum params))) (crumbn (conc destdir"/"(current-seconds)"-"uniqn"."(current-process-id)))) (if (not (file-exists? destdir))(create-directory (conc destdir"/attic") #t)) (let loop ((count 0)) (let* ((currlks (glob (conc destdir"/*"))) (numqrys (length currlks)) (delayval (cond ;; do a droopish curve - ((> numqrys 50) - (if (> numqrys 50) - (for-each - (lambda (f) - (if (> (- (current-seconds) - (handle-exceptions - exn - (current-seconds) ;; file is likely gone, just fake out - (file-modification-time f))) - (keep-age-param)) - (let* ((basedir (pathname-directory f)) - (filen (pathname-file f)) - (destf (conc basedir"/attic/"filen))) - (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf) - ;; (delete-file* f) - (handle-exceptions - exn - #t - (file-move f destf #t))))) - currlks)) - 1) ;; 50 and above => 1 - ((> numqrys 10) (* numqrys (/ 1 (qif-slope)))) ;; slope of 1/100 - ;; ((> numqrys 30) 0.50) - ;; ((> numqrys 25) 0.20) - ;; ((> numqrys 20) 0.10) - ;; ((> numqrys 15) 0.05) - ;; ((> numqrys 10) 0.01) + ((> numqrys 25) + (for-each + (lambda (f) + (if (> (- (current-seconds) + (handle-exceptions + exn + (current-seconds) ;; file is likely gone, just fake out + (file-modification-time f))) + (keep-age-param)) + (let* ((basedir (pathname-directory f)) + (filen (pathname-file f)) + (destf (conc basedir"/attic/"filen))) + (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf) + ;; (delete-file* f) + (handle-exceptions + exn + #t + (file-move f destf #t))))) + currlks) + 4) + ((> numqrys 0) (dbfile:droop numqrys)) ;; slope of 1/100 (else #f)))) (if (and delayval (< count 5)) (begin (thread-sleep! delayval) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -29,18 +29,21 @@ (import (prefix sqlite3 sqlite3:)) (declare (unit launch)) (declare (uses subrun)) (declare (uses common)) +(declare (uses commonmod)) (declare (uses configf)) (declare (uses db)) (declare (uses ezsteps)) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "megatest-fossil-hash.scm") + +(import commonmod) ;;====================================================================== ;; ezsteps ;;====================================================================== @@ -205,11 +208,11 @@ ) )))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) - (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30"))) + (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "60"))) (start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- @@ -239,13 +242,13 @@ (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) (do-sync (or new-cpu-load new-disk-free over-time)) - (test-info (rmt:get-test-info-by-id run-id test-id)) - (state (db:test-get-state test-info)) - (status (db:test-get-status test-info)) + (test-info (rmt:get-test-state-status-by-id run-id test-id)) + (state (car test-info));; (db:test-get-state test-info)) + (status (cdr test-info));; (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) @@ -259,11 +262,12 @@ (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) - (launch:handle-zombie-tests run-id) + (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty + (launch:handle-zombie-tests run-id)) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) @@ -765,20 +769,28 @@ ;; new ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here - +(define *last-rollup* 0) (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) - (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) + (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) - (current-state (rmt:get-run-state run-id)) - (current-status (rmt:get-run-status run-id))) - ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing - (debug:print 0 *default-log-port* "Running test cnt :" running-cnt) - (rmt:set-state-status-and-roll-up-run run-id current-state current-status) + (current-state-status (rmt:get-run-state-status run-id)) + (current-state (car current-state-status)) ;; (rmt:get-run-state run-id)) + (current-status (cdr current-state-status))) ;; (rmt:get-run-status run-id))) + ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing + (debug:print 0 *default-log-port* "Running test cnt :" running-cnt) + ;; + ;; TODO: add a final rollup when run is done (if there isn't one already) + ;; + (if (or (< running-cnt 3) ;; have only few running + (> (- (current-seconds) *last-rollup*) 10)) ;; or haven't rolled up in past ten seconds + (begin + (rmt:set-state-status-and-roll-up-run run-id current-state current-status) + (set! *last-rollup* (current-seconds)))) (runs:update-junit-test-reporter-xml run-id) (cond ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" )) (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id))) (begin Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -118,176 +118,179 @@ (if server-info (begin (remote-server-url-set! *runremote* (server:record->url server-info)) (remote-server-id-set! *runremote* (server:record->id server-info))))) (set! runremote *runremote*))) ;; new runremote will come from this on next iteration - - ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity - ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; - ;; DOT SET_HOMEHOST -> MUTEXLOCK; - ;; ensure we have a homehost record - (if (or (not (pair? (remote-hh-dat runremote))) ;; not on homehost - (not (cdr (remote-hh-dat runremote)))) ;; not on homehost - (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little - (let ((hh-data (server:choose-server areapath 'homehost))) - (remote-hh-dat-set! runremote (or hh-data (cons #f #f))))) - - ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) - (cond - #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds - (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.") - (set! *runremote* #f) - ;; BUG: close-connections should go here? - (mutex-unlock! *rmt-mutex*) - (rmt:send-receive cmd rid params attemptnum: 1 area-dat: area-dat)) - - ;;DOT EXIT; - ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" } - ;; give up if more than 150 attempts - ((> attemptnum 150) - (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.") - (exit 1)) - - ;;DOT CASE2 [label="local\nreadonly\nquery"]; - ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2} - ;;DOT CASE2 -> "rmt:open-qry-close-locally"; - ;; readonly mode, read request- handle it - case 2 - ((and readonly-mode - (member cmd api:read-only-queries)) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") - (rmt:open-qry-close-locally cmd 0 params) - ) - - ;;DOT CASE3 [label="write in\nread-only mode"]; - ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3} - ;;DOT CASE3 -> "#f"; - ;; readonly mode, write request. Do nothing, return #f - (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params)) - - ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. - ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. - ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) - ;; - ;;DOT CASE4 [label="reset\nconnection"]; - ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} - ;;DOT CASE4 -> "rmt:send-receive"; - ;; reset the connection if it has been unused too long - ((and runremote - (remote-api-url runremote) - (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on - (+ (remote-last-access runremote) - (remote-server-timeout runremote)))) - (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses in " (remote-server-timeout runremote) " seconds, forcing new connection.") - (http-transport:close-connections runremote) - ;; moving this setting of runremote conndat to #f to inside the http-transport:close-connections - ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. - (mutex-unlock! *rmt-mutex*) - (rmt:send-receive cmd rid params attemptnum: attemptnum)) - - ;;DOT CASE5 [label="local\nread"]; - ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; - ;;DOT CASE5 -> "rmt:open-qry-close-locally"; - - ;; on homehost and this is a read - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (rmt:on-homehost? runremote) - (member cmd api:read-only-queries)) ;; this is a read - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") - (rmt:open-qry-close-locally cmd 0 params)) - - ;;DOT CASE6 [label="init\nremote"]; - ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; - ;;DOT CASE6 -> "rmt:send-receive"; - ;; on homehost and this is a write, we already have a server, but server has died - - ;; reinstate this keep-alive section but inject a time condition into the (add ... - ;; - ;; ((and (cdr (remote-hh-dat runremote)) ;; on homehost - ;; (not (member cmd api:read-only-queries)) ;; this is a write - ;; (remote-server-url runremote) ;; have a server - ;; (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. - ;; (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6") - ;; (http-transport:close-connections area-dat: runremote) ;; make sure to clean up - ;; (set! *runremote* (make-remote)) - ;; (let* ((server-info (remote-server-info *runremote*))) - ;; (if server-info - ;; (begin - ;; (remote-server-url-set! *runremote* (server:record->url server-info)) - ;; (remote-server-id-set! *runremote* (server:record->id server-info))))) - ;; (remote-force-server-set! runremote (common:force-server?)) - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") - ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) - - ;;DOT CASE7 [label="homehost\nwrite"]; - ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7}; - ;;DOT CASE7 -> "rmt:open-qry-close-locally"; - ;; on homehost and this is a write, we already have a server - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; on homehost - (not (member cmd api:read-only-queries)) ;; this is a write - (remote-server-url runremote)) ;; have a server (needed to sync written data back) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") - (rmt:open-qry-close-locally cmd 0 params)) - - ;;DOT CASE8 [label="force\nserver"]; - ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8}; - ;;DOT CASE8 -> "rmt:open-qry-close-locally"; - ;; on homehost, no server contact made and this is a write, passively start a server - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; have homehost - (not (remote-server-url runremote)) ;; no connection yet - (not (member cmd api:read-only-queries))) ;; not a read-only query - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") - (let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call - (if server-info - (begin - (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed - (remote-server-id-set! runremote (server:record->id server-info))) - (if (common:force-server?) - (server:start-and-wait *toppath*) - (server:kind-run *toppath*))) + + (http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode))) + +(define (http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode) + ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity + ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; + ;; DOT SET_HOMEHOST -> MUTEXLOCK; + ;; ensure we have a homehost record + (if (or (not (pair? (remote-hh-dat runremote))) ;; not on homehost + (not (cdr (remote-hh-dat runremote)))) ;; not on homehost + (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little + (let ((hh-data (server:choose-server areapath 'homehost))) + (remote-hh-dat-set! runremote (or hh-data (cons #f #f))))) + + ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) + (cond + #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds + (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.") + (set! *runremote* #f) + ;; BUG: close-connections should go here? + (mutex-unlock! *rmt-mutex*) + (rmt:send-receive cmd rid params attemptnum: 1 area-dat: area-dat)) + + ;;DOT EXIT; + ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" } + ;; give up if more than 150 attempts + ((> attemptnum 150) + (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.") + (exit 1)) + + ;;DOT CASE2 [label="local\nreadonly\nquery"]; + ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2} + ;;DOT CASE2 -> "rmt:open-qry-close-locally"; + ;; readonly mode, read request- handle it - case 2 + ((and readonly-mode + (member cmd api:read-only-queries)) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") + (rmt:open-qry-close-locally cmd 0 params) + ) + + ;;DOT CASE3 [label="write in\nread-only mode"]; + ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3} + ;;DOT CASE3 -> "#f"; + ;; readonly mode, write request. Do nothing, return #f + (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params)) + + ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. + ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. + ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) + ;; + ;;DOT CASE4 [label="reset\nconnection"]; + ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} + ;;DOT CASE4 -> "rmt:send-receive"; + ;; reset the connection if it has been unused too long + ((and runremote + (remote-api-url runremote) + (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on + (+ (remote-last-access runremote) + (remote-server-timeout runremote)))) + (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses in " (remote-server-timeout runremote) " seconds, forcing new connection.") + (http-transport:close-connections runremote) + ;; moving this setting of runremote conndat to #f to inside the http-transport:close-connections + ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. + (mutex-unlock! *rmt-mutex*) + (rmt:send-receive cmd rid params attemptnum: attemptnum)) + + ;;DOT CASE5 [label="local\nread"]; + ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; + ;;DOT CASE5 -> "rmt:open-qry-close-locally"; + + ;; on homehost and this is a read + ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required + (rmt:on-homehost? runremote) + (member cmd api:read-only-queries)) ;; this is a read + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") + (rmt:open-qry-close-locally cmd 0 params)) + + ;;DOT CASE6 [label="init\nremote"]; + ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; + ;;DOT CASE6 -> "rmt:send-receive"; + ;; on homehost and this is a write, we already have a server, but server has died + + ;; reinstate this keep-alive section but inject a time condition into the (add ... + ;; + ;; ((and (cdr (remote-hh-dat runremote)) ;; on homehost + ;; (not (member cmd api:read-only-queries)) ;; this is a write + ;; (remote-server-url runremote) ;; have a server + ;; (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. + ;; (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6") + ;; (http-transport:close-connections area-dat: runremote) ;; make sure to clean up + ;; (set! *runremote* (make-remote)) + ;; (let* ((server-info (remote-server-info *runremote*))) + ;; (if server-info + ;; (begin + ;; (remote-server-url-set! *runremote* (server:record->url server-info)) + ;; (remote-server-id-set! *runremote* (server:record->id server-info))))) + ;; (remote-force-server-set! runremote (common:force-server?)) + ;; (mutex-unlock! *rmt-mutex*) + ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") + ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) + + ;;DOT CASE7 [label="homehost\nwrite"]; + ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7}; + ;;DOT CASE7 -> "rmt:open-qry-close-locally"; + ;; on homehost and this is a write, we already have a server + ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required + (cdr (remote-hh-dat runremote)) ;; on homehost + (not (member cmd api:read-only-queries)) ;; this is a write + (remote-server-url runremote)) ;; have a server (needed to sync written data back) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") + (rmt:open-qry-close-locally cmd 0 params)) + + ;;DOT CASE8 [label="force\nserver"]; + ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8}; + ;;DOT CASE8 -> "rmt:open-qry-close-locally"; + ;; on homehost, no server contact made and this is a write, passively start a server + ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required + (cdr (remote-hh-dat runremote)) ;; have homehost + (not (remote-server-url runremote)) ;; no connection yet + (not (member cmd api:read-only-queries))) ;; not a read-only query + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") + (let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call + (if server-info + (begin + (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed + (remote-server-id-set! runremote (server:record->id server-info))) + (if (common:force-server?) + (server:start-and-wait *toppath*) + (server:kind-run *toppath*))) (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") (rmt:open-qry-close-locally cmd 0 params))) - ;;DOT CASE9 [label="force server\nnot on homehost"]; - ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; - ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; - ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one - (not (remote-api-url runremote))) - (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost - (not (remote-api-url runremote)))) ;; and no connection - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " runremote: " (remote->alist runremote)) - (mutex-unlock! *rmt-mutex*) - (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? - (server:start-and-wait *toppath*)) - ;; was: (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http - (set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http - (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as - - ;;DOT CASE10 [label="on homehost"]; - ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10}; - ;;DOT CASE10 -> "rmt:open-qry-close-locally"; - ;; all set up if get this far, dispatch the query - ((and (not (remote-force-server runremote)) - (cdr (remote-hh-dat runremote))) ;; we are on homehost - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10") - (rmt:open-qry-close-locally cmd (if rid rid 0) params)) - - ;;DOT CASE11 [label="send_receive"]; - ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; - ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; - ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; - ;; not on homehost, do server query - (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid))))) - ;;DOT } + ;;DOT CASE9 [label="force server\nnot on homehost"]; + ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; + ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; + ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one + (not (remote-api-url runremote))) + (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost + (not (remote-api-url runremote)))) ;; and no connection + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " runremote: " (remote->alist runremote)) + (mutex-unlock! *rmt-mutex*) + (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? + (server:start-and-wait *toppath*)) + ;; was: (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http + (set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http + (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as + + ;;DOT CASE10 [label="on homehost"]; + ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10}; + ;;DOT CASE10 -> "rmt:open-qry-close-locally"; + ;; all set up if get this far, dispatch the query + ((and (not (remote-force-server runremote)) + (cdr (remote-hh-dat runremote))) ;; we are on homehost + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10") + (rmt:open-qry-close-locally cmd (if rid rid 0) params)) + + ;;DOT CASE11 [label="send_receive"]; + ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; + ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; + ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; + ;; not on homehost, do server query + (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)))) +;;DOT } ;; bunch of small functions factored out of send-receive to make debug easier ;; (define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid) @@ -537,20 +540,21 @@ (define (rmt:get-test-id run-id testname item-path) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) -;; run-id is NOT used -;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) (begin (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) +(define (rmt:get-test-state-status-by-id run-id test-id) + (rmt:send-receive 'get-test-state-status-by-id run-id (list run-id test-id))) + (define (rmt:test-get-rundir-from-test-id run-id test-id) (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) (define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f)) (assert (number? run-id) "FATAL: Run id required.") @@ -799,10 +803,13 @@ (define (rmt:get-run-state run-id) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-run-state #f (list run-id))) +(define (rmt:get-run-state-status run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-state-status #f (list run-id))) (define (rmt:set-run-status run-id run-status #!key (msg #f)) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1863,11 +1863,12 @@ (newtestname (db:test-make-full-name hed my-item-path))) ;; test names are unique on testname/item-path (tests:testqueue-set-items! new-test-record #f) (tests:testqueue-set-itemdat! new-test-record my-itemdat) (tests:testqueue-set-item_path! new-test-record my-item-path) (hash-table-set! test-records newtestname new-test-record) - (set! tal (append tal (list newtestname))))) ;; since these are itemized create new test names testname/itempath + ;; BUG: This next line sucks up a lot of horsepower + (set! tal (append tal (list newtestname))))) ;; since these are itemized create new test names testname/itempath items-in-testpatt))) ;; At this point we have possibly added items to tal but all must be handed off to Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1966,13 +1966,13 @@ ;;====================================================================== ;; teststep-set-status! used to be here (define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) - (let* ((testdat (rmt:get-test-info-by-id run-id test-id))) + (let* ((testdat (rmt:get-test-state-status-by-id run-id test-id))) (and testdat - (equal? (test:get-state testdat) "KILLREQ")))) + (equal? (car testdat) "KILLREQ")))) (define (test:tdb-get-rundat-count tdb) (if tdb (let ((res 0)) (sqlite3:for-each-row