Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -118,11 +118,11 @@ (define (api:execute-requests dbstruct dat) (handle-exceptions exn (let ((call-chain (get-call-chain)) ) - (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer") + (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat) (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector @@ -150,11 +150,11 @@ ;; SERVERS ((start-server) (apply server:kind-run params)) ((kill-server) (set! *server-run* #f)) ;; TESTS - ((test-set-state-status-by-id) (apply db:test-set-state-status-by-id dbstruct params)) + ((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) ((delete-test-records) (apply db:delete-test-records dbstruct params)) ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params)) ((test-set-state-status) (apply db:test-set-state-status dbstruct params)) ((test-set-top-process-pid) (apply db:test-set-top-process-pid dbstruct params)) ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params)) Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -111,11 +111,11 @@ (archive-id (if archive-info (car archive-info) -1)) (disk-groups (make-hash-table)) (test-groups (make-hash-table)) ;; these two (disk and test groups) could be combined nicely (bup-exe (or (configf:lookup *configdat* "archive" "bup") "bup")) (compress (or (configf:lookup *configdat* "archive" "compress") "9")) - (linktree (configf:lookup *configdat* "setup" "linktree"))) + (linktree (common:get-linktree))) ;; (configf:lookup *configdat* "setup" "linktree"))) (if (not archive-dir) ;; no archive disk found, this is fatal (begin (debug:print 0 *default-log-port* "FATAL: No archive disks found. Please add disks with at least " min-space " MB space to the [archive-disks] section of megatest.config") (debug:print 0 *default-log-port* " use [archive] minspace to specify minimum available space") @@ -211,11 +211,11 @@ (define (archive:bup-restore archive-command run-id run-name tests rp-mutex bup-mutex) ;; move the getting of archive space down into the below block so that a single run can ;; allocate as needed should a disk fill up ;; (let* ((bup-exe (or (configf:lookup *configdat* "archive" "bup") "bup")) - (linktree (configf:lookup *configdat* "setup" "linktree"))) + (linktree (common:get-linktree))) ;; (configf:lookup *configdat* "setup" "linktree"))) ;; from the test info bin the path to the test by stem ;; (for-each (lambda (test-dat) Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -95,11 +95,11 @@ ((http)(http-transport:client-connect host port)))) (ping-res (case *transport-type* ((http)(rmt:login-no-auto-client-setup start-res))))) (if (and start-res ping-res) - (begin + (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) start-res) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -17,10 +17,11 @@ (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) (declare (unit common)) +(declare (uses keys)) (include "common_records.scm") ;; (require-library margs) ;; (include "margs.scm") @@ -32,16 +33,19 @@ ;; (old-exit) ;; (old-exit code))) (define getenv get-environment-variable) (define (safe-setenv key val) - (if (and (string? val)(string? key)) - (handle-exceptions - exn - (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val) - (setenv key val)) - (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val))) + (if (substring-index ":" key) ;; variables containing : are for internal use and cannot be environment variables. + (debug:print-error 4 *default-log-port* "skip setting internal use only variables containing \":\"") + (if (and (string? val) + (string? key)) + (handle-exceptions + exn + (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val) + (setenv key val)) + (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)))) (define home (getenv "HOME")) (define user (getenv "USER")) ;; GLOBAL GLETCHES @@ -952,12 +956,17 @@ rtestpatt) (else args-testpatt)))) (define (common:get-linktree) (or (getenv "MT_LINKTREE") - (if *configdat* - (configf:lookup *configdat* "setup" "linktree")))) + (or (and *configdat* + (configf:lookup *configdat* "setup" "linktree")) + (if *toppath* + (conc *toppath* "/lt") + (if (file-exists? "megatest.config") ;; we are in the toppath (new area, mtutils compatible) + (conc (current-directory) "/lt") + #f))))) (define (common:args-get-runname) (let ((res (or (args:get-arg "-runname") (args:get-arg ":runname") (getenv "MT_RUNNAME")))) @@ -1010,22 +1019,34 @@ (else (let* ((currhost (get-host-name)) (bestadrs (server:get-best-guess-address currhost)) ;; first look in config, then look in file .homehost, create it if not found (homehost (or (configf:lookup *configdat* "server" "homehost" ) - (let ((hhf (conc *toppath* "/.homehost"))) - (if (file-exists? hhf) - (with-input-from-file hhf read-line) - (if (file-write-access? *toppath*) - (begin - (with-output-to-file hhf - (lambda () - (print bestadrs))) - (begin - (mutex-unlock! *homehost-mutex*) - (car (common:get-homehost)))) - #f))))) + (handle-exceptions + exn + (if (> trynum 0) + (let ((delay-time (* (- 5 trynum) 5))) + (mutex-unlock! *homehost-mutex*) + (debug:print 0 *default-log-port* "ERROR: Failed to read .homehost file, delaying " delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn)) + (thread-sleep! delay-time) + (common:get-homehost trynum: (- trynum 1))) + (begin + (mutex-unlock! *homehost-mutex*) + (debug:print 0 *default-log-port* "ERROR: Failed to read .homehost file after trying five times. Giving up and exiting, message: " ((condition-property-accessor 'exn 'message) exn)) + (exit 1))) + (let ((hhf (conc *toppath* "/.homehost"))) + (if (file-exists? hhf) + (with-input-from-file hhf read-line) + (if (file-write-access? *toppath*) + (begin + (with-output-to-file hhf + (lambda () + (print bestadrs))) + (begin + (mutex-unlock! *homehost-mutex*) + (car (common:get-homehost)))) + #f)))))) (at-home (or (equal? homehost currhost) (equal? homehost bestadrs)))) (set! *home-host* (cons homehost at-home)) (mutex-unlock! *homehost-mutex*) *home-host*)))) @@ -1202,23 +1223,23 @@ ;; lazy-safe get file mod time. on any error (file not existing etc.) return 0 ;; (define (common:lazy-modification-time fpath) (handle-exceptions - exn - 0 - (file-modification-time fpath))) + exn + 0 + (file-modification-time fpath))) ;; find timestamp of newest file associated with a sqlite db file (define (common:lazy-sqlite-db-modification-time fpath) (let* ((glob-list (handle-exceptions - exn - '("/no/such/file") - (glob (conc fpath "*")))) + exn + `(,(conc "/no/such/file, message: " ((condition-property-accessor 'exn 'message) exn))) + (glob (conc fpath "*")))) (file-list (if (eq? 0 (length glob-list)) - '("/no/such/file") - glob-list))) + '("/no/such/file") + glob-list))) (apply max (map common:lazy-modification-time file-list)))) @@ -1611,11 +1632,12 @@ (let* ((key (car keyval)) (val (cdr keyval)) (delim (if (string-search whitesp val) "\"" ""))) - (print (if (member key ignorevars) + (print (if (or (member key ignorevars) + (string-search ":" key)) ;; internal only values to be skipped. "# export " "export ") key "=" delim (mungeval val) delim))) envvars))))) Index: common_records.scm ================================================================== --- common_records.scm +++ common_records.scm @@ -26,13 +26,27 @@ (define-syntax define-simple-syntax (syntax-rules () ((_ (name arg ...) body ...) (define-syntax name (syntax-rules () ((name arg ...) (begin body ...))))))) +;; (define-syntax common:handle-exceptions +;; (syntax-rules () +;; ((_ exn-in errstmt ...)(handle-exceptions exn-in errstmt ...)))) + +(define-syntax common:debug-handle-exceptions + (syntax-rules () + ((_ debug exn errstmt body ...) + (if debug + (begin body ...) + (handle-exceptions exn errstmt body ...))))) + (define-syntax common:handle-exceptions (syntax-rules () - ((_ exn-in errstmt ...)(handle-exceptions exn-in errstmt ...)))) + ((_ exn errstmt body ...) + (begin body ...)))) + +;; (define handle-exceptions common:handle-exceptions) ;; iup callbacks are not dumping the stack, this is a work-around ;; (define-simple-syntax (debug:catch-and-dump proc procname) (handle-exceptions @@ -97,11 +111,11 @@ (list? n)) (not (null? (lset-intersection! eq? *verbosity* n)))) ((and (number? *verbosity*) (list? n)) (member *verbosity* n)))) - + (define (debug:setup) (let ((debugstr (or (args:get-arg "-debug") (getenv "MT_DEBUG_MODE")))) (set! *verbosity* (debug:calc-verbosity debugstr)) (debug:check-verbosity *verbosity* debugstr) Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -15,10 +15,11 @@ (use regex regex-case) ;; directory-utils) (declare (unit configf)) (declare (uses process)) (declare (uses env)) +(declare (uses keys)) (include "common_records.scm") ;; return list (path fullpath configname) (define (find-config configname #!key (toppath #f)) @@ -653,19 +654,31 @@ (lambda (section) (hash-table-set! ht (car section)(cdr section))) adat) ht)) +;; if (define (configf:read-alist fname) - (configf:alist->config - (with-input-from-file fname read))) + (handle-exceptions + exn + #f + (configf:alist->config + (with-input-from-file fname read)))) (define (configf:write-alist cdat fname) - (with-output-to-file fname - (lambda () - (pp (configf:config->alist cdat))))) - + (let ((dat (configf:config->alist cdat))) + (with-output-to-file fname ;; first write out the file + (lambda () + (pp dat))) + (if (file-exists? fname) ;; now verify it is readable + (if (configf:read-alist fname) + #t ;; data is good. + (begin + (delete-file fname) + (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") + #f)) + #f))) ;; convert hierarchial list to ini format ;; (define (configf:config->ini data) (map Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -474,13 +474,13 @@ (make-hash-table)))) (testconfig (begin ;; (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path) (runs:set-megatest-env-vars run-id inkeyvals: keydat inrunname: runname intarget: keystring testname: testname itempath: item-path) ;; these may be needed by the launching process (handle-exceptions - exn + exn ;; NOTE: I've no idea why this was written this way. Research, study and fix needed! (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f) - (tests:get-testconfig (db:test-get-testname testdat) test-registry #t)))) + (tests:get-testconfig (db:test-get-testname testdat) item-path test-registry #t)))) (viewlog (lambda (x) (if (file-exists? logfile) ;(system (conc "firefox " logfile "&")) (dashboard-tests:run-html-viewer logfile) (message-window (conc "File " logfile " not found"))))) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -2712,11 +2712,11 @@ ;; (define *monitor-db-path* #f) (define *last-monitor-update-time* 0) ;; Force creation of the db in case it isn't already there. -(tasks:open-db) +;; (tasks:open-db) (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -898,11 +898,10 @@ (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) (tmpdb (db:get-db dbstruct)) (refndb (dbr:dbstruct-refndb dbstruct)) (allow-cleanup #t) ;; (if run-ids #f #t)) - ;; (tdbdat (tasks:open-db)) (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) (data-synced 0)) ;; count of changed records (I hope) ;; kill servers (if (member 'killservers options) @@ -2580,24 +2579,33 @@ ;; (db:delay-if-busy) ;; ;; NB// This call only operates on toplevel tests. Consider replacing it with more general call ;; (define (db:set-tests-state-status dbstruct run-id testnames currstate currstatus newstate newstatus) - (for-each (lambda (testname) - (let ((qry (conc "UPDATE tests SET state=?,status=? WHERE " - (if currstate (conc "state='" currstate "' AND ") "") - (if currstatus (conc "status='" currstatus "' AND ") "") - " run_id=? AND testname LIKE ?;")) - (test-id (db:get-test-id dbstruct run-id testname ""))) - (db:with-db - dbstruct - run-id - #t - (lambda (db) - (sqlite3:execute db qry newstate newstatus run-id testname))) - (if test-id (mt:process-triggers dbstruct run-id test-id newstate newstatus)))) - testnames)) + (let ((test-ids '())) + (for-each + (lambda (testname) + (let ((qry (conc "UPDATE tests SET state=?,status=? WHERE " + (if currstate (conc "state='" currstate "' AND ") "") + (if currstatus (conc "status='" currstatus "' AND ") "") + " run_id=? AND testname LIKE ?;")) + (test-id (db:get-test-id dbstruct run-id testname ""))) + (db:with-db + dbstruct + run-id + #t + (lambda (db) + (sqlite3:execute db qry + (or newstate currstate "NOT_STARTED") + (or newstatus currstate "UNKNOWN") + run-id testname))) + (if test-id + (begin + (set! test-ids (cons test-id test-ids)) + (mt:process-triggers dbstruct run-id test-id newstate newstatus))))) + testnames) + test-ids)) ;; ;; speed up for common cases with a little logic ;; ;; NB// Ultimately this will be deprecated in deference to mt:test-set-state-status-by-id ;; ;; NOTE: run-id is not used Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -609,12 +609,11 @@ (iup:vbox ;; (iup:label "Run statistics" #:expand "HORIZONTAL") stats-matrix))) (define (dcommon:servers-table commondat tabdat) - (let* ((tdbdat (tasks:open-db)) - (colnum 0) + (let* ((colnum 0) (rownum 0) (servers-matrix (iup:matrix #:expand "YES" #:numcol 7 #:numcol-visible 7 #:numlin-visible 5 @@ -621,11 +620,10 @@ )) (colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId")) (updater (lambda () (if (dashboard:monitor-changed? commondat tabdat) (let ((servers (server:get-list *toppath* limit: 10))) - ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))) (iup:attribute-set! servers-matrix "NUMLIN" (length servers)) ;; (set! colnum 0) ;; (for-each (lambda (colname) ;; ;; (print "colnum: " colnum " colname: " colname) ;; (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -57,11 +57,11 @@ ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") (server:get-best-guess-address hostname) #f))) (if ipstr ipstr hostn))) ;; hostname))) (start-port (portlogger:open-run-close portlogger:find-port)) - (link-tree-path (configf:lookup *configdat* "setup" "linktree"))) + (link-tree-path (common:get-linktree))) ;; (configf:lookup *configdat* "setup" "linktree"))) (debug:print-info 0 *default-log-port* "portlogger recommended port: " start-port) (root-path (if link-tree-path link-tree-path (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP! (handle-directory spiffy-directory-listing) @@ -112,40 +112,40 @@ ;; (define (http-transport:try-start-server ipaddrstr portnum) (let ((config-hostname (configf:lookup *configdat* "server" "hostname"))) (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname) (handle-exceptions - exn - (begin - (print-error-message exn) - (if (< portnum 64000) - (begin - (debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 *default-log-port* "exn=" (condition->list exn)) - (portlogger:open-run-close portlogger:set-failed portnum) - (debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port") - (thread-sleep! 0.1) - - ;; get_next_port goes here - (http-transport:try-start-server ipaddrstr - (portlogger:open-run-close portlogger:find-port))) - (begin - (print "ERROR: Tried and tried but could not start the server")))) - ;; any error in following steps will result in a retry - (set! *server-info* (list ipaddrstr portnum)) - (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum) - ;; This starts the spiffy server - ;; NEED WAY TO SET IP TO #f TO BIND ALL - ;; (start-server bind-address: ipaddrstr port: portnum) - (if config-hostname ;; this is a hint to bind directly - (start-server port: portnum bind-address: (if (equal? config-hostname "-") - ipaddrstr - config-hostname)) - (start-server port: portnum)) - (portlogger:open-run-close portlogger:set-port portnum "released") - (debug:print 1 *default-log-port* "INFO: server has been stopped")))) + exn + (begin + (print-error-message exn) + (if (< portnum 64000) + (begin + (debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "exn=" (condition->list exn)) + (portlogger:open-run-close portlogger:set-failed portnum) + (debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port") + (thread-sleep! 0.1) + + ;; get_next_port goes here + (http-transport:try-start-server ipaddrstr + (portlogger:open-run-close portlogger:find-port))) + (begin + (print "ERROR: Tried and tried but could not start the server")))) + ;; any error in following steps will result in a retry + (set! *server-info* (list ipaddrstr portnum)) + (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum) + ;; This starts the spiffy server + ;; NEED WAY TO SET IP TO #f TO BIND ALL + ;; (start-server bind-address: ipaddrstr port: portnum) + (if config-hostname ;; this is a hint to bind directly + (start-server port: portnum bind-address: (if (equal? config-hostname "-") + ipaddrstr + config-hostname)) + (start-server port: portnum)) + (portlogger:open-run-close portlogger:set-port portnum "released") + (debug:print 1 *default-log-port* "INFO: server has been stopped")))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== @@ -229,30 +229,30 @@ ;; ((exn http client-error) e (print e))) (set! res (vector success (db:string->obj (handle-exceptions - exn - (begin - (set! success #f) - (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (if runremote - (remote-conndat-set! runremote #f)) - ;; Killing associated server to allow clean retry.") - ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? - (mutex-unlock! *http-mutex*) + exn + (begin + (set! success #f) + (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (if runremote + (remote-conndat-set! runremote #f)) + ;; Killing associated server to allow clean retry.") + ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? + (mutex-unlock! *http-mutex*) ;;; (signal (make-composite-condition ;;; (make-property-condition 'commfail 'message "failed to connect to server"))) ;;; "communications failed" - (db:obj->string #f)) - (with-input-from-request ;; was dat - fullurl - (list (cons 'key "thekey") - (cons 'cmd cmd) - (cons 'params sparams)) - read-string)) + (db:obj->string #f)) + (with-input-from-request ;; was dat + fullurl + (list (cons 'key "thekey") + (cons 'cmd cmd) + (cons 'params sparams)) + read-string)) transport: 'http) 0)) ;; added this speculatively ;; Shouldn't this be a call to the managed call-all-connections stuff above? (close-all-connections!) (mutex-unlock! *http-mutex*) @@ -440,17 +440,16 @@ (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) (define (http-transport:server-shutdown port) - (let ((tdbdat (tasks:open-db))) + (begin ;;(BB> "http-transport:server-shutdown called") (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) ;; ;; start_shutdown ;; - ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up (portlogger:open-run-close portlogger:set-port port "released") (thread-sleep! 1) ;; (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -12,11 +12,11 @@ ;; launch a task - this runs on the originating host, tests themselves ;; ;;====================================================================== (use regex regex-case base64 sqlite3 srfi-18 directory-utils posix-extras z3 call-with-environment-variables csv) -(use typed-records pathname-expand) +(use typed-records pathname-expand matchable) (import (prefix base64 base64:)) (import (prefix sqlite3 sqlite3:)) (declare (unit launch)) @@ -413,13 +413,15 @@ (work-area (assoc/default 'work-area cmdinfo)) ;; work-area is the test run area (test-name (assoc/default 'test-name cmdinfo)) (runscript (assoc/default 'runscript cmdinfo)) (ezsteps (assoc/default 'ezsteps cmdinfo)) ;; (runremote (assoc/default 'runremote cmdinfo)) - (transport (assoc/default 'transport cmdinfo)) + ;; (transport (assoc/default 'transport cmdinfo)) ;; not used ;; (serverinf (assoc/default 'serverinf cmdinfo)) - (port (assoc/default 'port cmdinfo)) + ;; (port (assoc/default 'port cmdinfo)) + (serverurl (assoc/default 'serverurl cmdinfo)) + (homehost (assoc/default 'homehost cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) (test-id (assoc/default 'test-id cmdinfo)) (target (assoc/default 'target cmdinfo)) (itemdat (assoc/default 'itemdat cmdinfo)) (env-ovrd (assoc/default 'env-ovrd cmdinfo)) @@ -443,10 +445,51 @@ runscript))))) ;; assume it is on the path ) ;; (rollup-status 0) (if contour (setenv "MT_CONTOUR" contour)) + ;; On NFS it can be slow and unreliable to get needed startup information. + ;; i. Check if we are on the homehost, if so, proceed + ;; ii. Check if host and port passed in via CMDINFO are valid and if + ;; possible use them. + (let ((bestadrs (server:get-best-guess-address (get-host-name))) + (needcare #f)) + (if (equal? homehost bestadrs) ;; we are likely on the homehost + (debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost) + (let ((host-port (if serverurl (string-split serverurl ":") #f))) + (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote* + (if (string? homehost) + (if (and host-port + (> (length host-port) 1)) + (let* ((host (car host-port)) + (port (cadr host-port)) + (start-res (http-transport:client-connect host port)) + (ping-res (rmt:login-no-auto-client-setup start-res))) + (if (and start-res + ping-res) + (let ((url (http-transport:server-dat-make-url start-res))) + (remote-conndat-set! *runremote* start-res) + (remote-server-url-set! *runremote* url) + (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.")) + (debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.") + )) + (begin + (debug:print-info 0 *default-log-port* (if host-port + (conc "received invalid host-port information " host-port) + "no host-port information received")) + ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare. + (set! needcare #t))) + (begin + (debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.") + (set! needcare #t))))) + (if needcare ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host + (let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory + (handle-exceptions + exn + (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn)) + (create-directory logdir #t))))) + ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (file-exists? top-path) (> count 10)) (change-directory top-path) @@ -576,11 +619,11 @@ (list "MT_ITEM_INFO" (conc itemdat)) (list "MT_ITEMPATH" item-path) (list "MT_RUNNAME" runname) (list "MT_MEGATEST" megatest) (list "MT_TARGET" target) - (list "MT_LINKTREE" (configf:lookup *configdat* "setup" "linktree")) + (list "MT_LINKTREE" (common:get-linktree)) ;; (configf:lookup *configdat* "setup" "linktree")) (list "MT_TESTSUITENAME" (common:get-testsuite-name)))) (if mt-bindir-path (setenv "PATH" (conc (getenv "PATH") ":" mt-bindir-path))) ;; (change-directory top-path) ;; Can setup as client for server mode now @@ -682,11 +725,11 @@ ;; to megatest-(current-seconds).cfg and symlink it to megatest.cfg (if (and *configdat* (or (args:get-arg "-run") (args:get-arg "-runtests") (args:get-arg "-execute"))) - (let* ((linktree (get-environment-variable "MT_LINKTREE")) + (let* ((linktree (common:get-linktree)) ;; (get-environment-variable "MT_LINKTREE")) (target (common:args-get-target exit-if-bad: #t)) (runname (or (args:get-arg "-runname") (args:get-arg ":runname") (getenv "MT_RUNNAME"))) (fulldir (conc linktree "/" @@ -742,11 +785,11 @@ (define (launch:setup-body #!key (force #f) (areapath #f)) (let* ((toppath (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath (runname (common:args-get-runname)) (target (common:args-get-target exit-if-bad: #t)) (linktree (common:get-linktree)) - (contour (args:get-arg "-contour")) + (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour")) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (rundir (if (and runname target linktree)(conc linktree (if contour (conc "/" contour) "") "/" target "/" runname) #f)) (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) @@ -800,12 +843,15 @@ (exit 1))) (setenv "MT_RUN_AREA_HOME" *toppath*) ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it (let* ((keys (rmt:get-keys)) (key-vals (keys:target->keyval keys target)) - (linktree (or (getenv "MT_LINKTREE") - (if *configdat* (configf:lookup *configdat* "setup" "linktree") #f))) + (linktree (common:get-linktree)) + ; (or (getenv "MT_LINKTREE") + ; (if *configdat* + ; (configf:lookup *configdat* "setup" "linktree") + ; (conc *toppath* "/lt")))) (second-pass (find-and-read-config mtconfig environ-patt: "env-override" given-toppath: toppath pathenvvar: "MT_RUN_AREA_HOME")) @@ -840,12 +886,11 @@ (set! *configstatus* 'partial)) (begin (debug:print-error 0 *default-log-port* "No " mtconfig " file found. Giving up.") (exit 2)))))) ;; additional house keeping - (let* ((linktree (or (getenv "MT_LINKTREE") - (if *configdat* (configf:lookup *configdat* "setup" "linktree") #f)))) + (let* ((linktree (common:get-linktree))) (if linktree (begin (if (not (file-exists? linktree)) (begin (handle-exceptions @@ -917,11 +962,11 @@ (runname (if (string? run-info) ;; if we pass in a string as run-info use it as run-name. run-info (db:get-value-by-header (db:get-rows run-info) (db:get-header run-info) "runname"))) - (contour (args:get-arg "-contour")) + (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour")) ;; convert back to db: from rdb: - this is always run at server end (target (string-intersperse (map cadr keyvals) "/")) (not-iterated (equal? "" item-path)) @@ -932,12 +977,14 @@ ;; nb// if itempath is not "" then it is prefixed with "/" (toptest-path (conc disk-path (if contour (conc "/" contour) "") "/" testtop-base)) (test-path (conc disk-path (if contour (conc "/" contour) "") "/" test-base)) ;; ensure this exists first as links to subtests must be created there - (linktree (let ((rd (config-lookup *configdat* "setup" "linktree"))) - (if rd rd (conc *toppath* "/runs")))) + (linktree (common:get-linktree)) + ;; WAS: (let ((rd (config-lookup *configdat* "setup" "linktree"))) + ;; (if rd rd (conc *toppath* "/runs")))) + ;; which seems wrong ... (lnkbase (conc linktree (if contour (conc "/" contour) "") "/" target "/" runname)) (lnkpath (conc lnkbase "/" testname)) (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path)) (lnktarget (conc lnkpath "/" item-path))) @@ -1088,11 +1135,11 @@ ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ((item-path (item-list->path itemdat)) - (contour (args:get-arg "-contour"))) + (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (string->number (or (configf:lookup *configdat* "setup" "launch-delay") "5")))) (if (> launch-delay delta) (begin (debug:print-info 0 *default-log-port* "Delaying launch of " test-name " for " (- launch-delay delta) " seconds") @@ -1185,12 +1232,19 @@ (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) - (list 'transport (conc *transport-type*)) + ;; (list 'transport (conc *transport-type*)) ;; (list 'serverinf *server-info*) + (list 'homehost (let* ((hhdat (common:get-homehost))) + (if hhdat + (car hhdat) + #f))) + (list 'serverurl (if *runremote* + (remote-server-url *runremote*) + #f)) ;; (list 'toppath *toppath*) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript) (list 'run-id run-id ) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -530,11 +530,11 @@ (begin (set! *didsomething* #t) ;; suppress the help output. (if (getenv "MT_TARGET") ;; no point in trying if no target (if (args:get-arg "-runname") (let* ((toppath (launch:setup)) - (linktree (if toppath (configf:lookup *configdat* "setup" "linktree"))) + (linktree (common:get-linktree)) ;; (if toppath (configf:lookup *configdat* "setup" "linktree"))) (runtop (conc linktree "/" (getenv "MT_TARGET") "/" (args:get-arg "-runname"))) (files (if (file-exists? runtop) (append (glob (conc runtop "/.megatest*")) (glob (conc runtop "/.runconfig*"))) '()))) @@ -785,12 +785,11 @@ (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server") (args:get-arg "-kill-server")) (let ((tl (launch:setup))) (if tl - (let* ((tdbdat (tasks:open-db)) - (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))) + (let* ((servers (server:get-list *toppath*)) (fmtstr "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n") (servers-to-kill '()) (kill-switch (if (args:get-arg "-kill-server") "-9" "")) (killinfo (or (args:get-arg "-stop-server") (args:get-arg "-kill-server") )) (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) @@ -814,15 +813,15 @@ (killed #f) (status (< last-update 20))) ;; (zmq-sockets (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server - (if (equal? state "dead") - (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day. - (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete)) - (if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds - (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid))) + ;; (if (equal? state "dead") + ;; (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day. + ;; (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete)) + ;; (if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds + ;; (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid))) (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update (if status "alive" "dead") transport) (if (or (equal? id sid) (equal? sid 0)) ;; kill all/any (begin @@ -1183,11 +1182,11 @@ runs-spec) (newline))))) (for-each (lambda (test) - (handle-exceptions + (common:debug-handle-exceptions #f exn (begin (debug:print-error 0 *default-log-port* "Bad data in test record? " test) (print "exn=" (condition->list exn)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) Index: mt.scm ================================================================== --- mt.scm +++ mt.scm @@ -208,11 +208,11 @@ (tal (cdr test-dirs))) ;; Setting MT_LINKTREE here is almost certainly unnecessary. (let ((tconfig-file (conc hed "/" test-name "/testconfig"))) (if (and (file-exists? tconfig-file) (file-read-access? tconfig-file)) - (let ((link-tree-path (configf:lookup *configdat* "setup" "linktree")) + (let ((link-tree-path (common:get-linktree)) ;; (configf:lookup *configdat* "setup" "linktree")) (old-link-tree (get-environment-variable "MT_LINKTREE"))) (if link-tree-path (setenv "MT_LINKTREE" link-tree-path)) (let ((newtcfg (read-config tconfig-file #f #f))) ;; NOTE: Does NOT run [system ...] (hash-table-set! *testconfigs* test-name newtcfg) (if old-link-tree Index: mtut.scm ================================================================== --- mtut.scm +++ mtut.scm @@ -12,11 +12,11 @@ ;; fake out readline usage of toplevel-command (define (toplevel-command . a) #f) (use srfi-1 posix srfi-69 readline ;; regex regex-case srfi-69 apropos json http-client directory-utils rpc typed-records;; (srfi 18) extras) - srfi-18 extras format pkts regex regex-case + srfi-18 extras format pkts pkts regex regex-case (prefix dbi dbi:)) ;; zmq extras) (declare (uses common)) (declare (uses megatest-version)) (declare (uses margs)) @@ -343,11 +343,11 @@ (exists (lookup-by-uuid pdb uuid #f))) (if (not exists) (let* ((pktdat (string-intersperse (with-input-from-file pkt read-lines) "\n")) - (apkt (convert-pkt->alist pktdat)) + (apkt (pkt->alist pktdat)) (ptype (alist-ref 'T apkt))) (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0) (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue")) (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...") ))) @@ -354,11 +354,11 @@ pkts)))) (string-split pktsdirs))))) (define (get-pkt-alists pkts) (map (lambda (x) - (alist-ref 'pkta x)) ;; 'pkta pulls out the alist from the read pkt + (alist-ref 'apkt x)) ;; 'pkta pulls out the alist from the read pkt pkts)) ;; given list of pkts (alist mode) return list of D cards as Unix epoch, sorted descending ;; also delete duplicates by target i.e. (car pkt) (define (get-pkt-times pkts) @@ -391,11 +391,12 @@ ((number? sched-in) sched-in) (else (current-seconds)))) (args-data (if args-alist args-alist (hash-table->alist args:arg-hash))) - (alldat (apply append (list 'a action + (alldat (apply append (list 'T "cmd" + 'a action 'U (current-user-name) 'D sched) (map (lambda (x) (let* ((param (car x)) (value (cdr x)) @@ -761,11 +762,11 @@ (val (cdr a)) (par (lookup-param-by-key key))) ;; (print "key: " key " val: " val " par: " par) (if par (conc res " " (param-translate par) " " val) - (if (member key '(a Z U D)) ;; a is the action + (if (member key '(a Z U D T)) ;; a is the action res (begin (print "ERROR: Unknown key in packet \"" key "\" with value \"" val "\"") res))))) (conc "megatest " (if (not (member action '("sync"))) @@ -807,11 +808,11 @@ (pkts (find-pkts pdb '(cmd) '())) (torun (make-hash-table)) ;; target => ( ... info ... ) (rgentargs (hash-table-keys rgconf))) ;; these are the targets registered for automatically triggering (for-each (lambda (pktdat) - (let* ((pkta (alist-ref 'pkta pktdat)) + (let* ((pkta (alist-ref 'apkt pktdat)) (action (alist-ref 'a pkta)) (cmdline (pkt->cmdline pkta)) (uuid (alist-ref 'Z pkta)) (logf (conc logdir "/" uuid "-run.log")) (fullcmd (conc "NBFAKE_LOG=" logf " nbfake " cmdline))) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -50,25 +50,29 @@ db)) (define (portlogger:open-run-close proc . params) (let* ((fname (conc "/tmp/." (current-user-name) "-portlogger.db")) (avail (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away - (handle-exceptions - exn - (begin - ;; (release-dot-lock fname) - (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 *default-log-port* "exn=" (condition->list exn)) - (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it - (print-call-chain (current-error-port))) + ;;(handle-exceptions + ;; exn + ;; (begin + ;; ;; (release-dot-lock fname) + ;; (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params) + ;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (debug:print 0 *default-log-port* "exn=" (condition->list exn)) + ;; (if (file-exists? fname) + ;; (begin + ;; (debug:print 0 *default-log-port* "Removing portlogger database file " fname) + ;; (delete-file fname))) ;; just get rid of the portlogger file + ;; (print-call-chain (current-error-port))) (let* (;; (lock (obtain-dot-lock fname 2 9 10)) (db (portlogger:open-db fname)) (res (apply proc db params))) (sqlite3:finalize! db) ;; (release-dot-lock fname) - res)))) + res))) +;; ) ;; (fold-row PROC INIT DATABASE SQL . PARAMETERS) (define (portlogger:take-port db portnum) (let* ((qry1 (sqlite3:prepare db "INSERT INTO ports (port,state) VALUES (?,?);")) (qry2 (sqlite3:prepare db "UPDATE ports SET state=?,update_time=strftime('%s','now') WHERE port=?;")) @@ -99,24 +103,24 @@ (sqlite3:finalize! qry3) res)) (define (portlogger:get-prev-used-port db) (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 *default-log-port* "exn=" (condition->list exn)) - (print-call-chain (current-error-port)) - (debug:print 0 *default-log-port* "Continuing anyway.") - #f) - (sqlite3:fold-row - (lambda (var curr) - (or curr var curr)) - #f - db - "SELECT (port) FROM ports WHERE state='released' LIMIT 1;"))) + exn + (begin + (debug:print 0 *default-log-port* "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "exn=" (condition->list exn)) + (print-call-chain (current-error-port)) + (debug:print 0 *default-log-port* "Continuing anyway.") + #f) + (sqlite3:fold-row + (lambda (var curr) + (or curr var curr)) + #f + db + "SELECT (port) FROM ports WHERE state='released' LIMIT 1;"))) (define (portlogger:find-port db) (let* ((lowport (let ((val (configf:lookup *configdat* "server" "lowport"))) (if (and val (string->number val)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -33,17 +33,18 @@ ;; if a server is either running or in the process of starting call client:setup ;; else return #f to let the calling proc know that there is no server available ;; (define (rmt:get-connection-info areapath #!key (area-dat #f)) ;; TODO: push areapath down. (let* ((runremote (or area-dat *runremote*)) - (cinfo (remote-conndat runremote)) - (run-id 0)) - (if cinfo - cinfo - (if (server:check-if-running areapath) - (client:setup areapath) - #f)))) + (cinfo (if (remote? runremote) + (remote-conndat runremote) + #f))) + (if cinfo + cinfo + (if (server:check-if-running areapath) + (client:setup areapath) + #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; @@ -446,11 +447,11 @@ ;; WARNING: This currently bypasses the transaction wrapped writes system (define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) (rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment))) -(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus) +(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus) (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus))) (define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode) (if (number? run-id) (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)) Index: runconfigs.config ================================================================== --- runconfigs.config +++ runconfigs.config @@ -1,5 +1,12 @@ +# To get emacs font highlighing in the various megatest configs do this: +# +# Install emacs-goodies-el: +# sudo apt install emacs-goodies-el +# Add to your ~/.emacs file: +# (add-to-list 'auto-mode-alist '("config\\'" . conf-space-mode)) +# # example of a cron entry to run sync using db spec pgdb, with pgdb setting in file local.config # [a/b/c] all:scheduled:sync cron= 0/5 * * * *;dbdest=pgdb;appendconf=/mfs/matt/.sysmaint/local.config Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -21,10 +21,11 @@ (declare (uses runconfig)) (declare (uses tests)) (declare (uses server)) (declare (uses mt)) (declare (uses archive)) +(declare (uses keys)) ;; (declare (uses filedb)) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") @@ -56,11 +57,11 @@ (common:args-get-target) (get-environment-variable "MT_TARGET"))) (keys (if inkeys inkeys (rmt:get-keys))) (keyvals (if inkeyvals inkeyvals (keys:target->keyval keys target))) (vals (hash-table-ref/default *env-vars-by-run-id* run-id #f)) - (link-tree (configf:lookup *configdat* "setup" "linktree"))) + (link-tree (common:get-linktree))) ;; (configf:lookup *configdat* "setup" "linktree"))) (if testname (setenv "MT_TEST_NAME" testname)) (if itempath (setenv "MT_ITEMPATH" itempath)) ;; get the info from the db and put it in the cache (if link-tree @@ -211,11 +212,11 @@ (all-tests-registry #f) ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names (all-test-names #f) ;; (hash-table-keys all-tests-registry)) (test-names #f) ;; Generated by a call to (tests:filter-test-names all-test-names test-patts)) (required-tests #f) ;; Put fully qualified test/testpath names in this list to be done (task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id))) - (tdbdat (tasks:open-db)) + ;; (tdbdat (tasks:open-db)) (config-reruns (let ((x (configf:lookup *configdat* "setup" "reruns"))) (if x (string->number x) #f))) (allowed-tests #f)) ;; check if readonly @@ -237,12 +238,12 @@ (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (set! *time-to-exit* #t) (print "Received signal " signum ", cleaning up before exit. Please wait...") (let ((th1 (make-thread (lambda () - (let ((tdbdat (tasks:open-db))) - (rmt:tasks-set-state-given-param-key task-key "killed")) + ;; (let ((tdbdat (tasks:open-db))) + (rmt:tasks-set-state-given-param-key task-key "killed") ;; ) (print "Killed by signal " signum ". Exiting") (thread-sleep! 3) (exit)))) (th2 (make-thread (lambda () (thread-sleep! 5) @@ -318,13 +319,18 @@ ;; ;; (rmt:general-call 'delete-tests-in-state run-id "NOT_STARTED") ;; Now convert anything in allow-auto-rerun to NOT_STARTED ;; - (for-each (lambda (state) - (rmt:set-tests-state-status run-id test-names state #f "NOT_STARTED" state)) - (string-split (or (configf:lookup *configdat* "setup" "allow-auto-rerun") ""))))) + (for-each + (lambda (state-status) + (let* ((ss-lst (string-split-fields "/" state-status #:infix)) + (state (if (> (length ss-lst) 0)(car ss-lst) #f)) + (status (if (> (length ss-lst) 1)(cadr ss-lst) #f))) + (rmt:set-tests-state-status run-id test-names state status "NOT_STARTED" status))) + ;; list of state/status pairs separated by spaces + (string-split (or (configf:lookup *configdat* "setup" "allow-auto-rerun") ""))))) ;; Ensure all tests are registered in the test_meta table (runs:update-all-test_meta #f) ;; now add non-directly referenced dependencies (i.e. waiton) @@ -1075,11 +1081,11 @@ (string->number mcj) 1))) ;; length of the register queue ahead (reglen (if (number? reglen-in) reglen-in 1)) (last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle (last-time-some-running (current-seconds)) - (tdbdat (tasks:open-db)) + ;; (tdbdat (tasks:open-db)) (runsdat (make-runs:dat ;; hed: hed ;; tal: tal ;; reg: reg ;; reruns: reruns @@ -1170,13 +1176,13 @@ itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) ;; every couple minutes verify the server is there for this run - (if (and (common:low-noise-print 60 "try start server" run-id) - (tasks:need-server run-id)) - (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood + ;; (if (and (common:low-noise-print 60 "try start server" run-id) + ;; (tasks:need-server run-id)) + ;; (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) @@ -1647,11 +1653,11 @@ ;; NB// should pass in keys? ;; (define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode 'remove-all)(options '())) (common:clear-caches) ;; clear all caches (let* ((db #f) - (tdbdat (tasks:open-db)) + ;; (tdbdat (tasks:open-db)) (keys (rmt:get-keys)) (rundat (mt:get-runs-by-patt keys runnamepatt target)) (header (vector-ref rundat 0)) (runs (vector-ref rundat 1)) (states (if state (string-split state ",") '())) @@ -1705,11 +1711,11 @@ (tasks:kill-runner target run-name testpatt) ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) - (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) + ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) (debug:print 1 *default-log-port* "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((print-run) (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header) action) ((run-wait) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -52,18 +52,19 @@ (begin (thread-sleep! 1) (loop (file-exists? fullpath) (- count 1))) (begin + (debug:print 0 *default-log-port* "ERROR: removing the journal file " fullpath ", this is not good. Look for disk full, write access and other issues.") (if remove (system (conc "rm -rf " fullpath))) #f))) #t)))))) (define (tasks:get-task-db-path) (let ((dbdir (or (configf:lookup *configdat* "setup" "monitordir") (configf:lookup *configdat* "setup" "dbdir") - (conc (configf:lookup *configdat* "setup" "linktree") "/.db")))) + (conc (common:get-linktree) "/.db")))) (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* "Couldn't create path to " dbdir) (exit 1)) @@ -93,11 +94,11 @@ (tasks:open-db numretries (- numretries 1))) (begin (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 *default-log-port* " exn=" (condition->list exn)))) - (let* ((dbpath (tasks:get-task-db-path)) + (let* ((dbpath (db:dbfile-path )) ;; (tasks:get-task-db-path)) (dbfile (conc dbpath "/monitor.db")) (avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away (exists (file-exists? dbpath)) (write-access (file-write-access? dbpath)) (mdb (cond ;; what the hek is *toppath* doing here? Index: tests/Makefile ================================================================== --- tests/Makefile +++ tests/Makefile @@ -19,13 +19,15 @@ # The NEWTARGET causes some tests to fail. Do not use until this is fixed. NEWTARGET = "$(OS)/$(FS)/$(VER)" TARGET = "ubuntu/nfs/none" -all : build unit test1 test2 test3 test4 test5 test6 test7 test8 test9 +all : build unit test4 +# test1 test2 test3 test4 test5 test6 test7 test8 test9 -unit : basicserver.log runs.log misc.log tests.log +unit : all-rmt.log +# basicserver.log runs.log misc.log tests.log rel : cd release;dashboard -rows 25 & ## basicserver.log : unittests/basicserver.scm Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -99,12 +99,12 @@ # htmlviewercmd firefox -new-window htmlviewercmd arora # -runtests automatically deletes the records for tests with the listed states on starting up a run allowing them to re-run # (nb// this is in addition to NOT_STARTED which is automatically re-run) -# -allow-auto-rerun INCOMPLETE ZERO_ITEMS +# format is STATE/STATUS +allow-auto-rerun /INCOMPLETE /ZERO_ITEMS # could add: STUCK STUCK/DEAD UNKNOWN KILLED KILLREQ PREQ_DISCARD [validvalues] state start end 0 1 - 2 status pass fail n/a 0 1 running - 2 ADDED tests/unittests/all-rmt.scm Index: tests/unittests/all-rmt.scm ================================================================== --- /dev/null +++ tests/unittests/all-rmt.scm @@ -0,0 +1,122 @@ + +;;====================================================================== +;; A L L - R M T +;;====================================================================== + +;; Run like this: +;; +;; ./rununittest.sh all-rmt 1 + +;; Definitions: +;; NTN - no test needed +;; DEP - function is deprecated, no point in testing +;; NED - function nested under others, no test needed. +;; DEF - deferred + +(print "start dir: " (current-directory)) + +(define toppath (current-directory)) +(test #f #t (string?(server:start-and-wait *toppath*))) + +(test "setup for run" #t (begin (launch:setup) + (string? (getenv "MT_RUN_AREA_HOME")))) +(test #f #t (vector? (client:setup toppath))) + +(test #f #t (vector? (rmt:get-connection-info toppath))) ;; TODO: push areapath down. +(test #f #t (string? (server:check-if-running "."))) +;; DEF (test #f #f (rmt:send-receive-no-auto-client-setup *runremote* 'get-keys #f '())) +;; DEF (rmt:kill-server run-id) +;; DEF (rmt:start-server run-id) +(test #f '(#t "successful login")(rmt:login #f)) +;; DEF (rmt:login-no-auto-client-setup connection-info) +(test #f #t (pair? (rmt:get-latest-host-load (get-host-name)))) +(test #f #t (list? (rmt:get-changed-record-ids 0))) +(test #f #f (begin (runs:update-all-test_meta #f) #f)) +(test #f '("test1" "test2")(sort (alist-ref "tagtwo" (hash-table->alist (rmt:get-tests-tags)) equal?) string<=)) +(test #f '() (rmt:get-key-val-pairs 0)) +(test #f '("SYSTEM" "RELEASE") (rmt:get-keys)) +(test #f '("SYSTEM" "RELEASE") (rmt:get-keys-write)) ;; dummy query to force server start +(test #f '() (rmt:get-key-vals 1)) +(test #f (vector '("SYSTEM" "RELEASE") '()) (rmt:get-targets)) +(test #f "" (rmt:get-target 1)) +(test #f #t (rmt:register-test 1 "foo" "")) +(test #f 1 (rmt:get-test-id 1 "foo" "")) +(test #f "foo" (vector-ref (rmt:get-test-info-by-id 1 1) 2)) +(test #f "/tmp/badname" (rmt:test-get-rundir-from-test-id 1 1)) +(test #f '(1) (db:set-tests-state-status *db* 1 '("foo") "COMPLETED" "PASS" "NOT_STARTED" "PASS")) +(test #f '(1) (rmt:set-tests-state-status 1 '("foo") "COMPLETED" "PASS" "NOT_STARTED" "PASS")) +(test #f #t (mt:test-set-state-status-by-id 1 1 "COMPLETED" "PASS" "Just testing!")) +(test #f #t (list? (rmt:get-tests-for-run 1 "%" '() '() #f #f #f #f #f #f 0 #f))) +(test #f #t (list? (rmt:get-tests-for-runs-mindata '(1) "%" '() '() #f))) +(test #f #f (begin (rmt:delete-test-records 1 2) #f)) +(test #f #t (begin (rmt:test-set-state-status 1 1 "COMPLETED" "FAIL" "Another message") #t)) +(test #f 0 (rmt:test-toplevel-num-items 1 "foo")) +(test #f '()(rmt:get-matching-previous-test-run-records 1 "foo" "")) +(test #f '("/tmp/badname" "logs/final.log") (rmt:test-get-logfile-info 1 "foo")) +(test #f '()(rmt:test-get-records-for-index-file 1 "foo")) +(test #f #t (vector? (rmt:get-testinfo-state-status 1 1))) +(test #f #t (rmt:test-set-log! 1 1 "/tmp/another/logfile/eh")) +(test #f #f (begin (rmt:test-set-top-process-pid 1 1 123) #f)) +(test #f 123 (rmt:test-get-top-process-pid 1 1)) +(define keys (rmt:get-keys)) +(test #f '()(rmt:get-run-ids-matching-target keys "%/%" #f "%" "%" "%" "%")) +(test #f '()(rmt:test-get-paths-matching-keynames-target-new keys "%/%" #f "%" "%" "%" "%")) +(test #f '()(rmt:get-prereqs-not-met 1 '() "foo" "")) +(test #f 0 (rmt:get-count-tests-running-for-run-id 1)) +(test #f 0 (rmt:get-count-tests-running 1)) +(test #f 0 (rmt:get-count-tests-running-for-testname 1 "foo")) +(test #f 0 (rmt:get-count-tests-running-in-jobgroup 1 "nada")) +(test #f #f (begin (rmt:set-state-status-and-roll-up-items 1 "foo" "" "COMPLETED" "FAIL" "Just yet another message") #f)) +(test #f #t (rmt:top-test-set-per-pf-counts 1 "foo")) +(test #f '() (rmt:get-raw-run-stats 1)) +(test #f #t (vector? (rmt:get-run-info 1))) +(test #f 0 (rmt:get-num-runs "%")) +(define keypatts '(("SYSTEM" "ubuntu")("RELEASE" "v1.234")) ) +(test #f 1 (rmt:register-run '(("SYSTEM" "ubuntu")("RELEASE" "v1.234")) "bar" "NEW" "JUSTFINE" "bobafett" "quick")) +(test #f "bar" (rmt:get-run-name-from-id 1)) +(test #f #t (begin (rmt:delete-run 2) #t)) ;; delete a non-existant run +(test #f #t (begin (rmt:update-run-stats 1 '()) #t)) +(test #f #t (begin (rmt:delete-old-deleted-test-records) #t)) +(test #f #t (vector? (rmt:get-runs "%" 10 0 keypatts))) +(test #f '(1)(rmt:get-all-run-ids)) +(test #f '()(rmt:get-prev-run-ids 1)) +(test #f #t (begin (rmt:lock/unlock-run 1 #t #f "mikey") #t)) +(test #f "JUSTFINE" (rmt:get-run-status 1)) +(test #f #t (begin (rmt:set-run-status 1 "NOTFINE" msg: "A message") #t)) +(test #f #t (begin (rmt:update-run-event_time 1) #t)) +;; (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update) ;; fields of #f uses default +;; (rmt:find-and-mark-incomplete run-id ovr-deadtime) +;; (rmt:get-main-run-stats run-id) +;; (rmt:get-var varname) +;; (rmt:set-var varname value) +;; (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f)) +;; (rmt:get-previous-test-run-record run-id test-name item-path) +;; (rmt:get-run-stats) +;; (rmt:teststep-set-status! run-id test-id teststep-name state-in status-in comment logfile) +;; (rmt:get-steps-for-test run-id test-id) +;; (rmt:read-test-data run-id test-id categorypatt #!key (work-area #f)) +;; (rmt:testmeta-add-record testname) +;; (rmt:testmeta-get-record testname) +;; (rmt:testmeta-update-field test-name fld val) +;; (rmt:test-data-rollup run-id test-id status) +;; (rmt:csv->test-data run-id test-id csvdata) +;; (rmt:tasks-find-task-queue-records target run-name test-patt state-patt action-patt) +;; (rmt:tasks-add action owner target runname testpatt params) +;; (rmt:tasks-set-state-given-param-key param-key new-state) +;; (rmt:tasks-get-last target runname) +;; (rmt:archive-get-allocations testname itempath dneeded) +;; (rmt:archive-register-block-name bdisk-id archive-path) +;; (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name areakey) +;; (rmt:archive-register-disk bdisk-name bdisk-path df) +;; (rmt:test-set-archive-block-id run-id test-id archive-block-id) +;; (rmt:test-get-archive-block-info archive-block-id) +;; NED (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) +;; NED (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected +;; DEF (test #f #f (rmt:print-db-stats)) +;; DEF (rmt:get-max-query-average run-id) +;; NED (rmt:general-call stmtname run-id . params) +;; DEP (rmt:sdb-qry qry val run-id) +;; DEF (rmt:runtests user run-id testpatt params) +;; DEP (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f)) +;; DEP (rmt:synchash-get run-id proc synckey keynum params) +;; DEP (test #f #f (rmt:update-pass-fail-counts 1 "foo"))