Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -118,11 +118,11 @@ (define (api:execute-requests dbstruct dat) (handle-exceptions exn (let ((call-chain (get-call-chain)) ) - (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer") + (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat) (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -248,11 +248,11 @@ ;; (define (common:rotate-logs) (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) - (handle-exceptions + (common:debug-handle-exceptions #t exn (debug:print-info 0 *default-log-port* "failed to rotate log " file ", probably handled by another process.") (let* ((fullname (conc "logs/" file)) (file-age (- (current-seconds)(file-modification-time fullname)))) (if (or (and (string-match "^.*.log" file) @@ -291,11 +291,11 @@ (cond ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) ((and (file-exists? mtconf) (file-exists? dbfile) (not read-only) (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "Failed to switch versions.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain (current-error-port)) @@ -397,13 +397,13 @@ (define (common:get-megatest-exe) (or (getenv "MT_MEGATEST") "megatest")) (define (common:read-encoded-string instr) - (handle-exceptions + (common:debug-handle-exceptions #t exn - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* "received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain (current-error-port)) #f) @@ -840,11 +840,11 @@ (loop (car tal)(cdr tal)))))))) (define (common:get-install-area) (let ((exe-path (car (argv)))) (if (file-exists? exe-path) - (handle-exceptions + (common:debug-handle-exceptions #t exn #f (pathname-directory (pathname-directory (pathname-directory exe-path)))) @@ -858,11 +858,11 @@ (let loop ((hed (car dirs)) (tal (cdr dirs))) (let ((res (or (and (directory? hed) (file-write-access? hed) hed) - (handle-exceptions + (common:debug-handle-exceptions #t exn #f (create-directory hed #t))))) (if (and (string? res) (directory? res)) @@ -874,18 +874,18 @@ ;; return the youngest timestamp . filename ;; (define (common:get-youngest glob-list) (let ((all-files (apply append (map (lambda (patt) - (handle-exceptions + (common:debug-handle-exceptions #t exn '() (glob patt))) glob-list)))) (fold (lambda (fname res) (let ((last-mod (car res)) - (curmod (handle-exceptions + (curmod (common:debug-handle-exceptions #t exn 0 (file-modification-time fname)))) (if (> curmod last-mod) (list curmod fname) @@ -1202,23 +1202,23 @@ ;; lazy-safe get file mod time. on any error (file not existing etc.) return 0 ;; (define (common:lazy-modification-time fpath) (handle-exceptions - exn - 0 - (file-modification-time fpath))) + exn + 0 + (file-modification-time fpath))) ;; find timestamp of newest file associated with a sqlite db file (define (common:lazy-sqlite-db-modification-time fpath) (let* ((glob-list (handle-exceptions - exn - '("/no/such/file") - (glob (conc fpath "*")))) + exn + `(,(conc "/no/such/file, message: " ((condition-property-accessor 'exn 'message) exn))) + (glob (conc fpath "*")))) (file-list (if (eq? 0 (length glob-list)) - '("/no/such/file") - glob-list))) + '("/no/such/file") + glob-list))) (apply max (map common:lazy-modification-time file-list)))) @@ -1233,11 +1233,11 @@ ;; make "nice-path" available in config files and the repl (define nice-path common:nice-path) (define (common:read-link-f path) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed.") path) ;; just give up (with-input-from-pipe Index: common_records.scm ================================================================== --- common_records.scm +++ common_records.scm @@ -26,13 +26,27 @@ (define-syntax define-simple-syntax (syntax-rules () ((_ (name arg ...) body ...) (define-syntax name (syntax-rules () ((name arg ...) (begin body ...))))))) +;; (define-syntax common:handle-exceptions +;; (syntax-rules () +;; ((_ exn-in errstmt ...)(common:debug-handle-exceptions #t exn-in errstmt ...)))) + +(define-syntax common:debug-handle-exceptions + (syntax-rules () + ((_ debug exn errstmt body ...) + (if debug + (begin body ...) + (handle-exceptions exn errstmt body ...))))) + (define-syntax common:handle-exceptions (syntax-rules () - ((_ exn-in errstmt ...)(handle-exceptions exn-in errstmt ...)))) + ((_ exn errstmt body ...) + (begin body ...)))) + +;; (define handle-exceptions common:handle-exceptions) ;; iup callbacks are not dumping the stack, this is a work-around ;; (define-simple-syntax (debug:catch-and-dump proc procname) (handle-exceptions @@ -97,11 +111,11 @@ (list? n)) (not (null? (lset-intersection! eq? *verbosity* n)))) ((and (number? *verbosity*) (list? n)) (member *verbosity* n)))) - + (define (debug:setup) (let ((debugstr (or (args:get-arg "-debug") (getenv "MT_DEBUG_MODE")))) (set! *verbosity* (debug:calc-verbosity debugstr)) (debug:check-verbosity *verbosity* debugstr) Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -49,11 +49,11 @@ (config:assoc-safe-add (hash-table-ref/default cfgdat section-name '()) var value metadata: metadata))) (define (config:eval-string-in-environment str) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* "problem evaluating \"" str "\" in the shell environment") #f) (let ((cmdres (process:cmd-run->list (conc "echo " str)))) @@ -109,11 +109,11 @@ (conc "(lambda (ht)(config-lookup ht \"" sect "\" \"" var "\"))"))) ((runconfigs-get rget) (conc "(lambda (ht)(runconfigs-get ht \"" cmd "\"))")) ;; ((rget) (conc "(lambda (ht)(runconfigs-get ht \"" cmd "\"))")) (else "(lambda (ht)(print \"ERROR\") \"ERROR\")")))) ;; (print "fullcmd=" fullcmd) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "WARNING: failed to process config input \"" l "\"") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) ;; (print "exn=" (condition->list exn)) @@ -653,19 +653,31 @@ (lambda (section) (hash-table-set! ht (car section)(cdr section))) adat) ht)) +;; if (define (configf:read-alist fname) - (configf:alist->config - (with-input-from-file fname read))) + (handle-exceptions + exn + #f + (configf:alist->config + (with-input-from-file fname read)))) (define (configf:write-alist cdat fname) - (with-output-to-file fname - (lambda () - (pp (configf:config->alist cdat))))) - + (let ((dat (configf:config->alist cdat))) + (with-output-to-file fname ;; first write out the file + (lambda () + (pp dat))) + (if (file-exists? fname) ;; now verify it is readable + (if (configf:read-alist fname) + #t ;; data is good. + (begin + (delete-file fname) + (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") + #f)) + #f))) ;; convert hierarchial list to ini format ;; (define (configf:config->ini data) (map Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -465,22 +465,22 @@ (item-path (db:test-get-item-path testdat)) ;; this next block was added to fix a bug where variables were ;; needed. Revisit this. (runconfig (let ((runconfigf (conc *toppath* "/runconfigs.config"))) ;; no rush but it would be good to convert this call to use runconfig:read (if (file-exists? runconfigf) - (handle-exceptions + (common:debug-handle-exceptions #t exn #f ;; do nothing, just keep on trucking .... (setup-env-defaults runconfigf run-id (make-hash-table) keydat environ-patt: keystring)) (make-hash-table)))) (testconfig (begin ;; (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path) (runs:set-megatest-env-vars run-id inkeyvals: keydat inrunname: runname intarget: keystring testname: testname itempath: item-path) ;; these may be needed by the launching process - (handle-exceptions - exn + (common:debug-handle-exceptions #t + exn ;; NOTE: I've no idea why this was written this way. Research, study and fix needed! (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f) - (tests:get-testconfig (db:test-get-testname testdat) test-registry #t)))) + (tests:get-testconfig (db:test-get-testname testdat) item-path test-registry #t)))) (viewlog (lambda (x) (if (file-exists? logfile) ;(system (conc "firefox " logfile "&")) (dashboard-tests:run-html-viewer logfile) (message-window (conc "File " logfile " not found"))))) @@ -513,11 +513,11 @@ (> (current-milliseconds)(+ last-update 250))) ;; every half seconds if db touched (> (current-milliseconds)(+ last-update 10000)) ;; force update even 10 seconds request-update)) (newtestdat (if need-update ;; NOTE: BUG HIDER, try to eliminate this exception handler - (handle-exceptions + (common:debug-handle-exceptions #t exn (debug:print-info 0 *default-log-port* "test db access issue in examine test for run-id " run-id ", test-id " test-id ": " ((condition-property-accessor 'exn 'message) exn)) (rmt:get-test-info-by-id run-id test-id ))))) ;; (print "INFO: need-update= " need-update " curr-mod-time = " curr-mod-time) (cond Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -1899,11 +1899,11 @@ (viewgen (configf:lookup views-cfgdat view-name "viewgen")) (updater (configf:lookup views-cfgdat view-name "updater")) (result-child #f)) (if (and (file-exists? source) (file-read-access? source)) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print-call-chain) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 *default-log-port* "ERROR: failed to load " source ", try loading in the repl: megatest -repl") @@ -1911,11 +1911,11 @@ (load source)) (begin (debug:print 0 *default-log-port* "ERROR: cannot find file to load: \"" source "\" for user view " view-name))) ;; now run the user supplied definition for the tab view (if success - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print-call-chain) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 *default-log-port* "ERROR: failed call procedure " viewgen @@ -1928,11 +1928,11 @@ ((eval (string->symbol viewgen)) commondat tabs tab-num view-name views-cfgdat *configdat*)))) ;; and finally set the updater (if success (dboard:commondat-add-updater commondat (lambda () - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print-call-chain) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 *default-log-port* "ERROR: failed call procedure \"" updater @@ -2712,14 +2712,14 @@ ;; (define *monitor-db-path* #f) (define *last-monitor-update-time* 0) ;; Force creation of the db in case it isn't already there. -(tasks:open-db) +;; (tasks:open-db) (define (dashboard:get-youngest-run-db-mod-time dbdir) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) @@ -3013,11 +3013,11 @@ (reverse (sqlite3:fold-row (lambda (res t var val) (cons (vector t var val) res)) '() db all-dat-qrystr))) - (let ((zeropt (handle-exceptions + (let ((zeropt (common:debug-handle-exceptions #t exn #f (sqlite3:first-row db all-dat-qrystr)))) (if zeropt ;; NOTE: Add zeropt to the beginning of the list as the list was reversed above. (hash-table-set! res-ht Index: datashare.scm ================================================================== --- datashare.scm +++ datashare.scm @@ -228,11 +228,11 @@ (file-read-access? path)) (let* ((dbpath (conc path "/datashare.db")) (writeable (file-write-access? dbpath)) (dbexists (file-exists? dbpath)) (handler (make-busy-timeout 136000))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 2 *default-log-port* "ERROR: problem accessing db " dbpath ((condition-property-accessor 'exn 'message) exn)) (exit)) @@ -243,11 +243,11 @@ (datashare:initialize-db db))) db) (print "ERROR: invalid path for storing database: " path)))) (define (open-run-close-exception-handling proc idb . params) - (handle-exceptions + (common:debug-handle-exceptions #t exn (let ((sleep-time (random 30)) (err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) (case err-status ((busy) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -74,11 +74,11 @@ (print-call-chain (current-error-port)))) ;; convert to -inline ;; (define (db:first-result-default db stmt default . params) - (handle-exceptions + (common:debug-handle-exceptions #t exn (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn) (if (eq? err-status 'done) default @@ -145,11 +145,11 @@ (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) (debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access")) (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*)) (debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*)) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print-call-chain (current-error-port)) (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn)) ;; there is no recovering at this time. exit @@ -190,11 +190,11 @@ ;; ;; If run-id is #f return to create and retrieve the path where the db will live. ;; (define (db:dbfile-path . junk) ;; run-id) (let* ((dbdir (common:get-db-tmp-area))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* "Couldn't create path to " dbdir) (exit 1)) (if (not (directory? dbdir))(create-directory dbdir #t))) @@ -259,11 +259,11 @@ ;; ;; ;; (define (db:open-rundb dbstruct run-id #!key (attemptnum 0)(do-not-open #f)) ;; (conc *toppath* "/megatest.db") (car *configinfo*))) ;; (let* ((dbfile (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db")) ;; (dbexists (file-exists? dbfile)) ;; (db (db:lock-create-open dbfile (lambda (db) -;; (handle-exceptions +;; (common:debug-handle-exceptions #t ;; exn ;; (begin ;; ;; (release-dot-lock dbpath) ;; (if (> attemptnum 2) ;; (debug:print-error 0 *default-log-port* "tried twice, cannot create/initialize db for run-id " run-id ", at path " dbpath) @@ -540,11 +540,11 @@ ;; handle special cases, megatest.db and monitor.db ;; ;; NOPE: apply this same approach to all db files ;; (else ;; ((equal? fname "megatest.db") ;; this file can be regenerated if needed - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin ;; (db:move-and-recreate-db dbdat) (if (> numtries 0) (db:repair-db dbdat numtries: (- numtries 1)) @@ -584,11 +584,11 @@ ;; if last-update specified ("field-name" . time-in-seconds) ;; then sync only records where field-name >= time-in-seconds ;; IFF field-name exists ;; (define (db:sync-tables tbls last-update fromdb todb . slave-dbs) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable in db:sync-tables.") (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) @@ -898,11 +898,10 @@ (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) (tmpdb (db:get-db dbstruct)) (refndb (dbr:dbstruct-refndb dbstruct)) (allow-cleanup #t) ;; (if run-ids #f #t)) - ;; (tdbdat (tasks:open-db)) (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) (data-synced 0)) ;; count of changed records (I hope) ;; kill servers (if (member 'killservers options) @@ -1052,11 +1051,11 @@ (debug:print-info 11 *default-log-port* "open-run-close-no-exception-handling END" ) res) #f)) (define (open-run-close-exception-handling proc idb . params) - (handle-exceptions + (common:debug-handle-exceptions #t exn (let ((sleep-time (random 30)) (err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) (case err-status ((busy) @@ -2580,24 +2579,33 @@ ;; (db:delay-if-busy) ;; ;; NB// This call only operates on toplevel tests. Consider replacing it with more general call ;; (define (db:set-tests-state-status dbstruct run-id testnames currstate currstatus newstate newstatus) - (for-each (lambda (testname) - (let ((qry (conc "UPDATE tests SET state=?,status=? WHERE " - (if currstate (conc "state='" currstate "' AND ") "") - (if currstatus (conc "status='" currstatus "' AND ") "") - " run_id=? AND testname LIKE ?;")) - (test-id (db:get-test-id dbstruct run-id testname ""))) - (db:with-db - dbstruct - run-id - #t - (lambda (db) - (sqlite3:execute db qry newstate newstatus run-id testname))) - (if test-id (mt:process-triggers dbstruct run-id test-id newstate newstatus)))) - testnames)) + (let ((test-ids '())) + (for-each + (lambda (testname) + (let ((qry (conc "UPDATE tests SET state=?,status=? WHERE " + (if currstate (conc "state='" currstate "' AND ") "") + (if currstatus (conc "status='" currstatus "' AND ") "") + " run_id=? AND testname LIKE ?;")) + (test-id (db:get-test-id dbstruct run-id testname ""))) + (db:with-db + dbstruct + run-id + #t + (lambda (db) + (sqlite3:execute db qry + (or newstate currstate "NOT_STARTED") + (or newstatus currstate "UNKNOWN") + run-id testname))) + (if test-id + (begin + (set! test-ids (cons test-id test-ids)) + (mt:process-triggers dbstruct run-id test-id newstate newstatus))))) + testnames) + test-ids)) ;; ;; speed up for common cases with a little logic ;; ;; NB// Ultimately this will be deprecated in deference to mt:test-set-state-status-by-id ;; ;; NOTE: run-id is not used @@ -3668,11 +3676,11 @@ (and dbdat (db:dbdat-get-db dbdat)) (if dbdat (let* ((dbpath (db:dbdat-get-path dbdat)) (db (db:dbdat-get-db dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline (dbfj (conc dbpath "-journal"))) - (if (handle-exceptions + (if (common:debug-handle-exceptions #t exn (begin (debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj) (thread-sleep! 1) (db:delay-if-busy count (- count 1))) Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -609,12 +609,11 @@ (iup:vbox ;; (iup:label "Run statistics" #:expand "HORIZONTAL") stats-matrix))) (define (dcommon:servers-table commondat tabdat) - (let* ((tdbdat (tasks:open-db)) - (colnum 0) + (let* ((colnum 0) (rownum 0) (servers-matrix (iup:matrix #:expand "YES" #:numcol 7 #:numcol-visible 7 #:numlin-visible 5 @@ -621,11 +620,10 @@ )) (colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId")) (updater (lambda () (if (dashboard:monitor-changed? commondat tabdat) (let ((servers (server:get-list *toppath* limit: 10))) - ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))) (iup:attribute-set! servers-matrix "NUMLIN" (length servers)) ;; (set! colnum 0) ;; (for-each (lambda (colname) ;; ;; (print "colnum: " colnum " colname: " colname) ;; (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -112,40 +112,40 @@ ;; (define (http-transport:try-start-server ipaddrstr portnum) (let ((config-hostname (configf:lookup *configdat* "server" "hostname"))) (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname) (handle-exceptions - exn - (begin - (print-error-message exn) - (if (< portnum 64000) - (begin - (debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 *default-log-port* "exn=" (condition->list exn)) - (portlogger:open-run-close portlogger:set-failed portnum) - (debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port") - (thread-sleep! 0.1) - - ;; get_next_port goes here - (http-transport:try-start-server ipaddrstr - (portlogger:open-run-close portlogger:find-port))) - (begin - (print "ERROR: Tried and tried but could not start the server")))) - ;; any error in following steps will result in a retry - (set! *server-info* (list ipaddrstr portnum)) - (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum) - ;; This starts the spiffy server - ;; NEED WAY TO SET IP TO #f TO BIND ALL - ;; (start-server bind-address: ipaddrstr port: portnum) - (if config-hostname ;; this is a hint to bind directly - (start-server port: portnum bind-address: (if (equal? config-hostname "-") - ipaddrstr - config-hostname)) - (start-server port: portnum)) - (portlogger:open-run-close portlogger:set-port portnum "released") - (debug:print 1 *default-log-port* "INFO: server has been stopped")))) + exn + (begin + (print-error-message exn) + (if (< portnum 64000) + (begin + (debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "exn=" (condition->list exn)) + (portlogger:open-run-close portlogger:set-failed portnum) + (debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port") + (thread-sleep! 0.1) + + ;; get_next_port goes here + (http-transport:try-start-server ipaddrstr + (portlogger:open-run-close portlogger:find-port))) + (begin + (print "ERROR: Tried and tried but could not start the server")))) + ;; any error in following steps will result in a retry + (set! *server-info* (list ipaddrstr portnum)) + (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum) + ;; This starts the spiffy server + ;; NEED WAY TO SET IP TO #f TO BIND ALL + ;; (start-server bind-address: ipaddrstr port: portnum) + (if config-hostname ;; this is a hint to bind directly + (start-server port: portnum bind-address: (if (equal? config-hostname "-") + ipaddrstr + config-hostname)) + (start-server port: portnum)) + (portlogger:open-run-close portlogger:set-port portnum "released") + (debug:print 1 *default-log-port* "INFO: server has been stopped")))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== @@ -229,30 +229,30 @@ ;; ((exn http client-error) e (print e))) (set! res (vector success (db:string->obj (handle-exceptions - exn - (begin - (set! success #f) - (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (if runremote - (remote-conndat-set! runremote #f)) - ;; Killing associated server to allow clean retry.") - ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? - (mutex-unlock! *http-mutex*) + exn + (begin + (set! success #f) + (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (if runremote + (remote-conndat-set! runremote #f)) + ;; Killing associated server to allow clean retry.") + ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? + (mutex-unlock! *http-mutex*) ;;; (signal (make-composite-condition ;;; (make-property-condition 'commfail 'message "failed to connect to server"))) ;;; "communications failed" - (db:obj->string #f)) - (with-input-from-request ;; was dat - fullurl - (list (cons 'key "thekey") - (cons 'cmd cmd) - (cons 'params sparams)) - read-string)) + (db:obj->string #f)) + (with-input-from-request ;; was dat + fullurl + (list (cons 'key "thekey") + (cons 'cmd cmd) + (cons 'params sparams)) + read-string)) transport: 'http) 0)) ;; added this speculatively ;; Shouldn't this be a call to the managed call-all-connections stuff above? (close-all-connections!) (mutex-unlock! *http-mutex*) @@ -440,17 +440,16 @@ (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) (define (http-transport:server-shutdown port) - (let ((tdbdat (tasks:open-db))) + (begin ;;(BB> "http-transport:server-shutdown called") (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) ;; ;; start_shutdown ;; - ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up (portlogger:open-run-close portlogger:set-port port "released") (thread-sleep! 1) ;; (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -363,11 +363,11 @@ (pids (delete-duplicates (filter number? (list pid1 pid2))))) (if (not (null? pids)) (begin (for-each (lambda (pid) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-info 0 *default-log-port* "Unable to kill process with pid " pid ", possibly already killed.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) (debug:print 0 *default-log-port* "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") @@ -378,11 +378,11 @@ (process-signal pid-num signal/term)) (process:get-sub-pids pid)) (thread-sleep! 5) ;; (if (process:process-alive? pid) (map (lambda (pid-num) - (handle-exceptions + (common:debug-handle-exceptions #t exn #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) @@ -846,18 +846,18 @@ (if *configdat* (configf:lookup *configdat* "setup" "linktree") #f)))) (if linktree (begin (if (not (file-exists? linktree)) (begin - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* "Something went wrong when trying to create linktree dir at " linktree) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (exit 1)) (create-directory linktree #t)))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* "Something went wrong when trying to create link to linktree at " *toppath*) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) (let ((tlink (conc *toppath* "/lt"))) @@ -952,11 +952,11 @@ (debug:print 0 *default-log-port* "WARNING: linktree did not exist! Creating it now at " linktree) (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) ;; create the directory for the tests dir links, this is needed no matter what... (if (and (not (directory-exists? lnkbase)) (not (file-exists? lnkbase))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase) (print-error-message exn (current-error-port))) (create-directory lnkbase #t))) @@ -971,28 +971,28 @@ ;; to the iteration. use pathname-directory to trim the path by one ;; level (if (not not-iterated) ;; i.e. iterated (let ((iterated-parent (pathname-directory (conc lnkpath "/" item-path)))) (debug:print-info 2 *default-log-port* "Creating iterated parent " iterated-parent) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* " Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit 1)) (create-directory iterated-parent #t)))) (if (symbolic-link? lnkpath) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit 1)) (delete-file lnkpath))) (if (not (or (file-exists? lnkpath) (symbolic-link? lnkpath))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit 1)) (create-symbolic-link toptest-path lnkpath))) @@ -1021,11 +1021,11 @@ ;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin (debug:print-info 2 *default-log-port* "Creating " toptest-path " and link " lnkpath) - (handle-exceptions + (common:debug-handle-exceptions #t exn #f ;; don't care to catch and deal with errors here for now. (create-directory toptest-path #t)) (hash-table-set! *toptest-paths* testname toptest-path))))) @@ -1033,11 +1033,11 @@ ;; been created. Now, if this is an iterated test the real test dir must be created (if (not not-iterated) ;; this is an iterated test (begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) (debug:print 2 *default-log-port* "Setting up sub test run area") (debug:print 2 *default-log-port* " - creating run area in " test-path) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit 1)) (create-directory test-path #t)) @@ -1044,11 +1044,11 @@ (debug:print 2 *default-log-port* " - creating link from: " test-path "\n" " to: " lnktarget) ;; If there is already a symlink delete it and recreate it. - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit)) (if (symbolic-link? lnktarget) (delete-file lnktarget)) Index: lock-queue.scm ================================================================== --- lock-queue.scm +++ lock-queue.scm @@ -39,11 +39,11 @@ (db (sqlite3:open-database actualfname)) (handler (make-busy-timeout 136000))) (if dbexists (vector db actualfname) (begin - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (thread-sleep! 10) (if (> count 0) (lock-queue:open-db fname count: (- count 1)) @@ -69,11 +69,11 @@ (sqlite3:set-busy-handler! db handler) (vector db actualfname))) (define (lock-queue:set-state dbdat test-id newstate #!key (remtries 10)) (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200) - (handle-exceptions + (common:debug-handle-exceptions #t exn (if (> remtries 0) (begin (debug:print 0 *default-log-port* "WARNING: exception on lock-queue:set-state. Trying again in 30 seconds.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) @@ -87,11 +87,11 @@ test-id))) (define (lock-queue:any-younger? dbdat mystart test-id #!key (remtries 10)) ;; no need to wait on journal on read only queries ;; (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200) - (handle-exceptions + (common:debug-handle-exceptions #t exn (if (> remtries 0) (begin (debug:print 0 *default-log-port* "WARNING: exception on lock-queue:any-younger. Removing lockdb and trying again in 5 seconds.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) @@ -116,11 +116,11 @@ (let* ((res #f) (db (lock-queue:db-dat-get-db dbdat)) (lckqry (sqlite3:prepare db "SELECT test_id,run_lock FROM runlocks WHERE run_lock='locked';")) (mklckqry (sqlite3:prepare db "INSERT INTO runlocks (test_id,run_lock) VALUES (?,'locked');"))) (let ((result - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "WARNING: failed to get queue lock. Removing lock db and returning fail") ;; Will try again in a few seconds") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 10) @@ -148,11 +148,11 @@ result))) (define (lock-queue:release-lock fname test-id #!key (count 10)) (let* ((dbdat (lock-queue:open-db fname))) (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 "lock-queue:release-lock; waiting on journal") - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "WARNING: Failed to release queue lock. Will try again in few seconds") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! (/ count 10)) @@ -161,11 +161,11 @@ (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat)) (lock-queue:release-lock fname test-id count: (- count 1))) (let ((journal (conc fname "-journal"))) ;; If we've tried ten times and failed there is a serious problem ;; try to remove the lock db and allow it to be recreated - (handle-exceptions + (common:debug-handle-exceptions #t exn #f (if (file-exists? journal)(delete-file journal)) (if (file-exists? fname) (delete-file fname)) #f)))) @@ -173,11 +173,11 @@ (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat))))) (define (lock-queue:steal-lock dbdat test-id #!key (count 10)) (debug:print-info 0 *default-log-port* "Attempting to steal lock at " (lock-queue:db-dat-get-path dbdat)) (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 "lock-queue:steal-lock; waiting on journal") - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "WARNING: Failed to steal queue lock. Will try again in few seconds") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 10) @@ -194,11 +194,11 @@ (define (lock-queue:wait-turn fname test-id #!key (count 10)(waiting-msg #f)) (let* ((dbdat (lock-queue:open-db fname)) (mystart (current-seconds)) (db (lock-queue:db-dat-get-db dbdat))) ;; (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 waiting-msg: "lock-queue:wait-turn; waiting on journal file") - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "WARNING: Failed to find out if it is ok to skip the wait queue. Will try again in few seconds") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain (current-error-port)) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -467,11 +467,11 @@ (exit-handler (lambda (#!optional (exit-code 0)) (printf "Preparing to exit with exit code ~A ...\n" exit-code) (for-each (lambda (pid) - (handle-exceptions + (common:debug-handle-exceptions #t exn #t (let-values (((pid-val exit-status exit-code) (process-wait pid #t))) (if (or (eq? pid-val pid) (eq? pid-val 0)) @@ -538,11 +538,11 @@ (debug:print-info 0 *default-log-port* "No cached megatest or runconfigs files found. None removed.") (begin (debug:print-info 0 *default-log-port* "Removing cached files:\n " (string-intersperse files "\n ")) (for-each (lambda (f) - (handle-exceptions + (common:debug-handle-exceptions #t exn (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f) (delete-file f))) files)))) (debug:print-error 0 *default-log-port* "-clean-cache requires -runname.")) @@ -781,12 +781,11 @@ (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server") (args:get-arg "-kill-server")) (let ((tl (launch:setup))) (if tl - (let* ((tdbdat (tasks:open-db)) - (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))) + (let* ((servers (server:get-list *toppath*)) (fmtstr "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n") (servers-to-kill '()) (kill-switch (if (args:get-arg "-kill-server") "-9" "")) (killinfo (or (args:get-arg "-stop-server") (args:get-arg "-kill-server") )) (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) @@ -810,15 +809,15 @@ (killed #f) (status (< last-update 20))) ;; (zmq-sockets (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server - (if (equal? state "dead") - (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day. - (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete)) - (if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds - (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid))) + ;; (if (equal? state "dead") + ;; (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day. + ;; (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete)) + ;; (if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds + ;; (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid))) (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update (if status "alive" "dead") transport) (if (or (equal? id sid) (equal? sid 0)) ;; kill all/any (begin @@ -1179,11 +1178,11 @@ runs-spec) (newline))))) (for-each (lambda (test) - (handle-exceptions + (common:debug-handle-exceptions #f exn (begin (debug:print-error 0 *default-log-port* "Bad data in test record? " test) (print "exn=" (condition->list exn)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) Index: mtut.scm ================================================================== --- mtut.scm +++ mtut.scm @@ -201,15 +201,15 @@ (or (configf:lookup torun contour runkey) '())))) (define (fossil:clone-or-sync url name dest-dir) (let ((targ-file (conc dest-dir "/" name))) ;; do not force usage of .fossil extension - (handle-exceptions + (common:debug-handle-exceptions #t exn (print "ERROR: failed to create directory " dest-dir " message: " ((condition-property-accessor 'exn 'message) exn)) (create-directory dest-dir #t)) - (handle-exceptions + (common:debug-handle-exceptions #t exn (print "ERROR: failed to clone or sync 1ossil " url " message: " ((condition-property-accessor 'exn 'message) exn)) (if (file-exists? targ-file) (system (conc "fossil pull --once " url " -R " targ-file)) (system (conc "fossil clone " url " " targ-file)) @@ -216,19 +216,19 @@ )))) (define (fossil:last-change-node-and-time fossils-dir fossil-name branch) (let* ((fossil-file (conc fossils-dir "/" fossil-name)) (timeline-port (if (file-read-access? fossil-file) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print "ERROR: failed to get timeline from " fossil-file " message: " ((condition-property-accessor 'exn 'message) exn)) #f) (open-input-pipe (conc "fossil timeline -t ci -W 0 -n 0 -R " fossil-file))) #f)) (get-line (lambda () - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print "ERROR: failed to read from file " fossil-file " message: " ((condition-property-accessor 'exn 'message) exn)) #f) (read-line timeline-port)))) @@ -391,11 +391,12 @@ ((number? sched-in) sched-in) (else (current-seconds)))) (args-data (if args-alist args-alist (hash-table->alist args:arg-hash))) - (alldat (apply append (list 'a action + (alldat (apply append (list 'T "cmd" + 'a action 'U (current-user-name) 'D sched) (map (lambda (x) (let* ((param (car x)) (value (cdr x)) @@ -449,11 +450,11 @@ (new-target (if area-xlatr (let ((xlatr-key (string->symbol area-xlatr))) (if (alist-ref xlatr-key *target-mappers*) (begin (print "Using target mapper: " area-xlatr) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print "FAILED TO RUN TARGET MAPPER FOR " area ", called " area-xlatr) (print " function is: " (alist-ref xlatr-key *target-mappers*)) (print " message: " ((condition-property-accessor 'exn 'message) exn)) @@ -594,11 +595,11 @@ (lambda (cmd) (print "cmd: " cmd) (let* ((script (car cmd)) (params (cdr cmd)) (cmd (conc script " " contour " " runkey " " std-runname " " action " " params)) - (res (handle-exceptions + (res (common:debug-handle-exceptions #t exn #f (print "Running " cmd) (with-input-from-pipe cmd read-lines)))) (if (and res (not (null? res))) @@ -787,11 +788,11 @@ ;; (define (dispatch-commands mtconf toppath) ;; we are expecting a directory "logs", check and create it, create the log in /tmp if not able to create logs dir (let ((logdir (if (if (not (directory? "logs")) - (handle-exceptions + (common:debug-handle-exceptions #t exn #f (create-directory "logs") #t) #t) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -50,25 +50,29 @@ db)) (define (portlogger:open-run-close proc . params) (let* ((fname (conc "/tmp/." (current-user-name) "-portlogger.db")) (avail (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away - (handle-exceptions - exn - (begin - ;; (release-dot-lock fname) - (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 *default-log-port* "exn=" (condition->list exn)) - (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it - (print-call-chain (current-error-port))) + ;;(common:debug-handle-exceptions #t + ;; exn + ;; (begin + ;; ;; (release-dot-lock fname) + ;; (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params) + ;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (debug:print 0 *default-log-port* "exn=" (condition->list exn)) + ;; (if (file-exists? fname) + ;; (begin + ;; (debug:print 0 *default-log-port* "Removing portlogger database file " fname) + ;; (delete-file fname))) ;; just get rid of the portlogger file + ;; (print-call-chain (current-error-port))) (let* (;; (lock (obtain-dot-lock fname 2 9 10)) (db (portlogger:open-db fname)) (res (apply proc db params))) (sqlite3:finalize! db) ;; (release-dot-lock fname) - res)))) + res))) +;; ) ;; (fold-row PROC INIT DATABASE SQL . PARAMETERS) (define (portlogger:take-port db portnum) (let* ((qry1 (sqlite3:prepare db "INSERT INTO ports (port,state) VALUES (?,?);")) (qry2 (sqlite3:prepare db "UPDATE ports SET state=?,update_time=strftime('%s','now') WHERE port=?;")) @@ -99,24 +103,24 @@ (sqlite3:finalize! qry3) res)) (define (portlogger:get-prev-used-port db) (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 *default-log-port* "exn=" (condition->list exn)) - (print-call-chain (current-error-port)) - (debug:print 0 *default-log-port* "Continuing anyway.") - #f) - (sqlite3:fold-row - (lambda (var curr) - (or curr var curr)) - #f - db - "SELECT (port) FROM ports WHERE state='released' LIMIT 1;"))) + exn + (begin + (debug:print 0 *default-log-port* "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "exn=" (condition->list exn)) + (print-call-chain (current-error-port)) + (debug:print 0 *default-log-port* "Continuing anyway.") + #f) + (sqlite3:fold-row + (lambda (var curr) + (or curr var curr)) + #f + db + "SELECT (port) FROM ports WHERE state='released' LIMIT 1;"))) (define (portlogger:find-port db) (let* ((lowport (let ((val (configf:lookup *configdat* "server" "lowport"))) (if (and val (string->number val)) @@ -153,11 +157,11 @@ (define (portlogger:main . args) (let* ((dbfname (conc "/tmp/." (current-user-name) "-portlogger.db")) (db (portlogger:open-db dbfname)) (numargs (length args)) (result - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "EXCEPTION: portlogger database at " dbfname " probably overloaded or unreadable. Try removing it.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) Index: process.scm ================================================================== --- process.scm +++ process.scm @@ -23,11 +23,11 @@ (loop (conc res (read-char port))) res))) (define (process:cmd-run-with-stderr->list cmd . params) ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params) -;; (handle-exceptions +;; (common:debug-handle-exceptions #t ;; exn ;; (begin ;; (print "ERROR: Failed to run command: " cmd " " (string-intersperse params " ")) ;; (print " " ((condition-property-accessor 'exn 'message) exn)) ;; #f) @@ -48,11 +48,11 @@ (close-output-port fho) result))))) ;; ) (define (process:cmd-run-proc-each-line cmd proc . params) ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print "ERROR: Failed to run command: " cmd " " (string-intersperse params " ")) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) @@ -140,21 +140,21 @@ (let ((pid (string->number inl))) (if proc (proc pid)) (loop (read-line) (cons pid res)))))))) (define (process:alive? pid) - (handle-exceptions + (common:debug-handle-exceptions #t exn ;; possibly pid is a process not a child, look in /proc to see if it is running still (file-exists? (conc "/proc/" pid)) (let-values (((rpid exit-type exit-signal)(process-wait pid #t))) (and (number? rpid) (equal? rpid pid))))) (define (process:alive-on-host? host pid) (let ((cmd (conc "ssh " host " ps -o pid= -p " pid))) - (handle-exceptions + (common:debug-handle-exceptions #t exn #f ;; anything goes wrong - assume the process in NOT running. (with-input-from-pipe cmd (lambda () Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -194,11 +194,11 @@ (server:start-and-wait *toppath*) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) -;; (handle-exceptions +;; (common:debug-handle-exceptions #t ;; exn ;; (begin ;; (debug:print 0 *default-log-port* "WARNING: stats collection failed in update-db-stats") ;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) ;; (print "exn=" (condition->list exn)) @@ -293,11 +293,11 @@ (mutex-unlock! *db-multi-sync-mutex*))))) res)) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) - (res (handle-exceptions + (res (common:debug-handle-exceptions #t exn #f (http-transport:client-api-send-receive run-id connection-info cmd params)))) (if (and res (vector-ref res 0)) (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! @@ -446,11 +446,11 @@ ;; WARNING: This currently bypasses the transaction wrapped writes system (define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) (rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment))) -(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus) +(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus) (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus))) (define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode) (if (number? run-id) (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)) Index: rpc-transport.scm ================================================================== --- rpc-transport.scm +++ rpc-transport.scm @@ -24,11 +24,11 @@ (include "common_records.scm") (include "db_records.scm") ;; procstr is the name of the procedure to be called as a string (define (rpc-transport:autoremote procstr params) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 1 *default-log-port* "Remote failed for " proc " " params) (apply (eval (string->symbol procstr)) params)) ;; (if *runremote* @@ -136,21 +136,21 @@ (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") )))))) (define (rpc-transport:find-free-port-and-open port) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print "Failed to bind to port " (rpc:default-server-port) ", trying next port") (rpc-transport:find-free-port-and-open (+ port 1))) (rpc:default-server-port port) (tcp-read-timeout 240000) (tcp-listen (rpc:default-server-port) 10000))) (define (rpc-transport:ping run-id host port) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print "SERVER_NOT_FOUND") (exit 1)) (let ((login-res ((rpc:procedure 'server:login host port) *toppath*))) @@ -204,11 +204,11 @@ ;; (port (if (and hostinfo (> (length hostdat) 1))(cadr hostdat) #f))) ;; (if (and port ;; (string->number port)) ;; (let ((portn (string->number port))) ;; (debug:print-info 2 *default-log-port* "Setting up to connect to host " host ":" port) -;; (handle-exceptions +;; (common:debug-handle-exceptions #t ;; exn ;; (begin ;; (debug:print-error 0 *default-log-port* "Failed to open a connection to the server at host: " host " port: " port) ;; (debug:print 0 *default-log-port* " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) ;; ;; (open-run-close Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -211,11 +211,11 @@ (all-tests-registry #f) ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names (all-test-names #f) ;; (hash-table-keys all-tests-registry)) (test-names #f) ;; Generated by a call to (tests:filter-test-names all-test-names test-patts)) (required-tests #f) ;; Put fully qualified test/testpath names in this list to be done (task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id))) - (tdbdat (tasks:open-db)) + ;; (tdbdat (tasks:open-db)) (config-reruns (let ((x (configf:lookup *configdat* "setup" "reruns"))) (if x (string->number x) #f))) (allowed-tests #f)) ;; check if readonly @@ -237,12 +237,12 @@ (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (set! *time-to-exit* #t) (print "Received signal " signum ", cleaning up before exit. Please wait...") (let ((th1 (make-thread (lambda () - (let ((tdbdat (tasks:open-db))) - (rmt:tasks-set-state-given-param-key task-key "killed")) + ;; (let ((tdbdat (tasks:open-db))) + (rmt:tasks-set-state-given-param-key task-key "killed") ;; ) (print "Killed by signal " signum ". Exiting") (thread-sleep! 3) (exit)))) (th2 (make-thread (lambda () (thread-sleep! 5) @@ -318,13 +318,18 @@ ;; ;; (rmt:general-call 'delete-tests-in-state run-id "NOT_STARTED") ;; Now convert anything in allow-auto-rerun to NOT_STARTED ;; - (for-each (lambda (state) - (rmt:set-tests-state-status run-id test-names state #f "NOT_STARTED" state)) - (string-split (or (configf:lookup *configdat* "setup" "allow-auto-rerun") ""))))) + (for-each + (lambda (state-status) + (let* ((ss-lst (string-split-fields "/" state-status #:infix)) + (state (if (> (length ss-lst) 0)(car ss-lst) #f)) + (status (if (> (length ss-lst) 1)(cadr ss-lst) #f))) + (rmt:set-tests-state-status run-id test-names state status "NOT_STARTED" status))) + ;; list of state/status pairs separated by spaces + (string-split (or (configf:lookup *configdat* "setup" "allow-auto-rerun") ""))))) ;; Ensure all tests are registered in the test_meta table (runs:update-all-test_meta #f) ;; now add non-directly referenced dependencies (i.e. waiton) @@ -422,11 +427,11 @@ (if (> (length (hash-table-keys test-records)) 0) (let* ((keep-going #t) (run-queue-retries 5) (th1 (make-thread (lambda () (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)) - ;; (handle-exceptions + ;; (common:debug-handle-exceptions #t ;; exn ;; (begin ;; (print-call-chain (current-error-port)) ;; (debug:print-error 0 *default-log-port* "failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn)) ;; (if (> run-queue-retries 0) @@ -438,11 +443,11 @@ (th2 (make-thread (lambda () ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going - (handle-exceptions + (common:debug-handle-exceptions #t exn (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id) (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) run-ids))) "runs: mark-incompletes"))) @@ -1075,11 +1080,11 @@ (string->number mcj) 1))) ;; length of the register queue ahead (reglen (if (number? reglen-in) reglen-in 1)) (last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle (last-time-some-running (current-seconds)) - (tdbdat (tasks:open-db)) + ;; (tdbdat (tasks:open-db)) (runsdat (make-runs:dat ;; hed: hed ;; tal: tal ;; reg: reg ;; reruns: reruns @@ -1170,13 +1175,13 @@ itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) ;; every couple minutes verify the server is there for this run - (if (and (common:low-noise-print 60 "try start server" run-id) - (tasks:need-server run-id)) - (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood + ;; (if (and (common:low-noise-print 60 "try start server" run-id) + ;; (tasks:need-server run-id)) + ;; (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) @@ -1647,11 +1652,11 @@ ;; NB// should pass in keys? ;; (define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode 'remove-all)(options '())) (common:clear-caches) ;; clear all caches (let* ((db #f) - (tdbdat (tasks:open-db)) + ;; (tdbdat (tasks:open-db)) (keys (rmt:get-keys)) (rundat (mt:get-runs-by-patt keys runnamepatt target)) (header (vector-ref rundat 0)) (runs (vector-ref rundat 1)) (states (if state (string-split state ",") '())) @@ -1705,11 +1710,11 @@ (tasks:kill-runner target run-name testpatt) ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) - (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) + ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) (debug:print 1 *default-log-port* "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((print-run) (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header) action) ((run-wait) @@ -1872,18 +1877,18 @@ (debug:print 0 *default-log-port* "WARNING: directory " real-dir " does not exist") (debug:print 0 *default-log-port* "WARNING: no real directory corrosponding to link " run-dir ", nothing done"))) (if (symbolic-link? run-dir) (begin (debug:print-info 1 *default-log-port* "Removing symlink " run-dir) - (handle-exceptions + (common:debug-handle-exceptions #t exn (debug:print-error 0 *default-log-port* " Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue") (delete-file run-dir))) (if (directory? run-dir) (if (> (directory-fold (lambda (f x)(+ 1 x)) 0 run-dir) 0) (debug:print 0 *default-log-port* "WARNING: refusing to remove " run-dir " as it is not empty") - (handle-exceptions + (common:debug-handle-exceptions #t exn (debug:print-error 0 *default-log-port* " Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue") (delete-directory run-dir))) (if (and run-dir (not (member run-dir (list "n/a" "/tmp/badname")))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -183,11 +183,11 @@ (if (null? server-logs) '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) - (let* ((mod-time (handle-exceptions + (let* ((mod-time (common:debug-handle-exceptions #t exn 0 (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) @@ -287,11 +287,11 @@ (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. (define (server:dotserver-age-seconds areapath) (let ((server-file (conc areapath "/.server"))) (begin - (handle-exceptions + (common:debug-handle-exceptions #t exn #f (- (current-seconds) (file-modification-time server-file)))))) Index: sharedat.scm ================================================================== --- sharedat.scm +++ sharedat.scm @@ -112,11 +112,11 @@ (directory? path) (file-read-access? path)) (let* ((dbpath (conc path "/spublish.db")) (writeable (file-write-access? dbpath)) (dbexists (file-exists? dbpath))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 2 *default-log-port* "ERROR: problem accessing db " dbpath ((condition-property-accessor 'exn 'message) exn)) (exit 1)) Index: spublish.scm ================================================================== --- spublish.scm +++ spublish.scm @@ -112,11 +112,11 @@ (directory? path) (file-read-access? path)) (let* ((dbpath (conc path "/spublish.db")) (writeable (file-write-access? dbpath)) (dbexists (file-exists? dbpath))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 2 *default-log-port* "ERROR: problem accessing db " dbpath ((condition-property-accessor 'exn 'message) exn)) (exit 1)) Index: sretrieve.scm ================================================================== --- sretrieve.scm +++ sretrieve.scm @@ -121,11 +121,11 @@ (directory? path) (file-read-access? path)) (let* ((dbpath (conc path "/" *exe-name* ".db")) (writeable (file-write-access? dbpath)) (dbexists (file-exists? dbpath))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 2 *default-log-port* "ERROR: problem accessing db " dbpath ((condition-property-accessor 'exn 'message) exn)) (exit 1)) @@ -485,11 +485,11 @@ ;; /.config file using ;; as an input (if (file-exists? upstream-file) (if (or (not (file-exists? package-config)) ;; if not created call the updater, otherwise call only if upstream newer (> (file-modification-time upstream-file)(file-modification-time package-config))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (debug:print-error 0 *default-log-port* "failed to run script " conversion-script " with params " upstream-file " " package-config) (let ((pid (process-run conversion-script (list upstream-file package-config)))) (process-wait pid))) (debug:print 0 *default-log-port* "Skipping update of " package-config " from " upstream-file)) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -52,19 +52,20 @@ (begin (thread-sleep! 1) (loop (file-exists? fullpath) (- count 1))) (begin + (debug:print 0 *default-log-port* "ERROR: removing the journal file " fullpath ", this is not good. Look for disk full, write access and other issues.") (if remove (system (conc "rm -rf " fullpath))) #f))) #t)))))) (define (tasks:get-task-db-path) (let ((dbdir (or (configf:lookup *configdat* "setup" "monitordir") (configf:lookup *configdat* "setup" "dbdir") (conc (configf:lookup *configdat* "setup" "linktree") "/.db")))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print-error 0 *default-log-port* "Couldn't create path to " dbdir) (exit 1)) (if (not (directory? dbdir))(create-directory dbdir #t))) @@ -80,11 +81,11 @@ ;; ==> open in-mem version ;; (define (tasks:open-db #!key (numretries 4)) (if *task-db* *task-db* - (handle-exceptions + (common:debug-handle-exceptions #t exn (if (> numretries 0) (begin (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) @@ -93,11 +94,11 @@ (tasks:open-db numretries (- numretries 1))) (begin (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 *default-log-port* " exn=" (condition->list exn)))) - (let* ((dbpath (tasks:get-task-db-path)) + (let* ((dbpath (db:dbfile-path )) ;; (tasks:get-task-db-path)) (dbfile (conc dbpath "/monitor.db")) (avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away (exists (file-exists? dbpath)) (write-access (file-write-access? dbpath)) (mdb (cond ;; what the hek is *toppath* doing here? @@ -468,11 +469,11 @@ (define (tasks:param-key->id dbstruct task-params) (db:with-db dbstruct #f #f (lambda (db) - (handle-exceptions + (common:debug-handle-exceptions #t exn #f (sqlite3:first-result db "SELECT id FROM tasks_queue WHERE params LIKE ?;" task-params))))) @@ -484,19 +485,19 @@ (define (tasks:get-records-given-param-key dbstruct param-key state-patt action-patt test-patt) (db:with-db dbstruct #f #f (lambda (db) - (handle-exceptions + (common:debug-handle-exceptions #t exn '() (sqlite3:first-row db "SELECT id,action,owner,state,target,name,testpatt,keylock,params WHERE params LIKE ? AND state LIKE ? AND action LIKE ? AND testpatt LIKE ?;" param-key state-patt action-patt test-patt))))) (define (tasks:find-task-queue-records dbstruct target run-name test-patt state-patt action-patt) - ;; (handle-exceptions + ;; (common:debug-handle-exceptions #t ;; exn ;; '() ;; (sqlite3:first-row (let ((db (db:delay-if-busy (db:get-db dbstruct))) (res '())) @@ -528,11 +529,11 @@ (pid (string->number (caddr match-dat)))) (debug:print 0 *default-log-port* "Sending SIGINT to process " pid " on host " hostname) (if (equal? (get-host-name) hostname) (if (process:alive? pid) (begin - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "Kill of process " pid " on host " hostname " failed.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) #t) @@ -593,11 +594,11 @@ ;; (define (tasks:set-area dbh configdat #!key (toppath #f)) ;; could I safely put *toppath* in for the default for toppath? when would it be evaluated? (let loop ((area-name (or (configf:lookup configdat "setup" "area-name") (common:get-area-name))) (modifier 'none)) - (let ((success (handle-exceptions + (let ((success (common:debug-handle-exceptions #t exn (begin (debug:print 0 *default-log-port* "ERROR: cannot create area entry, " ((condition-property-accessor 'exn 'message) exn)) #f) ;; FIXME: I don't care for now but I should look at *why* there was an exception (pgdb:add-area dbh area-name (or toppath *toppath*))))) @@ -639,11 +640,11 @@ (pgdb:refresh-run-info dbh new-run-id state status owner event-time comment fail-count pass-count) new-run-id) - (if (handle-exceptions + (if (common:debug-handle-exceptions #t exn (begin (print-call-chain) #f) (pgdb:insert-run dbh spec-id target run-name state status owner event-time comment fail-count pass-count)) ;; area-id)) Index: tdb.scm ================================================================== --- tdb.scm +++ tdb.scm @@ -52,11 +52,11 @@ (directory? work-area) (file-read-access? work-area)) (let* ((dbpath (conc work-area "/testdat.db")) (dbexists (file-exists? dbpath)) (work-area-writeable (file-write-access? work-area)) - (db (handle-exceptions ;; open the db if area writeable or db pre-existing. open in-mem otherwise. if exception, open in-mem + (db (common:debug-handle-exceptions #t ;; open the db if area writeable or db pre-existing. open in-mem otherwise. if exception, open in-mem exn (begin (print-call-chain (current-error-port)) (debug:print 2 *default-log-port* "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test" ((condition-property-accessor 'exn 'message) exn)) @@ -81,11 +81,11 @@ (debug:print-info 11 *default-log-port* "Initialized test database " dbpath) (tdb:testdb-initialize db))) ;; (sqlite3:execute db "PRAGMA synchronous = 0;") (debug:print-info 11 *default-log-port* "open-test-db END (sucessful)" work-area) ;; now let's test that everything is correct - (handle-exceptions + (common:debug-handle-exceptions #t exn (begin (print-call-chain (current-error-port)) (debug:print-error 0 *default-log-port* "problem accessing test db " work-area ", you probably should clean and re-run this test or remove the file " dbpath ".\n " Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1098,11 +1098,11 @@ (if fnamepatt (apply append (map (lambda (p) (if (directory-exists? p) (let ((glob-query (conc p "/" fnamepatt))) - (handle-exceptions + (common:debug-handle-exceptions #t exn (with-input-from-pipe (conc "echo " glob-query) read-lines) ;; we aren't going to try too hard. If glob breaks it is likely because someone tried to do */*/*.log or similar (glob glob-query))) @@ -1151,11 +1151,11 @@ (cache-exists (and cache-file (not force-create) ;; if force-create then pretend there is no cache to read (file-exists? cache-file))) (cached-dat (if (and (not force-create) cache-exists) - (handle-exceptions + (common:debug-handle-exceptions #t exn #f ;; any issues, just give up with the cached version and re-read (configf:read-alist cache-file)) #f)) (test-full-name (if (and item-path (not (string-null? item-path))) @@ -1529,11 +1529,11 @@ ;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) (define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (remtries 10)) - (handle-exceptions + (common:debug-handle-exceptions #t exn (if (> remtries 0) (begin (print-call-chain (current-error-port)) (debug:print-info 0 *default-log-port* "WARNING: failed to set meta info. Will try " remtries " more times") Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -99,12 +99,12 @@ # htmlviewercmd firefox -new-window htmlviewercmd arora # -runtests automatically deletes the records for tests with the listed states on starting up a run allowing them to re-run # (nb// this is in addition to NOT_STARTED which is automatically re-run) -# -allow-auto-rerun INCOMPLETE ZERO_ITEMS +# format is STATE/STATUS +allow-auto-rerun /INCOMPLETE /ZERO_ITEMS # could add: STUCK STUCK/DEAD UNKNOWN KILLED KILLREQ PREQ_DISCARD [validvalues] state start end 0 1 - 2 status pass fail n/a 0 1 running - 2