Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -561,11 +561,11 @@ request-update)) (newtestdat (if need-update ;; NOTE: BUG HIDER, try to eliminate this exception handler (handle-exceptions exn - (debug:print-info 0 "test db access issue: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print-info 0 "test db access issue in examine test for run-id " run-id ", test-id " test-id ": " ((condition-property-accessor 'exn 'message) exn)) (db:get-test-info-by-id dbstruct run-id test-id ))))) ;; (debug:print-info 0 "need-update= " need-update " curr-mod-time = " curr-mod-time) (cond ((and need-update newtestdat) (set! testdat newtestdat) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -1458,11 +1458,11 @@ (define (dashboard:get-youngest-run-db-mod-time) (handle-exceptions exn (begin - (debug:print 0 "WARNING: error in accessing databases: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn)) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (apply max (map (lambda (filen) (file-modification-time filen)) (glob (conc *dbdir* "/*.db")))))) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -31,11 +31,11 @@ (sqlite3:set-busy-handler! db handler) (sqlite3:execute db "PRAGMA synchronous = 0;") (if (not exists) (sqlite3:execute db - "CREATE TABLE ports ( + "CREATE TABLE IF NOT EXISTS ports ( port INTEGER PRIMARY KEY, state TEXT DEFAULT 'not-used', fail_count INTEGER DEFAULT 0, update_time TIMESTAMP DEFAULT (strftime('%s','now')) );")) db)) @@ -88,16 +88,25 @@ (sqlite3:finalize! qry2) (sqlite3:finalize! qry3) res)) (define (portlogger:get-prev-used-port db) - (sqlite3:fold-row - (lambda (var curr) - (or curr var curr)) - #f - db - "SELECT (port) FROM ports WHERE state='released' LIMIT 1;")) + (handle-exceptions + exn + (begin + (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 "exn=" (condition->list exn)) + (print-call-chain) + (debug:print 0 "Continuing anyway.") + #f) + (sqlite3:fold-row + (lambda (var curr) + (or curr var curr)) + #f + db + "SELECT (port) FROM ports WHERE state='released' LIMIT 1;"))) (define (portlogger:find-port db) (let* ((lowport (let ((val (configf:lookup *configdat* "server" "lowport"))) (if (and val (string->number val)) @@ -104,11 +113,19 @@ (string->number val) 32768))) (portnum (or (portlogger:get-prev-used-port db) (+ lowport ;; top of registered ports is 49152 but lets use ports in the registered range (random (- 64000 lowport)))))) - (portlogger:take-port db portnum) + (handle-exceptions + exn + (begin + (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 "exn=" (condition->list exn)) + (print-call-chain) + (debug:print 0 "Continuing anyway.")) + (portlogger:take-port db portnum)) portnum)) ;; set port to "released", "failed" etc. ;; (define (portlogger:set-port db portnum value) @@ -122,20 +139,30 @@ ;;====================================================================== ;; MAIN ;;====================================================================== (define (portlogger:main . args) - (let* ((db (portlogger:open-db (conc "/tmp/." (current-user-name) "-portlogger.db"))) + (let* ((dbfname (conc "/tmp/." (current-user-name) "-portlogger.db")) + (db (portlogger:open-db dbfname)) (numargs (length args)) - (result (cond - ((> numargs 1) ;; most commands - (case (string->symbol (car args)) ;; commands with two or more params - ((take)(portlogger:take-port db (string->number (cadr args)))) - ((set) (portlogger:set-port db - (string->number (cadr args)) - (caddr args)) - (caddr args)) - ((failed)(portlogger:set-failed db (string->number (cadr args))) 'failed)))))) + (result + (handle-exceptions + exn + (begin + (debug:print 0 "EXCEPTION: portlogger database at " dbfname " probably overloaded or unreadable. Try removing it.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) + (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) + (print-call-chain)) + (cond + ((> numargs 1) ;; most commands + (case (string->symbol (car args)) ;; commands with two or more params + ((take)(portlogger:take-port db (string->number (cadr args)))) + ((set) (portlogger:set-port db + (string->number (cadr args)) + (caddr args)) + (caddr args)) + ((failed)(portlogger:set-failed db (string->number (cadr args))) 'failed))))))) (sqlite3:finalize! db) result)) ;; (print (apply portlogger:main (cdr (argv)))) Index: process.scm ================================================================== --- process.scm +++ process.scm @@ -51,10 +51,12 @@ ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params) (handle-exceptions exn (begin (print "ERROR: Failed to run command: " cmd " " (string-intersperse params " ")) + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) #f) (let-values (((fh fho pid) (if (null? params) (process cmd) (process cmd params)))) (let loop ((curr (read-line fh)) @@ -124,11 +126,10 @@ (if (eof-object? inl) (reverse res) (let ((pid (string->number inl))) (if proc (proc pid)) (loop (read-line) (cons pid res)))))))) - (define (process:alive? pid) (handle-exceptions exn ;; possibly pid is a process not a child, look in /proc to see if it is running still Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -104,10 +104,12 @@ (mutex-lock! *db-stats-mutex*) (handle-exceptions exn (begin (debug:print 0 "WARNING: stats collection failed in update-db-stats") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) #f) ;; if this fails we don't care, it is just stats (let* ((cmd (if (eq? rawcmd 'general-call) (car params) rawcmd)) (stat-vec (hash-table-ref/default *db-stats* cmd #f))) (if (not stat-vec) (let ((newvec (vector 0 0))) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -650,28 +650,31 @@ (lambda (record) (let* ((param-key (list-ref record 8)) (match-dat (string-search hostpid-rx param-key))) (if match-dat (let ((hostname (cadr match-dat)) - (pid (caddr match-dat))) + (pid (string->number (caddr match-dat)))) (debug:print 0 "Sending SIGINT to process " pid " on host " hostname) (if (equal? (get-host-name) hostname) - (begin - (handle-exceptions - exn - (begin - (debug:print 0 "Kill of process " pid " on host " hostname " failed.") - (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) - #t) - (process-signal (string->number pid) signal/int) - (thread-sleep! 5) - (process-signal (string->number pid) signal/kill))) + (if (process:alive? pid) + (begin + (handle-exceptions + exn + (begin + (debug:print 0 "Kill of process " pid " on host " hostname " failed.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + #t) + (process-signal pid signal/int) + (thread-sleep! 5) + (if (process:alive? pid) + (process-signal pid signal/kill))))) ;; (call-with-environment-variables (let ((old-targethost (getenv "TARGETHOST"))) (setenv "TARGETHOST" hostname) (system (conc "nbfake kill " pid)) - (if old-targethost (setenv "TARGETHOST" old-targethost))))) + (if old-targethost (setenv "TARGETHOST" old-targethost)) + (unsetenv "TARGETHOST")))) (debug:print 0 "ERROR: no record or improper record for " target "/" run-name " in tasks_queue in monitor.db")))) records))) ;;====================================================================== Index: tdb.scm ================================================================== --- tdb.scm +++ tdb.scm @@ -70,11 +70,12 @@ (debug:print-info 11 "open-test-db END (sucessful)" work-area) ;; now let's test that everything is correct (handle-exceptions exn (begin - (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test" + (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test or remove the file " + dbpath ".\n " ((condition-property-accessor 'exn 'message) exn)) #f) ;; Is there a cheaper single line operation that will check for existance of a table ;; and raise an exception ? (sqlite3:execute db "SELECT id FROM test_data LIMIT 1;"))