Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -160,11 +160,11 @@ exn (handle-exceptions exn (begin (debug:print 0 "ERROR: received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain) + (print-call-chain (current-error-port)) #f) (read (open-input-string (base64:base64-decode instr)))) (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;;====================================================================== Index: datashare.scm ================================================================== --- datashare.scm +++ datashare.scm @@ -255,11 +255,11 @@ (else (print "EXCEPTION: database overloaded or unreadable.") (print " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) (print " status: " ((condition-property-accessor 'sqlite3 'status) exn)) - (print-call-chain) + (print-call-chain (current-error-port)) (thread-sleep! sleep-time) (print "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up"))) (apply open-run-close-exception-handling proc idb params)) (apply open-run-close-no-exception-handling proc idb params))) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -329,11 +329,11 @@ exn (begin (debug:print 0 "WARNING: database files may not have been closed correctly. Consider running -cleanup-db") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 " db: " rundb) - (print-call-chain) + (print-call-chain (current-error-port)) #f) (sqlite3:interrupt! rundb) (sqlite3:finalize! rundb #t)))) ;; (mutex-unlock! *db-sync-mutex*) ) @@ -436,11 +436,11 @@ (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) (debug:print 0 " src db: " (db:dbdat-get-path fromdb)) (for-each (lambda (dbdat) (debug:print 0 " dbpath: " (db:dbdat-get-path dbdat))) (cons todb slave-dbs)) - (print-call-chain)) + (print-call-chain (current-error-port))) (cond ((not fromdb) (debug:print 3 "WARNING: db:sync-tables called with fromdb missing") -1) ((not todb) (debug:print 3 "WARNING: db:sync-tables called with todb missing") -2) ((not (sqlite3:database? (db:dbdat-get-db fromdb))) (debug:print 0 "ERROR: db:sync-tables called with fromdb not a database " fromdb) -3) @@ -649,11 +649,11 @@ (else (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) - (print-call-chain) + (print-call-chain (current-error-port)) (thread-sleep! sleep-time) (debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up"))) (apply open-run-close-exception-handling proc idb params)) (apply open-run-close-no-exception-handling proc idb params))) @@ -1549,11 +1549,11 @@ ;; not-in #t = above behaviour, #f = must match (define (db:get-tests-for-run dbstruct run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals) (if (not (number? run-id)) (begin ;; no need to treat this as an error by default (debug:print 4 "WARNING: call to db:get-tests-for-run with bad run-id=" run-id) - ;; (print-call-chain) + ;; (print-call-chain (current-error-port)) '()) (let* ((qryvalstr (case qryvals ((shortlist) "id,run_id,testname,item_path,state,status") ((#f) db:test-record-qry-selector) ;; "id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment") (else qryvals))) @@ -2673,11 +2673,11 @@ ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn) (if (eq? err-status 'done) default (begin (debug:print 0 "ERROR: query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain) + (print-call-chain (current-error-port)) default))) (apply sqlite3:first-result db stmt params))) ;;====================================================================== ;; Extract ods file from the db Index: lock-queue.scm ================================================================== --- lock-queue.scm +++ lock-queue.scm @@ -189,19 +189,19 @@ (handle-exceptions exn (begin (debug:print 0 "WARNING: Failed to find out if it is ok to skip the wait queue. Will try again in few seconds") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain) + (print-call-chain (current-error-port)) (thread-sleep! 10) (if (> count 0) (begin (sqlite3:finalize! db) (lock-queue:wait-turn fname test-id count: (- count 1))) (begin (debug:print 0 "Giving up calls to lock-queue:wait-turn for test-id " test-id " at path " fname ", printing call chain") - (print-call-chain) + (print-call-chain (current-error-port)) #f))) (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 waiting-msg: "lock-queue:wait-turn; waiting on journal file") (sqlite3:execute db "INSERT OR REPLACE INTO queue (test_id,start_time,state) VALUES (?,?,'waiting');" Index: mt.scm ================================================================== --- mt.scm +++ mt.scm @@ -168,11 +168,11 @@ ;; speed up for common cases with a little logic (define (mt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) (if (not (and run-id test-id)) (begin (debug:print 0 "ERROR: bad data handed to mt:test-set-state-status-by-id, run-id=" run-id ", test-id=" test-id ", newstate=" newstate) - (print-call-chain) + (print-call-chain (current-error-port)) #f) (begin (cond ((and newstate newstatus newcomment) (rmt:general-call 'state-status-msg run-id newstate newstatus newcomment test-id)) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -58,11 +58,11 @@ ;; (release-dot-lock fname) (debug:print 0 "ERROR: portlogger:open-run-close failed. " proc " " params) (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 "exn=" (condition->list exn)) (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it - (print-call-chain)) + (print-call-chain (current-error-port))) (let* (;; (lock (obtain-dot-lock fname 2 9 10)) (db (portlogger:open-db fname)) (res (apply proc db params))) (sqlite3:finalize! db) ;; (release-dot-lock fname) @@ -104,11 +104,11 @@ exn (begin (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 "exn=" (condition->list exn)) - (print-call-chain) + (print-call-chain (current-error-port)) (debug:print 0 "Continuing anyway.") #f) (sqlite3:fold-row (lambda (var curr) (or curr var curr)) @@ -129,11 +129,11 @@ exn (begin (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 "exn=" (condition->list exn)) - (print-call-chain) + (print-call-chain (current-error-port)) (debug:print 0 "Continuing anyway.")) (portlogger:take-port db portnum)) portnum)) ;; set port to "released", "failed" etc. @@ -160,11 +160,11 @@ (begin (debug:print 0 "EXCEPTION: portlogger database at " dbfname " probably overloaded or unreadable. Try removing it.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) - (print-call-chain)) + (print-call-chain (current-error-port))) (cond ((> numargs 1) ;; most commands (case (string->symbol (car args)) ;; commands with two or more params ((take)(portlogger:take-port db (string->number (cadr args)))) ((set) (portlogger:set-port db Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -97,11 +97,11 @@ ;; (begin ;; (hash-table-delete! *runremote* run-id) ;; (thread-sleep! 0.5) ;; (rmt:send-receive cmd rid params attempnum: (+ attemptnum 1))) ;; (begin - ;; (print-call-chain) + ;; (print-call-chain (current-error-port)) ;; (debug:print 0 "ERROR: too many attempts to communicate have failed. Giving up. Kill your mtest processes and start over") ;; (exit 1))))) (begin ;; let ((new-connection-info (client:setup run-id))) (debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.") (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection @@ -300,11 +300,11 @@ (define (rmt:get-test-info-by-id run-id test-id) (if (and (number? run-id)(number? test-id)) (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) (begin (debug:print 0 "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) - (print-call-chain) + (print-call-chain (current-error-port)) #f))) (define (rmt:test-get-rundir-from-test-id run-id test-id) (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) @@ -325,11 +325,11 @@ (define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals) (if (number? run-id) (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals)) (begin (debug:print "ERROR: rmt:get-tests-for-run called with bad run-id=" run-id) - (print-call-chain) + (print-call-chain (current-error-port)) '()))) (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) (let ((run-id-list (if run-ids run-ids Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -670,27 +670,15 @@ ;; Register tests ;; ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) - (if #t ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs - (begin - (rmt:general-call 'register-test run-id run-id test-name item-path) - (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)) - (let ((th (make-thread (lambda () - (mutex-lock! registry-mutex) - (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start) - (mutex-unlock! registry-mutex) - ;; If haven't done it before register a top level test if this is an itemized test - (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) - (rmt:general-call 'register-test run-id run-id test-name "")) - (rmt:general-call 'register-test run-id run-id test-name item-path) - (mutex-lock! registry-mutex) - (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) - (mutex-unlock! registry-mutex)) - (conc test-name "/" item-path)))) - (thread-start! th))) + ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs + (rmt:general-call 'register-test run-id run-id test-name item-path) + (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) + (rmt:general-call 'register-test run-id run-id test-name "")) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) (if (and (null? tal)(null? reg)) (list hed tal (append reg (list hed)) reruns) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -23,37 +23,39 @@ ;;====================================================================== ;; wait up to aprox n seconds for a journal to go away ;; (define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f)) - (let ((fullpath (conc path "-journal"))) - (handle-exceptions - exn - #t ;; if stuff goes wrong just allow it to move on - (let loop ((journal-exists (file-exists? fullpath)) - (count n)) ;; wait ten times ... - (if journal-exists - (begin - (if (and waiting-msg - (eq? (modulo n 30) 0)) - (debug:print 0 waiting-msg)) - (if (> count 0) - (begin - (thread-sleep! 1) - (loop (file-exists? fullpath) - (- count 1))) - (begin - (if remove (system (conc "rm -rf " fullpath))) - #f))) - #t))))) + (if (not (string? path)) + (debug:print 0 "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)") + (let ((fullpath (conc path "-journal"))) + (handle-exceptions + exn + #t ;; if stuff goes wrong just allow it to move on + (let loop ((journal-exists (file-exists? fullpath)) + (count n)) ;; wait ten times ... + (if journal-exists + (begin + (if (and waiting-msg + (eq? (modulo n 30) 0)) + (debug:print 0 waiting-msg)) + (if (> count 0) + (begin + (thread-sleep! 1) + (loop (file-exists? fullpath) + (- count 1))) + (begin + (if remove (system (conc "rm -rf " fullpath))) + #f))) + #t)))))) (define (tasks:get-task-db-path) - (if *task-db* - (vector-ref *task-db* 1) - (let* ((linktree (configf:lookup *configdat* "setup" "linktree")) - (dbpath (conc linktree "/.db/monitor.db"))) - dbpath))) + (let* ((linktree (configf:lookup *configdat* "setup" "linktree")) + (dbpath (conc linktree "/.db"))) + dbpath)) + + ;; If file exists AND ;; file readable ;; ==> open it ;; If file exists AND @@ -67,26 +69,28 @@ *task-db* (handle-exceptions exn (if (> numretries 0) (begin - (print-call-chain) + (print-call-chain (current-error-port)) (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) - (print "exn=" (condition->list exn)) + (debug:print 0 " exn=" (condition->list exn)) (thread-sleep! 1) (tasks:open-db numretries (- numretries 1))) (begin - (print-call-chain) + (print-call-chain (current-error-port)) (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) - (print "exn=" (condition->list exn)))) + (debug:print 0 " exn=" (condition->list exn)))) (let* ((dbpath (tasks:get-task-db-path)) + (dbfile (conc dbpath "/monitor.db")) (avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away (exists (file-exists? dbpath)) (write-access (file-write-access? dbpath)) - (mdb (cond - ((file-write-access? *toppath*)(sqlite3:open-database dbpath)) - ((file-read-access? dbpath) (sqlite3:open-database dbpath)) + (mdb (cond ;; what the hek is *toppath* doing here? + ((and (string? *toppath*)(file-write-access? *toppath*)) + (sqlite3:open-database dbfile)) + ((file-read-access? dbpath) (sqlite3:open-database dbfile)) (else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath)) (handler (make-busy-timeout 36000))) (if (and exists (not write-access)) (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control @@ -308,11 +312,11 @@ exn (begin (debug:print 0 "WARNING: tasks:get-server db access error.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 " for run " run-id) - (print-call-chain) + (print-call-chain (current-error-port)) (if (> retries 0) (begin (debug:print 0 " trying call to tasks:get-server again in 10 seconds") (thread-sleep! 10) (tasks:get-server mdb run-id retries: (- retries 0))) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -660,11 +660,11 @@ ;; (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up") ;; (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") ;; (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) ;; (print "exn=" (condition->list exn)) ;; (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) - ;; (print-call-chain))) + ;; (print-call-chain (current-error-port)))) ;; (let* ((num-records 0) ;; (test:tdb-get-rundat-count tdb)) ;; (cpuload (get-cpu-load)) ;; (diskfree (get-df (current-directory))) ;; (uname (get-uname "-srvpio")) ;; (hostname (get-host-name))) @@ -688,11 +688,11 @@ (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up") (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) - (print-call-chain))) + (print-call-chain (current-error-port)))) (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) ))) ;;====================================================================== ;; A R C H I V I N G