Index: common_records.scm ================================================================== --- common_records.scm +++ common_records.scm @@ -9,10 +9,29 @@ ;; PURPOSE. ;;====================================================================== ;; (use trace) +;; Some of these routines use: +;; +;; http://www.cs.toronto.edu/~gfb/scheme/simple-macros.html +;; +;; Syntax for defining macros in a simple style similar to function definiton, +;; when there is a single pattern for the argument list and there are no keywords. +;; +;; (define-simple-syntax (name arg ...) body ...) +;; + +(define-syntax define-simple-syntax + (syntax-rules () + ((_ (name arg ...) body ...) + (define-syntax name (syntax-rules () ((name arg ...) (begin body ...))))))) + +(define-syntax common:handle-exceptions + (syntax-rules () + ((_ exn-in errstmt ...)(handle-exceptions exn-in errstmt ...)))) + (define (debug:calc-verbosity vstr) (cond ((number? vstr) vstr) ((not (string? vstr)) 1) ;; ((string-match "^\\s*$" vstr) 1) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -32,10 +32,28 @@ (define *rundb-mutex* (make-mutex)) ;; prevent problems opening/closing rundb's (define *number-of-writes* 0) (define *number-non-write-queries* 0) +;;====================================================================== +;; SQLITE3 HELPERS +;;====================================================================== + +;; convert to -inline +(define (db:first-result-default db stmt default . params) + (handle-exceptions + exn + (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) + ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn) + (if (eq? err-status 'done) + default + (begin + (debug:print 0 "ERROR: query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn)) + (print-call-chain (current-error-port)) + default))) + (apply sqlite3:first-result db stmt params))) + ;; Get/open a database ;; if run-id => get run specific db ;; if #f => get main db ;; if db already open - return inmem ;; if db not open, open inmem, rundb and sync then return inmem @@ -429,19 +447,21 @@ (mutex-lock! *db-sync-mutex*) (handle-exceptions exn (begin (debug:print 0 "EXCEPTION: database probably overloaded or unreadable in db:sync-tables.") + (print-call-chain (current-error-port)) (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) (debug:print 0 " src db: " (db:dbdat-get-path fromdb)) (for-each (lambda (dbdat) (debug:print 0 " dbpath: " (db:dbdat-get-path dbdat))) (cons todb slave-dbs)) - (print-call-chain (current-error-port)) - (exit 1)) + (if *server-run* ;; we are inside a server + (set! *time-to-exit* #t) ;; let watch dog know that it is time to die. + (exit 1))) (cond ((not fromdb) (debug:print 3 "WARNING: db:sync-tables called with fromdb missing") -1) ((not todb) (debug:print 3 "WARNING: db:sync-tables called with todb missing") -2) ((not (sqlite3:database? (db:dbdat-get-db fromdb))) (debug:print 0 "ERROR: db:sync-tables called with fromdb not a database " fromdb) -3) @@ -1863,12 +1883,15 @@ (db:with-db dbstruct run-id #f (lambda (db) - (sqlite3:first-result db "SELECT attemptnum FROM tests WHERE id=?;" - test-id)))) + (db:first-result-default + db + "SELECT attemptnum FROM tests WHERE id=?;" + #f + test-id)))) (define db:test-record-fields '("id" "run_id" "testname" "state" "status" "event_time" "host" "cpuload" "diskfree" "uname" "rundir" "item_path" "run_duration" "final_logf" "comment" "shortdir" "attemptnum")) @@ -2660,28 +2683,10 @@ (if (not ever-seen) (set! result (append (if (null? tests)(list waitontest-name) tests) result))))) waitons) (delete-duplicates result)))) -;;====================================================================== -;; SQLITE3 HELPERS -;;====================================================================== - -;; convert to -inline -(define (db:first-result-default db stmt default . params) - (handle-exceptions - exn - (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) - ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn) - (if (eq? err-status 'done) - default - (begin - (debug:print 0 "ERROR: query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain (current-error-port)) - default))) - (apply sqlite3:first-result db stmt params))) - ;;====================================================================== ;; Extract ods file from the db ;;====================================================================== ;; NOT REWRITTEN YET!!!!! Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -463,39 +463,41 @@ ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; (loop 0 server-state)) - (begin - (debug:print-info 0 "Starting to shutdown the server.") - ;; need to delete only *my* server entry (future use) - (set! *time-to-exit* #t) - (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) - ;; - ;; start_shutdown - ;; - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") - (portlogger:open-run-close portlogger:set-port port "released") - (thread-sleep! 5) - (debug:print-info 0 "Max cached queries was " *max-cache-size*) - (debug:print-info 0 "Number of cached writes " *number-of-writes*) - (debug:print-info 0 "Average cached write time " - (if (eq? *number-of-writes* 0) - "n/a (no writes)" - (/ *writes-total-delay* - *number-of-writes*)) - " ms") - (debug:print-info 0 "Number non-cached queries " *number-non-write-queries*) - (debug:print-info 0 "Average non-cached time " - (if (eq? *number-non-write-queries* 0) - "n/a (no queries)" - (/ *total-non-write-delay* - *number-non-write-queries*)) - " ms") - (debug:print-info 0 "Server shutdown complete. Exiting") - (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running") - (exit)))))) + (http-transport:server-shutdown server-id port))))) + +(define (http-transport:server-shutdown server-id port) + (debug:print-info 0 "Starting to shutdown the server.") + ;; need to delete only *my* server entry (future use) + (set! *time-to-exit* #t) + (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) + ;; + ;; start_shutdown + ;; + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") + (portlogger:open-run-close portlogger:set-port port "released") + (thread-sleep! 5) + (debug:print-info 0 "Max cached queries was " *max-cache-size*) + (debug:print-info 0 "Number of cached writes " *number-of-writes*) + (debug:print-info 0 "Average cached write time " + (if (eq? *number-of-writes* 0) + "n/a (no writes)" + (/ *writes-total-delay* + *number-of-writes*)) + " ms") + (debug:print-info 0 "Number non-cached queries " *number-non-write-queries*) + (debug:print-info 0 "Average non-cached time " + (if (eq? *number-non-write-queries* 0) + "n/a (no queries)" + (/ *total-non-write-delay* + *number-non-write-queries*)) + " ms") + (debug:print-info 0 "Server shutdown complete. Exiting") + (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running") + (exit)) ;; all routes though here end in exit ... ;; ;; start_server? ;; Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -64,11 +64,11 @@ (test-name (assoc/default 'test-name cmdinfo)) (runscript (assoc/default 'runscript cmdinfo)) (ezsteps (assoc/default 'ezsteps cmdinfo)) ;; (runremote (assoc/default 'runremote cmdinfo)) (transport (assoc/default 'transport cmdinfo)) - (serverinf (assoc/default 'serverinf cmdinfo)) + ;; (serverinf (assoc/default 'serverinf cmdinfo)) (port (assoc/default 'port cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) (test-id (assoc/default 'test-id cmdinfo)) (target (assoc/default 'target cmdinfo)) (itemdat (assoc/default 'itemdat cmdinfo)) @@ -787,11 +787,11 @@ (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) (list 'transport (conc *transport-type*)) - (list 'serverinf *server-info*) + ;; (list 'serverinf *server-info*) (list 'toppath *toppath*) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript) (list 'run-id run-id ) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -28,11 +28,16 @@ (if (not (string? path)) (debug:print 0 "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)") (let ((fullpath (conc path "-journal"))) (handle-exceptions exn - #t ;; if stuff goes wrong just allow it to move on + (begin + (print-call-chain (current-error-port)) + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 " exn=" (condition->list exn)) + (debug:print 0 "tasks:wait-on-journal failed. Continuing on, you can ignore this call-chain") + #t) ;; if stuff goes wrong just allow it to move on (let loop ((journal-exists (file-exists? fullpath)) (count n)) ;; wait ten times ... (if journal-exists (begin (if (and waiting-msg @@ -308,21 +313,22 @@ (define (tasks:get-server mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (handle-exceptions exn - (begin + (begin + (print-call-chain (current-error-port)) (debug:print 0 "WARNING: tasks:get-server db access error.") - (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 " for run " run-id) - (print-call-chain (current-error-port)) - (if (> retries 0) - (begin - (debug:print 0 " trying call to tasks:get-server again in 10 seconds") - (thread-sleep! 10) - (tasks:get-server mdb run-id retries: (- retries 0))) - (debug:print 0 "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\""))) + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 " for run " run-id) + (print-call-chain (current-error-port)) + (if (> retries 0) + (begin + (debug:print 0 " trying call to tasks:get-server again in 10 seconds") + (thread-sleep! 10) + (tasks:get-server mdb run-id retries: (- retries 0))) + (debug:print 0 "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\""))) (sqlite3:for-each-row (lambda (id interface port pubport transport pid hostname) (set! res (vector id interface port pubport transport pid hostname))) mdb ;; removed: @@ -375,38 +381,10 @@ (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) -;; (if status ;; #t means alive -;; (begin -;; (if (equal? hostname (get-host-name)) -;; (handle-exceptions -;; exn -;; (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n" -;; " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) -;; (debug:print 1 "Sending signal/term to " pid " on " hostname) -;; (process-signal pid signal/term) -;; (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill -;; ;;(process-signal pid signal/kill) -;; ) ;; local machine, send sig term -;; (begin -;; ;;(debug:print-info 1 "Stopping remote servers not yet supported.")))) -;; (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide") -;; (let ((serverdat (list hostname port))) -;; (hash-table-set! *runremote* run-id (http-transport:client-connect hostname port)) -;; (cdb:kill-server serverdat pid))))) ;; remote machine, try telling server to commit suicide -;; (begin -;; (if status -;; (if (equal? hostname (get-host-name)) -;; (begin -;; (debug:print-info 1 "Sending signal/term to " pid " on " hostname) -;; (process-signal pid signal/term) ;; local machine, send sig term -;; (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill -;; (process-signal pid signal/kill)) -;; (debug:print 0 "WARNING: Can't kill frozen server on remote host " hostname)))))) - ;;====================================================================== ;; Tasks and Task monitors ;;====================================================================== Index: tdb.scm ================================================================== --- tdb.scm +++ tdb.scm @@ -54,10 +54,11 @@ (string->number (args:get-arg "-override-timeout")) 136000)))) (handle-exceptions exn (begin + (print-call-chain (current-error-port)) (debug:print 2 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test" ((condition-property-accessor 'exn 'message) exn)) (set! db (sqlite3:open-database ":memory:")) ;; open an in-memory db to allow readonly access (set! dbexists #f)) ;; must force re-creation of tables, more tom-foolery (set! db (sqlite3:open-database dbpath))) @@ -71,10 +72,11 @@ (debug:print-info 11 "open-test-db END (sucessful)" work-area) ;; now let's test that everything is correct (handle-exceptions exn (begin + (print-call-chain (current-error-port)) (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test or remove the file " dbpath ".\n " ((condition-property-accessor 'exn 'message) exn)) #f) ;; Is there a cheaper single line operation that will check for existance of a table Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -647,31 +647,10 @@ ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (uname (get-uname "-srvpio")) (hostname (get-host-name))) - ;; (handle-exceptions - ;; exn - ;; (if (> remtries 0) - ;; (begin - ;; (set! remtries (- remtries 1)) - ;; (thread-sleep! 10) - ;; (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) - ;; (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) - ;; (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up") - ;; (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") - ;; (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) - ;; (print "exn=" (condition->list exn)) - ;; (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) - ;; (print-call-chain (current-error-port)))) - ;; (let* ((num-records 0) ;; (test:tdb-get-rundat-count tdb)) - ;; (cpuload (get-cpu-load)) - ;; (diskfree (get-df (current-directory))) - ;; (uname (get-uname "-srvpio")) - ;; (hostname (get-host-name))) - ;; ;(tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) - ;; (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname) (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) ;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) (define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) (let* ((cpuload (get-cpu-load)) @@ -679,10 +658,12 @@ (remtries 10)) (handle-exceptions exn (if (> remtries 0) (begin + (print-call-chain (current-error-port)) + (debug:print-info 0 "WARNING: failed to set meta info. Will try " remtries " more times") (set! remtries (- remtries 1)) (thread-sleep! 10) (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up")