Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -163,10 +163,15 @@ chmod a+x $@ deploytarg/nbfind : utils/nbfind $(INSTALL) $< $@ chmod a+x $@ + +$(PREFIX)/bin/mtest-reaper: helpers/mtest-reaper.scm helpers/ducttape-lib.scm helpers/inteldate.scm helpers/mimetypes.scm + make -C helpers $@ PREFIX=$(PREFIX) INSTALL=$(INSTALL) ARCHSTR=$(ARCHSTR) + +mtest-reaper: $(PREFIX)/bin/mtest-reaper # install dashboard as dboard so wrapper script can be called dashboard $(PREFIX)/bin/.$(ARCHSTR)/dboard : dboard $(FILES) utils/mk_wrapper utils/mk_wrapper $(PREFIX) dboard $(PREFIX)/bin/dashboard chmod a+x $(PREFIX)/bin/dashboard @@ -177,10 +182,11 @@ $(PREFIX)/bin/nbfind $(PREFIX)/bin/loadrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \ $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun $(PREFIX)/bin/.$(ARCHSTR) : mkdir -p $(PREFIX)/bin/.$(ARCHSTR) + mkdir -p $(PREFIX)/bin/.$(ARCHSTR)/lib test: tests/tests.scm cd tests;csi -I .. -b -n tests.scm ext-tests/.fslckout : $(MTQA_FOSSIL) @@ -282,5 +288,6 @@ echo "(use postgresql)(hash-table-set! *available-db* 'postgresql #t)" >> altdb.scm;\ fi portlogger-example : portlogger-example.scm api.o archive.o client.o common.o configf.o daemon.o dashboard-tests.o db.o dcommon.o ezsteps.o filedb.o genexample.o gutils.o http-transport.o items.o keys.o launch.o lock-queue.o margs.o megatest-version.o mt.o ods.o portlogger.o process.o rmt.o rpc-transport.o runconfig.o runs.o sdb.o server.o synchash.o tasks.o tdb.o tests.o tree.o csc $(CSCOPTS) portlogger-example.scm api.o archive.o client.o common.o configf.o daemon.o dashboard-tests.o db.o dcommon.o ezsteps.o filedb.o genexample.o gutils.o http-transport.o items.o keys.o launch.o lock-queue.o margs.o megatest-version.o mt.o ods.o portlogger.o process.o rmt.o rpc-transport.o runconfig.o runs.o sdb.o server.o synchash.o tasks.o tdb.o tests.o tree.o + Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -39,10 +39,11 @@ get-run-status get-run-stats get-targets get-target ;; register-run + get-tests-tags get-tests-for-run get-test-id get-tests-for-runs-mindata get-run-name-from-id get-runs @@ -61,10 +62,12 @@ synchash-get )) (define api:write-queries '( + get-keys-write ;; dummy "write" query to force server start + ;; SERVERS start-server kill-server ;; TESTS @@ -111,10 +114,11 @@ ;; (define (api:execute-requests dbstruct dat) (handle-exceptions exn (let ((call-chain (get-call-chain))) + (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer") (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (if (not (vector? dat)) ;; it is an error to not receive a vector (vector #f #f "remote must be called with a vector") @@ -167,10 +171,11 @@ ((mark-incomplete) (apply db:find-and-mark-incomplete dbstruct params)) ;; TESTMETA ((testmeta-add-record) (apply db:testmeta-add-record dbstruct params)) ((testmeta-update-field) (apply db:testmeta-update-field dbstruct params)) + ((get-tests-tags) (db:get-tests-tags dbstruct)) ;; TASKS ((tasks-add) (apply tasks:add dbstruct params)) ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params)) ((tasks-get-last) (apply tasks:get-last dbstruct params)) @@ -186,10 +191,11 @@ ;;====================================================================== ;; KEYS ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) ((get-keys) (db:get-keys dbstruct)) + ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server ((get-key-vals) (apply db:get-key-vals dbstruct params)) ((get-target) (apply db:get-target dbstruct params)) ((get-targets) (db:get-targets dbstruct)) ;; ARCHIVES @@ -239,10 +245,11 @@ ;; TEST DATA ((read-test-data) (apply db:read-test-data dbstruct params)) ;; MISC + ((get-latest-host-load) (apply db:get-latest-host-load dbstruct params)) ((have-incompletes?) (apply db:have-incompletes? dbstruct params)) ((login) (apply db:login dbstruct params)) ((general-call) (let ((stmtname (car params)) (run-id (cadr params)) (realparams (cddr params))) Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -197,19 +197,19 @@ (tasks:hostinfo-get-port server-dat) " client:setup (server-dat = #t)") (if (> remaining-tries 8) (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time - (server:try-running run-id) + (server:try-running *toppath*) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) (if (< num-available 2) - (server:try-running run-id)) + (server:try-running *toppath*)) (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) ;; keep this as a function to ease future (define (client:start run-id server-info) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -90,14 +90,13 @@ (define *db-stats-mutex* (make-mutex)) ;; db access (define *db-last-access* (current-seconds)) ;; last db access, used in server (define *db-write-access* #t) ;; db sync -(define *db-last-write* 0) ;; used to record last touch of db (define *db-last-sync* 0) ;; last time the sync to megatest.db happened (define *db-sync-in-progress* #f) ;; if there is a sync in progress do not try to start another -(define *db-multi-sync-mutex* (make-mutex)) ;; protect access to *db-sync-in-progress*, *db-last-sync* and *db-last-write* +(define *db-multi-sync-mutex* (make-mutex)) ;; protect access to *db-sync-in-progress*, *db-last-sync* ;; task db (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) (define *db-cache-path* #f) @@ -135,10 +134,18 @@ (define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) +(defstruct remote + (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) + (server-url (if *toppath* (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*) #f)) + (last-server-check 0) ;; last time we checked to see if the server was alive + (conndat #f) + (transport *transport-type*) + (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds + ;; launching and hosts (defstruct host (reachable #f) (last-update 0) (last-used 0) @@ -531,47 +538,59 @@ ;;====================================================================== ;; E X I T H A N D L I N G ;;====================================================================== (define (common:run-sync?) - (let ((ohh (common:on-homehost?)) - (srv (args:get-arg "-server"))) - ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) (and (common:on-homehost?) - (args:get-arg "-server")))) + (args:get-arg "-server"))) + +;; (let ((ohh (common:on-homehost?)) +;; (srv (args:get-arg "-server"))) +;; (and ohh srv))) + ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) ;;;; run-ids ;; if #f use *db-local-sync* : or 'local-sync-flags ;; if #t use timestamps : or 'timestamps (define (common:sync-to-megatest.db dbstruct) (let ((start-time (current-seconds)) (res (db:multi-db-sync dbstruct 'new2old))) (let ((sync-time (- (current-seconds) start-time))) - (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds") + (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)) (if (common:low-noise-print 30 "sync new to old") - (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds"))) + (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)))) res)) + + + +(define *wdnum* 0) +(define *wdnum*mutex (make-mutex)) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; (define (common:watchdog) + (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (debug-mode (debug:debug-mode 1)) - (last-time (current-seconds))) - (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync) - (if legacy-sync + (last-time (current-seconds)) + (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) + ) + (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) + (if (and legacy-sync (not *time-to-exit*)) (let ((dbstruct (db:setup))) (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") (let loop () + ;;(BB> "watchdog loop. pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) ;; sync for filesystem local db writes ;; (mutex-lock! *db-multi-sync-mutex*) - (let* ((need-sync (>= *db-last-write* *db-last-sync*)) ;; no sync since last write + (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write (sync-in-progress *db-sync-in-progress*) - (should-sync (> (- (current-seconds) *db-last-sync*) 5)) ;; sync every five seconds minimum + (should-sync (and (not *time-to-exit*) + (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum (will-sync (and (or need-sync should-sync) (not sync-in-progress))) (start-time (current-seconds))) ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) (if will-sync (set! *db-sync-in-progress* #t)) @@ -599,55 +618,71 @@ ;; keep going unless time to exit ;; (if (not *time-to-exit*) (let delay-loop ((count 0)) + ;;(BB> "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) + (if (and (not *time-to-exit*) (< count 4)) ;; was 11, changing to 4. (begin (thread-sleep! 1) (delay-loop (+ count 1)))) - (loop))) + (if (not *time-to-exit*) (loop)))) (if (common:low-noise-print 30) - (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*))))))) + (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num))))))) (define (std-exit-procedure) + (on-exit (lambda () 0)) + ;;(BB> "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") (if (and no-hurry (debug:debug-mode 18)) (rmt:print-db-stats)) (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds - (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated + (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated (if *task-db* (let ((db (cdr *task-db*))) (if (sqlite3:database? db) (begin (sqlite3:interrupt! db) (sqlite3:finalize! db #t) ;; (vector-set! *task-db* 0 #f) (set! *task-db* #f))))) - (close-output-port *default-log-port*) + (if (and *runremote* + (remote-conndat *runremote*)) + (begin + (http-client#close-all-connections!))) ;; for http-client + (if (not (eq? *default-log-port* (current-error-port))) + (close-output-port *default-log-port*)) (set! *default-log-port* (current-error-port))) "Cleanup db exit thread")) (th2 (make-thread (lambda () (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...") (if no-hurry - (thread-sleep! 5) ;; give the clean up few seconds to do it's stuff - (thread-sleep! 2)) - (debug:print 4 *default-log-port* " ... done") - ) + (begin + (thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff + (begin + (thread-sleep! 2))) + (debug:print 4 *default-log-port* " ... done") + ) "clean exit"))) (thread-start! th1) (thread-start! th2) - (thread-join! th1)))) + (thread-join! th1) + ) + ) + + 0) (define (std-signal-handler signum) ;; (signal-mask! signum) (set! *time-to-exit* #t) + ;;(BB> "got signal "signum) (debug:print-error 0 *default-log-port* "Received signal " signum " exiting promptly") ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway (exit)) (set-signal-handler! signal/int std-signal-handler) ;; ^C @@ -779,20 +814,24 @@ (define (common:args-get-status) (or (args:get-arg "-status")(args:get-arg ":status"))) (define (common:args-get-testpatt rconf) - (let* ((rtestpatt (if rconf (runconfigs-get rconf "TESTPATT") #f)) - (args-testpatt (or (args:get-arg "-testpatt") - (args:get-arg "-runtests") - "%")) - (testpatt (or (and (equal? args-testpatt "%") - rtestpatt) - args-testpatt))) - (if rtestpatt (debug:print-info 0 *default-log-port* "TESTPATT from runconfigs: " rtestpatt)) - testpatt)) - + (let* ((tagexpr (args:get-arg "-tagexpr")) + (tags-testpatt (if tagexpr (string-join (runs:get-tests-matching-tags tagexpr) ",") #f)) + (testpatt-key (if (args:get-arg "-mode") (args:get-arg "-mode") "TESTPATT")) + (args-testpatt (or (args:get-arg "-testpatt") (args:get-arg "-runtests") "%")) + (rtestpatt (if rconf (runconfigs-get rconf testpatt-key) #f))) + (cond + (tags-testpatt + (debug:print-info 0 *default-log-port* "-tagexpr "tagexpr" selects testpatt "tags-testpatt) + tags-testpatt) + ((and (equal? args-testpatt "%") rtestpatt) + (debug:print-info 0 *default-log-port* "testpatt defined in "testpatt-key" from runconfigs: " rtestpatt) + rtestpatt) + (else args-testpatt)))) + (define (common:get-linktree) (or (getenv "MT_LINKTREE") (if *configdat* (configf:lookup *configdat* "setup" "linktree")))) @@ -1084,11 +1123,12 @@ (lambda ()(list (read)(read)(read))))) (with-input-from-file "/proc/loadavg" (lambda ()(list (read)(read)(read)))))) ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads -;; returns list (normalized-proc-load normalized-core-load 1m 5m 15m ncores nthreads) +;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. +;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load ;; (define (common:get-normalized-cpu-load remote-host) (let ((data (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/loadavg;cat /proc/cpuinfo;echo end") @@ -1143,51 +1183,89 @@ (let ((res (system (conc "ping -c 1 " hostname " > /dev/null")))) (eq? res 0))) ;; ideally put all this info into the db, no need to preserve it across moving homehost ;; -(define (common:get-least-loaded-host hosts-raw) +;; return list of +;; ( reachable? cpuload update-time ) +(define (common:get-host-info hostname) + (let* ((loadinfo (rmt:get-latest-host-load hostname)) + (load (car loadinfo)) + (load-sample-time (cdr loadinfo)) + (load-sample-age (- (current-seconds) load-sample-time)) + (loadinfo-timeout-seconds 20) + (host-last-update-timeout-seconds 10) + (host-rec (hash-table-ref/default *host-loads* hostname #f)) + ) + (cond + ((< load-sample-age loadinfo-timeout-seconds) + (list #t + load-sample-time + load)) + ((and host-rec + (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) + (list #t + (host-last-update host-rec) + (host-last-cpuload host-rec ))) + ((common:unix-ping hostname) + (list #t + (current-seconds) + (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) + (else + (list #f 0 -1))))) + +(define (common:update-host-loads-table hosts-raw) (let* ((hosts (filter (lambda (x) (string-match (regexp "^\\S+$") x)) hosts-raw))) - (if (null? hosts) - #f - ;; - ;; stategy: - ;; sort by last-used and normalized-load - ;; if last-updated > 15 seconds then re-update - ;; take the host with the lowest load with the lowest last-used (i.e. not used for longest time) - ;; - (let ((best-host #f) - (curr-time (current-seconds))) - (for-each - (lambda (hostname) - (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h)))) - ;; if host hasn't been pinged in 15 sec update it's data - (ping-good (if (< (- curr-time (host-last-update rec)) 15) - (host-reachable rec) - (or (host-reachable rec) - (begin - (host-reachable-set! rec (common:unix-ping hostname)) - (host-last-update-set! rec curr-time) - (host-last-cpuload-set! rec (common:get-normalized-cpu-load hostname)) - (host-reachable rec)))))) - (cond - ((not best-host) - (set! best-host hostname)) - ((and ping-good - (< (alist-ref 'adj-core-load (host-last-cpuload rec)) - (alist-ref 'adj-core-load - (host-last-cpuload (hash-table-ref *host-loads* best-host))))) - (set! best-host hostname))))) - hosts) - best-host)))) + (for-each + (lambda (hostname) + (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h)))) + (host-info (common:get-host-info hostname)) + (is-reachable (car host-info)) + (last-reached-time (cadr host-info)) + (load (caddr host-info))) + (host-reachable-set! rec is-reachable) + (host-last-update-set! rec last-reached-time) + (host-last-cpuload-set! rec load))) + hosts))) + +(define (common:get-least-loaded-host hosts-raw) + (let* ((hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw)) + (best-host #f) + (best-load 99999) + (curr-time (current-seconds))) + (common:update-host-loads-table hosts) + (for-each + (lambda (hostname) + (let* ((rec + (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h)))) + (reachable (host-reachable rec)) + (load (host-last-cpuload rec))) + (cond + ((not reachable) #f) + ((< (+ load (/ (random 250) 1000)) ;; add a random factor to keep from getting in a rut + (+ best-load (/ (random 250) 1000)) ) + (set! best-load load) + (set! best-host hostname))))) + hosts) + best-host)) + + + (define (common:wait-for-cpuload maxload numcpus waitdelay #!key (count 1000) (msg #f)(remote-host #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (first (car loadavg)) (next (cadr loadavg)) Index: common_records.scm ================================================================== --- common_records.scm +++ common_records.scm @@ -121,10 +121,11 @@ (db:log-event (apply conc params)) (apply print params) ))))) ;; Brandon's debug printer shortcut (indulge me :) +(define *BB-process-starttime* (current-milliseconds)) (define (BB> . in-args) (let* ((stack (get-call-chain)) (location #f)) (for-each (lambda (frame) @@ -131,12 +132,59 @@ (let* ((this-loc (vector-ref frame 0)) (this-func (cadr (string-split this-loc " ")))) (if (equal? this-func "BB>") (set! location this-loc)))) stack) - (let ((dp-args (append (list 0 *default-log-port* location" " ) in-args))) + (let ((dp-args (append (list 0 *default-log-port* (conc location "@"(/ (- (current-milliseconds) *BB-process-starttime*) 1000)" ") ) in-args))) (apply debug:print dp-args)))) + +(define *BBpp_custom_expanders_list* (make-hash-table)) + + + +;; register hash tables with BBpp. +(hash-table-set! *BBpp_custom_expanders_list* HASH_TABLE: + (cons hash-table? hash-table->alist)) + +;; test name converter +(define (BBpp_custom_converter arg) + (let ((res #f)) + (for-each + (lambda (custom-type-name) + (let* ((custom-type-info (hash-table-ref *BBpp_custom_expanders_list* custom-type-name)) + (custom-type-test (car custom-type-info)) + (custom-type-converter (cdr custom-type-info))) + (when (and (not res) (custom-type-test arg)) + (set! res (custom-type-converter arg))))) + (hash-table-keys *BBpp_custom_expanders_list*)) + (if res (BBpp_ res) arg))) + +(define (BBpp_ arg) + (cond + ;;((SOMESTRUCT? arg) (cons SOMESTRUCT: (SOMESTRUCT->alist arg))) + ;;((dboard:tabdat? arg) (cons dboard:tabdat: (dboard:tabdat->alist arg))) + ((hash-table? arg) + (let ((al (hash-table->alist arg))) + (BBpp_ (cons HASH_TABLE: al)))) + ((null? arg) '()) + ;;((list? arg) (cons (BBpp_ (car arg)) (BBpp_ (cdr arg)))) + ((pair? arg) (cons (BBpp_ (car arg)) (BBpp_ (cdr arg)))) + (else (BBpp_custom_converter arg)))) + +;; Brandon's pretty printer. It expands hashes and custom types in addition to regular pp +(define (BBpp arg) + (pp (BBpp_ arg))) + +;(use define-macro) +(define-syntax inspect + (syntax-rules () + [(_ x) + ;; (with-output-to-port (current-error-port) + (printf "~a is: ~a\n" 'x (with-output-to-string (lambda () (BBpp x)))) + ;; ) + ] + [(_ x y ...) (begin (inspect x) (inspect y ...))])) (define (debug:print-error n e . params) ;; normal print (if (debug:debug-mode n) (with-output-to-port (or e (current-error-port)) Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -473,11 +473,11 @@ (testconfig (begin ;; (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path) (runs:set-megatest-env-vars run-id inkeyvals: keydat inrunname: runname intarget: keystring testname: testname itempath: item-path) ;; these may be needed by the launching process (handle-exceptions exn - (tests:get-testconfig (db:test-get-testname testdat) test-registry #f) + (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f) (tests:get-testconfig (db:test-get-testname testdat) test-registry #t)))) (viewlog (lambda (x) (if (file-exists? logfile) ;(system (conc "firefox " logfile "&")) (dashboard-tests:run-html-viewer logfile) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -290,10 +290,21 @@ ;; runs summary view tests-tree ;; used in newdashboard ) +;; register tabdat with BBpp +;; this is used by BBpp (Brandon's pretty printer) to convert dboard:tabdat into a composition of lists that pp will handle +(hash-table-set! *BBpp_custom_expanders_list* TABDAT: + (cons dboard:tabdat? + (lambda (tabdat-item) + (filter + (lambda (alist-entry) + (member (car alist-entry) + '(allruns-by-id allruns))) ;; FIELDS OF INTEREST + (dboard:tabdat->alist tabdat-item))))) + (define (dboard:tabdat-target-string vec) (let ((targ (dboard:tabdat-target vec))) (if (list? targ)(string-intersperse targ "/") "no-target-specified"))) (define (dboard:tabdat-test-patts-use vec) @@ -360,10 +371,24 @@ ((last-update 0) : fixnum) ;; last query to db got records from before last-update ((data-changed #f) : boolean) ((run-data-offset 0) : number) ;; get only 100 items per call, set back to zero when received less that 100 items (db-path #f) ) + +;; register dboard:rundat with BBpp +;; this is used by BBpp (Brandon's pretty printer) to convert dboard:rundat into a composition of lists that pp will handle +(hash-table-set! *BBpp_custom_expanders_list* RUNDAT: + (cons dboard:rundat? + (lambda (tabdat-item) + (filter + (lambda (alist-entry) + (member (car alist-entry) + '(run run-data-offset ))) ;; FIELDS OF INTEREST + (dboard:rundat->alist tabdat-item))))) + + + (define (dboard:rundat-make-init #!key (run #f)(key-vals #f)(tests #f));; -100 is before time began (make-dboard:rundat run: run tests: (or tests (make-hash-table)) @@ -623,10 +648,12 @@ (for-each (lambda (run) (hash-table-set! ht (db:get-value-by-header run header "id") run)) runs-tree) ;; (vector-ref runs-dat 1)) ht)) (tb (dboard:tabdat-runs-tree tabdat))) + ;;(BB> "In update-rundat") + ;;(inspect allruns runs-hash) (dboard:tabdat-last-runs-update-set! tabdat (- (current-seconds) 2)) (dboard:tabdat-header-set! tabdat header) ;; ;; trim runs to only those that are changing often here ;; @@ -740,11 +767,17 @@ (run-struct (or run-struct (dboard:rundat-make-init run: run tests: tests-ht key-vals: key-vals))) - (new-res (if (null? all-test-ids) res (cons run-struct res))) + (new-res (if (null? all-test-ids) + res + (delete-duplicates + (cons run-struct res) + (lambda (a b) + (eq? (db:get-value-by-header (dboard:rundat-run a) header "id") + (db:get-value-by-header (dboard:rundat-run b) header "id")))))) (elapsed-time (- (current-seconds) start-time))) (if (null? all-test-ids) (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id) (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) (if (or (null? tal) @@ -3391,10 +3424,13 @@ ;;(print "RA => calling runs-tab-updater with commondat " commondat " tab-num " tab-num) ;;(tabdat-values tabdat) ;;RA added ;; (pp (dboard:tabdat->alist tabdat)) ;; (if (dashboard:database-changed? commondat tabdat context-key: 'runs-rundat) (dashboard:do-update-rundat tabdat) + ;;(BB> "dashboard:runs-tab-updater") + ;;(inspect tabdat) + (let ((uidat (dboard:commondat-uidat commondat))) ;;(print "RA => Calling update-buttons with tabdat : " tabdat " uidat " uidat) (update-buttons tabdat uidat (dboard:tabdat-numruns tabdat) (dboard:tabdat-num-tests tabdat))) )) "dashboard:runs-tab-updater")) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -335,10 +335,11 @@ (let ((update_info (cons (if force-sync 0 *db-last-sync*) "last_update"))) (mutex-unlock! *db-multi-sync-mutex*) (db:sync-tables (db:sync-all-tables-list dbstruct) update_info tmpdb refndb mtdb)) (mutex-lock! *db-multi-sync-mutex*) (set! *db-last-sync* start-t) + (set! *db-last-access* start-t) (mutex-unlock! *db-multi-sync-mutex*))) ;; close all opened run-id dbs (define (db:close-all dbstruct) (if (dbr:dbstruct? dbstruct) @@ -1497,26 +1498,33 @@ (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE") - (sqlite3:execute - db - (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" - (string-intersperse (map conc all-ids) ",") - ");") - run-id)))) - - ;; Now do rollups for the toplevel tests - ;; - ;; (db:delay-if-busy dbdat) - (for-each - (lambda (toptest) - (let ((test-name (list-ref toptest 3))) -;; (run-id (list-ref toptest 5))) - (db:top-test-set-per-pf-counts dbstruct run-id test-name))) - toplevels))) + (for-each + (lambda (test-id) + (db:test-set-status-state dbstruct run-id test-id "COMPLETE" "DEAD" "Test failed to complete")) + all-ids)))))) + +;; ALL REPLACED BY THE BLOCK ABOVE +;; +;; (sqlite3:execute +;; db +;; (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" +;; (string-intersperse (map conc all-ids) ",") +;; ");") +;; run-id)))) +;; +;; ;; Now do rollups for the toplevel tests +;; ;; +;; ;; (db:delay-if-busy dbdat) +;; (for-each +;; (lambda (toptest) +;; (let ((test-name (list-ref toptest 3))) +;; ;; (run-id (list-ref toptest 5))) +;; (db:top-test-set-per-pf-counts dbstruct run-id test-name))) +;; toplevels))) ;; BUG: Probably broken - does not explicitly use run-id in the query ;; (define (db:top-test-set-per-pf-counts dbstruct run-id test-name) (db:general-call (db:get-db dbstruct run-id) 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name))) @@ -2039,11 +2047,12 @@ (if (string? netstate) (begin (hash-table-set! totals netstate (+ (hash-table-ref/default totals netstate 0) count)) (hash-table-set! curr netstate (+ (hash-table-ref/default curr netstate 0) count)))))) db - "SELECT state,status,count(id) FROM tests AS t GROUP BY state,status ORDER BY state,status DESC;") + "SELECT state,status,count(id) FROM tests AS t WHERE run_id=? GROUP BY state,status ORDER BY state,status DESC;" + run-id) ;; add the per run counts to res (for-each (lambda (state) (set! res (cons (list run-name state (hash-table-ref curr state)) res))) (sort (hash-table-keys curr) string>=)) (set! curr (make-hash-table)))))) @@ -2588,10 +2597,13 @@ )) 0))))) ;; DEBUG FIXME - need to merge this v.155 query correctly ;; AND testname in (SELECT testname FROM test_meta WHERE jobgroup=?) ;; AND NOT (uname = 'n/a' AND item_path = '');" + +;; tags: '("tag%" "tag2" "%ag6") +;; ;; done with run when: ;; 0 tests in LAUNCHED, NOT_STARTED, REMOTEHOSTSTART, RUNNING (define (db:estimated-tests-remaining dbstruct run-id) (db:with-db @@ -3469,10 +3481,28 @@ (set! res (cons (vector state status count) res))) db "SELECT state,status,count(state) FROM tests WHERE run_id=? AND testname=? AND item_path='' GROUP BY state,status;" run-id testname) res)) + + +(define (db:get-latest-host-load dbstruct raw-hostname) + (let* ((hostname (string-substitute "\\..*$" "" raw-hostname)) + (res (cons -1 0)) + (mydb (db:dbdat-get-db (db:get-db dbstruct 0))) + ) + (db:with-db + dbstruct + 0 + #f + (lambda (db) + (sqlite3:for-each-row + (lambda (cpuload update-time) (set! res (cons cpuload update-time))) + db + "SELECT tr.cpuload, tr.update_time FROM test_rundat tr, tests t WHERE t.host=? AND tr.cpuload != -1 AND tr.test_id=t.id ORDER BY tr.update_time DESC LIMIT 1;" + hostname))) res )) + (define (db:set-top-level-from-items dbstruct run-id testname) (let* ((dbdat (db:get-db dbstruct run-id)) (db (db:dbdat-get-db dbdat)) (summ (db:get-state-status-summary db run-id testname)) @@ -3609,10 +3639,29 @@ res)))) ;;====================================================================== ;; Tests meta data ;;====================================================================== + +;; returns a hash table of tags to tests +;; +(define (db:get-tests-tags dbstruct) + (let* ((dbdat (db:get-db dbstruct #f)) + (db (db:dbdat-get-db dbdat)) + (res (make-hash-table))) + (sqlite3:for-each-row + (lambda (testname tags-in) + (let ((tags (string-split tags-in ","))) + (for-each + (lambda (tag) + (hash-table-set! res tag + (delete-duplicates + (cons testname (hash-table-ref/default res tag '()))))) + tags))) + db + "SELECT testname,tags FROM test_meta") + res)) ;; read the record given a testname (define (db:testmeta-get-record dbstruct testname) (let ((res #f)) (db:with-db Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -217,11 +217,11 @@ (let* ((fullurl (if (vector? serverdat) (http-transport:server-dat-get-api-req serverdat) (begin (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") (exit 1)))) - (res #f) + (res (vector #f "uninitialized")) (success #t) (sparams (db:obj->string params transport: 'http))) (debug:print-info 11 *default-log-port* "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") ;; set up the http-client here (max-retry-attempts 1) @@ -383,30 +383,34 @@ (server-going #f)) (let loop ((count 0) (server-state 'available) (bad-sync-count 0) (start-time (current-milliseconds))) - + ;;(BB> "http-transport: top of loop; count="count" server-state="server-state" bad-sync-count="bad-sync-count" server-going="server-going) ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-db* ;; Removed code is pasted below (keeping it around until we are clear it is not needed). ;; no *dbstruct-db* yet, set running after our first pass through and start the db (if (eq? server-state 'available) (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers (if (equal? new-server-id server-id) (begin (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") + ;;(BB> "http-transport: ->dbprep") (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") + ;;(BB> "http-transport: ->running") (server:write-dotserver *toppath* (conc iface ":" port)) - (delete-file* (conc *toppath* "/.starting-server"))) + (thread-start! *watchdog*) + (server:complete-attempt *toppath*)) (begin ;; gotta exit nicely + ;;(BB> "http-transport: ->collision") (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") (http-transport:server-shutdown server-id port)))))) - + ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) (if (and (<= rem-time 4) @@ -486,11 +490,12 @@ ;; (thread-sleep! rem-time) ;; (thread-sleep! 4))) ;; fallback for if the math is changed ... (define (http-transport:server-shutdown server-id port) (let ((tdbdat (tasks:open-db))) - (debug:print-info 0 *default-log-port* "Starting to shutdown the server.") + ;;(BB> "http-transport:server-shutdown called") + (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) ;; ;; start_shutdown ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up @@ -513,21 +518,19 @@ " ms") (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete") ;; if the .server file contained :myport then we can remove it (server:remove-dotserver-file *toppath* port) + ;;(BB> "http-transport:server-shutdown -> exit") (exit))) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) - (with-output-to-file - (conc *toppath* "/.starting-server") - (lambda () - (print (current-process-id) " on " (get-host-name)))) + (server:attempting-start *toppath*) (let* ((tdbdat (tasks:open-db))) (set! *run-id* run-id) (if (args:get-arg "-daemonize") (begin (daemon:ize) @@ -539,11 +542,11 @@ (server:check-if-running run-id)) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0)) (begin ;; ok, no server detected, clean out any lingering records - (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id "notresponding"))) + (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id "notresponding"))) (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin @@ -552,11 +555,11 @@ (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") - (delete-file* (conc *toppath* "/.starting-server")) + (server:complete-attempt *toppath*) )) (let* ((th2 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server run thread started") (http-transport:run (if (args:get-arg "-server") DELETED inteldate.scm Index: inteldate.scm ================================================================== --- inteldate.scm +++ /dev/null @@ -1,180 +0,0 @@ -(use srfi-19) -(use test) -(use format) -(use regex) -(declare (unit inteldate)) -;; utility procedures to convert among -;; different ways to express date (inteldate, seconds since epoch, isodate) -;; -;; samples: -;; isodate -> "2016-01-01" -;; inteldate -> "16ww01.5" -;; seconds -> 1451631600 - -;; procedures provided: -;; ==================== -;; seconds->isodate -;; seconds->inteldate -;; -;; isodate->seconds -;; isodate->inteldate -;; -;; inteldate->seconds -;; inteldate->isodate - -;; srfi-19 used extensively; this doc is better tha the eggref: -;; http://srfi.schemers.org/srfi-19/srfi-19.html - -;; Author: brandon.j.barclay@intel.com 16ww18.6 - -(define (date->seconds date) - (inexact->exact - (string->number - (date->string date "~s")))) - -(define (seconds->isodate seconds) - (let* ((date (seconds->date seconds)) - (result (date->string date "~Y-~m-~d"))) - result)) - -(define (isodate->seconds isodate) - "Takes a string input of the form 'YY-MM-DD' or 'YYYY-MM-DD' and returns epoch time; for YY, assume after Y2K" - (let* ((numlist (map string->number (string-split isodate "-"))) - (raw-year (car numlist)) - (year (if (< raw-year 100) (+ raw-year 2000) raw-year)) - (month (list-ref numlist 1)) - (day (list-ref numlist 2)) - (date (make-date 0 0 0 0 day month year)) - (seconds (date->seconds date))) - - seconds)) - -;; adapted from perl Intel::WorkWeek perl module -;; intel year consists of numbered weeks starting from week 1 -;; week 1 is the week containing jan 1 of the year -;; days of week are numbered starting from 0 on sunday -;; intel year does not match calendar year in workweek 1 -;; before jan1. -(define (seconds->inteldate-values seconds) - (define (date-difference->seconds d1 d2) - (- (date->seconds d1) (date->seconds d2))) - - (let* ((thisdate (seconds->date seconds)) - (thisdow (string->number (date->string thisdate "~w"))) - - (year (date-year thisdate)) - ;; intel workweek 1 begins on sunday of week containing jan1 - (jan1 (make-date 0 0 0 0 1 1 year)) - (jan1dow (date-week-day jan1)) - (ww01 (date-subtract-duration jan1 (seconds->time (* 60 60 24 jan1dow)))) - - (ww01_delta_seconds (date-difference->seconds thisdate ww01)) - (wwnum_initial (inexact->exact (add1 (floor (/ ww01_delta_seconds 24 3600 7) )))) - - ;; we could be in ww1 of next year - (this-saturday (seconds->date - (+ seconds - (* 60 60 24 (- 6 thisdow))))) - (this-week-ends-next-year? - (> (date-year this-saturday) year)) - (intelyear - (if this-week-ends-next-year? - (add1 year) - year)) - (intelweek - (if this-week-ends-next-year? - 1 - wwnum_initial))) - (values intelyear intelweek thisdow))) - -(define (seconds->inteldate seconds) - (define (string-leftpad in width pad-char) - (let* ((unpadded-str (->string in)) - (padlen_temp (- width (string-length unpadded-str))) - (padlen (if (< padlen_temp 0) 0 padlen_temp)) - (padding - (fold conc "" - (map (lambda (x) (->string pad-char)) (iota padlen))))) - (conc padding unpadded-str))) - (define (zeropad num width) - (string-leftpad num width #:0)) - - (let-values (((intelyear intelweek day-of-week-num) - (seconds->inteldate-values seconds))) - (let ((intelyear-str - (zeropad - (->string - (if (> intelyear 1999) - (- intelyear 2000) intelyear)) - 2)) - (intelweek-str - (zeropad (->string intelweek) 2)) - (dow-str (->string day-of-week-num))) - (conc intelyear-str "ww" intelweek-str "." dow-str)))) - -(define (isodate->inteldate isodate) - (seconds->inteldate - (isodate->seconds isodate))) - -(define (inteldate->seconds inteldate) - (let ((match (string-match "^(\\d+)ww(\\d+).(\\d)$" inteldate))) - (if - (not match) - #f - (let* ( - (intelyear-raw (string->number (list-ref match 1))) - (intelyear (if (< intelyear-raw 100) - (+ intelyear-raw 2000) - intelyear-raw)) - (intelww (string->number (list-ref match 2))) - (dayofweek (string->number (list-ref match 3))) - - (day-of-seconds (* 60 60 24 )) - (week-of-seconds (* day-of-seconds 7)) - - - ;; get seconds at ww1.0 - (new-years-date (make-date 0 0 0 0 1 1 intelyear)) - (new-years-seconds - (date->seconds new-years-date)) - (new-years-dayofweek (date-week-day new-years-date)) - (ww1.0_seconds (- new-years-seconds - (* day-of-seconds - new-years-dayofweek))) - (workweek-adjustment (* week-of-seconds (sub1 intelww))) - (weekday-adjustment (* dayofweek day-of-seconds)) - - (result (+ ww1.0_seconds workweek-adjustment weekday-adjustment))) - result)))) - -(define (inteldate->isodate inteldate) - (seconds->isodate (inteldate->seconds inteldate))) - -(define (inteldate-tests) - (test-group - "date conversion tests" - (let ((test-table - '(("16ww01.5" . "2016-01-01") - ("16ww18.5" . "2016-04-29") - ("1999ww33.5" . "1999-08-13") - ("16ww18.4" . "2016-04-28") - ("16ww18.3" . "2016-04-27") - ("13ww01.0" . "2012-12-30") - ("13ww52.6" . "2013-12-28") - ("16ww53.3" . "2016-12-28")))) - (for-each - (lambda (test-pair) - (let ((inteldate (car test-pair)) - (isodate (cdr test-pair))) - (test - (conc "(isodate->inteldate "isodate ") => "inteldate) - inteldate - (isodate->inteldate isodate)) - - (test - (conc "(inteldate->isodate "inteldate ") => "isodate) - isodate - (inteldate->isodate inteldate)))) - test-table)))) - -;(inteldate-tests) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -122,10 +122,22 @@ (call-with-environment-variables (list (cons "PATH" (conc (get-environment-variable "PATH") ":."))) (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1") (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 (pid (process-run "/bin/bash" (list "-c" cmd)))) + + (with-output-to-file "Makefile.ezsteps" + (lambda () + (print stepname ".log :") + (print "\t" cmd) + (if (file-exists? (conc stepname ".logpro")) + (print "\tlogpro " stepname ".logpro " stepname ".html < " stepname ".log")) + (print) + (print stepname " : " stepname ".log") + (print)) + #:append) + (rmt:test-set-top-process-pid run-id test-id pid) (let processloop ((i 0)) (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) (mutex-lock! m) (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) @@ -268,11 +280,11 @@ ;; do all the ezsteps (if any) (if ezsteps (let* ((testconfig ;; (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here? ;; NOTE: it is tempting to turn off force-create of testconfig but dynamic ;; ezstep names need a full re-eval here. - (tests:get-testconfig test-name tconfigreg #t force-create: #t)) ;; 'return-procs))) + (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) (ezstepslst (if (hash-table? testconfig) (hash-table-ref/default testconfig "ezsteps" '()) #f))) (if testconfig (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ... @@ -316,15 +328,15 @@ (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) - (cpu-load (get-cpu-load)) + (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory)))) - (let ((new-cpu-load (let* ((load (get-cpu-load)) + (let ((new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) - (if (> delta 0.6) ;; don't bother updating with small changes + (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (get-df (current-directory))) (delta (abs (- df disk-free)))) (if (> delta 200) ;; ignore changes under 200 Meg @@ -846,11 +858,14 @@ (directory-exists? *toppath*)) (begin (setenv "MT_RUN_AREA_HOME" *toppath*) (setenv "MT_TESTSUITE_NAME" (common:get-testsuite-name))) (begin - (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area."))) + (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.") + ;;(exit 1) + #f + )) *toppath*)) (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) @@ -861,11 +876,11 @@ (if res (cdr res) (begin (if (common:low-noise-print 20 "No valid disks or no disk with enough space") (debug:print-error 0 *default-log-port* "No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists and has enough space!\n You can change minspace in the [setup] section of megatest.config. Current setting is: " minspace)) - (exit 1))))))) + (exit 1))))))) ;; TODO - move the exit to the calling location and return #f ;; Desired directory structure: ;; ;; - - -. ;; | @@ -1053,30 +1068,35 @@ ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) + (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ((item-path (item-list->path itemdat))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (string->number (or (configf:lookup *configdat* "setup" "launch-delay") "5")))) (if (> launch-delay delta) (begin (debug:print-info 0 *default-log-port* "Delaying launch of " test-name " for " (- launch-delay delta) " seconds") (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) *last-launch*) launch-delay)))) - (set! *last-launch* (current-seconds)) (change-directory *toppath*) (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars) - (list - (list "MT_RUN_AREA_HOME" *toppath*) - (list "MT_TEST_NAME" test-name) - (list "MT_RUNNAME" runname) - (list "MT_ITEMPATH" item-path) - )) - (let* ((tregistry (tests:get-all)) - (tconfig (or (tests:get-testconfig test-name tregistry #t force-create: #t) - test-conf)) ;; force re-read now that all vars are set + (append + (list + (list "MT_RUN_AREA_HOME" *toppath*) + (list "MT_TEST_NAME" test-name) + (list "MT_RUNNAME" runname) + (list "MT_ITEMPATH" item-path) + ) + itemdat)) + (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed + ;; for tconfig, why do we allow fallback to test-conf? + (tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t) + (begin + (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") + test-conf))) ;; force re-read now that all vars are set (useshell (let ((ush (config-lookup *configdat* "jobtools" "useshell"))) (if ush (if (equal? ush "no") ;; must use "no" to NOT use shell #f ush) @@ -1112,11 +1132,10 @@ (mt-bindir-path #f) (testinfo (rmt:get-test-info-by-id run-id test-id)) (mt_target (string-intersperse (map cadr keyvals) "/")) (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '())))) - ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) (if launcher (set! launcher (string-split launcher))) @@ -1130,10 +1149,11 @@ ;; prevent overlapping actions - set to LAUNCHED as early as possible ;; ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "LAUNCHED" #f) + ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk *configdat* tconfig)) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) @@ -1184,10 +1204,11 @@ ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" ""))))) (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) (debug:print 1 *default-log-port* "Launching " work-area) ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done (debug:print 4 *default-log-port* "fullcmd: " fullcmd) + (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible. (let* ((commonprevvals (alist->env-vars (hash-table-ref/default *configdat* "env-override" '()))) (miscprevvals (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute" (append (list (list "MT_TEST_RUN_DIR" work-area) (list "MT_TEST_NAME" test-name) @@ -1211,10 +1232,11 @@ (conc cmdstr " >> mt_launch.log 2>&1"))) (car fullcmd)) (if useshell '() (cdr fullcmd))))) + (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.6302) +(define megatest-version 1.6303) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -93,10 +93,12 @@ -reqtarg key1/key2/... : run for key1, key2, etc. but key1/key2 must be in runconfig -testpatt patt1/patt2,patt3/... : % is wildcard -runname : required, name for this particular test run -state : Applies to runs, tests or steps depending on context -status : Applies to runs, tests or steps depending on context + -mode key : load testpatt from in runconfigs instead of default TESTPATT + -tagexpr tag1,tag2%,.. : select tests with tags matching expression Test helpers (for use inside tests) -step stepname -test-status : set the state and status of a test (use :state and :status) -setlog logfname : set the path/filename to the final log relative to the test @@ -209,11 +211,13 @@ ":state" "-state" ":status" "-status" "-list-runs" - "-testpatt" + "-testpatt" + "-mode" + "-tagexpr" "-itempatt" "-setlog" "-set-toplog" "-runstep" "-logpro" @@ -345,11 +349,13 @@ ;; The watchdog is to keep an eye on things like db sync etc. ;; (define *watchdog* (make-thread common:watchdog "Watchdog thread")) -(thread-start! *watchdog*) +(if (not (args:get-arg "-server")) + (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog +;;(BB> "thread-start! watchdog") (if (args:get-arg "-log") (let ((oup (open-output-file (args:get-arg "-log")))) (debug:print-info 0 *default-log-port* "Sending log output to " (args:get-arg "-log")) (set! *default-log-port* oup))) @@ -1851,13 +1857,11 @@ (begin (if (not (launch:setup)) (begin (debug:print 0 *default-log-port* "Failed to setup, exiting") (exit 1))) - ;; now can find our db - ;; keep this one local - (open-run-close runs:update-all-test_meta #f) + (runs:update-all-test_meta #f) (set! *didsomething* #t))) ;;====================================================================== ;; Start a repl ;;====================================================================== @@ -1986,17 +1990,22 @@ ;;====================================================================== ;; Exit and clean up ;;====================================================================== -(if *runremote* (close-all-connections!)) ;; for http-client - (if (not *didsomething*) (debug:print 0 *default-log-port* help)) +;;(BB> "thread-join! watchdog") + +;; join the watchdog thread if it has been thread-start!ed (it may not have been started in the case of a server that never enters running state) +;; (symbols returned by thread-state: created ready running blocked suspended sleeping terminated dead) +(if (thread? *watchdog*) + (case (thread-state *watchdog*) + ((ready running blocked sleeping terminated dead) + (thread-join! *watchdog*)))) (set! *time-to-exit* #t) -(thread-join! *watchdog*) (if (not (eq? *globalexitstatus* 0)) (if (or (args:get-arg "-run")(args:get-arg "-runtests")(args:get-arg "-runall")) (begin (debug:print 0 *default-log-port* "NOTE: Subprocesses with non-zero exit code detected: " *globalexitstatus*) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -24,18 +24,10 @@ ;; generate entries for ~/.megatestrc with the following ;; ;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u -(defstruct remote - (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*) #f)) - (last-server-check 0) ;; last time we checked to see if the server was alive - (conndat #f) - (transport *transport-type*) - (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds - ;;====================================================================== ;; S U P P O R T F U N C T I O N S ;;====================================================================== ;; if a server is either running or in the process of starting call client:setup @@ -94,23 +86,38 @@ ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) + + ;; on homehost and this is a write, we already have a server, but server has died + ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost + (not (member cmd api:read-only-queries)) ;; this is a write + (remote-server-url *runremote*) ;; have a server + (not (server:read-dotserver *toppath*))) ;; server has died. + (set! *runremote* #f) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") + (rmt:send-receive cmd rid params attemptnum: attemptnum)) + ;; on homehost and this is a write, we already have a server ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) - ;; on homehost and this is a write, we have a server (we know because case 4 checked) - ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost - (not (member cmd api:read-only-queries))) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") - (rmt:open-qry-close-locally cmd 0 params)) + + ;; commented by bb; this was blocking server passive start on write on homehost (case 5) + ;; ;; on homehost and this is a write, we have a server (we know because case 4 checked) + ;; ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost + ;; (not (member cmd api:read-only-queries))) + ;; (mutex-unlock! *rmt-mutex*) + ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") + ;; (rmt:open-qry-close-locally cmd 0 params)) + + ;; no server contact made and this is a write, passively start a server ((and (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call @@ -257,11 +264,11 @@ ;; (rmt:update-db-stats run-id cmd params duration) ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it (if qry-is-write (let ((start-time (current-seconds))) (mutex-lock! *db-multi-sync-mutex*) - (set! *db-last-write* start-time) ;; the oldest "write" + (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) (mutex-unlock! *db-multi-sync-mutex*))))) res)) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) @@ -320,10 +327,15 @@ ;; added run-id to make looking up the correct db possible ;; (define (rmt:general-call stmtname run-id . params) (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params))) + +;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host +(define (rmt:get-latest-host-load hostname) + (rmt:send-receive 'get-latest-host-load 0 (list hostname))) + ;; (define (rmt:sync-inmem->db run-id) ;; (rmt:send-receive 'sync-inmem->db run-id '())) (define (rmt:sdb-qry qry val run-id) ;; add caching if qry is 'getid or 'getstr @@ -330,10 +342,17 @@ (rmt:send-receive 'sdb-qry run-id (list qry val))) ;; NOT COMPLETED (define (rmt:runtests user run-id testpatt params) (rmt:send-receive 'runtests run-id testpatt)) + +;;====================================================================== +;; T E S T M E T A +;;====================================================================== + +(define (rmt:get-tests-tags) + (rmt:send-receive 'get-tests-tags #f '())) ;;====================================================================== ;; K E Y S ;;====================================================================== @@ -345,10 +364,15 @@ (define (rmt:get-keys) (if *db-keys* *db-keys* (let ((res (rmt:send-receive 'get-keys #f '()))) (set! *db-keys* res) res))) + +(define (rmt:get-keys-write) ;; dummy query to force server start + (let ((res (rmt:send-receive 'get-keys-write #f '()))) + (set! *db-keys* res) + res)) ;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe ;; to cache the resuls in a hash ;; (define (rmt:get-key-vals run-id) Index: rpc-transport.scm ================================================================== --- rpc-transport.scm +++ rpc-transport.scm @@ -176,11 +176,11 @@ (if ping-res (let ((server-dat (list iface port #f #f #f))) (hash-table-set! *runremote* run-id server-dat) server-dat) (begin - (server:try-running run-id) + (server:try-running *toppath*) (thread-sleep! 2) (rpc-transport:client-setup run-id (- remtries 1))))) (let* ((server-db-info (open-run-close tasks:get-server tasks:open-db run-id))) (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if server-db-info @@ -191,15 +191,15 @@ (if start-res (begin (hash-table-set! *runremote* run-id server-dat) server-dat) (begin - (server:try-running run-id) + (server:try-running *toppath*) (thread-sleep! 2) (rpc-transport:client-setup run-id (- remtries 1))))) (begin - (server:try-running run-id) + (server:try-running *toppath*) (thread-sleep! 2) (rpc-transport:client-setup run-id (- remtries 1))))))))) ;; ;; (port (if (and hostinfo (> (length hostdat) 1))(cadr hostdat) #f))) ;; (if (and port Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1958,10 +1958,23 @@ (if (and val (not (equal? (vector-ref currrecord idx) val))) (begin (print "Updating " test-name " " fld " to " val) (rmt:testmeta-update-field test-name fld val))))) '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9)("jobgroup" 10))))) + +;; find tests with matching tags, tagpatt is a string "tagpatt1,tagpatt2%, ..." +;; +(define (runs:get-tests-matching-tags tagpatt) + (let* ((tagdata (rmt:get-tests-tags)) + (res '())) ;; list of tests that match one or more tags + (for-each + (lambda (tag) + (if (patt-list-match tag tagpatt) + (set! res (append (hash-table-ref tagdata tag))))) + (hash-table-keys tagdata)) + res)) + ;; Update test_meta for all tests (define (runs:update-all-test_meta db) (let ((test-names (tests:get-all))) ;; (tests:get-valid-tests))) (for-each ADDED sample-sauth-paths.scm Index: sample-sauth-paths.scm ================================================================== --- /dev/null +++ sample-sauth-paths.scm @@ -0,0 +1,4 @@ +(define *db-path* "/path/to/db") +(define *exe-path* "/path/to/store/suids") +(define *exe-src* "/path/to/spublish/and/sretrieve/executables") +(define *sauth-path* "/path/to/production/sauthorize/exe") ADDED sauth-common.scm Index: sauth-common.scm ================================================================== --- /dev/null +++ sauth-common.scm @@ -0,0 +1,263 @@ + +;; Create the sqlite db +(define (sauthorize:db-do proc) + (if (or (not *db-path*) + (not (file-exists? *db-path*))) + (begin + (print 0 "[database]\nlocation " *db-path* " \n\n Is missing from the config file!") + (exit 1))) + (if (and *db-path* + (directory? *db-path*) + (file-read-access? *db-path*)) + (let* ((dbpath (conc *db-path* "/sauthorize.db")) + (writeable (file-write-access? dbpath)) + (dbexists (file-exists? dbpath))) + (handle-exceptions + exn + (begin + (debug:print 2 "ERROR: problem accessing db " dbpath + ((condition-property-accessor 'exn 'message) exn)) + (exit 1)) + ; (print "calling proc " proc "db path " dbpath ) + (call-with-database + dbpath + (lambda (db) + ;(print 0 "calling proc " proc " on db " db) + (set-busy-handler! db (busy-timeout 10000)) ;; 10 sec timeout + (if (not dbexists)(sauthorize:initialize-db db)) + (proc db))))) + (print 0 "ERROR: invalid path for storing database: " *db-path*))) + +;;execute a query +(define (sauthorize:db-qry db qry) + (exec (sql db qry))) + + +(define (sauthorize:do-as-calling-user proc) + (let ((eid (current-effective-user-id)) + (cid (current-user-id))) + (if (not (eq? eid cid)) ;; running suid + (set! (current-effective-user-id) cid)) + ;(print 0 "cid " cid " eid:" eid) + (proc) + (if (not (eq? eid cid)) + (set! (current-effective-user-id) eid)))) + + +(define (run-cmd cmd arg-list) + ; (print (current-effective-user-id)) + ;(handle-exceptions +; exn +; (print 0 "ERROR: failed to run script " cmd " with params " arg-list " " (exn assert)) + (let ((pid (process-run cmd arg-list))) + (process-wait pid)) +) +;) + + +(define (regster-log inl usr-id area-id cmd) + (sauth-common:shell-do-as-adm + (lambda () + (sauthorize:db-do (lambda (db) + (sauthorize:db-qry db (conc "INSERT INTO actions (cmd,user_id,area_id,action_type ) VALUES ('sretrieve " inl "'," usr-id "," area-id ", 'cat' )"))))))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Check user types +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + +;;check if a user is an admin +(define (is-admin username) + (let* ((admin #f)) + (sauthorize:db-do (lambda (db) + (let* ((data-row (query fetch (sql db (conc "SELECT users.is_admin FROM users where users.username = '" username "'"))))) + (if (not (null? data-row)) + (let ((col (car data-row))) + (if (equal? col "yes") + (set! admin #t))))))) +admin)) + + +;;check if a user is an read-admin +(define (is-read-admin username) + (let* ((admin #f)) + (sauthorize:db-do (lambda (db) + (let* ((data-row (query fetch (sql db (conc "SELECT users.is_admin FROM users where users.username = '" username "'"))))) + (if (not (null? data-row)) + (let ((col (car data-row))) + (if (equal? col "read-admin") + (set! admin #t))))))) +admin)) + + +;;check if user has specifc role for a area +(define (is-user role username area) + (let* ((has-access #f)) + (sauthorize:db-do (lambda (db) + (let* ((data-row (query fetch (sql db (conc "SELECT permissions.access_type, permissions.expiration FROM users , areas, permissions where permissions.user_id = users.id and permissions.area_id = areas.id and users.username = '" username "' and areas.code = '" area "'"))))) + (if (not (null? data-row)) + (begin + (let* ((access-type (car data-row)) + (exdate (cadr data-row))) + (if (not (null? exdate)) + (begin + (let ((valid (is-access-valid exdate))) + ;(print valid) + (if (and (equal? access-type role) + (equal? valid #t)) + (set! has-access #t)))) + (print "Access expired")))))))) + ;(print has-access) +has-access)) + +(define (is-access-valid exp-str) + (let* ((ret-val #f ) + (date-parts (string-split exp-str "/")) + (yr (string->number (car date-parts))) + (month (string->number(car (cdr date-parts)))) + (day (string->number(caddr date-parts))) + (exp-date (make-date 0 0 0 0 day month yr ))) + ;(print exp-date) + ;(print (current-date)) + (if (> (date-compare exp-date (current-date)) 0) + (set! ret-val #t)) + ;(print ret-val) + ret-val)) + + +;check if area exists +(define (area-exists area) + (let* ((area-defined #f)) + (sauthorize:db-do (lambda (db) + (let* ((data-row (query fetch (sql db (conc "SELECT id FROM areas where areas.code = '" area "'"))))) + (if (not (null? data-row)) + (set! area-defined #t))))) +area-defined)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Get Record from database +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;gets area id by code +(define (get-area area) + (let* ((area-defined '())) + (sauthorize:db-do (lambda (db) + (let* ((data-row (query fetch (sql db (conc "SELECT id FROM areas where areas.code = '" area "'"))))) + (set! area-defined data-row)))) +area-defined)) + +;get id of users table by user name +(define (get-user user) + (let* ((user-defined '())) + (sauthorize:db-do (lambda (db) + (let* ((data-row (query fetch (sql db (conc "SELECT id FROM users where users.username = '" user "'"))))) + (set! user-defined data-row)))) +user-defined)) + +;get permissions id by userid and area id +(define (get-perm userid areaid) + (let* ((user-defined '())) + (sauthorize:db-do (lambda (db) + (let* ((data-row (query fetch (sql db (conc "SELECT id FROM permissions where user_id = " userid " and area_id = " areaid))))) + (set! user-defined data-row)))) + +user-defined)) + +(define (get-restrictions base-path usr) +(let* ((user-defined '())) + (sauthorize:db-do (lambda (db) + (let* ((data-row (query fetch (sql db (conc "SELECT restriction FROM areas, users, permissions where areas.id = permissions.area_id and users.id = permissions.user_id and users.username = '" usr "' and areas.basepath = '" base-path "'"))))) + ;(print data-row) + (set! user-defined data-row)))) + ; (print user-defined) + (if (null? user-defined) + "" + (car user-defined)))) + + +(define (get-obj-by-path path) + (let* ((obj '())) + (sauthorize:db-do (lambda (db) + (let* ((data-row (query fetch (sql db (conc "SELECT code,exe_name, id, basepath FROM areas where areas.basepath = '" path "'"))))) + (set! obj data-row)))) +obj)) + +(define (get-obj-by-code code ) + (let* ((obj '())) + (sauthorize:db-do (lambda (db) + (let* ((data-row (query fetch (sql db (conc "SELECT code, exe_name, id, basepath FROM areas where areas.code = '" code "'"))))) + (set! obj data-row)))) +;(print obj) +obj)) + + + +;; function to validate the users input for target path and resolve the path +;; TODO: Check for restriction in subpath +(define (sauth-common:resolve-path new current allowed-sheets) + (let* ((target-path (append current (string-split new "/"))) + (target-path-string (string-join target-path "/")) + (normal-path (normalize-pathname target-path-string)) + (normal-list (string-split normal-path "/")) + (ret '())) + (if (string-contains normal-path "..") + (begin + (print "ERROR: Path " new " resolved outside target area ") + #f) + (if(equal? normal-path ".") + ret + (if (not (member (car normal-list) allowed-sheets)) + (begin + (print "ERROR: Permision denied to " new ) + #f) + normal-list))))) + +(define (sauth-common:get-target-path base-path-list ext-path top-areas base-path) + (let* ((resolved-path (sauth-common:resolve-path ext-path base-path-list top-areas )) + (usr (current-user-name) ) ) + (if (not (equal? resolved-path #f)) + (if (null? resolved-path) + #f + (let* ((sheet (car resolved-path)) + (restricted-areas (get-restrictions base-path usr)) + (restrictions (conc ".*" (string-join (string-split restricted-areas ",") ".*|.*") ".*")) + (target-path (if (null? (cdr resolved-path)) + base-path + (conc base-path "/" (string-join (cdr resolved-path) "/"))))) + ; (print restricted-areas) + (if (and (not (equal? restricted-areas "" )) + (string-match (regexp restrictions) target-path)) + (begin + (print "Access denied to " (string-join resolved-path "/")) + ;(exit 1) + #f) + target-path))) + #f))) + +(define (sauth-common:shell-ls-cmd base-path-list ext-path top-areas base-path tail-cmd-list) + (if (and (null? base-path-list) (equal? ext-path "") ) + (print (string-intersperse top-areas " ")) + (let* ((resolved-path (sauth-common:resolve-path ext-path base-path-list top-areas ))) + ;(print resolved-path) + (if (not (equal? resolved-path #f)) + (if (null? resolved-path) + (print (string-intersperse top-areas " ")) + (let* ((target-path (sauth-common:get-target-path base-path-list ext-path top-areas base-path))) + (print target-path) + (if (not (equal? target-path #f)) + (begin + (cond + ((null? tail-cmd-list) + (run (pipe + (ls "-lrt" ,target-path)))) + ((not (equal? (car tail-cmd-list) "|")) + (print "ls cmd cannot accept " (string-join tail-cmd-list) " as an argument!!")) + (else + (run (pipe + (ls "-lrt" ,target-path) + (begin (system (string-join (cdr tail-cmd-list)))))) + ) +))) +)))))) + ADDED sauthorize.scm Index: sauthorize.scm ================================================================== --- /dev/null +++ sauthorize.scm @@ -0,0 +1,544 @@ + +;; Copyright 2006-2013, Matthew Welland. +;; +;; This program is made available under the GNU GPL version 2.0 or +;; greater. See the accompanying file COPYING for details. +;; +;; This program is distributed WITHOUT ANY WARRANTY; without even the +;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +;; PURPOSE. + +(use defstruct) +(use scsh-process) + +(use srfi-18) +(use srfi-19) +(use refdb) + +(use sql-de-lite srfi-1 posix regex regex-case srfi-69) +(declare (uses common)) + +(declare (uses configf)) +(declare (uses margs)) +(declare (uses megatest-version)) + +(include "megatest-fossil-hash.scm") +;;; please create this file before using sautherise. For sample file is avaliable sample-sauth-paths.scm. +(include "sauth-paths.scm") +(include "sauth-common.scm") + +;; +;; GLOBALS +;; +(define *verbosity* 1) +(define *logging* #f) +(define *exe-name* (pathname-file (car (argv)))) +(define *sretrieve:current-tab-number* 0) +(define *args-hash* (make-hash-table)) +(define sauthorize:help (conc "Usage: " *exe-name* " [action [params ...]] + + list : list areas $USER's can access + log : get listing of recent activity. + sauth list-area-user : list the users that can access the area. + sauth open --group : Open up an area. User needs to be the owner of the area to open it. + --code + --retrieve|--publish + sauth grant --area : Grant permission to read or write to a area that is alrady opend up. + --expiration yyyy/mm/dd --retrieve|--publish + [--restrict ] + sauth read-shell : Open sretrieve shell for reading. + sauth write-shell : Open spublish shell for writing. + +Part of the Megatest tool suite. +Learn more at http://www.kiatoa.com/fossils/megatest + +Version: " megatest-fossil-hash)) ;; " + +;;====================================================================== +;; RECORDS +;;====================================================================== + +;;====================================================================== +;; DB +;;====================================================================== + +;; replace (strftime('%s','now')), with datetime('now')) +(define (sauthorize:initialize-db db) + (for-each + (lambda (qry) + (exec (sql db qry))) + (list + "CREATE TABLE IF NOT EXISTS actions + (id INTEGER PRIMARY KEY, + cmd TEXT NOT NULL, + user_id INTEGER NOT NULL, + datetime TIMESTAMP DEFAULT (datetime('now','localtime')), + area_id INTEGER NOT NULL, + comment TEXT DEFAULT '' NOT NULL, + action_type TEXT NOT NULL);" + "CREATE TABLE IF NOT EXISTS users + (id INTEGER PRIMARY KEY, + username TEXT NOT NULL, + is_admin TEXT NOT NULL, + datetime TIMESTAMP DEFAULT (datetime('now','localtime')) + );" + "CREATE TABLE IF NOT EXISTS areas + (id INTEGER PRIMARY KEY, + basepath TEXT NOT NULL, + code TEXT NOT NULL, + exe_name TEXT NOT NULL, + datetime TIMESTAMP DEFAULT (datetime('now','localtime')) + );" + "CREATE TABLE IF NOT EXISTS permissions + (id INTEGER PRIMARY KEY, + access_type TEXT NOT NULL, + user_id INTEGER NOT NULL, + datetime TIMESTAMP DEFAULT (datetime('now','localtime')), + area_id INTEGER NOT NULL, + restriction TEXT DEFAULT '' NOT NULL, + expiration TIMESTAMP DEFAULT NULL);" + ))) + + + + +(define (get-access-type args) + (let loop ((hed (car args)) + (tal (cdr args))) + (cond + ((equal? hed "--retrieve") + "retrieve") + ((equal? hed "--publish") + "publish") + ((equal? hed "--area-admin") + "area-admin") + ((equal? hed "--writer-admin") + "writer-admin") + ((equal? hed "--read-admin") + "read-admin") + + ((null? tal) + #f) + (else + (loop (car tal)(cdr tal)))))) + + + +;; check if user can gran access to an area +(define (can-grant-perm username access-type area) + (let* ((isadmin (is-admin username)) + (is-area-admin (is-user "area-admin" username area )) + (is-read-admin (is-user "read-admin" username area) ) + (is-writer-admin (is-user "writer-admin" username area) ) ) + (cond + ((equal? isadmin #t) + #t) + ((equal? is-area-admin #t ) + #t) + ((and (equal? is-writer-admin #t ) (equal? access-type "retrieve")) + #t) + ((and (equal? is-read-admin #t ) (equal? access-type "retrieve")) + #t) + + (else + #f)))) + +(define (sauthorize:list-areausers area ) + (sauthorize:db-do (lambda (db) + (print "Users having access to " area ":") + (query (for-each-row + (lambda (row) + (let* ((exp-date (cadr row))) + (if (is-access-valid exp-date) + (apply print (intersperse row " | ")))))) + (sql db (conc "SELECT users.username, permissions.expiration, permissions.access_type FROM users, areas, permissions where permissions.user_id = users.id and permissions.area_id = areas.id and areas.code = '" area "'")))))) + + + + +; check if executable exists +(define (exe-exist exe access-type) + (let* ((filepath (conc *exe-path* "/" access-type "/" exe))) + ; (print filepath) + (if (file-exists? filepath) + #t + #f))) + +(define (copy-exe access-type exe-name group) + (run-cmd "/bin/chmod" (list "g+w" (conc *exe-path* "/" access-type))) + (let* ((spath (conc *exe-src* "/s" access-type)) + (dpath (conc *exe-path* "/" access-type "/" exe-name))) + (sauthorize:do-as-calling-user + (lambda () + (run-cmd "/bin/cp" (list spath dpath )) + (if (equal? access-type "publish") + (run-cmd "/bin/chmod" (list "u+s,o+rx" dpath)) + (begin + (if (equal? group "none") + (run-cmd "/bin/chmod" (list "u+s,o+rx" dpath)) + (begin + (run-cmd "/bin/chgrp" (list group dpath)) + (run-cmd "/bin/chmod" (list "g+s,o+rx" dpath)))))))) + (run-cmd "chmod" (list "g-w" (conc *exe-path* "/" access-type))))) + +(define (get-exe-name path group) + (let ((name "")) + (sauthorize:do-as-calling-user + (lambda () + (if (equal? (current-effective-user-id) (file-owner path)) + (set! name (conc (current-user-name) "_" group)) + (begin + (print "You cannot open areas that you dont own!!") + (exit 1))))) +name)) + +;check if a paths/codes are vaid and if area is alrady open +(define (open-area group path code access-type) + (let* ((exe-name (get-exe-name path group)) + (path-obj (get-obj-by-path path)) + (code-obj (get-obj-by-code code))) + ;(print path-obj) + (cond + ((not (null? path-obj)) + (if (equal? code (car path-obj)) + (begin + (if (equal? exe-name (cadr path-obj)) + (begin + (if (not (exe-exist exe-name access-type)) + (copy-exe access-type exe-name group) + (begin + (print "Area already open!!") + (exit 1)))) + (begin + (if (not (exe-exist exe-name access-type)) + (copy-exe access-type exe-name group)) + ;; update exe-name in db + (sauthorize:db-do (lambda (db) + (sauthorize:db-qry db (conc "update areas set exe_name = '" exe-name "' where id = " (caddr path-obj))))) + ))) + (begin + (print "Path " path " is registered with --code " (car path-obj) ". To open this area please execute following cmd: \n sauthorize open " path " --group " group " --code " (car path-obj) " --" access-type ) + (exit 1)))) + + ((not (null? code-obj)) + (print "Code " code " is used for diffrent path. Please try diffrent value of --code" ) + (exit 1)) + (else + ; (print (exe-exist exe-name access-type)) + (if (not (exe-exist exe-name access-type)) + (copy-exe access-type exe-name group)) + (sauthorize:db-do (lambda (db) + ;(print (conc "insert into areas (code, basepath, exe_name) values ('" code "', '" path "', '" exe-name "') ")) + (sauthorize:db-qry db (conc "insert into areas (code, basepath, exe_name) values ('" code "', '" path "', '" exe-name "') ")))))))) + +(define (user-has-open-perm user path access) + (let* ((has-access #f) + (eid (current-user-id))) + (cond + ((is-admin user) + (set! has-access #t )) + ((and (is-read-admin user) (equal? access "retrieve")) + (set! has-access #t )) + (else + (print "User " user " does not have permission to open areas"))) + has-access)) + + +;;check if user has group access +(define (is-group-washed req_grpid current-grp-list) + (let loop ((hed (car current-grp-list)) + (tal (cdr current-grp-list))) + (cond + ((equal? hed req_grpid) + #t) + ((null? tal) + #f) + (else + (loop (car tal)(cdr tal)))))) + +;create executables with appropriate suids +(define (sauthorize:open user path group code access-type) + (let* ((gpid (group-information group)) + (req_grpid (if (equal? group "none") + group + (if (equal? gpid #f) + #f + (caddr gpid)))) + (current-grp-list (get-groups)) + (valid-grp (if (equal? group "none") + group + (is-group-washed req_grpid current-grp-list)))) + (if (and (not (equal? group "none")) (equal? valid-grp #f )) + (begin + (print "Group " group " is not washed in the current xterm!!") + (exit 1)))) + (if (not (file-write-access? path)) + (begin + (print "You can open areas owned by yourself. You do not have permissions to open path." path) + (exit 1))) + (if (user-has-open-perm user path access-type) + (begin + ;(print "here") + (open-area group path code access-type) + (sauthorize:grant user user code "2017/12/25" "read-admin" "") + (sauthorize:db-do (lambda (db) + (sauthorize:db-qry db (conc "INSERT INTO actions (cmd,user_id,area_id,action_type ) VALUES ('sauthorize open " path " --code " code " --group " group " --" access-type "'," (car (get-user user)) "," (car (get-area code)) ", 'open' )")))) + (print "Area has " path " been opened for " access-type )))) + +(define (sauthorize:grant auser guser area exp-date access-type restrict) + ; check if user exist + (let* ((area-obj (get-area area)) + (auser-obj (get-user auser)) + (user-obj (get-user guser))) + + (if (null? user-obj) + (begin + (sauthorize:db-do (lambda (db) + (sauthorize:db-qry db (conc "insert into users (username, is_admin) values ('" guser "', 'no') ")))) + (set! user-obj (get-user guser)))) + (let* ((perm-obj (get-perm (car user-obj) (car area-obj)))) + (if(null? perm-obj) + (begin + ;; insert permissions + (sauthorize:db-do (lambda (db) + (sauthorize:db-qry db (conc "insert into permissions (access_type, user_id, area_id, restriction, expiration ) values ('" access-type "', " (car user-obj) ", " (car area-obj) ", '" restrict "', '" exp-date "')"))))) + (begin + ;update permissions + (sauthorize:db-do (lambda (db) + (sauthorize:db-qry db (conc "update permissions set access_type = '" access-type "' , restriction = '" restrict "', expiration = '" exp-date "' where user_id = " (car user-obj) " and area_id = " (car area-obj))))))) + (sauthorize:db-do (lambda (db) + (sauthorize:db-qry db (conc "INSERT INTO actions (cmd,user_id,area_id,action_type ) VALUES ('sauthorize grant " guser " --area " area " --expiration " exp-date " --" access-type " --restrict " restrict "'," (car auser-obj) "," (car area-obj) ", 'grant' )")))) + (print "Permission has been sucessfully granted to user " guser)))) + +(define (sauthorize:process-action username action . args) + (case (string->symbol action) + ((grant) + (if (< (length args) 6) + (begin + (print "ERROR: Missing arguments; " (string-intersperse args ", ")) + (exit 1))) + (let* ((remargs (args:get-args args '("--area" "--expiration" "--restrict") '() args:arg-hash 0)) + (guser (car args)) + (restrict (or (args:get-arg "--restrict") "")) + (area (or (args:get-arg "--area") "")) + (exp-date (or (args:get-arg "--expiration") "")) + (access-type (get-access-type remargs))) + ; (print "version " guser " restrict " restrict ) + ; (print "area " area " exp-date " exp-date " access-type " access-type) + (cond + ((equal? guser "") + (print "Username not found!! Try \"sauthorize help\" for useage ") + (exit 1)) + ((equal? area "") + (print "Area not found!! Try \"sauthorize help\" for useage ") + (exit 1)) + ((equal? access-type #f) + (print "Access type not found!! Try \"sauthorize help\" for useage ") + (exit 1)) + ((equal? exp-date "") + (print "Date of expiration not found!! Try \"sauthorize help\" for useage ") + (exit 1))) + (if (not (area-exists area)) + (begin + (print "Area does not exisit!!") + (exit 1))) + (if (can-grant-perm username access-type area) + (begin + (print "calling sauthorize:grant ") + (sauthorize:grant username guser area exp-date access-type restrict)) + (begin + (print "User " username " does not have permission to grant permissions to area " area "!!") + (exit 1))))) + ((list-area-user) + (if (not (equal? (length args) 1)) + (begin + (print "Missing argument area code to list-area-user ") + (exit 1))) + (let* ((area (car args))) + (if (not (area-exists area)) + (begin + (print "Area does not exisit!!") + (exit 1))) + + (sauthorize:list-areausers area ) + )) + ((read-shell) + (if (not (equal? (length args) 1)) + (begin + (print "Missing argument area code to read-shell ") + (exit 1))) + (let* ((area (car args)) + (code-obj (get-obj-by-code area))) + (if (or (null? code-obj) + (not (exe-exist (cadr code-obj) "retrieve"))) + (begin + (print "Area " area " is not open for reading!!") + (exit 1))) + (sauthorize:do-as-calling-user + (lambda () + (run-cmd (conc *exe-path* "/retrieve/" (cadr code-obj) ) (list "shell" area )))))) + ((write-shell) + (if (not (equal? (length args) 1)) + (begin + (print "Missing argument area code to read-shell ") + (exit 1))) + (let* ((area (car args)) + (code-obj (get-obj-by-code area))) + (if (or (null? code-obj) + (not (exe-exist (cadr code-obj) "publish"))) + (begin + (print "Area " area " is not open for Writing!!") + (exit 1))) + (sauthorize:do-as-calling-user + (lambda () + (run-cmd (conc *exe-path* "/publish/" (cadr code-obj) ) (list "shell" area)))))) + ((publish) + (if (< (length args) 2) + (begin + (print "Missing argument to publish. \n publish [opts] ") + (exit 1))) + (let* ((action (car args)) + (area (cadr args)) + (cmd-args (cddr args)) + (code-obj (get-obj-by-code area))) + (if (or (null? code-obj) + (not (exe-exist (cadr code-obj) "publish"))) + (begin + (print "Area " area " is not open for writing!!") + (exit 1))) + (sauthorize:do-as-calling-user + (lambda () + (run-cmd (conc *exe-path* "/publish/" (cadr code-obj) ) (append (list action area ) cmd-args)))))) + + ((retrieve) + (if (< (length args) 2) + (begin + (print "Missing argument to publish. \n publish [opts] ") + (exit 1))) + (let* ((action (car args)) + (area (cadr args)) + (cmd-args (cddr args)) + (code-obj (get-obj-by-code area))) + (if (or (null? code-obj) + (not (exe-exist (cadr code-obj) "retrieve"))) + (begin + (print "Area " area " is not open for reading!!") + (exit 1))) + (sauthorize:do-as-calling-user + (lambda () + (run-cmd (conc *exe-path* "/retrieve/" (cadr code-obj) ) (append (list action area ) cmd-args)))))) + + + + ((open) + (if (< (length args) 6) + (begin + (print "sauthorize open cmd takes 6 arguments!! \n Useage: sauthorize open --group --code --retrieve|--publish") + (exit 1))) + (let* ((remargs (args:get-args args '("--group" "--code") '() args:arg-hash 0)) + (path (car args)) + (group (or (args:get-arg "--group") "")) + (area (or (args:get-arg "--code") "")) + (access-type (get-access-type remargs))) + (cond + ((equal? path "") + (print "path not found!! Try \"sauthorize help\" for useage ") + (exit 1)) + ((equal? area "") + (print "--code not found!! Try \"sauthorize help\" for useage ") + (exit 1)) + ((equal? access-type #f) + (print "Access type not found!! Try \"sauthorize help\" for useage ") + (exit 1)) + ((and (not (equal? access-type "publish")) + (not (equal? access-type "retrieve"))) + (print "Access type can be eiter --retrieve or --publish !! Try \"sauthorize help\" for useage ") + (exit 1))) + + (sauthorize:open username path group area access-type))) + ((area-admin) + (let* ((usr (car args)) + (usr-obj (get-user usr)) + (user-id (car (get-user username)))) + + (if (is-admin username) + (begin + ; (print usr-obj) + (if (null? usr-obj) + (begin + (sauthorize:db-do (lambda (db) + ;(print (conc "INSERT INTO users (username,is_admin) VALUES ( '" usr "', 'read-admin' )")) + (sauthorize:db-qry db (conc "INSERT INTO users (username,is_admin) VALUES ( '" usr "', 'read-admin' )"))))) + (begin + ; (print (conc "update users set is_admin = 'no' where id = " (car usr-obj) )) + (sauthorize:db-do (lambda (db) + (sauthorize:db-qry db (conc "update users set is_admin = 'read-admin' where id = " (car usr-obj))))))) + (print "User " usr " is updated with area-admin access!")) + (print "Admin only function")) + (sauthorize:db-do (lambda (db) + (sauthorize:db-qry db (conc "INSERT INTO actions (cmd,user_id,area_id,action_type ) VALUES ('area-admin " usr " ', " user-id ",0, 'area-admin ')" )))))) + + ((register-log) + (if (< (length args) 4) + (print "Invalid arguments")) + ;(print args) + (let* ((cmd-line (car args)) + (user-id (cadr args)) + (area-id (caddr args)) + (user-obj (get-user username)) + (cmd (cadddr args))) + + (if (and (not (null? user-obj)) (equal? user-id (number->string(car user-obj)))) + (begin + (sauthorize:db-do (lambda (db) + (sauthorize:db-qry db (conc "INSERT INTO actions (cmd,user_id,area_id,action_type ) VALUES ('" cmd-line"', " user-id "," area-id ", '" cmd "')" ))))) + (print "You ar not authorised to run this cmd") + +))) + + + (else (print 0 "Unrecognised command " action)))) + +(define (main) + (let* ((args (argv)) + (prog (car args)) + (rema (cdr args)) + (username (current-user-name))) + ;; preserve the exe data in the config file + (cond + ;; one-word commands + ((eq? (length rema) 1) + (case (string->symbol (car rema)) + ((help -h -help --h --help) + (print sauthorize:help)) + ((list) + + (sauthorize:db-do (lambda (db) + (print "My Area accesses: ") + (query (for-each-row + (lambda (row) + (let* ((exp-date (car row))) + (if (is-access-valid exp-date) + (apply print (intersperse (cdr row) " | ")))))) + (sql db (conc "SELECT permissions.expiration, areas.basepath, areas.code, permissions.access_type FROM users, areas, permissions where permissions.user_id = users.id and permissions.area_id = areas.id and users.username = '" username "'")))))) + + ((log) + (sauthorize:db-do (lambda (db) + (print "Logs : ") + (query (for-each-row + (lambda (row) + + (apply print (intersperse row " | ")))) + (sql db "SELECT actions.cmd, users.username, actions.action_type, actions.datetime, areas.code FROM actions, users, areas where actions.user_id = users.id and actions.area_id = areas.id "))))) + (else + (print "ERROR: Unrecognised command. Try \"sauthorize help\"")))) + ;; multi-word commands + ((null? rema)(print sauthorize:help)) + ((>= (length rema) 2) + (apply sauthorize:process-action username (car rema)(cdr rema))) + (else (debug:print 0 "ERROR: Unrecognised command. Try \"sauthorize help\""))))) + +(main) + + + Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -47,11 +47,23 @@ ;; all routes though here end in exit ... ;; ;; start_server ;; (define (server:launch run-id transport-type) - (BB> "server:launch fired for run-id="run-id" transport-type="transport-type) + ;;(BB> "server:launch fired for run-id="run-id" transport-type="transport-type) + + (let ((attempt-in-progress (server:start-attempted? *toppath*))) + (when attempt-in-progress + (debug:print-info 0 *default-log-port* "Server start attempt in progress in other process (=> "attempt-in-progress"<=). Aborting server launch attempt in this process ("(current-process-id)")") + (exit))) + + (let ((dotserver-url (server:check-if-running *toppath*))) + (when dotserver-url + (debug:print-info 0 *default-log-port* "Server already running (=> "dotserver-url"<=). Aborting server launch attempt in this process ("(current-process-id)")") + (exit) + )) + (case transport-type ((http)(http-transport:launch run-id)) ;;((nmsg)(nmsg-transport:launch run-id)) ((rpc) (rpc-transport:launch run-id)) (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))) @@ -103,52 +115,60 @@ ;; Given a run id start a server process ### NOTE ### > file 2>&1 ;; if the run-id is zero and the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; -(define (server:run areapath) ;; areapath is ignored for now. +(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area (let* ((curr-host (get-host-name)) + (attempt-in-progress (server:start-attempted? areapath)) + (dot-server-url (server:check-if-running areapath)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) (target-host (car homehost)) (testsuite (common:get-testsuite-name)) - (logfile (conc *toppath* "/logs/server.log")) + (logfile (conc areapath "/logs/server.log")) (cmdln (conc (common:get-megatest-exe) " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") (conc " -daemonize -log " logfile) "") " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there - (push-directory *toppath*) - (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - (thread-start! log-rotate) - - ;; host.domain.tld match host? - (if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - - (setenv "TARGETHOST_LOGF" logfile) - (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever - (system (conc "nbfake " cmdln)) - (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - (thread-join! log-rotate) - (pop-directory))) - + (push-directory areapath) + (cond + (attempt-in-progress + (debug:print 0 *default-log-port* "INFO: Not trying to start server because attempt is in progress: "attempt-in-progress)) + (dot-server-url + (debug:print 0 *default-log-port* "INFO: Not trying to start server because one is already running : "dot-server-url)) + (else + (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") + (thread-start! log-rotate) + + ;; host.domain.tld match host? + (if (and target-host + ;; look at target host, is it host.domain.tld or ip address and does it + ;; match current ip or hostname + (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + (not (equal? curr-ip target-host))) + (begin + (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + (setenv "TARGETHOST" target-host))) + + (setenv "TARGETHOST_LOGF" logfile) + (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever + (system (conc "nbfake " cmdln)) + (unsetenv "TARGETHOST_LOGF") + (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + (thread-join! log-rotate) + (pop-directory))))) + (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) - (set! *my-client-signature* sig) - *my-client-signature*))) + (set! *my-client-signature* sig) + *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run areapath) (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f))) @@ -156,27 +176,36 @@ (> (- (current-seconds) last-run-time) 30)) (begin (server:run areapath) (hash-table-set! *server-kind-run* areapath (current-seconds)))))) -;; The generic run a server command. Dispatches the call to server 0 if run-id != 0 -;; -;; (define (server:try-running run-id) -;; (if (eq? run-id 0) -;; (server:run run-id) -;; (rmt:start-server run-id))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. +(define (server:attempting-start areapath) + (with-output-to-file + (conc areapath "/.starting-server") + (lambda () + (print (current-process-id) " on " (get-host-name))))) + +(define (server:complete-attempt areapath) + (delete-file* (conc areapath "/.starting-server"))) + (define (server:start-attempted? areapath) (let ((flagfile (conc areapath "/.starting-server"))) (handle-exceptions exn #f ;; if things go wrong pretend we can't see the file - (and (file-exists? flagfile) - (< (- (current-seconds) - (file-modification-time flagfile)) - 15))))) ;; exists and less than 15 seconds old + (cond + ((and (file-exists? flagfile) + (< (- (current-seconds) + (file-modification-time flagfile)) + 15)) ;; exists and less than 15 seconds old + (with-input-from-file flagfile (lambda () (read-line)))) + ((file-exists? flagfile) ;; it is stale. + (server:complete-attempt areapath) + #f) + (else #f))))) (define (server:read-dotserver areapath) (let ((dotfile (conc areapath "/.server"))) (handle-exceptions exn @@ -231,11 +260,13 @@ ((http)(server:ping-server dotserver)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res dotserver - #f)) + (begin + (server:remove-dotserver-file areapath ".*") ;; remove stale dotserver + #f))) #f))) ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -323,12 +323,12 @@ (res '())) (sqlite3:for-each-row (lambda (a . b) (set! res (cons (apply vector a b) res))) mdb - (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;") - run-id) + (conc "SELECT " selstr " FROM servers WHERE state in ('available','running','dbprep') ORDER BY start_time DESC;") + ) (vector header res))) (define (tasks:get-server mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) @@ -402,11 +402,11 @@ (< delay-time delay-max-tries)) (begin (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) (debug:print 0 *default-log-port* "Try starting server for run-id " run-id)) (thread-sleep! (/ (random 2000) 1000)) - (server:kind-run run-id) + (server:kind-run *toppath*) (thread-sleep! (min delay-time 1)) (if (not (or (server:start-attempted? *toppath*) (server:read-dotserver *toppath*))) ;; no point in trying (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)) #f)) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -141,11 +141,11 @@ ;; returns waitons waitors tconfigdat ;; (define (tests:get-waitons test-name all-tests-registry) - (let* ((config (tests:get-testconfig test-name all-tests-registry 'return-procs))) + (let* ((config (tests:get-testconfig test-name #f all-tests-registry 'return-procs))) (let ((instr (if config (config-lookup config "requirements" "waiton") (begin ;; No config means this is a non-existant test (debug:print-error 0 *default-log-port* "non-existent required test \"" test-name "\"") (exit 1)))) @@ -291,11 +291,11 @@ ;; Check for waiver eligibility ;; (define (tests:check-waiver-eligibility testdat prev-testdat) (let* ((test-registry (make-hash-table)) - (testconfig (tests:get-testconfig (db:test-get-testname testdat) test-registry #f)) + (testconfig (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f)) (test-rundir ;; (sdb:qry 'passstr (db:test-get-rundir testdat)) ;; ) (prev-rundir ;; (sdb:qry 'passstr (db:test-get-rundir prev-testdat)) ;; ) (waivers (if testconfig (configf:section-vars testconfig "waivers") '())) @@ -979,11 +979,11 @@ ;; if .testconfig exists in test directory read and return it ;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata" ;; else read the testconfig file ;; if have path to test directory save the config as .testconfig and return it ;; -(define (tests:get-testconfig test-name test-registry system-allowed #!key (force-create #f)) +(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)) (let* ((cache-path (tests:get-test-path-from-environment)) (cache-file (and cache-path (conc cache-path "/.testconfig"))) (cache-exists (and cache-file (not force-create) ;; if force-create then pretend there is no cache to read (file-exists? cache-file))) @@ -991,14 +991,17 @@ cache-exists) (handle-exceptions exn #f ;; any issues, just give up with the cached version and re-read (configf:read-alist cache-file)) - #f))) + #f)) + (test-full-name (if (and item-path (not (string-null? item-path))) + (conc test-name "/" item-path) + test-name))) (if cached-dat cached-dat - (let ((dat (hash-table-ref/default *testconfigs* test-name #f))) + (let ((dat (hash-table-ref/default *testconfigs* test-full-name #f))) (if (and dat ;; have a locally cached version (hash-table-ref/default dat "have fulldata" #f)) ;; marked as good data? dat ;; no cached data available (let* ((treg (or test-registry @@ -1012,11 +1015,11 @@ environ-patt: (if system-allowed "pre-launch-env-vars" #f)) #f))) (if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data - (if tcfg (hash-table-set! *testconfigs* test-name tcfg)) + (if tcfg (hash-table-set! *testconfigs* test-full-name tcfg)) (if (and testexists cache-file (file-write-access? cache-path)) (let ((tpath (conc cache-path "/.testconfig"))) (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath) @@ -1240,11 +1243,12 @@ (define (tests:get-full-data test-names test-records required-tests all-tests-registry) (if (not (null? test-names)) (let loop ((hed (car test-names)) (tal (cdr test-names))) ;; 'return-procs tells the config reader to prep running system but return a proc (debug:print-info 4 *default-log-port* "hed=" hed " at top of loop") - (let* ((config (tests:get-testconfig hed all-tests-registry 'return-procs)) + ;; don't know item-path at this time, let the testconfig get the top level testconfig + (let* ((config (tests:get-testconfig hed #f all-tests-registry 'return-procs)) (waitons (let ((instr (if config (config-lookup config "requirements" "waiton") (begin ;; No config means this is a non-existant test (debug:print-error 0 *default-log-port* "non-existent required test \"" hed "\", grep through your testconfigs to find and remove or create the test. Discarding and continuing.") "")))) Index: tests/fullrun/tests/all_toplevel/testconfig ================================================================== --- tests/fullrun/tests/all_toplevel/testconfig +++ tests/fullrun/tests/all_toplevel/testconfig @@ -1,8 +1,8 @@ [ezsteps] calcresults megatest -list-runs $MT_RUNNAME -target $MT_TARGET -check_triggers cat $MT_RUN_AREA_HOME/triggers_$MT_RUN_NAME.dat +check_triggers cat $MT_RUN_AREA_HOME/triggers_$MT_RUNNAME.dat [logpro] check_triggers ;; (expect:error in "LogFileBody" = 0 "No errors" #/error/i) ADDED thunk-utils.scm Index: thunk-utils.scm ================================================================== --- /dev/null +++ thunk-utils.scm @@ -0,0 +1,121 @@ +(use srfi-18) + + +;; wrap a proc with a mutex so that two threads may not call proc simultaneously. +;; will catch exceptions to ensure mutex is unlocked even if exception is thrown. +;; will generate a unique mutex for proc unless one is specified with canned-mutex: option +;; +;; example 1: (define thread-safe-+ (make-synchronized-proc +)) +;; example 2: (define thread-safe-plus +;; (make-synchronized-proc +;; (lambda (x y) +;; (+ x y)))) + +(define (make-synchronized-proc proc + #!key (canned-mutex #f)) + (let* ((guard-mutex (if canned-mutex canned-mutex (make-mutex))) + (guarded-proc ;; we are guarding the thunk against exceptions. We will record whether result of evaluation is an exception or a regular result. + (lambda args + (mutex-lock! guard-mutex) + (let* ((EXCEPTION (gensym)) ;; using gensym to avoid potential collision with a proc that returns a pair having the first element be our flag. gensym guarantees the symbol is unique. + (res + (condition-case + (apply proc args) ;; this is what we are guarding the execution of + [x () (cons EXCEPTION x)] + ))) + (mutex-unlock! guard-mutex) + (cond + ((and (pair? res) (eq? (car res) EXCEPTION)) + (raise (cdr res))) + (else + res)))))) + guarded-proc)) + + +;; retry an operation (depends on srfi-18) +;; ================== +;; idea here is to avoid spending time on coding retrying something. Trying to be generic here. +;; +;; Exception handling: +;; ------------------- +;; if evaluating the thunk results in exception, it will be retried. +;; on last try, if final-failure-returns-actual is true, the exception will be re-thrown to caller. +;; +;; look at options below #!key to see how to configure behavior +;; +;; + +(define (retry-thunk + the-thunk + #!key ;;;; options below + (accept-result? (lambda (x) x)) ;; retry if predicate applied to thunk's result is false + (retries 4) ;; how many tries + (failure-value #f) ;; return this on final failure, unless following option is enabled: + (final-failure-returns-actual #f) ;; on failure, on the last try, just return the result, not failure-value + + (retry-delay 0.1) ;; delay between tries + (back-off-factor 1) ;; multiply retry-delay by this factor on retry + (random-delay 0.1) ;; add a random portion of this value to wait + + (chatty #f) ;; print status as we go, for debugging. + ) + + (when chatty (print) (print "Entered retry-thunk") (print "-=-=-=-=-=-")) + (let* ((guarded-thunk ;; we are guarding the thunk against exceptions. We will record whether result of evaluation is an exception or a regular result. + (lambda () + (let* ((EXCEPTION (gensym)) ;; using gensym to avoid potential collision + (res + (condition-case + (the-thunk) ;; this is what we are guarding the execution of + [x () (cons EXCEPTION x)] + ))) + (cond + ((and (pair? res) (eq? (car res) EXCEPTION)) + (if chatty + (print " - the-thunk threw exception >"(cdr res)"<")) + (cons 'exception (cdr res))) + (else + (if chatty + (print " - the-thunk returned result >"res"<")) + (cons 'regular-result res))))))) + + (let loop ((guarded-res (guarded-thunk)) + (retries-left retries) + (fail-wait retry-delay)) + (if chatty (print " ==========")) + (let* ((wait-time (+ fail-wait (+ (* fail-wait back-off-factor) + (* random-delay + (/ (random 1024) 1024) )))) + (res-type (car guarded-res)) + (res-value (cdr guarded-res))) + (cond + ((and (eq? res-type 'regular-result) (accept-result? res-value)) + (if chatty (print " + return result that satisfied accept-result? >"res-value"<")) + res-value) + + ((> retries-left 0) + (if chatty (print " - sleep "wait-time)) + (thread-sleep! wait-time) + (if chatty (print " + retry ["retries-left" tries left]")) + (loop (guarded-thunk) + (sub1 retries-left) + wait-time)) + + ((eq? res-type 'regular-result) + (if final-failure-returns-actual + (begin + (if chatty (print " + last try failed- return the result >"res-value"<")) + res-value) + (begin + (if chatty (print " + last try failed- return canned failure value >"failure-value"<")) + failure-value))) + + (else ;; no retries left; result was not accepted and res-type can only be 'exception + (if final-failure-returns-actual + (begin + (if chatty (print " + last try failed with exception- re-throw it >"res-value"<")) + (abort res-value)); re-raise the exception. TODO: find a way for call-history to show as though from entry to this function + (begin + (if chatty (print " + last try failed with exception- return canned failure value >"failure-value"<")) + failure-value)))))))) +