Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -38,11 +38,12 @@ ARCHSTR=$(shell lsb_release -sr) # ARCHSTR=$(shell bash -c "echo \$$MACHTYPE") PNGFILES = $(shell cd docs/manual;ls *png) -all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut ndboard +#all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut ndboard +all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut mtest: $(OFILES) readline-fix.scm megatest.o csc $(CSCOPTS) $(OFILES) megatest.o -o mtest dboard : $(OFILES) $(GOFILES) dashboard.scm @@ -234,11 +235,12 @@ install : $(PREFIX)/bin/.$(ARCHSTR) $(PREFIX)/bin/.$(ARCHSTR)/mtest $(PREFIX)/bin/megatest \ $(PREFIX)/bin/.$(ARCHSTR)/dboard $(PREFIX)/bin/dashboard $(HELPERS) $(PREFIX)/bin/nbfake \ $(PREFIX)/bin/nbfind $(PREFIX)/bin/loadrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \ $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun $(PREFIX)/bin/mtutil \ - $(PREFIX)/share/db/mt-pg.sql $(PREFIX)/bin/.$(ARCHSTR)/ndboard $(PREFIX)/bin/tcmt + $(PREFIX)/bin/tcmt $(PREFIX)/share/db/mt-pg.sql +# $(PREFIX)/bin/.$(ARCHSTR)/ndboard # $(PREFIX)/bin/newdashboard $(PREFIX)/bin/.$(ARCHSTR) : mkdir -p $(PREFIX)/bin/.$(ARCHSTR) ADDED codescanlib.scm Index: codescanlib.scm ================================================================== --- /dev/null +++ codescanlib.scm @@ -0,0 +1,127 @@ + +;; gotta compile with csc, doesn't work with csi -s for whatever reason + +(use srfi-69) +(use matchable) +(use utils) +(use ports) +(use extras) +(use srfi-1) +(use posix) +(use srfi-12) + +;; turn scheme file to a list of sexps, sexps of interest will be in the form of (define ( ) ) +(define (load-scm-file scm-file) + ;;(print "load "scm-file) + (handle-exceptions + exn + '() + (with-input-from-string + (conc "(" + (with-input-from-file scm-file read-all) + ")" ) + read))) + +;; extract a list of procname, filename, args and body of procedures defined in filename, input from load-scm-file +;; -- be advised: +;; * this may be fooled by macros, since this code does not take them into account. +;; * this code does only checks for form (define ( ... ) ) +;; so it excludes from reckoning +;; - generated functions, as in things like foo-set! from defstructs, +;; - define-inline, ( +;; - define procname (lambda .. +;; - etc... +(define (get-toplevel-procs+file+args+body filename) + (let* ((scm-tree (load-scm-file filename)) + (procs + (filter identity + (map + (match-lambda + [('define ('uses args ...) body ...) #f] ;; filter out (define (uses ... + [('define ('unit args ...) body ...) #f] ;; filter out (define (unit ... + [('define ('prefix args ...) body ...) #f] ;; filter out (define (prefix ... + [('define (defname args ...) body ...) ;; match (define (procname ) ) + (if (atom? defname) ;; filter out things we dont understand (procname is a list, what??) + (list defname filename args body) + #f)] + [else #f] ) scm-tree)))) + procs)) + + +;; given a sexp, return a flat lost of atoms in that sexp +(define (get-atoms-in-body body) + (cond + ((null? body) '()) + ((atom? body) (list body)) + (else + (apply append (map get-atoms-in-body body))))) + +;; given a file, return a list of procname, file, list of atoms in said procname +(define (get-procs+file+atoms file) + (let* ((toplevel-proc-items (get-toplevel-procs+file+args+body file)) + (res + (map + (lambda (item) + (let* ((proc (car item)) + (file (cadr item)) + (args (caddr item)) + (body (cadddr item)) + (atoms (append (get-atoms-in-body args) (get-atoms-in-body body)))) + (list proc file atoms))) + toplevel-proc-items))) + res)) + +;; uniquify a list of atoms +(define (unique-atoms lst) + (let loop ((lst (flatten lst)) (res '())) + (if (null? lst) + (reverse res) + (let ((c (car lst))) + (loop (cdr lst) (if (member c res) res (cons c res))))))) + +;; given a list of procname, filename, list of procs called from procname, cross reference and reverse +;; returning alist mapping procname to procname that calls said procname +(define (get-callers-alist all-procs+file+calls) + (let* ((all-procs (map car all-procs+file+calls)) + (caller-ht (make-hash-table))) + ;; let's cross reference with a hash table + (for-each (lambda (proc) (hash-table-set! caller-ht proc '())) all-procs) + (for-each (lambda (item) + (let* ((proc (car item)) + (file (cadr item)) + (calls (caddr item))) + (for-each (lambda (callee) + (hash-table-set! caller-ht callee + (cons proc + (hash-table-ref caller-ht callee)))) + calls))) + all-procs+file+calls) + (map (lambda (x) + (let ((k (car x)) + (r (unique-atoms (cdr x)))) + (cons k r))) + (hash-table->alist caller-ht)))) + +;; create a handy cross-reference of callees to callers in the form of an alist. +(define (get-xref all-scm-files) + (let* ((all-procs+file+atoms + (apply append (map get-procs+file+atoms all-scm-files))) + (all-procs (map car all-procs+file+atoms)) + (all-procs+file+calls ; proc calls things in calls list + (map (lambda (item) + (let* ((proc (car item)) + (file (cadr item)) + (atoms (caddr item)) + (calls + (filter identity + (map + (lambda (x) + (if (and ;; (not (equal? x proc)) ;; uncomment to prevent listing self + (member x all-procs)) + x + #f)) + atoms)))) + (list proc file calls))) + all-procs+file+atoms)) + (callers (get-callers-alist all-procs+file+calls))) + callers)) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -7,11 +7,11 @@ ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;;====================================================================== -(use srfi-1 posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records directory-utils stack +(use srfi-1 data-structures posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records directory-utils stack matchable) (require-extension regex posix) (require-extension (srfi 18) extras tcp rpc) @@ -30,10 +30,27 @@ ;; (define (exit . code) ;; (if (null? code) ;; (old-exit) ;; (old-exit code))) + +;; execute thunk, return value. If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* . +;; arguments - thunk, message +(define (common:fail-safe thunk warning-message-on-exception) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "notable but nonfatal condition - "warning-message-on-exception) + (debug:print-info 0 *default-log-port* + (string-substitute "\n?Error:" "nonfatal condition:" + (with-output-to-string + (lambda () + (print-error-message exn) )))) + (debug:print-info 0 *default-log-port* " -- continuing after nonfatal condition...") + #f) + (thunk))) + (define getenv get-environment-variable) (define (safe-setenv key val) (if (substring-index ":" key) ;; variables containing : are for internal use and cannot be environment variables. (debug:print-error 4 *default-log-port* "skip setting internal use only variables containing \":\"") (if (and (string? val) @@ -44,10 +61,20 @@ (setenv key val)) (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)))) (define home (getenv "HOME")) (define user (getenv "USER")) + + +;; returns list of fd count, socket count +(define (get-file-descriptor-count #!key (pid (current-process-id ))) + (list + (length (glob (conc "/proc/" pid "/fd/*"))) + (length (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*"))))) + ) +) + ;; GLOBALS ;; CONTEXTS (defstruct cxt @@ -160,11 +187,11 @@ (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) - (server-timeout (server:get-timeout)) ;; default from server:get-timeout + (server-timeout (server:expiration-timeout)) (force-server #f) (ro-mode #f) (ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode ;; launching and hosts @@ -1492,26 +1519,26 @@ (adjload (* maxload numcpus)) (loadjmp (- first next))) (cond ((and (> first adjload) (> count 0)) - (debug:print-info 0 *default-log-port* "server start delayed " waitdelay " seconds due to load " first " exceeding max of " adjload " (normalized load-limit: " maxload ") " (if msg msg "")) + (debug:print-info 0 *default-log-port* "server start delayed " waitdelay " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) (thread-sleep! waitdelay) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1))) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) ((and (> loadjmp numcpus) (> count 0)) (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) (thread-sleep! waitdelay) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1)))))) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) (define (common:wait-for-homehost-load maxload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) (hh (if hh-dat (car hh-dat) #f)) (numcpus (common:get-num-cpus hh))) - (common:wait-for-normalized-load maxload msg: msg remote-host: hh))) + (common:wait-for-normalized-load maxload msg hh))) (define (common:get-num-cpus remote-host) (let ((proc (lambda () (let loop ((numcpu 0) (inl (read-line))) @@ -1527,11 +1554,11 @@ proc) (with-input-from-file "/proc/cpuinfo" proc)))) ;; wait for normalized cpu load to drop below maxload ;; -(define (common:wait-for-normalized-load maxload #!key (msg #f)(remote-host #f)) +(define (common:wait-for-normalized-load maxload msg remote-host) (let ((num-cpus (common:get-num-cpus remote-host))) (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) (define (get-uname . params) (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -469,12 +469,12 @@ (testconfig (begin ;; (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path) (runs:set-megatest-env-vars run-id inkeyvals: keydat inrunname: runname intarget: keystring testname: testname itempath: item-path) ;; these may be needed by the launching process (handle-exceptions exn ;; NOTE: I've no idea why this was written this way. Research, study and fix needed! - (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f) - (tests:get-testconfig (db:test-get-testname testdat) item-path test-registry #t)))) + (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f allow-write-cache: #f) + (tests:get-testconfig (db:test-get-testname testdat) item-path test-registry #t allow-write-cache: #f)))) (viewlog (lambda (x) (if (common:file-exists? logfile) ;(system (conc "firefox " logfile "&")) (dcommon:run-html-viewer logfile) (message-window (conc "File " logfile " not found"))))) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -47,11 +47,11 @@ (include "vg_records.scm") (define help (conc "Megatest Dashboard, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version " - license GPL, Copyright (C) Matt Welland 2012-2016 + license GPL, Copyright (C) Matt Welland 2012-2017 Usage: dashboard [options] -h : this help -test run-id,test-id : control test identified by testid -skip-version-check : skip the version check @@ -349,11 +349,11 @@ (dboard:tabdat-monitor-db-path-set! tabdat (conc (dboard:tabdat-dbdir tabdat) "/monitor.db")) ;; HACK ALERT: this is a hack, please fix. (dboard:tabdat-ro-set! tabdat (not (file-read-access? (dboard:tabdat-dbfpath tabdat)))) - (dboard:tabdat-keys-set! tabdat (db:dispatch-query (db:get-access-mode) rmt:get-keys db:get-keys)) + (dboard:tabdat-keys-set! tabdat (rmt:get-keys)) (dboard:tabdat-dbkeys-set! tabdat (append (dboard:tabdat-keys tabdat) (list "runname"))) (dboard:tabdat-tot-runs-set! tabdat (rmt:get-num-runs "%")) ) ;; RADT => Matrix defstruct addition @@ -577,12 +577,11 @@ (db-modified (>= db-mod-time last-db-time)) (multi-get (> (dboard:rundat-run-data-offset run-dat) 0)) ;; multi-get in progress (tmptests (if (or do-not-use-db-file-timestamps (dboard:tabdat-filters-changed tabdat) db-modified) - (db:dispatch-query access-mode rmt:get-tests-for-run db:get-tests-for-run - run-id testnamepatt states statuses ;; run-id testpatt states statuses + (rmt:get-tests-for-run run-id testnamepatt states statuses ;; run-id testpatt states statuses (dboard:rundat-run-data-offset run-dat) ;; query offset num-to-get (dboard:tabdat-hide-not-hide tabdat) ;; no-in sort-by ;; sort-by sort-order ;; sort-order @@ -652,17 +651,15 @@ ;; create a virtual table of all the tests ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") ) ;; (define (update-rundat tabdat runnamepatt numruns testnamepatt keypatts) (let* ((access-mode (dboard:tabdat-access-mode tabdat)) - (keys (db:dispatch-query access-mode rmt:get-keys db:get-keys)) + (keys (rmt:get-keys)) (last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2)) - (allruns (db:dispatch-query access-mode rmt:get-runs db:get-runs - runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts)) + (allruns (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts)) ;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f)) - (allruns-tree (db:dispatch-query access-mode rmt:get-runs-by-patt db:get-runs-by-patt - keys "%" #f #f #f #f last-runs-update));;'("id" "runname") + (allruns-tree (rmt:get-runs-by-patt keys "%" #f #f #f #f last-runs-update)) ;;'("id" "runname") (header (db:get-header allruns)) (runs (db:get-rows allruns)) ;; RA => Filtered as per runpatt selected (runs-tree (db:get-rows allruns-tree)) ;; RA => Returns complete list of runs (start-time (current-seconds)) (runs-hash (let ((ht (make-hash-table))) @@ -735,15 +732,13 @@ ;; (define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts) (let* ((access-mode (dboard:tabdat-access-mode tabdat)) (keys (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys))) (last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2)) - (allruns (db:dispatch-query access-mode rmt:get-runs db:get-runs - runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts)) + (allruns (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts)) ;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f)) - (allruns-tree (db:dispatch-query access-mode rmt:get-runs-by-patt db:get-runs-by-patt - keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname") + (allruns-tree (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname") (header (db:get-header allruns)) (runs (db:get-rows allruns)) ;; RA => Filtered as per runpatt selected (runs-tree (db:get-rows allruns-tree)) ;; RA => Returns complete list of runs (start-time (current-seconds)) (runs-hash (let ((ht (make-hash-table))) @@ -802,11 +797,19 @@ (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id) (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) (if (or (null? tal) (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update (begin - (if (> elapsed-time 2)(print "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")) + (when (> elapsed-time 2) + (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") + (let* ((old-val (iup:attribute *tim* "TIME")) + (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) + (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) + (iup:attribute-set! *tim* "TIME" new-val)) + + + ) (dboard:tabdat-allruns-set! tabdat new-res) maxtests) (if (> (dboard:rundat-run-data-offset run-struct) 0) (loop run tal new-res newmaxtests) ;; not done getting data for this run (loop (car tal)(cdr tal) new-res newmaxtests))))))) @@ -1631,35 +1634,35 @@ (define (tree-path->run-id tabdat path) (if (not (null? path)) (hash-table-ref/default (dboard:tabdat-path-run-ids tabdat) path #f) #f)) -(define (dboard:get-tests-dat tabdat run-id last-update) - (let* ((access-mode (dboard:tabdat-access-mode tabdat)) - (tdat (if run-id (db:dispatch-query access-mode rmt:get-tests-for-run db:get-tests-for-run - run-id - (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%") - (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)) ;; '() - (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)) ;; '() - #f #f ;; offset limit - (dboard:tabdat-hide-not-hide tabdat) ;; not-in - #f #f ;; sort-by sort-order - #f ;; get all? "id,testname,item_path,state,status,event_time,run_duration" ;; qryval - (if (dboard:tabdat-filters-changed tabdat) - 0 - last-update) - *dashboard-mode*) - '()))) ;; get 'em all - ;; (debug:print 0 *default-log-port* "dboard:get-tests-dat: got " (length tdat) " test records for run " run-id) - (sort tdat (lambda (a b) - (let* ((aval (vector-ref a 2)) - (bval (vector-ref b 2)) - (anum (string->number aval)) - (bnum (string->number bval))) - (if (and anum bnum) - (< anum bnum) - (string<= aval bval))))))) +;; (define (dboard:get-tests-dat tabdat run-id last-update) +;; (let* ((access-mode (dboard:tabdat-access-mode tabdat)) +;; (tdat (if run-id (db:dispatch-query access-mode rmt:get-tests-for-run db:get-tests-for-run +;; run-id +;; (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%") +;; (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)) ;; '() +;; (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)) ;; '() +;; #f #f ;; offset limit +;; (dboard:tabdat-hide-not-hide tabdat) ;; not-in +;; #f #f ;; sort-by sort-order +;; #f ;; get all? "id,testname,item_path,state,status,event_time,run_duration" ;; qryval +;; (if (dboard:tabdat-filters-changed tabdat) +;; 0 +;; last-update) +;; *dashboard-mode*) +;; '()))) ;; get 'em all +;; ;; (debug:print 0 *default-log-port* "dboard:get-tests-dat: got " (length tdat) " test records for run " run-id) +;; (sort tdat (lambda (a b) +;; (let* ((aval (vector-ref a 2)) +;; (bval (vector-ref b 2)) +;; (anum (string->number aval)) +;; (bnum (string->number bval))) +;; (if (and anum bnum) +;; (< anum bnum) +;; (string<= aval bval))))))) (define (dashboard:safe-cadr-assoc name lst) (let ((res (assoc name lst))) (if (and res (> (length res) 1)) @@ -1675,12 +1678,11 @@ (time-a (db:get-value-by-header record-a runs-header "event_time")) (time-b (db:get-value-by-header record-b runs-header "event_time"))) (< time-a time-b))))) (changed #f) (last-runs-update (dboard:tabdat-last-runs-update tabdat)) - (runs-dat (db:dispatch-query access-mode rmt:get-runs-by-patt db:get-runs-by-patt - (dboard:tabdat-keys tabdat) "%" #f #f #f #f last-runs-update))) + (runs-dat (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f last-runs-update))) (dboard:tabdat-last-runs-update-set! tabdat (- (current-seconds) 2)) (for-each (lambda (run-id) (let* ((run-record (hash-table-ref/default runs-hash run-id #f)) (key-vals (map (lambda (key)(db:get-value-by-header run-record runs-header key)) (dboard:tabdat-keys tabdat))) @@ -1747,12 +1749,11 @@ (define (dashboard:get-runs-hash tabdat) (let* ((access-mode (dboard:tabdat-access-mode tabdat)) (last-runs-update 0);;(dboard:tabdat-last-runs-update tabdat)) - (runs-dat (db:dispatch-query access-mode rmt:get-runs-by-patt db:get-runs-by-patt - (dboard:tabdat-keys tabdat) "%" #f #f #f #f last-runs-update)) + (runs-dat (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f last-runs-update)) (runs-header (vector-ref runs-dat 0)) ;; 0 is header, 1 is list of records (runs (vector-ref runs-dat 1)) (run-id (dboard:tabdat-curr-run-id tabdat)) (runs-hash (let ((ht (make-hash-table))) (for-each (lambda (run) @@ -1764,13 +1765,11 @@ (define (dashboard:runs-summary-updater commondat tabdat tb cell-lookup run-matrix) ;; (if (dashboard:database-changed? commondat tabdat context-key: 'runs-summary-rundat) (dashboard:do-update-rundat tabdat) ;; ) (dboard:runs-summary-control-panel-updater tabdat) (let* ((last-runs-update (dboard:tabdat-last-runs-update tabdat)) - (runs-dat (db:dispatch-query (dboard:tabdat-access-mode tabdat) - rmt:get-runs-by-patt db:get-runs-by-patt - (dboard:tabdat-keys tabdat) "%" #f #f #f #f last-runs-update)) + (runs-dat (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f last-runs-update)) (runs-header (vector-ref runs-dat 0)) ;; 0 is header, 1 is list of records (runs (vector-ref runs-dat 1)) (run-id (dboard:tabdat-curr-run-id tabdat)) (runs-hash (dashboard:get-runs-hash tabdat)) ;; (runs-hash (let ((ht (make-hash-table))) @@ -2219,10 +2218,20 @@ (iup:hbox (iup:button "Quit" #:action (lambda (obj) (exit)) #:expand "NO" #:size "40x15") (iup:button "Refresh" #:action (lambda (obj) + (dboard:tabdat-last-data-update-set! tabdat 0) + (dboard:tabdat-last-runs-update-set! tabdat 0) + (dboard:tabdat-run-update-times-set! tabdat (make-hash-table)) + (dboard:tabdat-last-test-dat-set! tabdat (make-hash-table)) + (dboard:tabdat-allruns-set! tabdat '()) + (dboard:tabdat-allruns-by-id-set! tabdat (make-hash-table)) + (dboard:tabdat-done-runs-set! tabdat '()) + (dboard:tabdat-not-done-runs-set! tabdat '()) + (dboard:tabdat-view-changed-set! tabdat #t) + (dboard:commondat-please-update-set! commondat #t) (mark-for-update tabdat)) #:expand "NO" #:size "40x15") (iup:button "Collapse" #:action (lambda (obj) (debug:catch-and-dump (lambda () @@ -2437,11 +2446,11 @@ ;; (rmt:test-set-state-status-by-id run-id test-id "KILLREQ" #f #f) (common:run-a-command (conc "megatest -set-state-status KILLREQ,n/a -target " target " -runname " runname " -testpatt " item-test-path - " -state RUNNING,REMOTEHOSTSTART,LAUNCHED")))) + " -state RUNNING,REMOTEHOSTSTART,LAUNCHED,NOT_STARTED")))) (iup:menu-item "Run" (iup:menu @@ -2479,11 +2488,11 @@ (lambda (obj) (common:run-a-command (conc "megatest -set-state-status KILLREQ,n/a -target " target " -runname " runname " -testpatt % " - " -state RUNNING,REMOTEHOSTSTART,LAUNCHED")))) + " -state RUNNING,REMOTEHOSTSTART,LAUNCHED,NOT_STARTED")))) (iup:menu-item "Delete Run Data" #:action (lambda (obj) (common:run-a-command @@ -2976,13 +2985,11 @@ ;; run times tab data updater ;; (define (dashboard:run-times-tab-run-data-updater commondat tabdat tab-num) (let* ((access-mode (dboard:tabdat-access-mode tabdat)) (last-runs-update (dboard:tabdat-last-runs-update tabdat)) - (runs-dat (db:dispatch-query access-mode - rmt:get-runs-by-patt db:get-runs-by-patt - (dboard:tabdat-keys tabdat) "%" #f #f #f #f last-runs-update)) + (runs-dat (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f last-runs-update)) (runs-header (vector-ref runs-dat 0)) ;; 0 is header, 1 is list of records (runs-hash (let ((ht (make-hash-table))) (for-each (lambda (run) (hash-table-set! ht (db:get-value-by-header run runs-header "id") run)) (vector-ref runs-dat 1)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -479,20 +479,22 @@ '("run_duration" #f) '("comment" #f) '("event_time" #f) '("fail_count" #f) '("pass_count" #f) - '("archived" #f)) + '("archived" #f) + '("last_update" #f)) (list "test_steps" '("id" #f) '("test_id" #f) '("stepname" #f) '("state" #f) '("status" #f) '("event_time" #f) '("comment" #f) - '("logfile" #f)) + '("logfile" #f) + '("last_update" #f)) (list "test_data" '("id" #f) '("test_id" #f) '("category" #f) '("variable" #f) @@ -500,11 +502,12 @@ '("expected" #f) '("tol" #f) '("units" #f) '("comment" #f) '("status" #f) - '("type" #f)))) + '("type" #f) + '("last_update" #f)))) ;; needs db to get keys, this is for syncing all tables ;; (define (db:sync-main-list dbstruct) (let ((keys (db:get-keys dbstruct))) @@ -516,11 +519,11 @@ (list "metadat" '("var" #f) '("val" #f)) (append (list "runs" '("id" #f)) (map (lambda (k)(list k #f)) (append keys - (list "runname" "state" "status" "owner" "event_time" "comment" "fail_count" "pass_count" "contour")))) + (list "runname" "state" "status" "owner" "event_time" "comment" "fail_count" "pass_count" "contour" "last_update")))) (list "test_meta" '("id" #f) '("testname" #f) '("owner" #f) '("description" #f) @@ -1042,26 +1045,40 @@ data-synced)) (define (db:tmp->megatest.db-sync dbstruct last-update) (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) (tmpdb (db:get-db dbstruct)) - (refndb (dbr:dbstruct-refndb dbstruct))) - (db:sync-tables (db:sync-all-tables-list dbstruct) last-update tmpdb refndb mtdb))) + (refndb (dbr:dbstruct-refndb dbstruct)) + (res (db:sync-tables (db:sync-all-tables-list dbstruct) last-update tmpdb refndb mtdb))) + (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb) + res)) ;;;; run-ids ;; if #f use *db-local-sync* : or 'local-sync-flags ;; if #t use timestamps : or 'timestamps +;; +;; NB// no-sync-db is the db handle, not a flag! +;; (define (db:sync-to-megatest.db dbstruct #!key (no-sync-db #f)) (let* ((start-time (current-seconds)) - (last-update (if no-sync-db - (db:no-sync-get/default no-sync-db "LAST_UPDATE" 0) - 0)) ;; (or (db:get-var dbstruct "LAST_UPDATE") 0)) + (last-full-update (if no-sync-db + (db:no-sync-get/default no-sync-db "LAST_FULL_UPDATE" 0) + 0)) + (full-sync-needed (> (- start-time last-full-update) 3600)) ;; every hour do a full sync + (last-update (if full-sync-needed + 0 + (if no-sync-db + (db:no-sync-get/default no-sync-db "LAST_UPDATE" 0) + 0))) ;; (or (db:get-var dbstruct "LAST_UPDATE") 0)) (sync-needed (> (- start-time last-update) 6)) - (res (if sync-needed ;; don't sync if a sync already occurred in the past 6 seconds + (res (if (or sync-needed ;; don't sync if a sync already occurred in the past 6 seconds + full-sync-needed) (begin (if no-sync-db - (db:no-sync-set no-sync-db "LAST_UPDATE" start-time)) + (begin + (if full-sync-needed (db:no-sync-set no-sync-db "LAST_FULL_UPDATE" start-time)) + (db:no-sync-set no-sync-db "LAST_UPDATE" start-time))) (db:tmp->megatest.db-sync dbstruct last-update)) 0)) (sync-time (- (current-seconds) start-time))) (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)) (if (common:low-noise-print 30 "sync new to old") @@ -1362,10 +1379,11 @@ "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND last_df > ?;") dneeded)) + (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat) blocks)) ;; returns id of the record, register a disk allocated to archiving and record it's last known ;; available space ;; @@ -1418,11 +1436,13 @@ res) (begin (sqlite3:execute db "INSERT OR REPLACE INTO archive_blocks (archive_disk_id,disk_path,last_du) VALUES (?,?,?);" bdisk-id archive-path (or du 0)) - (db:archive-register-block-name dbstruct bdisk-id archive-path du: du))))) + (db:archive-register-block-name dbstruct bdisk-id archive-path du: du))) + (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat) + res)) ;; The "archived" field in tests is overloaded; 0 = not archived, > 0 archived in block with given id ;; (define (db:test-set-archive-block-id dbstruct run-id test-id archive-block-id) @@ -1849,14 +1869,18 @@ ;;====================================================================== (define (db:open-no-sync-db) (let* ((dbpath (db:dbfile-path)) (dbname (conc dbpath "/no-sync.db")) + (db-exists (common:file-exists? dbname)) (db (sqlite3:open-database dbname))) (sqlite3:set-busy-handler! db (make-busy-timeout 136000)) - (sqlite3:execute db "PRAGMA synchronous = 0;") - (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));") + (if (not db-exists) + (begin + (sqlite3:execute db "PRAGMA synchronous = 0;") + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));") + (sqlite3:execute db "PRAGMA journal_mode=WAL;"))) db)) ;; if we are not a server create a db handle. this is not finalized ;; so watch for problems. I'm still not clear if it is needed to manually ;; finalize sqlite3 dbs with the sqlite3 egg. @@ -4262,10 +4286,11 @@ (begin (debug:print 0 *default-log-port* "WARNING: path given, " outputfile " is relative, prefixing with current directory") (conc (current-directory) "/" outputfile))) results) ;; brutal clean up + (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat) (system "rm -rf tempdir"))) ;; (db:extract-ods-file db "outputfile.ods" '(("sysname" "%")("fsname" "%")("datapath" "%")) "%") Index: docs/megatest-state-status.dot ================================================================== --- docs/megatest-state-status.dot +++ docs/megatest-state-status.dot @@ -31,11 +31,11 @@ label="{RUNNING|{n/a| PASS | FAIL}}"; ] "COMPLETED" [ shape="record"; - label = "{COMPLETED|{PASS | FAIL | CHECK| SKIP}}"; + label = "{COMPLETED|{PASS | SKIP | WAIVED | FAIL | CHECK| ABORT}}"; ] "RUNNING" -> "COMPLETED"; "RUNNING" -> "INCOMPLETE" [label="test dead for > 24hrs"]; DELETED emergency-patch-1.scm Index: emergency-patch-1.scm ================================================================== --- emergency-patch-1.scm +++ /dev/null @@ -1,203 +0,0 @@ - - -;; These are called by the server on recipt of /api calls -;; - keep it simple, only return the actual result of the call, i.e. no meta info here -;; -;; - returns #( flag result ) -;; -(define (api:execute-requests dbstruct dat) - (handle-exceptions - exn - (let ((call-chain (get-call-chain))) - (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat) - (print-call-chain (current-error-port)) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens - (cond - ((not (vector? dat)) ;; it is an error to not receive a vector - (vector #f (vector #f "remote must be called with a vector"))) - ((> *api-process-request-count* 20) ;; 20) - (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") - (set! *server-overloaded* #t) - (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! - (else - (let* ((cmd-in (vector-ref dat 0)) - (cmd (if (symbol? cmd-in) - cmd-in - (string->symbol cmd-in))) - (params (vector-ref dat 1)) - (start-t (current-milliseconds)) - (readonly-mode (dbr:dbstruct-read-only dbstruct)) - (readonly-command (member cmd api:read-only-queries)) - (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) - (res - (if writecmd-in-readonly-mode - (conc "attempt to run write command "cmd" on a read-only database") - (case cmd - ;;=============================================== - ;; READ/WRITE QUERIES - ;;=============================================== - - ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl - - ;; SERVERS - ((start-server) (apply server:kind-run params)) - ((kill-server) (set! *server-run* #f)) - - ;; TESTS - - ;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) - ;;BB - commented out above because it was calling below, eventually, incorrectly (dbstruct passed to mt:test-set-state-status-by-id, which previosly did more, but now only passes thru to db:set-state-status-and-roll-up-items. - ((test-set-state-status-by-id) - - ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) - (db:set-state-status-and-roll-up-items - dbstruct - (list-ref params 0) ; run-id - (list-ref params 1) ; test-name - #f ; item-path - (list-ref params 2) ; state - (list-ref params 3) ; status - (list-ref params 4) ; comment - )) - - ((delete-test-records) (apply db:delete-test-records dbstruct params)) - ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params)) - ((test-set-state-status) (apply db:test-set-state-status dbstruct params)) - ((test-set-top-process-pid) (apply db:test-set-top-process-pid dbstruct params)) - ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params)) - ((top-test-set-per-pf-counts) (apply db:top-test-set-per-pf-counts dbstruct params)) - ((test-set-archive-block-id) (apply db:test-set-archive-block-id dbstruct params)) - - ;; RUNS - ((register-run) (apply db:register-run dbstruct params)) - ((set-tests-state-status) (apply db:set-tests-state-status dbstruct params)) - ((delete-run) (apply db:delete-run dbstruct params)) - ((lock/unlock-run) (apply db:lock/unlock-run dbstruct params)) - ((update-run-event_time) (apply db:update-run-event_time dbstruct params)) - ((update-run-stats) (apply db:update-run-stats dbstruct params)) - ((set-var) (apply db:set-var dbstruct params)) - ((del-var) (apply db:del-var dbstruct params)) - - ;; STEPS - ((teststep-set-status!) (apply db:teststep-set-status! dbstruct params)) - - ;; TEST DATA - ((test-data-rollup) (apply db:test-data-rollup dbstruct params)) - ((csv->test-data) (apply db:csv->test-data dbstruct params)) - - ;; MISC - ((sync-inmem->db) (let ((run-id (car params))) - (db:sync-touched dbstruct run-id force-sync: #t))) - ((mark-incomplete) (apply db:find-and-mark-incomplete dbstruct params)) - - ;; TESTMETA - ((testmeta-add-record) (apply db:testmeta-add-record dbstruct params)) - ((testmeta-update-field) (apply db:testmeta-update-field dbstruct params)) - ((get-tests-tags) (db:get-tests-tags dbstruct)) - - ;; TASKS - ((tasks-add) (apply tasks:add dbstruct params)) - ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params)) - ((tasks-get-last) (apply tasks:get-last dbstruct params)) - - ;; NO SYNC DB - ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) - ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) - ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) - - ;; ARCHIVES - ;; ((archive-get-allocations) - ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) - ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) - ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) - - ;;====================================================================== - ;; READ ONLY QUERIES - ;;====================================================================== - - ;; KEYS - ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) - ((get-keys) (db:get-keys dbstruct)) - ((get-key-vals) (apply db:get-key-vals dbstruct params)) - ((get-target) (apply db:get-target dbstruct params)) - ((get-targets) (db:get-targets dbstruct)) - - ;; ARCHIVES - ((test-get-archive-block-info) (apply db:test-get-archive-block-info dbstruct params)) - - ;; TESTS - ((test-toplevel-num-items) (apply db:test-toplevel-num-items dbstruct params)) - ((get-test-info-by-id) (apply db:get-test-info-by-id dbstruct params)) - ((test-get-rundir-from-test-id) (apply db:test-get-rundir-from-test-id dbstruct params)) - ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params)) - ((get-count-tests-running) (apply db:get-count-tests-running dbstruct params)) - ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params)) - ;; ((delete-test-step-records) (apply db:delete-test-step-records dbstruct params)) - ((get-previous-test-run-record) (apply db:get-previous-test-run-record dbstruct params)) - ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params)) - ((test-get-logfile-info) (apply db:test-get-logfile-info dbstruct params)) - ((test-get-records-for-index-file) (apply db:test-get-records-for-index-file dbstruct params)) - ((get-testinfo-state-status) (apply db:get-testinfo-state-status dbstruct params)) - ((test-get-top-process-pid) (apply db:test-get-top-process-pid dbstruct params)) - ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params)) - ((get-prereqs-not-met) (apply db:get-prereqs-not-met dbstruct params)) - ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params)) - ((synchash-get) (apply synchash:server-get dbstruct params)) - ((get-raw-run-stats) (apply db:get-raw-run-stats dbstruct params)) - - ;; RUNS - ((get-run-info) (apply db:get-run-info dbstruct params)) - ((get-run-status) (apply db:get-run-status dbstruct params)) - ((set-run-status) (apply db:set-run-status dbstruct params)) - ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) - ((get-test-id) (apply db:get-test-id dbstruct params)) - ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) - ((get-runs) (apply db:get-runs dbstruct params)) - ((get-num-runs) (apply db:get-num-runs dbstruct params)) - ((get-all-run-ids) (db:get-all-run-ids dbstruct)) - ((get-prev-run-ids) (apply db:get-prev-run-ids dbstruct params)) - ((get-run-ids-matching-target) (apply db:get-run-ids-matching-target dbstruct params)) - ((get-runs-by-patt) (apply db:get-runs-by-patt dbstruct params)) - ((get-run-name-from-id) (apply db:get-run-name-from-id dbstruct params)) - ((get-main-run-stats) (apply db:get-main-run-stats dbstruct params)) - ((get-var) (apply db:get-var dbstruct params)) - ((get-run-stats) (apply db:get-run-stats dbstruct params)) - - ;; STEPS - ((get-steps-data) (apply db:get-steps-data dbstruct params)) - ((get-steps-for-test) (apply db:get-steps-for-test dbstruct params)) - - ;; TEST DATA - ((read-test-data) (apply db:read-test-data dbstruct params)) - ((read-test-data*) (apply db:read-test-data* dbstruct params)) - - ;; MISC - ((get-latest-host-load) (apply db:get-latest-host-load dbstruct params)) - ((have-incompletes?) (apply db:have-incompletes? dbstruct params)) - ((login) (apply db:login dbstruct params)) - ((general-call) (let ((stmtname (car params)) - (run-id (cadr params)) - (realparams (cddr params))) - (db:general-call dbstruct stmtname realparams))) - ((sdb-qry) (apply sdb:qry params)) - ((ping) (current-process-id)) - ((get-changed-record-ids) (apply db:get-changed-record-ids dbstruct params)) - - ;; TESTMETA - ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) - - ;; TASKS - ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) - (else - (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) - (conc "ERROR: BAD api call " cmd)))))) - - ;; save all stats - (let ((delta-t (- (current-milliseconds) - start-t))) - (hash-table-set! *db-api-call-time* cmd - (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) - (if writecmd-in-readonly-mode - (vector #f res) - (vector #t res))))))) DELETED emergency-patch-2.scm Index: emergency-patch-2.scm ================================================================== --- emergency-patch-2.scm +++ /dev/null @@ -1,311 +0,0 @@ -(include "common_records.scm") -(include "key_records.scm") -(include "db_records.scm") -(include "run_records.scm") -(include "test_records.scm") - -(define (common:wait-for-cpuload maxload numcpus waitdelay #!key (count 1000) (msg #f)(remote-host #f)) - (let* ((loadavg (common:get-cpu-load remote-host)) - (first (car loadavg)) - (next (cadr loadavg)) - (adjload (* maxload numcpus)) - (loadjmp (- first next))) - (cond - ((and (> first adjload) - (> count 0)) - (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load " first " exceeding max of " adjload " " (if msg msg "")) - (thread-sleep! waitdelay) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1))) - ((and (> loadjmp numcpus) - (> count 0)) - (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) - (thread-sleep! waitdelay) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1)))))) - -(define (common:wait-for-homehost-load maxload msg) - (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. - #f - (common:get-homehost))) - (hh (if hh-dat (car hh-dat) #f)) - (numcpus (common:get-num-cpus hh))) - (common:wait-for-normalized-load maxload msg: msg remote-host: hh))) - -;; wait for normalized cpu load to drop below maxload -;; -(define (common:wait-for-normalized-load maxload #!key (msg #f)(remote-host #f)) - (let ((num-cpus (common:get-num-cpus remote-host))) - (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) - -;; hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry itemmaps) -(define (runs:process-expanded-tests runsdat testdat) - ;; unroll the contents of runsdat and testdat (due to ongoing refactoring). - (let* ((hed (runs:testdat-hed testdat)) - (tal (runs:testdat-tal testdat)) - (reg (runs:testdat-reg testdat)) - (reruns (runs:testdat-reruns testdat)) - (test-name (runs:testdat-test-name testdat)) - (item-path (runs:testdat-item-path testdat)) - (jobgroup (runs:testdat-jobgroup testdat)) - (waitons (runs:testdat-waitons testdat)) - (item-path (runs:testdat-item-path testdat)) - (testmode (runs:testdat-testmode testdat)) - (newtal (runs:testdat-newtal testdat)) - (itemmaps (runs:testdat-itemmaps testdat)) - (test-record (runs:testdat-test-record testdat)) - (prereqs-not-met (runs:testdat-prereqs-not-met testdat)) - - (reglen (runs:dat-reglen runsdat)) - (regfull (runs:dat-regfull runsdat)) - (runname (runs:dat-runname runsdat)) - (max-concurrent-jobs (runs:dat-max-concurrent-jobs runsdat)) - (run-id (runs:dat-run-id runsdat)) - (test-patts (runs:dat-test-patts runsdat)) - (required-tests (runs:dat-required-tests runsdat)) - (test-registry (runs:dat-test-registry runsdat)) - (registry-mutex (runs:dat-registry-mutex runsdat)) - (flags (runs:dat-flags runsdat)) - (keyvals (runs:dat-keyvals runsdat)) - (run-info (runs:dat-run-info runsdat)) - (all-tests-registry (runs:dat-all-tests-registry runsdat)) - (run-limits-info (runs:dat-can-run-more-tests runsdat)) - ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running - (have-resources (car run-limits-info)) - (num-running (list-ref run-limits-info 1)) - (num-running-in-jobgroup(list-ref run-limits-info 2)) - (max-concurrent-jobs (list-ref run-limits-info 3)) - (job-group-limit (list-ref run-limits-info 4)) - ;; (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) - ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap)) - (fails (if (list? prereqs-not-met) - (runs:calc-fails prereqs-not-met) - (begin - (debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met) - '()))) - (non-completed (filter (lambda (x) ;; remove hed from not completed list, duh, of course it is not completed! - (not (equal? x hed))) - (runs:calc-not-completed prereqs-not-met))) - (loop-list (list hed tal reg reruns)) - ;; configure the load runner - (numcpus (common:get-num-cpus #f)) - (maxload (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3.0"))) ;; use a non-number string to disable - (maxhomehostload (string->number (or (configf:lookup *configdat* "jobtools" "maxhomehostload") "1.2"))) ;; use a non-number string to disable - (waitdelay (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60")))) - (debug:print-info 4 *default-log-port* "have-resources: " have-resources " prereqs-not-met: (" - (string-intersperse - (map (lambda (t) - (if (vector? t) - (conc (db:test-get-state t) "/" (db:test-get-status t)) - (conc " WARNING: t is not a vector=" t ))) - prereqs-not-met) - ", ") ") fails: " fails - "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) - - - - (if (and (not (null? prereqs-not-met)) - (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) - (debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) - - ;; Don't know at this time if the test have been launched at some time in the past - ;; i.e. is this a re-launch? - (debug:print-info 4 *default-log-port* "run-limits-info = " run-limits-info) - - (cond - - ;; Check item path against item-patts, - ;; - ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run - ;; else the run is stuck, temporarily or permanently - ;; but should check if it is due to lack of resources vs. prerequisites - (debug:print-info 1 *default-log-port* "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) - (if (or (not (null? tal))(not (null? reg))) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns) - #f)) - - ;; Register tests - ;; - ((not (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) - (debug:print-info 4 *default-log-port* "Pre-registering test " test-name "/" item-path " to create placeholder" ) - ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs - (let register-loop ((numtries 15)) - (rmt:register-test run-id test-name item-path) - (if (rmt:get-test-id run-id test-name item-path) - (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'done) - (if (> numtries 0) - (begin - (thread-sleep! 0.5) - (register-loop (- numtries 1))) - (debug:print-error 0 *default-log-port* "failed to register test " (db:test-make-full-name test-name item-path))))) - (if (not (eq? (hash-table-ref/default test-registry (db:test-make-full-name test-name "") #f) 'done)) - (begin - (rmt:register-test run-id test-name "") - (if (rmt:get-test-id run-id test-name "") - (hash-table-set! test-registry (db:test-make-full-name test-name "") 'done)))) - (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) - (if (and (null? tal)(null? reg)) - (list hed tal (append reg (list hed)) reruns) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - ;; NB// Here we are building reg as we register tests - ;; if regfull we must pop the front item off reg - (if regfull - (append (cdr reg) (list hed)) - (append reg (list hed))) - reruns))) - - ;; At this point hed test registration must be completed. - ;; - ((eq? (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f) - 'start) - (debug:print-info 0 *default-log-port* "Waiting on test registration(s): " - (string-intersperse - (filter (lambda (x) - (eq? (hash-table-ref/default test-registry x #f) 'start)) - (hash-table-keys test-registry)) - ", ")) - (thread-sleep! 0.051) - (list hed tal reg reruns)) - - ;; If no resources are available just kill time and loop again - ;; - ((not have-resources) ;; simply try again after waiting a second - (if (runs:lownoise "no resources" 60) - (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ...")) - ;; Have gone back and forth on this but db starvation is an issue. - ;; wait one second before looking again to run jobs. - (thread-sleep! 1) - ;; could have done hed tal here but doing car/cdr of newtal to rotate tests - (list (car newtal)(cdr newtal) reg reruns)) - - ;; This is the final stage, everything is in place so launch the test - ;; - ((and have-resources - (or (null? prereqs-not-met) - (and (member 'toplevel testmode) ;; 'toplevel) - (null? non-completed) - (not (member 'exclusive testmode))))) - ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) - ;; we are going to reset all the counters for test retries by setting a new hash table - ;; this means they will increment only when nothing can be run - (set! *max-tries-hash* (make-hash-table)) - ;; well, first lets see if cpu load throttling is enabled. If so wait around until the - ;; average cpu load is under the threshold before continuing - (if maxload ;; only gate if maxload is specified - (common:wait-for-cpuload maxload numcpus waitdelay)) - (if maxhomehostload - (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) - - (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) - (runs:incremental-print-results run-id) - (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) - (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) - ;; (thread-sleep! *global-delta*) - (if (or (not (null? tal))(not (null? reg))) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns) - #f)) - - ;; must be we have unmet prerequisites - ;; - (else - (debug:print 4 *default-log-port* "FAILS: " fails) - ;; If one or more of the prereqs-not-met are FAIL then we can issue - ;; a message and drop hed from the items to be processed. - ;; (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) - (if (and (not (null? prereqs-not-met)) - (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) - (debug:print-info 1 *default-log-port* "waiting on tests; " (string-intersperse - (runs:mixed-list-testname-and-testrec->list-of-strings - prereqs-not-met) ", "))) - (if (or (null? fails) - (member 'toplevel testmode)) - (begin - ;; couldn't run, take a breather - (if (runs:lownoise "Waiting for more work to do..." 60) - (debug:print-info 0 *default-log-port* "Waiting for more work to do...")) - (thread-sleep! 1) - (list (car newtal)(cdr newtal) reg reruns)) - ;; the waiton is FAIL so no point in trying to run hed ever again - (if (or (not (null? reg))(not (null? tal))) - (if (vector? hed) - (begin - (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path - " from the launch list as it has prerequistes that are FAIL") - (let ((test-id (rmt:get-test-id run-id hed ""))) - (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))) - (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) - ;; (thread-sleep! *global-delta*) - ;; This next is for the items - (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) - (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns ;; WAS: (cons hed reruns) ;; but that makes no sense? - )) - (let ((nth-try (hash-table-ref/default test-registry hed 0))) - (cond - ((member "RUNNING" (map db:test-get-state prereqs-not-met)) - (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60) - (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet.")) - (thread-sleep! 4) - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns)) - ((or (not nth-try) - (and (number? nth-try) - (< nth-try 10))) - (hash-table-set! test-registry hed (if (number? nth-try) - (+ nth-try 1) - 0)) - (if (runs:lownoise (conc "not removing test " hed) 60) - (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites")) - ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") - (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) - ;; (list hed tal reg reruns) - ;; (list (car newtal)(cdr newtal) reg reruns) - ;; (hash-table-set! test-registry hed 'removed) - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns)) - ((symbol? nth-try) - (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW - (if (null? tal) - #f ;; yes, really - (list (car tal)(cdr tal) reg reruns)) - (begin - (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60) - (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state " nth-try " will be overridden and we'll retry.")) - (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f) - (hash-table-set! test-registry hed 0) - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns)))) - (else - (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60) - (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now.")) - ;; (debug:print 0 *default-log-port* " prereqs: " prereqs-not-met) - (hash-table-set! test-registry hed 'removed) - (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "TEN_STRIKES" #f) - ;; I'm unclear on if this roll up is needed - it may be the root cause of the "all set to FAIL" bug. - (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "FAIL" #f) ;; treat as FAIL - (list (if (null? tal)(car newtal)(car tal)) - tal - reg - reruns))))) - ;; can't drop this - maybe running? Just keep trying - (let ((runable-tests (runs:runable-tests prereqs-not-met))) - (if (null? runable-tests) - #f ;; I think we are truly done here - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns))))))))) DELETED emergency-patch-3.scm Index: emergency-patch-3.scm ================================================================== --- emergency-patch-3.scm +++ /dev/null @@ -1,81 +0,0 @@ - ;; To build patch: - ;;;;;;;;;;;;;;;;;;;;;;;;; - ;; ldd /p/foundry/env/pkgs/megatest/1.64/19/bin/.11/mtest - ;; linux-vdso.so.1 => (0x00002aaaaaaab000) - ;; libchicken.so.7 => /p/foundry/env/pkgs/megatest/1.64/chicken-4.10.0//lib/libchicken.so.7 (0x00002aaaaaaad000) - ;; libm.so.6 => /lib64/libm.so.6 (0x00002aaaab0a6000) - ;; libdl.so.2 => /lib64/libdl.so.2 (0x00002aaaab31f000) - ;; libc.so.6 => /lib64/libc.so.6 (0x00002aaaab523000) - ;; /lib64/ld-linux-x86-64.so.2 (0x0000555555554000) - ;; - ;; /p/foundry/env/pkgs/megatest/1.64/chicken-4.10.0/bin/csc -s emergency-patch-3.scm - ;; - - - ;; to test patch: - ;;;;;;;;;;;;;;;;;;;;;;;;; - ;; in .megatestrc, add: - ;; (if (and (> megatest-version 1.64) - ;; (< megatest-version 1.6421)) - ;; (begin - ;; (load "/p/foundry/env/pkgs/megatest/1.64/19/share/epatch-1.so") - ;; (load "/p/foundry/env/pkgs/megatest/1.64/19/share/epatch-2.so"))) - ;; - - - ;; to productize patch: - ;;;;;;;;;;;;;;;;;;;;;;;;; - ;; -(use directory-utils regex) - -(include "common_records.scm") -(include "key_records.scm") -(include "db_records.scm") -(include "run_records.scm") -(include "test_records.scm") - -;; Given a run id start a server process ### NOTE ### > file 2>&1 -;; if the run-id is zero and the target-host is set -;; try running on that host -;; incidental: rotate logs in logs/ dir. -;; -(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area - (let* ((curr-host (get-host-name)) - ;; (attempt-in-progress (server:start-attempted? areapath)) - ;; (dot-server-url (server:check-if-running areapath)) - (curr-ip (server:get-best-guess-address curr-host)) - (curr-pid (current-process-id)) - (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) - (target-host (car homehost)) - (testsuite (common:get-testsuite-name)) - (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) - (cmdln (conc (common:get-megatest-exe) - " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - " -daemonize " - "") - ;; " -log " logfile - " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) - (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) - (load-limit (configf:lookup-number *configdat* "server" "load-limit" default: 0.9))) - ;; we want the remote server to start in *toppath* so push there - (push-directory areapath) - (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - (thread-start! log-rotate) - - ;; host.domain.tld match host? - (if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - - (setenv "TARGETHOST_LOGF" logfile) - (common:wait-for-normalized-load load-limit " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever - (system (conc "nbfake " cmdln)) - (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - (thread-join! log-rotate) - (pop-directory))) ADDED emergency-patches/emergency-patch-1.scm Index: emergency-patches/emergency-patch-1.scm ================================================================== --- /dev/null +++ emergency-patches/emergency-patch-1.scm @@ -0,0 +1,203 @@ + + +;; These are called by the server on recipt of /api calls +;; - keep it simple, only return the actual result of the call, i.e. no meta info here +;; +;; - returns #( flag result ) +;; +(define (api:execute-requests dbstruct dat) + (handle-exceptions + exn + (let ((call-chain (get-call-chain))) + (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat) + (print-call-chain (current-error-port)) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens + (cond + ((not (vector? dat)) ;; it is an error to not receive a vector + (vector #f (vector #f "remote must be called with a vector"))) + ((> *api-process-request-count* 20) ;; 20) + (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") + (set! *server-overloaded* #t) + (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! + (else + (let* ((cmd-in (vector-ref dat 0)) + (cmd (if (symbol? cmd-in) + cmd-in + (string->symbol cmd-in))) + (params (vector-ref dat 1)) + (start-t (current-milliseconds)) + (readonly-mode (dbr:dbstruct-read-only dbstruct)) + (readonly-command (member cmd api:read-only-queries)) + (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) + (res + (if writecmd-in-readonly-mode + (conc "attempt to run write command "cmd" on a read-only database") + (case cmd + ;;=============================================== + ;; READ/WRITE QUERIES + ;;=============================================== + + ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl + + ;; SERVERS + ((start-server) (apply server:kind-run params)) + ((kill-server) (set! *server-run* #f)) + + ;; TESTS + + ;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) + ;;BB - commented out above because it was calling below, eventually, incorrectly (dbstruct passed to mt:test-set-state-status-by-id, which previosly did more, but now only passes thru to db:set-state-status-and-roll-up-items. + ((test-set-state-status-by-id) + + ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) + (db:set-state-status-and-roll-up-items + dbstruct + (list-ref params 0) ; run-id + (list-ref params 1) ; test-name + #f ; item-path + (list-ref params 2) ; state + (list-ref params 3) ; status + (list-ref params 4) ; comment + )) + + ((delete-test-records) (apply db:delete-test-records dbstruct params)) + ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params)) + ((test-set-state-status) (apply db:test-set-state-status dbstruct params)) + ((test-set-top-process-pid) (apply db:test-set-top-process-pid dbstruct params)) + ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params)) + ((top-test-set-per-pf-counts) (apply db:top-test-set-per-pf-counts dbstruct params)) + ((test-set-archive-block-id) (apply db:test-set-archive-block-id dbstruct params)) + + ;; RUNS + ((register-run) (apply db:register-run dbstruct params)) + ((set-tests-state-status) (apply db:set-tests-state-status dbstruct params)) + ((delete-run) (apply db:delete-run dbstruct params)) + ((lock/unlock-run) (apply db:lock/unlock-run dbstruct params)) + ((update-run-event_time) (apply db:update-run-event_time dbstruct params)) + ((update-run-stats) (apply db:update-run-stats dbstruct params)) + ((set-var) (apply db:set-var dbstruct params)) + ((del-var) (apply db:del-var dbstruct params)) + + ;; STEPS + ((teststep-set-status!) (apply db:teststep-set-status! dbstruct params)) + + ;; TEST DATA + ((test-data-rollup) (apply db:test-data-rollup dbstruct params)) + ((csv->test-data) (apply db:csv->test-data dbstruct params)) + + ;; MISC + ((sync-inmem->db) (let ((run-id (car params))) + (db:sync-touched dbstruct run-id force-sync: #t))) + ((mark-incomplete) (apply db:find-and-mark-incomplete dbstruct params)) + + ;; TESTMETA + ((testmeta-add-record) (apply db:testmeta-add-record dbstruct params)) + ((testmeta-update-field) (apply db:testmeta-update-field dbstruct params)) + ((get-tests-tags) (db:get-tests-tags dbstruct)) + + ;; TASKS + ((tasks-add) (apply tasks:add dbstruct params)) + ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params)) + ((tasks-get-last) (apply tasks:get-last dbstruct params)) + + ;; NO SYNC DB + ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) + ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) + ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) + + ;; ARCHIVES + ;; ((archive-get-allocations) + ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) + ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) + ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) + + ;;====================================================================== + ;; READ ONLY QUERIES + ;;====================================================================== + + ;; KEYS + ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) + ((get-keys) (db:get-keys dbstruct)) + ((get-key-vals) (apply db:get-key-vals dbstruct params)) + ((get-target) (apply db:get-target dbstruct params)) + ((get-targets) (db:get-targets dbstruct)) + + ;; ARCHIVES + ((test-get-archive-block-info) (apply db:test-get-archive-block-info dbstruct params)) + + ;; TESTS + ((test-toplevel-num-items) (apply db:test-toplevel-num-items dbstruct params)) + ((get-test-info-by-id) (apply db:get-test-info-by-id dbstruct params)) + ((test-get-rundir-from-test-id) (apply db:test-get-rundir-from-test-id dbstruct params)) + ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params)) + ((get-count-tests-running) (apply db:get-count-tests-running dbstruct params)) + ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params)) + ;; ((delete-test-step-records) (apply db:delete-test-step-records dbstruct params)) + ((get-previous-test-run-record) (apply db:get-previous-test-run-record dbstruct params)) + ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params)) + ((test-get-logfile-info) (apply db:test-get-logfile-info dbstruct params)) + ((test-get-records-for-index-file) (apply db:test-get-records-for-index-file dbstruct params)) + ((get-testinfo-state-status) (apply db:get-testinfo-state-status dbstruct params)) + ((test-get-top-process-pid) (apply db:test-get-top-process-pid dbstruct params)) + ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params)) + ((get-prereqs-not-met) (apply db:get-prereqs-not-met dbstruct params)) + ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params)) + ((synchash-get) (apply synchash:server-get dbstruct params)) + ((get-raw-run-stats) (apply db:get-raw-run-stats dbstruct params)) + + ;; RUNS + ((get-run-info) (apply db:get-run-info dbstruct params)) + ((get-run-status) (apply db:get-run-status dbstruct params)) + ((set-run-status) (apply db:set-run-status dbstruct params)) + ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) + ((get-test-id) (apply db:get-test-id dbstruct params)) + ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) + ((get-runs) (apply db:get-runs dbstruct params)) + ((get-num-runs) (apply db:get-num-runs dbstruct params)) + ((get-all-run-ids) (db:get-all-run-ids dbstruct)) + ((get-prev-run-ids) (apply db:get-prev-run-ids dbstruct params)) + ((get-run-ids-matching-target) (apply db:get-run-ids-matching-target dbstruct params)) + ((get-runs-by-patt) (apply db:get-runs-by-patt dbstruct params)) + ((get-run-name-from-id) (apply db:get-run-name-from-id dbstruct params)) + ((get-main-run-stats) (apply db:get-main-run-stats dbstruct params)) + ((get-var) (apply db:get-var dbstruct params)) + ((get-run-stats) (apply db:get-run-stats dbstruct params)) + + ;; STEPS + ((get-steps-data) (apply db:get-steps-data dbstruct params)) + ((get-steps-for-test) (apply db:get-steps-for-test dbstruct params)) + + ;; TEST DATA + ((read-test-data) (apply db:read-test-data dbstruct params)) + ((read-test-data*) (apply db:read-test-data* dbstruct params)) + + ;; MISC + ((get-latest-host-load) (apply db:get-latest-host-load dbstruct params)) + ((have-incompletes?) (apply db:have-incompletes? dbstruct params)) + ((login) (apply db:login dbstruct params)) + ((general-call) (let ((stmtname (car params)) + (run-id (cadr params)) + (realparams (cddr params))) + (db:general-call dbstruct stmtname realparams))) + ((sdb-qry) (apply sdb:qry params)) + ((ping) (current-process-id)) + ((get-changed-record-ids) (apply db:get-changed-record-ids dbstruct params)) + + ;; TESTMETA + ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) + + ;; TASKS + ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) + (else + (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) + (conc "ERROR: BAD api call " cmd)))))) + + ;; save all stats + (let ((delta-t (- (current-milliseconds) + start-t))) + (hash-table-set! *db-api-call-time* cmd + (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) + (if writecmd-in-readonly-mode + (vector #f res) + (vector #t res))))))) ADDED emergency-patches/emergency-patch-2.scm Index: emergency-patches/emergency-patch-2.scm ================================================================== --- /dev/null +++ emergency-patches/emergency-patch-2.scm @@ -0,0 +1,311 @@ +(include "common_records.scm") +(include "key_records.scm") +(include "db_records.scm") +(include "run_records.scm") +(include "test_records.scm") + +(define (common:wait-for-cpuload maxload numcpus waitdelay #!key (count 1000) (msg #f)(remote-host #f)) + (let* ((loadavg (common:get-cpu-load remote-host)) + (first (car loadavg)) + (next (cadr loadavg)) + (adjload (* maxload numcpus)) + (loadjmp (- first next))) + (cond + ((and (> first adjload) + (> count 0)) + (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load " first " exceeding max of " adjload " " (if msg msg "")) + (thread-sleep! waitdelay) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1))) + ((and (> loadjmp numcpus) + (> count 0)) + (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) + (thread-sleep! waitdelay) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1)))))) + +(define (common:wait-for-homehost-load maxload msg) + (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. + #f + (common:get-homehost))) + (hh (if hh-dat (car hh-dat) #f)) + (numcpus (common:get-num-cpus hh))) + (common:wait-for-normalized-load maxload msg: msg remote-host: hh))) + +;; wait for normalized cpu load to drop below maxload +;; +(define (common:wait-for-normalized-load maxload #!key (msg #f)(remote-host #f)) + (let ((num-cpus (common:get-num-cpus remote-host))) + (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) + +;; hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry itemmaps) +(define (runs:process-expanded-tests runsdat testdat) + ;; unroll the contents of runsdat and testdat (due to ongoing refactoring). + (let* ((hed (runs:testdat-hed testdat)) + (tal (runs:testdat-tal testdat)) + (reg (runs:testdat-reg testdat)) + (reruns (runs:testdat-reruns testdat)) + (test-name (runs:testdat-test-name testdat)) + (item-path (runs:testdat-item-path testdat)) + (jobgroup (runs:testdat-jobgroup testdat)) + (waitons (runs:testdat-waitons testdat)) + (item-path (runs:testdat-item-path testdat)) + (testmode (runs:testdat-testmode testdat)) + (newtal (runs:testdat-newtal testdat)) + (itemmaps (runs:testdat-itemmaps testdat)) + (test-record (runs:testdat-test-record testdat)) + (prereqs-not-met (runs:testdat-prereqs-not-met testdat)) + + (reglen (runs:dat-reglen runsdat)) + (regfull (runs:dat-regfull runsdat)) + (runname (runs:dat-runname runsdat)) + (max-concurrent-jobs (runs:dat-max-concurrent-jobs runsdat)) + (run-id (runs:dat-run-id runsdat)) + (test-patts (runs:dat-test-patts runsdat)) + (required-tests (runs:dat-required-tests runsdat)) + (test-registry (runs:dat-test-registry runsdat)) + (registry-mutex (runs:dat-registry-mutex runsdat)) + (flags (runs:dat-flags runsdat)) + (keyvals (runs:dat-keyvals runsdat)) + (run-info (runs:dat-run-info runsdat)) + (all-tests-registry (runs:dat-all-tests-registry runsdat)) + (run-limits-info (runs:dat-can-run-more-tests runsdat)) + ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running + (have-resources (car run-limits-info)) + (num-running (list-ref run-limits-info 1)) + (num-running-in-jobgroup(list-ref run-limits-info 2)) + (max-concurrent-jobs (list-ref run-limits-info 3)) + (job-group-limit (list-ref run-limits-info 4)) + ;; (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) + ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap)) + (fails (if (list? prereqs-not-met) + (runs:calc-fails prereqs-not-met) + (begin + (debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met) + '()))) + (non-completed (filter (lambda (x) ;; remove hed from not completed list, duh, of course it is not completed! + (not (equal? x hed))) + (runs:calc-not-completed prereqs-not-met))) + (loop-list (list hed tal reg reruns)) + ;; configure the load runner + (numcpus (common:get-num-cpus #f)) + (maxload (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3.0"))) ;; use a non-number string to disable + (maxhomehostload (string->number (or (configf:lookup *configdat* "jobtools" "maxhomehostload") "1.2"))) ;; use a non-number string to disable + (waitdelay (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60")))) + (debug:print-info 4 *default-log-port* "have-resources: " have-resources " prereqs-not-met: (" + (string-intersperse + (map (lambda (t) + (if (vector? t) + (conc (db:test-get-state t) "/" (db:test-get-status t)) + (conc " WARNING: t is not a vector=" t ))) + prereqs-not-met) + ", ") ") fails: " fails + "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) + + + + (if (and (not (null? prereqs-not-met)) + (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) + (debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) + + ;; Don't know at this time if the test have been launched at some time in the past + ;; i.e. is this a re-launch? + (debug:print-info 4 *default-log-port* "run-limits-info = " run-limits-info) + + (cond + + ;; Check item path against item-patts, + ;; + ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run + ;; else the run is stuck, temporarily or permanently + ;; but should check if it is due to lack of resources vs. prerequisites + (debug:print-info 1 *default-log-port* "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) + (if (or (not (null? tal))(not (null? reg))) + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns) + #f)) + + ;; Register tests + ;; + ((not (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) + (debug:print-info 4 *default-log-port* "Pre-registering test " test-name "/" item-path " to create placeholder" ) + ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs + (let register-loop ((numtries 15)) + (rmt:register-test run-id test-name item-path) + (if (rmt:get-test-id run-id test-name item-path) + (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'done) + (if (> numtries 0) + (begin + (thread-sleep! 0.5) + (register-loop (- numtries 1))) + (debug:print-error 0 *default-log-port* "failed to register test " (db:test-make-full-name test-name item-path))))) + (if (not (eq? (hash-table-ref/default test-registry (db:test-make-full-name test-name "") #f) 'done)) + (begin + (rmt:register-test run-id test-name "") + (if (rmt:get-test-id run-id test-name "") + (hash-table-set! test-registry (db:test-make-full-name test-name "") 'done)))) + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) + (if (and (null? tal)(null? reg)) + (list hed tal (append reg (list hed)) reruns) + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + ;; NB// Here we are building reg as we register tests + ;; if regfull we must pop the front item off reg + (if regfull + (append (cdr reg) (list hed)) + (append reg (list hed))) + reruns))) + + ;; At this point hed test registration must be completed. + ;; + ((eq? (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f) + 'start) + (debug:print-info 0 *default-log-port* "Waiting on test registration(s): " + (string-intersperse + (filter (lambda (x) + (eq? (hash-table-ref/default test-registry x #f) 'start)) + (hash-table-keys test-registry)) + ", ")) + (thread-sleep! 0.051) + (list hed tal reg reruns)) + + ;; If no resources are available just kill time and loop again + ;; + ((not have-resources) ;; simply try again after waiting a second + (if (runs:lownoise "no resources" 60) + (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ...")) + ;; Have gone back and forth on this but db starvation is an issue. + ;; wait one second before looking again to run jobs. + (thread-sleep! 1) + ;; could have done hed tal here but doing car/cdr of newtal to rotate tests + (list (car newtal)(cdr newtal) reg reruns)) + + ;; This is the final stage, everything is in place so launch the test + ;; + ((and have-resources + (or (null? prereqs-not-met) + (and (member 'toplevel testmode) ;; 'toplevel) + (null? non-completed) + (not (member 'exclusive testmode))))) + ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) + ;; we are going to reset all the counters for test retries by setting a new hash table + ;; this means they will increment only when nothing can be run + (set! *max-tries-hash* (make-hash-table)) + ;; well, first lets see if cpu load throttling is enabled. If so wait around until the + ;; average cpu load is under the threshold before continuing + (if maxload ;; only gate if maxload is specified + (common:wait-for-cpuload maxload numcpus waitdelay)) + (if maxhomehostload + (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) + + (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) + (runs:incremental-print-results run-id) + (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! *global-delta*) + (if (or (not (null? tal))(not (null? reg))) + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns) + #f)) + + ;; must be we have unmet prerequisites + ;; + (else + (debug:print 4 *default-log-port* "FAILS: " fails) + ;; If one or more of the prereqs-not-met are FAIL then we can issue + ;; a message and drop hed from the items to be processed. + ;; (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) + (if (and (not (null? prereqs-not-met)) + (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) + (debug:print-info 1 *default-log-port* "waiting on tests; " (string-intersperse + (runs:mixed-list-testname-and-testrec->list-of-strings + prereqs-not-met) ", "))) + (if (or (null? fails) + (member 'toplevel testmode)) + (begin + ;; couldn't run, take a breather + (if (runs:lownoise "Waiting for more work to do..." 60) + (debug:print-info 0 *default-log-port* "Waiting for more work to do...")) + (thread-sleep! 1) + (list (car newtal)(cdr newtal) reg reruns)) + ;; the waiton is FAIL so no point in trying to run hed ever again + (if (or (not (null? reg))(not (null? tal))) + (if (vector? hed) + (begin + (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path + " from the launch list as it has prerequistes that are FAIL") + (let ((test-id (rmt:get-test-id run-id hed ""))) + (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))) + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! *global-delta*) + ;; This next is for the items + (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) + (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed) + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns ;; WAS: (cons hed reruns) ;; but that makes no sense? + )) + (let ((nth-try (hash-table-ref/default test-registry hed 0))) + (cond + ((member "RUNNING" (map db:test-get-state prereqs-not-met)) + (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60) + (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet.")) + (thread-sleep! 4) + (list (runs:queue-next-hed newtal reg reglen regfull) + (runs:queue-next-tal newtal reg reglen regfull) + (runs:queue-next-reg newtal reg reglen regfull) + reruns)) + ((or (not nth-try) + (and (number? nth-try) + (< nth-try 10))) + (hash-table-set! test-registry hed (if (number? nth-try) + (+ nth-try 1) + 0)) + (if (runs:lownoise (conc "not removing test " hed) 60) + (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites")) + ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) + ;; (list hed tal reg reruns) + ;; (list (car newtal)(cdr newtal) reg reruns) + ;; (hash-table-set! test-registry hed 'removed) + (list (runs:queue-next-hed newtal reg reglen regfull) + (runs:queue-next-tal newtal reg reglen regfull) + (runs:queue-next-reg newtal reg reglen regfull) + reruns)) + ((symbol? nth-try) + (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW + (if (null? tal) + #f ;; yes, really + (list (car tal)(cdr tal) reg reruns)) + (begin + (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60) + (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state " nth-try " will be overridden and we'll retry.")) + (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f) + (hash-table-set! test-registry hed 0) + (list (runs:queue-next-hed newtal reg reglen regfull) + (runs:queue-next-tal newtal reg reglen regfull) + (runs:queue-next-reg newtal reg reglen regfull) + reruns)))) + (else + (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60) + (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now.")) + ;; (debug:print 0 *default-log-port* " prereqs: " prereqs-not-met) + (hash-table-set! test-registry hed 'removed) + (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "TEN_STRIKES" #f) + ;; I'm unclear on if this roll up is needed - it may be the root cause of the "all set to FAIL" bug. + (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "FAIL" #f) ;; treat as FAIL + (list (if (null? tal)(car newtal)(car tal)) + tal + reg + reruns))))) + ;; can't drop this - maybe running? Just keep trying + (let ((runable-tests (runs:runable-tests prereqs-not-met))) + (if (null? runable-tests) + #f ;; I think we are truly done here + (list (runs:queue-next-hed newtal reg reglen regfull) + (runs:queue-next-tal newtal reg reglen regfull) + (runs:queue-next-reg newtal reg reglen regfull) + reruns))))))))) ADDED emergency-patches/emergency-patch-3.scm Index: emergency-patches/emergency-patch-3.scm ================================================================== --- /dev/null +++ emergency-patches/emergency-patch-3.scm @@ -0,0 +1,81 @@ + ;; To build patch: + ;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ldd /p/foundry/env/pkgs/megatest/1.64/19/bin/.11/mtest + ;; linux-vdso.so.1 => (0x00002aaaaaaab000) + ;; libchicken.so.7 => /p/foundry/env/pkgs/megatest/1.64/chicken-4.10.0//lib/libchicken.so.7 (0x00002aaaaaaad000) + ;; libm.so.6 => /lib64/libm.so.6 (0x00002aaaab0a6000) + ;; libdl.so.2 => /lib64/libdl.so.2 (0x00002aaaab31f000) + ;; libc.so.6 => /lib64/libc.so.6 (0x00002aaaab523000) + ;; /lib64/ld-linux-x86-64.so.2 (0x0000555555554000) + ;; + ;; /p/foundry/env/pkgs/megatest/1.64/chicken-4.10.0/bin/csc -s emergency-patch-3.scm + ;; + + + ;; to test patch: + ;;;;;;;;;;;;;;;;;;;;;;;;; + ;; in .megatestrc, add: + ;; (if (and (> megatest-version 1.64) + ;; (< megatest-version 1.6421)) + ;; (begin + ;; (load "/p/foundry/env/pkgs/megatest/1.64/19/share/epatch-1.so") + ;; (load "/p/foundry/env/pkgs/megatest/1.64/19/share/epatch-2.so"))) + ;; + + + ;; to productize patch: + ;;;;;;;;;;;;;;;;;;;;;;;;; + ;; +(use directory-utils regex) + +(include "common_records.scm") +(include "key_records.scm") +(include "db_records.scm") +(include "run_records.scm") +(include "test_records.scm") + +;; Given a run id start a server process ### NOTE ### > file 2>&1 +;; if the run-id is zero and the target-host is set +;; try running on that host +;; incidental: rotate logs in logs/ dir. +;; +(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area + (let* ((curr-host (get-host-name)) + ;; (attempt-in-progress (server:start-attempted? areapath)) + ;; (dot-server-url (server:check-if-running areapath)) + (curr-ip (server:get-best-guess-address curr-host)) + (curr-pid (current-process-id)) + (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) + (target-host (car homehost)) + (testsuite (common:get-testsuite-name)) + (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) + (cmdln (conc (common:get-megatest-exe) + " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") + " -daemonize " + "") + ;; " -log " logfile + " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) + (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) + (load-limit (configf:lookup-number *configdat* "server" "load-limit" default: 0.9))) + ;; we want the remote server to start in *toppath* so push there + (push-directory areapath) + (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") + (thread-start! log-rotate) + + ;; host.domain.tld match host? + (if (and target-host + ;; look at target host, is it host.domain.tld or ip address and does it + ;; match current ip or hostname + (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + (not (equal? curr-ip target-host))) + (begin + (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + (setenv "TARGETHOST" target-host))) + + (setenv "TARGETHOST_LOGF" logfile) + (common:wait-for-normalized-load load-limit " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever + (system (conc "nbfake " cmdln)) + (unsetenv "TARGETHOST_LOGF") + (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + (thread-join! log-rotate) + (pop-directory))) ADDED get-config-settings.sh Index: get-config-settings.sh ================================================================== --- /dev/null +++ get-config-settings.sh @@ -0,0 +1,2 @@ + grep configf:lookup *.scm | sed 's/^.*:lookup//; s/^-number//; s/^ //' | grep -v '^\(section\|test-conf\|tconfig\|testconfig\|dat\|config\|views-cfgdat\)' | perl -pe 's/^\s*(\*configdat\*|configdat|mtconf)//; s/^\s+//; s/\).*$//; s/"//g' | awk '{print $1,$2}' | sort | grep -v section | sort | uniq + Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -239,10 +239,11 @@ (msg ((condition-property-accessor 'exn 'message) exn))) (set! success #f) (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " msg) (debug:print 0 *default-log-port* " cmd: " cmd " params: " params) + (debug:print 0 *default-log-port* " call-chain: " call-chain) (if runremote (remote-conndat-set! runremote #f)) ;; Killing associated server to allow clean retry.") ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? (mutex-unlock! *http-mutex*) @@ -301,10 +302,11 @@ exn (begin (print-call-chain *default-log-port*) (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn))) (close-connection! api-dat) + ;;(close-idle-connections!) #t)) #f))) (define (make-http-transport:server-dat)(make-vector 6)) @@ -377,11 +379,11 @@ (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) - (server-timeout (server:get-timeout)) + (server-timeout (server:expiration-timeout)) (server-going #f) (server-log-file (args:get-arg "-log"))) ;; always set when we are a server (let loop ((count 0) (server-state 'available) (bad-sync-count 0) @@ -431,21 +433,15 @@ (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) - (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) - (adjusted-timeout (if (> hrs-since-start 1) - (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour - server-timeout))) - (if (common:low-noise-print 120 "server timeout") - (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) + (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) (cond ((and *server-run* (> (+ last-access server-timeout) - (current-seconds)) - (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. + (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn @@ -492,33 +488,45 @@ ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch) - ;; lets not even bother to start if there are already three or more server files ready to go - (let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) - (if (> num-alive 3) - (begin - (debug:print 0 *default-log-port* "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up") - (exit)))) - (let* ((th2 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server run thread started") - (http-transport:run - (if (args:get-arg "-server") - (args:get-arg "-server") - "-") - )) "Server run")) - (th3 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server monitor thread started") - (http-transport:keep-running) - "Keep running")))) - (thread-start! th2) - (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. - (thread-start! th3) - (set! *didsomething* #t) - (thread-join! th2) - (exit))) + (start-time-old (> (- (current-seconds) start-time) 5)) + (cleanup-proc (lambda (msg) + (let* ((serv-fname (conc "server-" (current-process-id) "-" (get-host-name) ".log")) + (full-serv-fname (conc *toppath* "/logs/" serv-fname)) + (new-serv-fname (conc *toppath* "/logs/" "defunct-" serv-fname))) + (debug:print 0 *default-log-port* msg) + (if (common:file-exists? full-serv-fname) + (system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname)) + (debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname)) + (exit))))) + (cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting") + (exit))) + ;; lets not even bother to start if there are already three or more server files ready to go + (let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) + (if (> num-alive 3) + (begin + (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up")) + (exit)))) + (let* ((th2 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server run thread started") + (http-transport:run + (if (args:get-arg "-server") + (args:get-arg "-server") + "-") + )) "Server run")) + (th3 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server monitor thread started") + (http-transport:keep-running) + "Keep running")))) + (thread-start! th2) + (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. + (thread-start! th3) + (set! *didsomething* #t) + (thread-join! th2) + (exit)))) (define (http-transport:server-signal-handler signum) (signal-mask! signum) (handle-exceptions exn Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -451,11 +451,11 @@ (keyvals #f) (fullrunscript (if (not runscript) #f (if (substring-index "/" runscript) runscript ;; use unadultered if contains slashes - (let ((fulln (conc testpath "/" runscript))) + (let ((fulln (conc work-area "/" runscript))) (if (and (common:file-exists? fulln) (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path ) ;; (rollup-status 0) @@ -467,67 +467,95 @@ (setenv "MT_TESTSUITENAME" areaname) (setenv "MT_RUN_AREA_HOME" top-path) (set! *toppath* top-path) (setenv "MT_TEST_RUN_DIR" work-area) - ;; On NFS it can be slow and unreliable to get needed startup information. - ;; i. Check if we are on the homehost, if so, proceed - ;; ii. Check if host and port passed in via CMDINFO are valid and if - ;; possible use them. - (let ((bestadrs (server:get-best-guess-address (get-host-name))) - (needcare #f)) - (if (equal? homehost bestadrs) ;; we are likely on the homehost - (debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost) - (let ((host-port (if serverurl (string-split serverurl ":") #f))) - (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote* - (if (string? homehost) - (if (and host-port - (> (length host-port) 1)) - (let* ((host (car host-port)) - (port (cadr host-port)) - (start-res (http-transport:client-connect host port)) - (ping-res (rmt:login-no-auto-client-setup start-res))) - (if (and start-res - ping-res) - ;; (begin ;; let ((url (http-transport:server-dat-make-url start-res))) - (begin - (remote-conndat-set! *runremote* start-res) - ;; (remote-server-url-set! *runremote* url) - ;; (if (server:ping url) - (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data.")) - (begin - (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " host ":" port) - (set! *runremote* #f)) - ;; (remote-conndat-set! *runremote* #f)) - )) - (begin - (set! *runremote* #f) - (debug:print-info 0 *default-log-port* (if host-port - (conc "received invalid host-port information " host-port) - "no host-port information received")) - ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare. - (set! needcare #t))) - (begin - (set! *runremote* #f) - (debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.") - (set! needcare #t))))) - (if needcare ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host - (let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory - (handle-exceptions - exn - (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn)) - (create-directory logdir #t))))) - + ;; ;; On NFS it can be slow and unreliable to get needed startup information. + ;; ;; i. Check if we are on the homehost, if so, proceed + ;; ;; ii. Check if host and port passed in via CMDINFO are valid and if + ;; ;; possible use them. + ;; (let ((bestadrs (server:get-best-guess-address (get-host-name))) + ;; (needcare #f)) + ;; (if (equal? homehost bestadrs) ;; we are likely on the homehost + ;; (debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost) + ;; (let ((host-port (if serverurl (string-split serverurl ":") #f))) + ;; (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote* + ;; (if (string? homehost) + ;; (if (and host-port + ;; (> (length host-port) 1)) + ;; (let* ((host (car host-port)) + ;; (port (cadr host-port)) + ;; (start-res (http-transport:client-connect host port)) + ;; (ping-res (rmt:login-no-auto-client-setup start-res))) + ;; (if (and start-res + ;; ping-res) + ;; ;; (begin ;; let ((url (http-transport:server-dat-make-url start-res))) + ;; (begin + ;; (remote-conndat-set! *runremote* start-res) + ;; ;; (remote-server-url-set! *runremote* url) + ;; ;; (if (server:ping url) + ;; (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data.")) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " host ":" port) + ;; (set! *runremote* #f)) + ;; ;; (remote-conndat-set! *runremote* #f)) + ;; )) + ;; (begin + ;; (set! *runremote* #f) + ;; (debug:print-info 0 *default-log-port* (if host-port + ;; (conc "received invalid host-port information " host-port) + ;; "no host-port information received")) + ;; ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare. + ;; (set! needcare #t))) + ;; (begin + ;; (set! *runremote* #f) + ;; (debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.") + ;; (set! needcare #t))))) + ;; (if needcare ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host + ;; (let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory + ;; (handle-exceptions + ;; exn + ;; (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (create-directory logdir #t))))) + ;; ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) - (if (or (common:file-exists? top-path) + (if (or (common:directory-exists? work-area) (> count 10)) - (change-directory top-path) + (change-directory work-area) (begin - (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found") + (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " work-area " not found") (thread-sleep! 10) (loop (+ count 1))))) + + (if (not (string=? (common:real-path work-area)(common:real-path (current-directory)))) + (begin + (debug:print 0 *default-log-port* + "INFO: we are expecting to be in directory " work-area "\n" + " but we are actually in the directory " (current-directory) "\n" + " doing another change dir.") + (change-directory work-area))) + + ;; spot check that the files in testpath are available. Too often NFS delays cause problems here. + (let ((files (glob (conc testpath "/*"))) + (bad-files '())) + (for-each + (lambda (fullname) + (let* ((fname (pathname-strip-directory fullname)) + (targn (conc work-area "/" fname))) + (if (not (file-exists? targn)) + (set! bad-files (cons fname bad-files))))) + files) + (if (not (null? bad-files)) + (begin + (debug:print 0 *default-log-port* "INFO: test data from " testpath " not copied properly or filesystem problems causing data to not be found. Re-running the copy command.") + (debug:print 0 *default-log-port* "INFO: missing files from " work-area ": " (string-intersperse bad-files ", ")) + (launch:test-copy testpath work-area)))) + + ;; one more time, change to the work-area directory + (change-directory work-area) + (launch:setup) ;; should be properly in the top-path now (set! tconfigreg (tests:get-all)) (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) @@ -559,11 +587,12 @@ (begin (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond - ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "KILLREQ" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun + ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. + ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) @@ -948,12 +977,24 @@ (read-config (conc toppath "/runconfigs.config") *runconfigdat* #t ;; consider using runconfig:read some day ... sections: sections))) (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) (mtcachef (car cachefiles)) (rccachef (cdr cachefiles))) - (if rccachef (configf:write-alist runconfigdat rccachef)) - (if mtcachef (configf:write-alist *configdat* mtcachef)) + ;; trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "/p/fdk/gwa/lefkowit/mtTesting/qa/primbeqa/links/p1222/11/PDK_r1.1.1/prim/clean/pcell_testgen/.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342 + ;; TODO - consider 1) using simple-lock to bracket cache write + ;; 2) cache in hash on server, since need to do rmt: anyway to lock. + + (if rccachef + (common:fail-safe + (lambda () + (configf:write-alist runconfigdat rccachef)) + (conc "Could not write cache file - "rccachef))) + (if mtcachef + (common:fail-safe + (lambda () + (configf:write-alist *configdat* mtcachef)) + (conc "Could not write cache file - "mtcachef))) (set! *runconfigdat* runconfigdat) (if (and rccachef mtcachef) (set! *configstatus* 'fulldata)))) ;; no configs found? should not happen but let's try to recover gracefully, return an empty hash-table (set! *configdat* (make-hash-table)) ))) @@ -1018,12 +1059,26 @@ ;; one more attempt to cache the configs for future reading (let* ((cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) (mtcachef (car cachefiles)) (rccachef (cdr cachefiles))) - (if (and rccachef *runconfigdat* (not (common:file-exists? rccachef))) (configf:write-alist *runconfigdat* rccachef)) - (if (and mtcachef *configdat* (not (common:file-exists? mtcachef))) (configf:write-alist *configdat* mtcachef)) + + ;; trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "/p/fdk/gwa/lefkowit/mtTesting/qa/primbeqa/links/p1222/11/PDK_r1.1.1/prim/clean/pcell_testgen/.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342 + ;; TODO - consider 1) using simple-lock to bracket cache write + ;; 2) cache in hash on server, since need to do rmt: anyway to lock. + (if (and rccachef *runconfigdat* (not (common:file-exists? rccachef))) + (common:fail-safe + (lambda () + (configf:write-alist *runconfigdat* rccachef)) + (conc "Could not write cache file - "rccachef)) + ) + (if (and mtcachef *configdat* (not (common:file-exists? mtcachef))) + (common:fail-safe + (lambda () + (configf:write-alist *configdat* mtcachef)) + (conc "Could not write cache file - "mtcachef)) + ) (if (and rccachef mtcachef *runconfigdat* *configdat*) (set! *configstatus* 'fulldata))) ;; if have -append-config then read and append here (let ((cfname (args:get-arg "-append-config"))) @@ -1043,10 +1098,26 @@ (cdr res) (begin (if (common:low-noise-print 20 "No valid disks or no disk with enough space") (debug:print-error 0 *default-log-port* "No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists and has enough space!\n You can change minspace in the [setup] section of megatest.config. Current setting is: " minspace)) (exit 1))))))) ;; TODO - move the exit to the calling location and return #f + +(define (launch:test-copy test-src-path test-path) + (let* ((ovrcmd (let ((cmd (config-lookup *configdat* "setup" "testcopycmd"))) + (if cmd + ;; substitute the TEST_SRC_PATH and TEST_TARG_PATH + (string-substitute "TEST_TARG_PATH" test-path + (string-substitute "TEST_SRC_PATH" test-src-path cmd #t) #t) + #f))) + (cmd (if ovrcmd + ovrcmd + (conc "rsync -av" (if (debug:debug-mode 1) "" "q") " " test-src-path "/ " test-path "/" + " >> " test-path "/mt_launch.log 2>> " test-path "/mt_launch.log"))) + (status (system cmd))) + (if (not (eq? status 0)) + (debug:print 2 *default-log-port* "ERROR: problem with running \"" cmd "\"")))) + ;; Desired directory structure: ;; ;; - - -. ;; | @@ -1208,23 +1279,11 @@ (if (not (directory? test-path)) (create-directory test-path #t)) ;; this is a hack, I don't know why out of the blue this path does not exist sometimes (if (and test-src-path (directory? test-path)) (begin - (let* ((ovrcmd (let ((cmd (config-lookup *configdat* "setup" "testcopycmd"))) - (if cmd - ;; substitute the TEST_SRC_PATH and TEST_TARG_PATH - (string-substitute "TEST_TARG_PATH" test-path - (string-substitute "TEST_SRC_PATH" test-src-path cmd #t) #t) - #f))) - (cmd (if ovrcmd - ovrcmd - (conc "rsync -av" (if (debug:debug-mode 1) "" "q") " " test-src-path "/ " test-path "/" - " >> " test-path "/mt_launch.log 2>> " test-path "/mt_launch.log"))) - (status (system cmd))) - (if (not (eq? status 0)) - (debug:print 2 *default-log-port* "ERROR: problem with running \"" cmd "\""))) + (launch:test-copy test-src-path test-path) (list lnkpathf lnkpath )) (if (and test-src-path (> remtries 0)) (begin (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) ;; @@ -1238,23 +1297,23 @@ ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex - (let* ((lock-key (conc "test-" test-id)) - (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) - (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds - (if (car lock) - #t - (if (> (current-seconds) expire-time) - (begin - (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) - (rmt:no-sync-del! lock-key) ;; destroy the lock - (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; - (begin - (thread-sleep! 1) - (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) + (let* ( ;; (lock-key (conc "test-" test-id)) + ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) + ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds + ;; (if (car lock) + ;; #t + ;; (if (> (current-seconds) expire-time) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) + ;; (rmt:no-sync-del! lock-key) ;; destroy the lock + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; + ;; (begin + ;; (thread-sleep! 1) + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) (item-path (item-list->path itemdat)) (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1))) (if (> launch-delay delta) @@ -1427,11 +1486,11 @@ (car fullcmd)) (if useshell '() (cdr fullcmd))))) (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. - (rmt:no-sync-del! lock-key) ;; release the lock for starting this test + ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.6424) +(define megatest-version 1.6429) Index: megatest.config ================================================================== --- megatest.config +++ megatest.config @@ -22,5 +22,7 @@ all areas=fullrun,ext-tests snazy areas=%; selector=QUICKPATT/ [nopurpose] +[server] +timeout 1 Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -53,24 +53,48 @@ (include "key_records.scm") (include "db_records.scm") (include "run_records.scm") (include "megatest-fossil-hash.scm") +(define *usage-log-file* #f) ;; put path to file for logging usage in this var in the ~/.megatestrc file +(define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file + +;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file +;; (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc"))) (if (common:file-exists? debugcontrolf) (load debugcontrolf))) + +;; usage logging, careful with this, it is not designed to deal with all real world challenges! +;; +(if (and *usage-log-file* + (file-write-access? *usage-log-file*)) + (with-output-to-file + *usage-log-file* + (lambda () + (print + (if *usage-use-seconds* + (current-seconds) + (time->string + (seconds->local-time (current-seconds)) + "%Yww%V.%w %H:%M:%S")) + " " + (current-user-name) " " + (current-directory) " " + "\"" (string-intersperse (argv) " ") "\"")) + #:append)) ;; Disabled help items ;; -rollup : (currently disabled) fill run (set by :runname) with latest test(s) ;; from prior runs with same keys ;; -daemonize : fork into background and disconnect from stdin/out (define help (conc " Megatest, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version " - license GPL, Copyright Matt Welland 2006-2015 - + license GPL, Copyright Matt Welland 2006-2017 + Usage: megatest [options] -h : this help -manual : show the Megatest user manual -version : print megatest version (currently " megatest-version ") @@ -90,10 +114,11 @@ -get-run-status : gets status for run specified by target and runname -run-wait : wait on run specified by target and runname -preclean : remove the existing test directory before running the test -clean-cache : remove the cached megatest.config and runconfigs.config files -no-cache : do not use the cached config files. + -one-pass : launch as many tests as you can but do not wait for more to be ready Selectors (e.g. use for -runtests, -remove-runs, -set-state-status, -list-runs etc.) -target key1/key2/... : run for key1, key2, etc. -reqtarg key1/key2/... : run for key1, key2, etc. but key1/key2 must be in runconfigs -testpatt patt1/patt2,patt3/... : % is wildcard @@ -323,10 +348,11 @@ "-lock" "-unlock" "-list-servers" "-kill-servers" "-run-wait" ;; wait on a run to complete (i.e. no RUNNING) + "-one-pass" ;; "-local" ;; run some commands using local db access "-generate-html" ;; misc queries "-list-disks" @@ -1557,11 +1583,11 @@ (lambda (target runname keys keyvals) (if (args:get-arg "-rerun-clean") ;; first set states/statuses correct (let ((states (or (configf:lookup *configdat* "validvalues" "cleanrerun-states") "KILLREQ,KILLED,UNKNOWN,INCOMPLETE,STUCK,NOT_STARTED")) (statuses (or (configf:lookup *configdat* "validvalues" "cleanrerun-statuses") - "FAIL,INCOMPLETE,ABORT,CHECK"))) + "FAIL,INCOMPLETE,ABORT,CHECK,DEAD"))) (hash-table-set! args:arg-hash "-preclean" #t) (runs:operate-on 'set-state-status target (common:args-get-runname) ;; (or (args:get-arg "-runname")(args:get-arg ":runname")) "%" ;; (common:args-get-testpatt #f) ;; (args:get-arg "-testpatt") Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -128,24 +128,25 @@ ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) ;; - ;; ;;DOT CASE4 [label="reset\nconnection"]; - ;; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} - ;; ;;DOT CASE4 -> "rmt:send-receive"; - ;; ;; reset the connection if it has been unused too long - ;; ((and runremote - ;; (remote-conndat runremote) - ;; (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #.\n message: Server closed connection before sending response" - ;; (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) - ;; (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") - ;; (http-transport:close-connections area-dat: runremote) - ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. - ;; (mutex-unlock! *rmt-mutex*) - ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) - + ;;DOT CASE4 [label="reset\nconnection"]; + ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} + ;;DOT CASE4 -> "rmt:send-receive"; + ;; reset the connection if it has been unused too long + ((and runremote + (remote-conndat runremote) + (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on + (+ (http-transport:server-dat-get-last-access (remote-conndat runremote)) + (remote-server-timeout runremote)))) + (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") + (http-transport:close-connections area-dat: runremote) + (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. + (mutex-unlock! *rmt-mutex*) + (rmt:send-receive cmd rid params attemptnum: attemptnum)) + ;;DOT CASE5 [label="local\nread"]; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; ;;DOT CASE5 -> "rmt:open-qry-close-locally"; ;; on homehost and this is a read ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required @@ -229,13 +230,13 @@ ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; ;; not on homehost, do server query (else - (mutex-unlock! *rmt-mutex*) + ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") - (mutex-lock! *rmt-mutex*) + ;; (mutex-lock! *rmt-mutex*) (let* ((conninfo (remote-conndat runremote)) (dat (case (remote-transport runremote) ((http) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away (http-transport:client-api-send-receive 0 conninfo cmd params) ((commfail)(vector #f "communications fail")) @@ -243,22 +244,26 @@ (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) - (if (and (vector? conninfo) (> 5 (vector-length conninfo))) + (if (and (vector? conninfo) (< 5 (vector-length conninfo))) (http-transport:server-dat-update-last-access conninfo) ;; refresh access time - (begin + (begin + (debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo) (set! conninfo #f) - (remote-conndat-set! runremote #f))) + (remote-conndat-set! *runremote* #f) + (http-transport:close-connections area-dat: runremote))) ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end (if (and (vector? res) (eq? (vector-length res) 2) (eq? (vector-ref res 1) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision. + ;; this is the case where the returned data is bad or the server is overloaded and we want + ;; to ease off the queries (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (mutex-lock! *rmt-mutex*) (http-transport:close-connections area-dat: runremote) (set! *runremote* #f) ;; force starting over @@ -266,14 +271,16 @@ (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) - (remote-conndat-set! runremote #f) + (mutex-lock! *rmt-mutex*) + (remote-conndat-set! runremote #f) (http-transport:close-connections area-dat: runremote) (remote-server-url-set! runremote #f) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") ;; (if (not (server:check-if-running *toppath*)) ;; (server:start-and-wait *toppath*)) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;;DOT } Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -266,10 +266,49 @@ (debug:print-info 0 *default-log-port* "running run-pre-hook: \"" run-pre-hook "\", log is " actual-logf) (system (conc run-pre-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 *default-log-port* "pre-hook \"" run-pre-hook "\" took " (- (current-seconds) start-time) " seconds to run."))) (debug:print 0 *default-log-port* "Skipping pre-hook call \"" run-pre-hook "\" as there are existing tests for this run."))))) +(define (runs:run-post-hook run-id) + (let* ((run-post-hook (configf:lookup *configdat* "runs" "post-hook")) + (existing-tests (if run-post-hook + (rmt:get-tests-for-run run-id "%" '() '() ;; run-id testpatt states statuses + #f #f ;; offset limit + #f ;; not-in + #f ;; sort-by + #f ;; sort-order + #f ;; get full data (not 'shortlist) + 0 ;; (runs:gendat-inc-results-last-update *runs:general-data*) ;; last update time + 'dashboard) + '())) + (log-dir (conc *toppath* "/logs")) + (log-file (conc "post-hook-" (string-translate (getenv "MT_TARGET") "/" "-") "-" (getenv "MT_RUNNAME") ".log")) + (full-log-fname (conc log-dir "/" log-file))) + (if run-post-hook + ;; (if (null? existing-tests) + ;; (debug:print 0 *default-log-port* "Skipping post-hook call \"" run-post-hook "\" as there are existing tests for this run."))))) + (let* ((use-log-dir (if (not (directory-exists? log-dir)) + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir) + #f) + (create-directory log-dir #t) + #t) + #t)) + (start-time (current-seconds)) + (actual-logf (if use-log-dir full-log-fname log-file))) + (handle-exceptions + exn + (begin + (print-call-chain *default-log-port*) + (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "ERROR: failed to run post-hook " run-post-hook ", check the log " log-file)) + (debug:print-info 0 *default-log-port* "running run-post-hook: \"" run-post-hook "\", log is " actual-logf) + (system (conc run-post-hook " >> " actual-logf " 2>&1")) + (debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) + ;; test-names: Comma separated patterns same as test-patts but used in selection ;; of tests to run. The item portions are not respected. ;; FIXME: error out if /patt specified ;; (define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names @@ -326,13 +365,13 @@ (thread-start! th1) (thread-join! th2))))) (set-signal-handler! signal/int sighand) (set-signal-handler! signal/term sighand)) - ;; force the starting of a server - (debug:print 0 *default-log-port* "waiting on server...") - (server:start-and-wait *toppath*) + ;; force the starting of a server -- removed BB 17ww28 - no longer needed. + ;;(debug:print 0 *default-log-port* "waiting on server...") + ;;(server:start-and-wait *toppath*) (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process (set! runconf (if (common:file-exists? runconfigf) (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target) (begin @@ -521,20 +560,10 @@ (begin (print-call-chain) (print " message: " ((condition-property-accessor 'exn 'message) exn))) (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))) - ;; (handle-exceptions - ;; exn - ;; (begin - ;; (print-call-chain (current-error-port)) - ;; (debug:print-error 0 *default-log-port* "failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn)) - ;; (if (> run-queue-retries 0) - ;; (begin - ;; (set! run-queue-retries (- run-queue-retries 1)) - ;; (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)))) - ;; (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))) "runs:run-tests-queue")) (th2 (make-thread (lambda () ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) @@ -594,12 +623,21 @@ (cdr reg) (if (null? tal) ;; if tal is null and reg not full then '() as reg contents moved to tal '() reg))) +;; this is the list of parameters to the named loop "loop" near the top of runs:run-tests-queue, look around line 1216 +;; +(define (runs:loop-values tal reg reglen regfull reruns) + (list (runs:queue-next-hed tal reg reglen regfull) ;; hed + (runs:queue-next-tal tal reg reglen regfull) ;; tal + (runs:queue-next-reg tal reg reglen regfull) ;; reg + reruns)) ;; reruns + (define runs:nothing-left-in-queue-count 0) +;; BB: for future reference - suspect target vars are not expanded to env vars at this point (item expansion using [items]\nwhatever [system echo $TARGETVAR] doesnt work right whereas [system echo #{targetvar}] does.. Tal and Randy have tix on this. on first pass, var not set, on second pass, ok. (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps) (let* ((loop-list (list hed tal reg reruns)) (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))) (if (list? res) res @@ -635,14 +673,11 @@ (member (hash-table-ref/default test-registry (db:test-make-full-name hed item-path) 'n/a) '(DONOTRUN removed CANNOTRUN))) ;; *common:cant-run-states-sym*) ;; '(COMPLETED KILLED WAIVED UNKNOWN INCOMPLETE)) ;; try to catch repeat processing of COMPLETED tests here (debug:print-info 1 *default-log-port* "Test " hed " set to \"" (hash-table-ref test-registry (db:test-make-full-name hed item-path)) "\". Removing it from the queue") (if (or (not (null? tal)) (not (null? reg))) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns) + (runs:loop-values tal reg reglen regfull reruns) (begin (debug:print-info 0 *default-log-port* "Nothing left in the queue!") ;; If get here twice then we know we've tried to expand all items ;; since there must be a logic issue with the handling of loops in the ;; items expand phase we will brute force an exit here. @@ -711,14 +746,12 @@ (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to discarded prerequisites"))) (if (and (null? trimmed-tal) (null? trimmed-reg)) #f - (list (runs:queue-next-hed trimmed-tal trimmed-reg reglen regfull) - (runs:queue-next-tal trimmed-tal trimmed-reg reglen regfull) - (runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull) - reruns))) + (runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns) + )) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) (null? prereq-fails) (null? non-completed)) @@ -733,14 +766,12 @@ (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while."))) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns)))) + (runs:loop-values tal reg reglen regfull reruns) + ))) ((and (or (not (null? fails)) (not (null? prereq-fails))) (member 'normal testmode)) @@ -753,27 +784,21 @@ (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites") (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))) (if (or (not (null? reg))(not (null? tal))) (begin (hash-table-set! test-registry hed 'CANNOTRUN) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - (cons hed reruns))) + (runs:loop-values tal reg reglen regfull (cons hed reruns)) + ) #f)) ;; #f flags do not loop ((and (not (null? fails))(member 'toplevel testmode)) (if (or (not (null? reg))(not (null? tal))) (list (car newtal)(append (cdr newtal) reg) '() reruns) #f)) ((null? runnables) #f) ;; if we get here and non-completed is null then it is all over. (else (debug:print 0 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now") - ;; (list (runs:queue-next-hed tal reg reglen regfull) - ;; (runs:queue-next-tal tal reg reglen regfull) - ;; (runs:queue-next-reg tal reg reglen regfull) - ;; reruns) (list (car newtal)(cdr newtal) reg reruns))))) (define (runs:mixed-list-testname-and-testrec->list-of-strings inlst) (if (null? inlst) '() @@ -873,14 +898,11 @@ ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run ;; else the run is stuck, temporarily or permanently ;; but should check if it is due to lack of resources vs. prerequisites (debug:print-info 1 *default-log-port* "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) (if (or (not (null? tal))(not (null? reg))) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns) + (runs:loop-values tal reg reglen regfull reruns) #f)) ;; Register tests ;; ((not (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) @@ -901,11 +923,11 @@ (if (rmt:get-test-id run-id test-name "") (hash-table-set! test-registry (db:test-make-full-name test-name "") 'done)))) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) (if (and (null? tal)(null? reg)) (list hed tal (append reg (list hed)) reruns) - (list (runs:queue-next-hed tal reg reglen regfull) + (list (runs:queue-next-hed tal reg reglen regfull) ;; cannot replace with a call to runs:loop-values as the logic is different for reg (runs:queue-next-tal tal reg reglen regfull) ;; NB// Here we are building reg as we register tests ;; if regfull we must pop the front item off reg (if regfull (append (cdr reg) (list hed)) @@ -958,14 +980,11 @@ (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) (if (or (not (null? tal))(not (null? reg))) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns) + (runs:loop-values tal reg reglen regfull reruns) #f)) ;; must be we have unmet prerequisites ;; (else @@ -997,25 +1016,18 @@ (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) ;; This next is for the items (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns ;; WAS: (cons hed reruns) ;; but that makes no sense? - )) + (runs:loop-values tal reg reglen regfull reruns)) (let ((nth-try (hash-table-ref/default test-registry hed 0))) (cond ((member "RUNNING" (map db:test-get-state prereqs-not-met)) (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet.")) (thread-sleep! 4) - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns)) + (runs:loop-values tal reg reglen regfull reruns)) ((or (not nth-try) (and (number? nth-try) (< nth-try 10))) (hash-table-set! test-registry hed (if (number? nth-try) (+ nth-try 1) @@ -1022,17 +1034,11 @@ 0)) (if (runs:lownoise (conc "not removing test " hed) 60) (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites")) ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) - ;; (list hed tal reg reruns) - ;; (list (car newtal)(cdr newtal) reg reruns) - ;; (hash-table-set! test-registry hed 'removed) - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns)) + (runs:loop-values newtal reg reglen regfull reruns)) ((symbol? nth-try) (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW (if (null? tal) #f ;; yes, really (list (car tal)(cdr tal) reg reruns)) @@ -1039,14 +1045,11 @@ (begin (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state " nth-try " will be overridden and we'll retry.")) (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f) (hash-table-set! test-registry hed 0) - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns)))) + (runs:loop-values newtal reg reglen regfull)))) (else (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now.")) ;; (debug:print 0 *default-log-port* " prereqs: " prereqs-not-met) (hash-table-set! test-registry hed 'removed) @@ -1056,17 +1059,14 @@ (list (if (null? tal)(car newtal)(car tal)) tal reg reruns))))) ;; can't drop this - maybe running? Just keep trying - (let ((runable-tests (runs:runable-tests prereqs-not-met))) + (let ((runable-tests (runs:runable-tests prereqs-not-met))) ;; SUSPICIOUS: Should look at more than just prereqs-not-met? (if (null? runable-tests) #f ;; I think we are truly done here - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns))))))))) + (runs:loop-values newtal reg reglen regfull reruns))))))))) ;; scan a list of tests looking to see if any are potentially runnable ;; (define (runs:runable-tests tests) (filter (lambda (t) @@ -1271,17 +1271,18 @@ itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) + ;; -- removed BB 17ww28 - no longer needed. ;; every 15 minutes verify the server is there for this run - (if (and (common:low-noise-print 240 "try start server" run-id) - (not (or (and *runremote* - (remote-server-url *runremote*) - (server:ping (remote-server-url *runremote*))) - (server:check-if-running *toppath*)))) - (server:kind-run *toppath*)) + ;; (if (and (common:low-noise-print 240 "try start server" run-id) + ;; (not (or (and *runremote* + ;; (remote-server-url *runremote*) + ;; (server:ping (remote-server-url *runremote*))) + ;; (server:check-if-running *toppath*)))) + ;; (server:kind-run *toppath*)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) @@ -1409,11 +1410,11 @@ (loop (car tal)(cdr tal) reg reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS - ((or (procedure? items)(eq? items 'have-procedure)) + ((or (procedure? items)(eq? items 'have-procedure)) ;; BB - target vars are env vars here? to allow expansion of [items]\nsomething [system echo $SOMETARGVAR], which is wonky (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) (if loop-list @@ -1466,10 +1467,11 @@ (thread-sleep! 5) ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! + (runs:run-post-hook run-id) (debug:print-info 1 *default-log-port* "All tests launched"))) (define (runs:calc-fails prereqs-not-met) (filter (lambda (test) (and (vector? test) ;; not (string? test)) @@ -1963,23 +1965,24 @@ ;; (resolve-pathname run-dir) (common:nice-path run-dir) #f)) (clean-mode (or mode 'remove-all)) (test-id (db:test-get-id test)) - (lock-key (conc "test-" test-id)) - (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) - (expire-time (+ (current-seconds) 30))) ;; give up on getting the lock and steal it after 15 seconds - (if (car lock) - #t - (if (> (current-seconds) expire-time) - (begin - (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to clean test with id " test-id) - (rmt:no-sync-del! lock-key) ;; destroy the lock - (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; - (begin - (thread-sleep! 1) - (loop (rmt:no-sync-get-lock lock-key) expire-time))))))) + ;; (lock-key (conc "test-" test-id)) + ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) + ;; (expire-time (+ (current-seconds) 30))) ;; give up on getting the lock and steal it after 15 seconds + ;; (if (car lock) + ;; #t + ;; (if (> (current-seconds) expire-time) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to clean test with id " test-id) + ;; (rmt:no-sync-del! lock-key) ;; destroy the lock + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; + ;; (begin + ;; (thread-sleep! 1) + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time))))))) + ) (case clean-mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f)) ((remove-all) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f))) (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) @@ -2016,11 +2019,12 @@ ;; Only delete the records *after* removing the directory. If things fail we have a record (case clean-mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) (db:test-get-state test)(db:test-get-status test) #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVED" #f #f)) (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test)))) - (rmt:no-sync-del! lock-key))) + ;; (rmt:no-sync-del! lock-key) + )) ;;====================================================================== ;; Routines for manipulating runs ;;====================================================================== Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -117,11 +117,11 @@ " -daemonize " "") ;; " -log " logfile " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) - (load-limit (configf:lookup-number *configdat* "server" "load-limit" default: 0.9))) + (load-limit (configf:lookup-number *configdat* "jobtools" "maxhomehostload" default: 3.0))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) @@ -134,11 +134,11 @@ (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) - (common:wait-for-normalized-load load-limit " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever + (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) @@ -249,11 +249,11 @@ (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 0) ;; been running at least 0 seconds (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (< (- now start-time) + (< (- now start-time) (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) 180) (random 360))) ;; under one hour running time +/- 180 )) #f)) @@ -448,26 +448,25 @@ (set! *db-last-access* (current-seconds)) ;; might not be needed. (if (equal? *toppath* toppath) #t #f))) -;; timeout is in hours -(define (server:get-timeout) - (let ((tmo (configf:lookup *configdat* "server" "timeout"))) +;; timeout is hms string: 1h 5m 3s, default is 1 minute +;; +(define (server:expiration-timeout) + (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) - (string->number tmo)) - (* 60 60 (string->number tmo)) - ;; (* 3 24 60 60) ;; default to three days - ;;(* 60 60 1) ;; default to one hour - (* 60 5) ;; default to five minutes - ))) + (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below + (* 3600 (string->number tmo)) + 60))) ;; moving this here as it needs access to db and cannot be in common. ;; (define (server:writable-watchdog dbstruct) (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) + (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) (no-sync-db (db:open-no-sync-db)) (sync-duration 0) ;; run time of the sync in milliseconds (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))) @@ -486,72 +485,82 @@ ;; sync for filesystem local db writes ;; (mutex-lock! *db-multi-sync-mutex*) (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write (sync-in-progress *db-sync-in-progress*) + (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5)) (should-sync (and (not *time-to-exit*) - (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum + (> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed (start-time (current-seconds)) + (cpu-load-adj (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f))) (mt-mod-time (file-modification-time mtpath)) (last-sync-start (if (common:file-exists? start-file) (file-modification-time start-file) 0)) (last-sync-end (if (common:file-exists? end-file) (file-modification-time end-file) 10)) - (recently-synced (and (< (- start-time mt-mod-time) 4) ;; not useful if sync didn't modify megatest.db! + (sync-period (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period + (recently-synced (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db! (< mt-mod-time last-sync-start))) (sync-done (<= last-sync-start last-sync-end)) - (will-sync (and (or need-sync should-sync) - sync-done + (sync-stale (> start-time (+ last-sync-start sync-stale-seconds))) + (will-sync (and (not *time-to-exit*) ;; do not start a sync if we are in the process of exiting + (or need-sync should-sync) + (or sync-done sync-stale) (not sync-in-progress) (not recently-synced)))) (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress=" sync-in-progress " should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync - " sync-done=" sync-done) + " sync-done=" sync-done " sync-period=" sync-period) + (if (and (> sync-period 5) + (common:low-noise-print 30 "sync-period")) + (debug:print-info 0 *default-log-port* "Increased sync period due to long sync times, sync took: " sync-period " seconds.")) ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) (if will-sync (set! *db-sync-in-progress* #t)) (mutex-unlock! *db-multi-sync-mutex*) (if will-sync - (let ((sync-start (current-milliseconds))) + (let (;; (max-sync-duration (configf:lookup-number *configdat* "server" "max-sync-duration")) ;; KEEPING THIS AVAILABLE BUT SHOULD NOT USE, I'M PRETTY SURE IT DOES NOT WORK! + (sync-start (current-milliseconds))) (with-output-to-file start-file (lambda ()(print (current-process-id)))) - + ;; put lock here - (if (< sync-duration 300) + ;; (if (or (not max-sync-duration) + ;; (< sync-duration max-sync-duration)) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive (set! sync-duration (- (current-milliseconds) sync-start)) (if (> res 0) ;; some records were transferred, keep the db alive (begin (mutex-lock! *heartbeat-mutex*) (set! *db-last-access* (current-seconds)) (mutex-unlock! *heartbeat-mutex*) (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) - (debug:print-info 2 *default-log-port* "sync called but zero records transferred"))) - ;; TODO: factor this next routine out into a function - (with-input-from-pipe ;; this should not block other threads but need to verify this - (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*) - (lambda () - (let loop ((inl (read-line)) - (res #f)) - (if (eof-object? inl) - (begin - (set! sync-duration (- (current-milliseconds) sync-start)) - (cond - ((not res) - (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\"")) - ((> res 0) - (mutex-lock! *heartbeat-mutex*) - (set! *db-last-access* (current-seconds)) - (mutex-unlock! *heartbeat-mutex*)))) - (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl))) - (if matches - (string->number (cadr matches)) - #f)))) - (loop (read-line) - (or num-synced res)))))))))) + (debug:print-info 2 *default-log-port* "sync called but zero records transferred"))))) +;; ;; TODO: factor this next routine out into a function +;; (with-input-from-pipe ;; this should not block other threads but need to verify this +;; (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*) +;; (lambda () +;; (let loop ((inl (read-line)) +;; (res #f)) +;; (if (eof-object? inl) +;; (begin +;; (set! sync-duration (- (current-milliseconds) sync-start)) +;; (cond +;; ((not res) +;; (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\"")) +;; ((> res 0) +;; (mutex-lock! *heartbeat-mutex*) +;; (set! *db-last-access* (current-seconds)) +;; (mutex-unlock! *heartbeat-mutex*)))) +;; (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl))) +;; (if matches +;; (string->number (cadr matches)) +;; #f)))) +;; (loop (read-line) +;; (or num-synced res)))))))))) (if will-sync (begin (mutex-lock! *db-multi-sync-mutex*) (set! *db-sync-in-progress* #f) (set! *db-last-sync* start-time) ADDED show-uncalled-procedures.scm Index: show-uncalled-procedures.scm ================================================================== --- /dev/null +++ show-uncalled-procedures.scm @@ -0,0 +1,13 @@ +(include "codescanlib.scm") + +(define (show-danglers) + (let* ((all-scm-files (glob "*.scm")) + (xref (get-xref all-scm-files)) + (dangling-procs + (map car (filter (lambda (x) (equal? 1 (length x))) xref)))) + (for-each print dangling-procs) ;; our product. + )) + +(show-danglers) + + Index: tcmt.scm ================================================================== --- tcmt.scm +++ tcmt.scm @@ -13,11 +13,14 @@ ;; ;; 1. Run the megatest process and pass it all the needed parameters ;; 2. Every five seconds check for state/status changes and print the info ;; -(use srfi-1 posix srfi-69 srfi-18 regex) +(use srfi-1 posix srfi-69 srfi-18 regex defstruct) + +(use trace) +;; (trace-call-sites #t) (declare (uses margs)) (declare (uses rmt)) (declare (uses common)) (declare (uses megatest-version)) @@ -29,92 +32,248 @@ (define remargs (args:get-args (argv) `( "-target" "-reqtarg" "-runname" + "-delay" ;; how long to wait for unexpected changes to ) `("-tc-repl" ) args:arg-hash 0)) -;; ##teamcity[testStarted name='suite.testName'] +(defstruct testdat + (tc-type #f) + (state #f) + (status #f) + (overall #f) + (flowid #f) + tctname + tname + (event-time #f) + details + comment + duration + (start-printed #f) + (end-printed #f)) + +;;====================================================================== +;; GLOBALS +;;====================================================================== + +;; Gotta have a global? Stash it in the *global* hash table. +;; +(define *global* (make-hash-table)) + +(define (tcmt:print tdat flush-mode) + (let* ((comment (if (testdat-comment tdat) + (conc " message='" (testdat-comment tdat) "'") + "")) + (details (if (testdat-details tdat) + (conc " details='" (testdat-details tdat) "'") + "")) + (flowid (conc " flowId='" (testdat-flowid tdat) "'")) + (duration (conc " duration='" (* 1e3 (testdat-duration tdat)) "'")) + (tcname (conc " name='" (testdat-tctname tdat) "'")) + (state (string->symbol (testdat-state tdat))) + (status (string->symbol (testdat-status tdat))) + (startp (testdat-start-printed tdat)) + (endp (testdat-end-printed tdat)) + (etime (testdat-event-time tdat)) + (overall (case state + ((RUNNING) state) + ((COMPLETED) state) + (else 'UNK))) + (tstmp (conc " timestamp='" (time->string (seconds->local-time etime) "%FT%T.000") "'"))) + (case overall + ((RUNNING) + (if (not startp) + (begin + (print "##teamcity[testStarted " tcname flowid tstmp "]") + (testdat-start-printed-set! tdat #t)))) + ((COMPLETED) + (if (not startp) ;; start stanza never printed + (begin + (print "##teamcity[testStarted " tcname flowid tstmp "]") + (testdat-start-printed-set! tdat #t))) + (if (not endp) + (begin + (if (not (member status '(PASS WARN SKIP WAIVED))) + (print "##teamcity[testFailed " tcname flowid comment details "]")) + (print "##teamcity[testFinished" tcname flowid comment details duration "]") + (testdat-end-printed-set! tdat #t)))) + (else + (if flush-mode + (begin + (if (not startp) + (begin + (print "##teamcity[testStarted " tcname flowid tstmp "]") + (testdat-start-printed-set! tdat #t))) + (if (not endp) + (begin + (print "##teamcity[testFailed " tcname flowid comment details "]") + (print "##teamcity[testFinished" tcname flowid comment details duration "]") + (testdat-end-printed-set! tdat #t))))))) + ;; (print "ERROR: tc-type \"" (testdat-tc-type tdat) "\" not recognised for " tcname))) + (flush-output))) + +;; ;; returns values: flag newlst +;; (define (remove-duplicate-completed tdats) +;; (let* ((flag #f) +;; (state (testdat-state tdat)) +;; (status (testdat-status tdat)) +;; (event-time (testdat-event-time tdat)) +;; (tname (testdat-tname tdat))) +;; (let loop ((hed (car tdats)) +;; (tal (cdr tdats)) +;; (new '())) +;; (if (and (equal? state "COMPLETED") +;; (equal? tname (testdat-tname hed)) +;; (equal? state (testdat-state hed))) ;; we have a duplicate COMPLETED call +;; (begin +;; (set! flag #t) ;; A changed completed + +;; process the queue of tests gathered so far. List includes one entry for every test so far seen +;; the last record for a test is preserved. Items are only removed from the list if over 15 seconds +;; have passed since it happened. This allows for compression of COMPLETED/FAIL followed by some other +;; state/status +;; +(define (process-queue data age flush-mode) + ;; here we process tqueue and gather those over 15 seconds (configurable?) old + (let* ((print-time (- (current-seconds) age)) ;; print stuff over 15 seconds old + (tqueue-raw (hash-table-ref/default data 'tqueue '())) + (tqueue (reverse (delete-duplicates tqueue-raw ;; REMOVE duplicates by testname and state + (lambda (a b) + (and (equal? (testdat-tname a)(testdat-tname b)) ;; need oldest to newest + (equal? (testdat-state a) (testdat-state b)))))))) ;; "COMPLETED") + ;; (equal? (testdat-state b) "COMPLETED"))))))) + (if (not (null? tqueue)) + (hash-table-set! + data + 'tqueue + (let loop ((hed (car tqueue)) ;; by this point all duplicates by state COMPLETED are removed + (tal (cdr tqueue)) + (rem '())) + (if (> print-time (testdat-event-time hed)) ;; event happened over 15 seconds ago + (begin + (tcmt:print hed flush-mode) + (if (null? tal) + rem ;; return rem to be processed in the future + (loop (car tal)(cdr tal) rem))) + (if (null? tal) + (cons hed rem) ;; return rem + hed for future processing + (loop (car tal)(cdr tal)(cons hed rem))))))))) + + ;; ##teamcity[testStarted name='suite.testName'] ;; ##teamcity[testStdOut name='suite.testName' out='text'] ;; ##teamcity[testStdErr name='suite.testName' out='error text'] ;; ##teamcity[testFailed name='suite.testName' message='failure message' details='message and stack trace'] ;; ##teamcity[testFinished name='suite.testName' duration='50'] ;; +;; flush; #f, normal call. #t, last call, print out something for NOT_STARTED, etc. +;; + +;;;;;;; (begin +;;;;;;; (case (string->symbol newstat) +;;;;;;; ((UNK) ) ;; do nothing +;;;;;;; ((RUNNING) (print "##teamcity[testStarted name='" tctname "' flowId='" flowid "']")) +;;;;;;; ((PASS SKIP WARN WAIVED) (print "##teamcity[testFinished name='" tctname "' duration='" (* 1e3 duration) "'" cmtstr details " flowId='" flowid "']")) +;;;;;;; (else +;;;;;;; (print "##teamcity[testFailed name='" tctname "' " cmtstr details " flowId='" flowid "']"))) +;;;;;;; (flush-output) + +;; (trace rmt:get-tests-for-run) -(define (print-changes-since data run-ids last-update tsname target runname) +(define (update-queue-since data run-ids last-update tsname target runname flowid flush) ;; (let ((now (current-seconds))) - (handle-exceptions - exn - (begin (print-call-chain) (print "Error message: " ((condition-property-accessor 'exn 'message) exn))) - (for-each - (lambda (run-id) - (let* ((tests (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f last-update #f))) - ;; (print "DEBUG: got tests=" tests) - (for-each - (lambda (testdat) - (let* ((testn (db:test-get-fullname testdat)) - (testname (db:test-get-testname testdat)) - (itempath (db:test-get-item-path testdat)) - (tctname (if (string=? itempath "") testname (conc testname "." (string-translate itempath "/" ".")))) - (state (db:test-get-state testdat)) - (status (db:test-get-status testdat)) - (duration (or (any->number (db:test-get-run_duration testdat)) 0)) - (comment (db:test-get-comment testdat)) - (logfile (db:test-get-final_logf testdat)) - (prevstat (hash-table-ref/default data testn #f)) - (newstat (if (equal? state "RUNNING") - "RUNNING" - (if (equal? state "COMPLETED") - status - "UNK"))) - (cmtstr (if comment - (conc " message='" comment "' ") - " ")) - (details (if (string-match ".*html$" logfile) - (conc " details='" *toppath* "/lt/" target "/" runname "/" testname (if (equal? itempath "") "/" (conc "/" itempath "/")) logfile "' ") - ""))) - - ;; (print "DEBUG: testn=" testn " state=" state " status=" status " prevstat=" prevstat " newstat=" newstat) - (if (or (not prevstat) - (not (equal? prevstat newstat))) - (begin - (case (string->symbol newstat) - ((UNK) ) ;; do nothing - ((RUNNING) (print "##teamcity[testStarted name='" tctname "']")) - ((PASS SKIP WARN WAIVED) (print "##teamcity[testFinished name='" tctname "' duration='" (* 1e3 duration) "'" cmtstr details " ]")) - (else - (print "##teamcity[testFailed name='" tctname "' " cmtstr details " ]"))) - (flush-output) - (hash-table-set! data testn newstat))))) - tests))) - run-ids)) - now)) - +;; (handle-exceptions +;; exn +;; (begin (print-call-chain) (print "Error message: " ((condition-property-accessor 'exn 'message) exn))) + (for-each + (lambda (run-id) + (let* ((tests (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f last-update #f))) + ;; (print "DEBUG: got tests=" tests) + (for-each + (lambda (test-rec) + (let* ((tqueue (hash-table-ref/default data 'tqueue '())) ;; NOTE: the key is a symbol! This allows keeping disparate info in the one hash, lazy but a quick solution for right now. + (is-top (db:test-get-is-toplevel test-rec)) + (tname (db:test-get-fullname test-rec)) + (testname (db:test-get-testname test-rec)) + (itempath (db:test-get-item-path test-rec)) + (tctname (if (string=? itempath "") testname (conc testname "." (string-translate itempath "/" ".")))) + (state (db:test-get-state test-rec)) + (status (db:test-get-status test-rec)) + (etime (db:test-get-event_time test-rec)) + (duration (or (any->number (db:test-get-run_duration test-rec)) 0)) + (comment (db:test-get-comment test-rec)) + (logfile (db:test-get-final_logf test-rec)) + (hostn (db:test-get-host test-rec)) + (pid (db:test-get-process_id test-rec)) + (newstat (cond + ((equal? state "RUNNING") "RUNNING") + ((equal? state "COMPLETED") status) + (flush (conc state "/" status)) + (else "UNK"))) + (cmtstr (if (and (not flush) comment) + comment + (if flush + (conc "Test ended in state/status=" state "/" status (if (string-match "^\\s*$" comment) + ", no Megatest comment found." + (conc ", Megatest comment=\"" comment "\""))) ;; special case, we are handling stragglers + #f))) + (details (if (string-match ".*html$" logfile) + (conc *toppath* "/lt/" target "/" runname "/" testname (if (equal? itempath "") "/" (conc "/" itempath "/")) logfile) + #f)) + (prev-tdat (hash-table-ref/default data tname #f)) + (tdat (if is-top + #f + (let ((new (or prev-tdat (make-testdat)))) ;; recycle the record so we keep track of already printed items + (testdat-flowid-set! new (or (testdat-flowid new) + (if (eq? pid 0) + tctname + (conc hostn "-" pid)))) + (testdat-tctname-set! new tctname) + (testdat-tname-set! new tname) + (testdat-state-set! new state) + (testdat-status-set! new status) + (testdat-comment-set! new cmtstr) + (testdat-details-set! new details) + (testdat-duration-set! new duration) + (testdat-event-time-set! new etime) ;; (current-seconds)) + (testdat-overall-set! new newstat) + (hash-table-set! data tname new) + new)))) + (if (not is-top) + (hash-table-set! data 'tqueue (cons tdat tqueue))) + (hash-table-set! data tname tdat) + )) + tests))) + run-ids) + now)) + (define (monitor pid) - (let ((run-ids #f) - (testdat (make-hash-table)) - (keys #f) - (last-update 0) - (target (or (args:get-arg "-target") - (args:get-arg "-reqtarg"))) - (runname (args:get-arg "-runname")) - (tsname #f)) + (let* ((run-ids '()) + (testdats (make-hash-table)) ;; each entry is a list of testdat structs + (keys #f) + (last-update 0) + (target (or (args:get-arg "-target") + (args:get-arg "-reqtarg"))) + (runname (args:get-arg "-runname")) + (tsname #f) + (flowid (conc target "/" runname)) + (tdelay (string->number (or (args:get-arg "-delay") "15")))) (if (and target runname) (begin (launch:setup) (set! keys (rmt:get-keys)))) (set! tsname (common:get-testsuite-name)) - (print "TCMT: for testsuite=" tsname " found runname=" runname ", target=" target ", keys=" keys " and successfully ran launch:setup.") + (print "TCMT: for testsuite=" tsname " found runname=" runname ", target=" target ", keys=" keys " and successfully ran launch:setup. Using " flowid " as the flowId.") (let loop () - (handle-exceptions - exn - ;; (print "Process done.") - (begin (print-call-chain) (print "Error message: " ((condition-property-accessor 'exn 'message) exn))) + ;;;;;; (handle-exceptions + ;;;;;; exn + ;;;;;; ;; (print "Process done.") + ;;;;;; (begin (print-call-chain) (print "Error message: " ((condition-property-accessor 'exn 'message) exn))) (let-values (((pidres exittype exitstatus) (process-wait pid #t))) (if (and keys (or (not run-ids) (null? run-ids))) @@ -131,20 +290,28 @@ (run-ids-in (map (lambda (row) (db:get-value-by-header row header "id")) rows))) (set! run-ids run-ids-in))) ;; (print "TCMT: pidres=" pidres " exittype=" exittype " exitstatus=" exitstatus " run-ids=" run-ids) - (if keys - (set! last-update (print-changes-since testdat run-ids last-update tsname target runname))) (if (eq? pidres 0) (begin - (thread-sleep! 3) + (if keys + (begin + (set! last-update (- (update-queue-since testdats run-ids last-update tsname target runname flowid #f) 5)) + (process-queue testdats tdelay #f))) + (thread-sleep! 3) (loop)) (begin ;; (print "TCMT: pidres=" pidres " exittype=" exittype " exitstatus=" exitstatus " run-ids=" run-ids) + (print "TCMT: processing any tests that did not formally complete.") + (update-queue-since testdats run-ids 0 tsname target runname flowid #t) ;; call in flush mode + (process-queue testdats 0 #t) (print "TCMT: All done.") - ))))))) + )))))) +;;;;; ) + +;; (trace print-changes-since) ;; (if (not (eq? pidres 0)) ;; (not exitstatus)) ;; (begin ;; (thread-sleep! 3) ;; (loop)) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1160,11 +1160,11 @@ ;; if .testconfig exists in test directory read and return it ;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata" ;; else read the testconfig file ;; if have path to test directory save the config as .testconfig and return it ;; -(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)) +(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)(allow-write-cache #t)) (let* ((use-cache (common:use-cache?)) (cache-path (tests:get-test-path-from-environment)) (cache-file (and cache-path (conc cache-path "/.testconfig"))) (cache-exists (and cache-file (not force-create) ;; if force-create then pretend there is no cache to read @@ -1201,11 +1201,12 @@ #f))) (if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data (if tcfg (hash-table-set! *testconfigs* test-full-name tcfg)) (if (and testexists cache-file - (file-write-access? cache-path)) + (file-write-access? cache-path) + allow-write-cache) (let ((tpath (conc cache-path "/.testconfig"))) (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath) (if (not (common:in-running-test?)) (configf:write-alist tcfg tpath)))) tcfg)))))) Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -252,10 +252,12 @@ # launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log # launcher #{ shell if which bsub > /dev/null;then echo bsub -q priority -o openlava.log;else echo sleeprunner;fi} # launcher nbfake +maxload 1.1 +maxhomehostload 1.1 [configf:settings trim-trailing-spaces yes] # Override the rollup for specific tests [testrollup] ADDED tests/fullrun/test-teamcity-run.sh Index: tests/fullrun/test-teamcity-run.sh ================================================================== --- /dev/null +++ tests/fullrun/test-teamcity-run.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +(cd ../..;make install) && RN=tcmt_m;megatest -remove-runs -target ubuntu/nfs/none -runname tcmt_m -testpatt %;\ + tcmt -run -target ubuntu/nfs/none -runname tcmt_m -testpatt % -rerun-clean 2>&1 | tee all.log | grep teamcity | tee teamcity.log + ADDED trackback.scm Index: trackback.scm ================================================================== --- /dev/null +++ trackback.scm @@ -0,0 +1,36 @@ +(include "codescanlib.scm") + +;; show call paths for named procedure +(define (traceback-proc in-procname) + (letrec* ((all-scm-files (glob "*.scm")) + (xref (get-xref all-scm-files)) + (have (alist-ref (string->symbol in-procname) xref eq? #f)) + (lookup (lambda (path procname depth) + (let* ((upcone-temp (filter (lambda (x) + (eq? procname (car x))) + xref)) + (upcone-temp2 (cond + ((null? upcone-temp) '()) + (else (cdar upcone-temp)))) + (upcone (filter + (lambda (x) (not (eq? x procname))) + upcone-temp2)) + (uppath (cons procname path)) + (updepth (add1 depth))) + (if (null? upcone) + (print uppath) + (for-each (lambda (x) + (if (not (member procname path)) + (lookup uppath x updepth) )) + upcone)))))) + (if have + (lookup '() (string->symbol in-procname) 0) + (print "no such func - "in-procname)))) + + +(if (eq? 1 (length (command-line-arguments))) + (traceback-proc (car (command-line-arguments))) + (print "Usage: trackback ")) + +(exit 0) + Index: utils/Makefile.git.installall ================================================================== --- utils/Makefile.git.installall +++ utils/Makefile.git.installall @@ -47,11 +47,11 @@ PROXY= # http://code.call-cc.org/dev-snapshots/2015/06/07/chicken-4.10.0rc1.tar.gz # http://code.call-cc.org/releases/4.10.0/chicken-4.10.0.tar.gz # Select version of chicken, sqlite3 etc -CHICKEN_VERSION=4.10.1 +CHICKEN_VERSION=4.12.0rc2 SQLITE3_VERSION=3090200 # http://www.sqlite.org/2014/sqlite-autoconf-3080500.tar.gz # http://www.sqlite.org/2015/sqlite-autoconf-3081101.tar.gz # Override IUPBRANCH to use other than trunk IUPBRANCH=trunk @@ -174,10 +174,13 @@ cd chicken-core; pwd cd chicken-core; fossil open ../chicken-scheme.fossil cd chicken-core; fossil up 337f5be # wget http://code.call-cc.org/dev-snapshots/2015/08/29/chicken-4.10.1.tar.gz +chicken-4.12.0rc2.tar.gz : + wget https://code.call-cc.org/dev-snapshots/2017/02/06/chicken-4.12.0rc2.tar.gz + # git clone git://code.call-cc.org/chicken-core # git clone http://code.call-cc.org/git/chicken-core.git $(PRODCHICKEN)/bin/chicken : wget http://code.call-cc.org/dev-snapshots/2015/08/29/chicken-4.10.1.tar.gz Index: utils/Makefile.installall ================================================================== --- utils/Makefile.installall +++ utils/Makefile.installall @@ -45,11 +45,11 @@ # http://code.call-cc.org/dev-snapshots/2015/06/07/chicken-4.10.0rc1.tar.gz # http://code.call-cc.org/releases/4.10.0/chicken-4.10.0.tar.gz # Select version of chicken, sqlite3 etc # CHICKEN_VERSION=4.10.0 -CHICKEN_VERSION=4.11.0rc2 +CHICKEN_VERSION=4.11.0 SQLITE3_VERSION=3090200 # http://www.sqlite.org/2014/sqlite-autoconf-3080500.tar.gz # http://www.sqlite.org/2015/sqlite-autoconf-3081101.tar.gz # Override IUPBRANCH to use other than trunk IUPBRANCH=trunk @@ -56,11 +56,11 @@ IUPCONFIG=ubuntu-15.04.inc # iup-3.15 # Eggs to install (straightforward ones) EGGS=matchable readline apropos base64 regex-literals format regex-case test coops trace csv \ - dot-locking posix-utils posix-extras directory-utils hostinfo tcp-server rpc csv-xml fmt \ + dot-locking posix-utils posix-extras hostinfo tcp-server rpc csv-xml fmt \ json md5 awful http-client spiffy uri-common intarweb spiffy-request-vars pathname-expand \ spiffy-directory-listing ssax sxml-serializer sxml-modifications sql-de-lite \ srfi-19 refdb ini-file sparse-vectors z3 call-with-environment-variables hahn linenoise \ crypt parley @@ -97,11 +97,11 @@ else ARCHSIZE=64_ endif CSCLIBS=$(shell echo $(LD_LIBRARY_PATH) | sed 's/:/ -L/g') -CSC_OPTIONS="-I$(PREFIX)/include -L$(CSCLIBS) -C \"-fPIC\"" +CSC_OPTIONS="-I$(PREFIX)/include -L$(CSCLIBS) -C -fPIC" # CSC_OPTIONS=-I $(PREFIX)/include -L $(CSCLIBS) nogui : base mutils #all : nogui libiup $(PREFIX)/lib/sqlite3.so @@ -137,11 +137,12 @@ mkdir -p eggflags touch $(EGGFLAGS) # some setup stuff # -$(PREFIX)/setup-chicken4x.sh : $(EGGFLAGS) +#$(PREFIX)/setup-chicken4x.sh : $(EGGFLAGS) +$(PREFIX)/setup-chicken4x.sh : mkdir -p $(PREFIX) (echo 'export PATH=$(PREFIX)/bin:$$PATH' > $(PREFIX)/setup-chicken4x.sh) (echo "export LD_LIBRARY_PATH=$(LD_LIBRARY_PATH)" >> $(PREFIX)/setup-chicken4x.sh) $(PREFIX)/setup-chicken4x.csh : $(EGGFLAGS) @@ -149,35 +150,26 @@ (echo "setenv PATH $(PREFIX):'$$'PATH" > $(PREFIX)/setup-chicken4x.csh) (echo "setenv LD_LIBRARY_PATH $(LD_LIBRARY_PATH)" >> $(PREFIX)/setup-chicken4x.csh) # NOTE: the touch chicken-core/chicken.scm compensates for the time stamp from the tar file chicken-core/chicken.scm : chicken-$(CHICKEN_VERSION).tar.gz - tar xf chicken-$(CHICKEN_VERSION).tar.gz + tar xzf chicken-$(CHICKEN_VERSION).tar.gz ln -sf chicken-$(CHICKEN_VERSION) chicken-core if [[ -e chicken-core/chicken.scm ]];then touch chicken-core/chicken.scm;fi -chicken-4.9.0rc1.tar.gz : - wget http://code.call-cc.org/dev-snapshots/2014/04/17/chicken-4.9.0rc1.tar.gz - -chicken-4.9.0.1.tar.gz : - wget http://code.call-cc.org/releases/4.9.0/chicken-4.9.0.1.tar.gz - -chicken-4.10.0rc1.tar.gz : - wget http://code.call-cc.org/dev-snapshots/2015/06/07/chicken-4.10.0rc1.tar.gz - -chicken-4.10.0.tar.gz : - wget http://code.call-cc.org/releases/4.10.0/chicken-4.10.0.tar.gz - -chicken-4.11.0rc2.tar.gz : - wget http://code.call-cc.org/dev-snapshots/2016/04/28/chicken-4.11.0rc2.tar.gz +chicken-4.11.0.tar.gz : + wget http://code.call-cc.org/releases/4.11.0/chicken-4.11.0.tar.gz # git clone git://code.call-cc.org/chicken-core # git clone http://code.call-cc.org/git/chicken-core.git $(CHICKEN_INSTALL) : chicken-core/chicken.scm $(PREFIX)/setup-chicken4x.sh $(PREFIX)/setup-chicken4x.csh - cd chicken-core;make PLATFORM=linux PREFIX=$(PREFIX) - cd chicken-core;make PLATFORM=linux PREFIX=$(PREFIX) install + pwd; env; which make + cd chicken-core; make PLATFORM=linux PREFIX=$(PREFIX) + cd chicken-core; make PLATFORM=linux PREFIX=$(PREFIX) install + #cd chicken-core;env -i PATH=${PATH} LD_LIBRARY_PATH=${LD_LIBRARY_PATH} make PLATFORM=linux PREFIX=$(PREFIX) + #cd chicken-core;env -i PATH=${PATH} LD_LIBRARY_PATH=${LD_LIBRARY_PATH} make PLATFORM=linux PREFIX=$(PREFIX) install #====================================================================== # S Q L I T E 3 #====================================================================== # https://www.sqlite.org/2015/sqlite-autoconf-3090200.tar.gz @@ -235,11 +227,12 @@ $(PREFIX)/lib/chicken/8/margs.so : opensrc/margs/margs.scm cd opensrc/margs;chicken-install opensrc/histstore/hs : opensrc/histstore/histstore.scm chkn eggs $(CHICKEN_EGG_DIR)/sqlite3.so - cd opensrc/histstore;$(PREFIX)/bin/csc histstore.scm -o hs + env | grep CSC + cd opensrc/histstore; $(PREFIX)/bin/csc histstore.scm -o hs $(PREFIX)/bin/hs : opensrc/histstore/hs cp -f opensrc/histstore/hs $(PREFIX)/bin/hs # stml @@ -254,11 +247,11 @@ stml/requirements.scm : stml/requirements.scm.template cp stml/install.cfg.template stml/install.cfg cp stml/requirements.scm.template stml/requirements.scm $(PREFIX)/lib/chicken/8/stml.so : stml/requirements.scm - cd stml;make + cd stml; make #====================================================================== # F F C A L L (Used by IUP) #====================================================================== @@ -280,34 +273,47 @@ iuplib.fossil : fossil clone http://www.kiatoa.com/fossils/iuplib iuplib.fossil cd-5.9_Linux26g4_64_lib.tar.gz : - wget -c http://sourceforge.net/projects/canvasdraw/files/5.9/Linux%20Libraries/cd-5.9_Linux26g4_64_lib.tar.gz/download + wget --no-check-certificate -c http://sourceforge.net/projects/canvasdraw/files/5.9/Linux%20Libraries/cd-5.9_Linux26g4_64_lib.tar.gz/download mv download cd-5.9_Linux26g4_64_lib.tar.gz +cd-5.10_Linux26g4_64_lib.tar.gz : + cp /p/fdk/gwa/jmoon18/cd-5.10_Linux26g4_64_lib.tar.gz cd-5.10_Linux26g4_64_lib.tar.gz + iup-3.17_Linux26g4_64_lib.tar.gz : - wget -c http://sourceforge.net/projects/iup/files/3.17/Linux%20Libraries/iup-3.17_Linux26g4_64_lib.tar.gz/download - mv download iup-3.17_Linux26g4_64_lib.tar.gz + cp /p/fdk/gwa/jmoon18/iup-3.17_Linux26g4_64_lib.tar.gz iup-3.17_Linux26g4_64_lib.tar.gz +# wget --no-check-certificate -c http://sourceforge.net/projects/iup/files/3.17/Linux%20Libraries/iup-3.17_Linux26g4_64_lib.tar.gz/download +# mv download iup-3.17_Linux26g4_64_lib.tar.gz + +iup-3.19.1_Linux26g4_64_lib.tar.gz : + cp /p/fdk/gwa/jmoon18/iup-3.19.1_Linux26g4_64_lib.tar.gz iup-3.19.1_Linux26g4_64_lib.tar.gz im-3.10_Linux26g4_64_lib.tar.gz : - wget -c http://sourceforge.net/projects/imtoolkit/files/3.10/Linux%20Libraries/im-3.10_Linux26g4_64_lib.tar.gz/download + wget --no-check-certificate -c http://sourceforge.net/projects/imtoolkit/files/3.10/Linux%20Libraries/im-3.10_Linux26g4_64_lib.tar.gz/download mv download im-3.10_Linux26g4_64_lib.tar.gz +im-3.11_Linux26g4_64_lib.tar.gz : + cp /p/fdk/gwa/jmoon18/im-3.11_Linux26g4_64_lib.tar.gz im-3.11_Linux26g4_64_lib.tar.gz + lua-5.3.2_Linux26g4_64_lib.tar.gz : - wget -c http://sourceforge.net/projects/luabinaries/files/5.3.2/Linux%20Libraries/lua-5.3.2_Linux26g4_64_lib.tar.gz/download + wget --no-check-certificate -c http://sourceforge.net/projects/luabinaries/files/5.3.2/Linux%20Libraries/lua-5.3.2_Linux26g4_64_lib.tar.gz/download mv download lua-5.3.2_Linux26g4_64_lib.tar.gz +lua-5.3.3_Linux26g4_64_lib.tar.gz : + cp /p/fdk/gwa/jmoon18/lua-5.3.3_Linux26g4_64_lib.tar.gz lua-5.3.3_Linux26g4_64_lib.tar.gz + iup/installall.sh : $(PREFIX)/lib/libiup.so \ - cd-5.9_Linux26g4_64_lib.tar.gz \ + cd-5.10_Linux26g4_64_lib.tar.gz \ iup-3.17_Linux26g4_64_lib.tar.gz \ - im-3.10_Linux26g4_64_lib.tar.gz \ - lua-5.3.2_Linux26g4_64_lib.tar.gz # iuplib.fossil + im-3.11_Linux26g4_64_lib.tar.gz \ + lua-5.3.3_Linux26g4_64_lib.tar.gz # iuplib.fossil mkdir -p iup pwd - tar -xzvf cd-5.9_Linux26g4_64_lib.tar.gz -C iup/ - tar -xzvf im-3.10_Linux26g4_64_lib.tar.gz -C iup/ + tar -xzvf cd-5.10_Linux26g4_64_lib.tar.gz -C iup/ + tar -xzvf im-3.11_Linux26g4_64_lib.tar.gz -C iup/ tar -xzvf iup-3.17_Linux26g4_64_lib.tar.gz -C iup/ mkdir -p $(PREFIX)/include/ $(PREFIX)/lib/ cp iup/include/* $(PREFIX)/include/ cp iup/*.so $(PREFIX)/lib/ cp iup/*.a $(PREFIX)/lib/ @@ -331,6 +337,6 @@ $(CHICKEN_EGG_DIR)/canvas-draw.so : $(PREFIX)/lib/libiup.so $(PREFIX)/lib/libavcall.a CSC_OPTIONS=$(CSC_OPTIONS) $(CHICKEN_INSTALL) $(PROX) -D no-library-checks canvas-draw clean : - rm -rf chicken-4.8.0 eggflags ffcall sqlite-autoconf-$(SQLITE3_VERSION) + rm -rf chicken-4.11.0 eggflags ffcall sqlite-autoconf-$(SQLITE3_VERSION) ADDED utils/checkPreReqs Index: utils/checkPreReqs ================================================================== --- /dev/null +++ utils/checkPreReqs @@ -0,0 +1,30 @@ +#!/bin/bash +SYSTEM_TYPE=$(lsb_release -irs |tr ' ' '_' |tr '\n' '-')$(uname -i) +file=`/bin/mktemp` +case $SYSTEM_TYPE in +Ubuntu-17.04-x86_64-std) + apt list --installed | cut -d/ -f 1 > $file + ;; +Ubuntu-16.04-x86_64) + apt list --installed | cut -d/ -f 1 > $file + ;; +Ubuntu-16.04-i686) + apt list --installed | cut -d/ -f 1 > $file + ;; +SUSE_LINUX_11-x86_64) + rpm -qa > $file + ;; +CentOS_5.11-x86_64-std) + rpm -qa > $file + ;; +esac + + + +for package in libmysqlclient-dev libsqlite3-dev sqlite3 postgresql libreadline-dev libwebkitgtk-dev libpangox-1.0-0 zlib1g-dev libfreetype6 cmake libssl-dev uuid-dev libmotif3 mysql-client; do + grep --silent $package $file + if [ "$?" != "0" ]; then + echo "sudo apt install $package" + fi +done +rm $file Index: utils/installall.sh ================================================================== --- utils/installall.sh +++ utils/installall.sh @@ -17,15 +17,16 @@ if [[ $OPTION=="" ]]; then export OPTION=std fi echo You may need to do the following first: -echo sudo apt-get install libreadline-dev -echo sudo apt-get install libwebkitgtk-dev -echo sudo apt-get install libpangox-1.0-0 zlib1g-dev libfreetype6-dev cmake -echo sudo apt-get install libssl-dev uuid-dev -echo sudo apt-get install libmotif3 -OR- set KTYPE=26g4 +echo sudo apt install libreadline-dev +echo sudo apt install libwebkitgtk-dev +echo sudo apt install libpangox-1.0-0 zlib1g-dev libfreetype6-dev cmake +echo sudo apt install libssl-dev uuid-dev +echo sudo apt install libmotif3 -OR- set KTYPE=26g4 +echo sudo apt install cmake echo echo Set OPTION to std, currently OPTION=$OPTION echo echo Additionally, if you want mysql-client, you will need to make sure echo mysql_config is in your path @@ -35,16 +36,25 @@ echo You are using proxy="$proxy" echo echo "Set additional_libpath to help find gtk or other libraries, don't forget a leading :" SYSTEM_TYPE=$(lsb_release -irs |tr ' ' '_' |tr '\n' '-')$(uname -i)-$OPTION + CHICKEN_VERSION=4.11.0 CHICKEN_BASEVER=4.11.0 # Set up variables # case $SYSTEM_TYPE in +Ubuntu-17.04-x86_64-std) + KTYPE=32 + CDVER=5.10 + IUPVER=3.17 + IMVER=3.11 + CHICKEN_VERSION=4.12.0 + CHICKEN_BASEVER=4.12.0 + ;; Ubuntu-16.04-x86_64-std) KTYPE=32 CDVER=5.10 IUPVER=3.17 IMVER=3.11 @@ -54,10 +64,12 @@ Ubuntu-16.04-i686-std) KTYPE=32 CDVER=5.10 IUPVER=3.17 IMVER=3.11 + CHICKEN_VERSION=4.12.0 + CHICKEN_BASEVER=4.12.0 ;; SUSE_LINUX_11-x86_64-std) KTYPE=26g4 CDVER=5.10 IUPVER=3.17 @@ -69,14 +81,18 @@ IUPVER=3.5 IMVER=3.6.3 ;; esac +echo SYSTEM_TYPE=$SYSTEM_TYPE echo KTYPE=$KTYPE echo CDVER=$CDVER echo IUPVER=$IUPVER echo IMVER=$IMVER +echo CHICKEN_VERSION=$CHICKEN_VERSION +echo CHICKEN_BASEVER=$CHICKEN_BASEVER + # NOTES: # # Centos with security setup may need to do commands such as following as root: # # NB// fix the paths first @@ -101,10 +117,11 @@ if [[ $proxy == "" ]]; then echo 'Please set the environment variable "proxy" to host.com:port (e.g. foo.com:1234) to use a proxy' echo PROX="" else export http_proxy=http://$proxy + export https_proxy=http://$proxy export PROX="-proxy $proxy" fi if [[ $KTYPE == "" ]]; then echo 'Using KTYPE=26' @@ -153,22 +170,25 @@ make PLATFORM=linux PREFIX=$PREFIX make PLATFORM=linux PREFIX=$PREFIX install cd $BUILDHOME fi cd $BUILDHOME -#wget --no-check-certificate https://github.com/nanomsg/nanomsg/archive/1.0.0.tar.gz -#mv 1.0.0 1.0.0.tar.gz -# if ! [[ -e $PREFIX/lib64/libnanomsg.so.1.0.0 ]]; then -# wget --no-check-certificate https://github.com/nanomsg/nanomsg/archive/1.0.0.tar.gz -# mv 1.0.0 1.0.0.tar.gz -# tar xf 1.0.0.tar.gz -# cd nanomsg-1.0.0 -# ./configure --prefix=$PREFIX -# make -# make install -# fi -# cd $BUILDHOME +#if [[ ! -e 1.0.0.tar.gz ]];then +# wget --no-check-certificate https://github.com/nanomsg/nanomsg/archive/1.0.0.tar.gz +# mv 1.0.0 1.0.0.tar.gz +#fi +if ! [[ -e $PREFIX/lib64/libnanomsg.so.1.0.0 ]]; then + wget --no-check-certificate https://github.com/nanomsg/nanomsg/archive/1.0.0.tar.gz + #mv 1.0.0 1.0.0.tar.gz + tar xf 1.0.0.tar.gz + cd nanomsg-1.0.0 + ./configure --prefix=$PREFIX + make + make install + CSC_OPTIONS="-I$PREFIX/include -L$PREFIX/lib" $CHICKEN_INSTALL $PROX nanomsg +fi +cd $BUILDHOME export SQLITE3_VERSION=3090200 if ! [[ -e $PREFIX/bin/sqlite3 ]]; then echo Install sqlite3 sqlite3_tgz=sqlite-autoconf-$SQLITE3_VERSION.tar.gz @@ -202,11 +222,11 @@ # Some eggs are quoted since they are reserved to Bash # for f in matchable readline apropos base64 regex-literals format "regex-case" "test" coops trace csv dot-locking posix-utils posix-extras directory-utils hostinfo tcp rpc csv-xml fmt json md5; do # $CHICKEN_INSTALL $PROX -keep-installed matchable readline apropos base64 regex-literals format "regex-case" "test" coops trace csv dot-locking posix-utils posix-extras directory-utils hostinfo tcp rpc csv-xml fmt json md5 awful http-client spiffy uri-common intarweb http-client spiffy-request-vars md5 message-digest http-client spiffy-directory-listing for egg in matchable readline apropos base64 regex-literals format "regex-case" "test" \ coops trace csv dot-locking posix-utils posix-extras directory-utils hostinfo \ - tcp rpc csv-xml fmt json md5 awful http-client spiffy uri-common intarweb http-client \ + tcp rpc csv-xml fmt json md5 awful http-client:0.7.1 spiffy uri-common intarweb http-client \ spiffy-request-vars s md5 message-digest spiffy-directory-listing ssax sxml-serializer \ sxml-modifications logpro z3 call-with-environment-variables \ pathname-expand typed-records simple-exceptions numbers crypt parley srfi-42 \ alist-lib ansi-escape-sequences args basic-sequences bindings chicken-doc chicken-doc-cmd \ cock condition-utils debug define-record-and-printer easyffi easyffi-base \ @@ -356,12 +376,16 @@ # CSC_OPTIONS="-I$PREFIX/include -L$CSCLIBS" $CHICKEN_INSTALL $PROX -D no-library-checks -deploy -prefix $DEPLOYTARG canvas-draw cd $BUILDHOME # install ducttape -cd ../ducttape -$CHICKEN_INSTALL +if [[ -e ../ducttape ]];then + cd ../ducttape + $CHICKEN_INSTALL +else + echo "ducttape egg not found at ../ducttape. You will need to cd into the ducttape directory in the megatest distribution and run \"chicken-install\"" +fi cd $BUILDHOME echo You may need to add $LD_LIBRARY_PATH to your LD_LIBRARY_PATH variable, a setup-chicken4x.sh echo file can be found in the current directory which should work for setting up to run chicken4x ADDED utils/mtrept.sh Index: utils/mtrept.sh ================================================================== --- /dev/null +++ utils/mtrept.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# +# Rollup counts of calls to Megatest from a logging dat file +# +# Usage: mtrept.sh file [host] + +if [[ "$2"x != "x" ]];then + host_name_grep="grep $2 | " +else + host_name_grep="" +fi +if [[ "$1"x == "x" ]];then + datfile=/p/fdk/gwa/$USER/.logger/all.dat +else + datfile=$1 +fi +datcopy=/tmp/$USER/all.$PID.dat + +if [[ -e $datfile ]];then + cp $datfile $datcopy + list_runs=$(grep list-runs $datcopy |$host_name_grep wc -l) + show_config=$(grep show-config $datcopy |$host_name_grep wc -l) + list_targets=$(grep list-targets $datcopy |$host_name_grep wc -l) + mt_run=$(grep ' -run ' $datcopy |$host_name_grep wc -l) + execute=$(grep ' -execute' $datcopy|$host_name_grep wc -l) + server=$(grep ' -server' $datcopy|$host_name_grep wc -l) + sync_to=$(grep ' -sync-to' $datcopy|$host_name_grep wc -l) + step=$(grep ' -step' $datcopy|$host_name_grep wc -l) + state_status=$(grep ' -set-state-status' $datcopy|$host_name_grep wc -l) + test_status=$(grep ' -test-status' $datcopy|$host_name_grep wc -l) + other=$(egrep -v ' -(list-runs|show-config|list-targets|run|execute|server|sync-to|step|set-state-status|test-status)' $datcopy |$host_name_grep wc -l) + start_time=$(head -1 $datcopy|awk '{print $1}') + end_time=$(tail -1 $datcopy | awk '{print $1}') + minutes=$(echo "($end_time-$start_time)/60.0" | bc) + hours=$(echo "($minutes/60)"|bc) + total_calls=$(cat $datcopy |$host_name_grep wc -l) + + if [[ $hours -gt 2 ]];then + echo "Over $hours hour period we have;" + else + echo "Over $minutes minutes we have;" + fi + echo " list-runs: $list_runs" + echo " show-config: $show_config" + echo " list-targets: $list_targets" + echo " execute: $execute" + echo " run: $mt_run" + echo " server: $server" + echo " step: $step" + echo " test-status: $test_status" + echo " sync-to: $sync_to" + echo " state-status: $state_status" + echo " other: $other" + echo " total: $total_calls" +else + echo "Could not find input file $datfile" +fi + ADDED utils/watch-close-wait.sh Index: utils/watch-close-wait.sh ================================================================== --- /dev/null +++ utils/watch-close-wait.sh @@ -0,0 +1,8 @@ +psline=$(ps -F -u $USER | grep "mtest" |grep " -run " | egrep " -(target|reqtarg) "| head -1) +id=$(echo $psline|awk '{print $2}') +echo "Watching process for command line: $psline" +echo " with PID=$id" +while true;do + echo "CLOSE_WAIT: $(lsof -n | grep CLOSE_WAIT | grep $id | wc -l) ALL OPEN: $(lsof -n |grep $id|wc -l) ALL CLOSE_WAIT: $(netstat -ap 2> /dev/null| grep -i close_wait| wc -l)" + sleep 1 +done ADDED utils/whodunit.scm Index: utils/whodunit.scm ================================================================== --- /dev/null +++ utils/whodunit.scm @@ -0,0 +1,48 @@ +(use posix srfi-69) + +(define *numsamples* (or (and (> (length (argv)) 1) + (string->number (cadr (argv)))) + 3)) + +(define (topdata) + (with-input-from-pipe + (conc "top -b -n " *numsamples* " -d 0.1") + read-lines)) + +(define (cleanup-data topdat)list + (let loop ((hed (car topdat)) + (tal (cdr topdat)) + (res '())) + (let* ((line-list (string-split hed)) + (nums (map (lambda (indat)(or (string->number indat) indat)) line-list)) + (not-data (or (null? nums) + (not (number? (car nums))))) + (new-res (if not-data res (cons nums res)))) + (if (null? tal) + new-res + (loop (car tal)(cdr tal) new-res))))) + +(print "Getting " *numsamples* " samples of cpu usage data.") +(define data (cleanup-data (topdata))) +(define pidhash (make-hash-table)) +(define userhash (make-hash-table)) + +;; sum up and normalize the +(for-each + (lambda (indat) + (let ((pid (car indat)) + (usr (cadr indat)) + (cpu (list-ref indat 8))) + (hash-table-set! userhash usr (+ cpu (hash-table-ref/default userhash usr 0))))) + data) + +(for-each + (lambda (usr) + (print usr + (if (< (string-length usr) 8) "\t\t" "\t") + (inexact->exact (round (/ (hash-table-ref userhash usr) *numsamples*))))) + (sort (hash-table-keys userhash) + (lambda (a b) + (> (hash-table-ref userhash a) + (hash-table-ref userhash b))))) +