Index: codescanlib.scm ================================================================== --- codescanlib.scm +++ codescanlib.scm @@ -46,11 +46,11 @@ #f)] [else #f] ) scm-tree)))) procs)) -;; given a sexp, return a flat lost of atoms in that sexp +;; given a sexp, return a flat list of atoms in that sexp (define (get-atoms-in-body body) (cond ((null? body) '()) ((atom? body) (list body)) (else Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -531,11 +531,12 @@ (5 "WAIVED") (6 "CHECK") (7 "STUCK/DEAD") (8 "DEAD") (9 "FAIL") - (10 "ABORT"))) + (10 "PREQ_FAIL") + (11 "ABORT"))) (define *common:ended-states* ;; states which indicate the test is stopped and will not proceed '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE")) (define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1,6 +1,6 @@ -;;====================================================================== +;====================================================================== ;; Copyright 2006-2016, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; @@ -198,21 +198,21 @@ ;; open an sql database inside a file lock ;; returns: db existed-prior-to-opening ;; RA => Returns a db handler; sets the lock if opened in writable mode ;; -;;(define *db-open-mutex* (make-mutex)) +(define *db-open-mutex* (make-mutex)) (define (db:lock-create-open fname initproc) (let* ((parent-dir (or (pathname-directory fname)(current-directory))) ;; no parent? go local (raw-fname (pathname-file fname)) (dir-writable (file-write-access? parent-dir)) (file-exists (common:file-exists? fname)) (file-write (if file-exists (file-write-access? fname) dir-writable ))) - ;;(mutex-lock! *db-open-mutex*) ;; tried this mutex, not clear it helped. + (mutex-lock! *db-open-mutex*) ;; tried this mutex, not clear it helped. (if file-write ;; dir-writable (condition-case (let* ((lockfname (conc fname ".lock")) (readyfname (conc parent-dir "/.ready-" raw-fname)) (readyexists (common:file-exists? readyfname))) @@ -246,11 +246,11 @@ (condition-case (begin (debug:print 2 *default-log-port* "WARNING: opening db in non-writable dir " fname) (let ((db (sqlite3:open-database fname))) - ;;(mutex-unlock! *db-open-mutex*) + (mutex-unlock! *db-open-mutex*) db)) (exn (io-error) (debug:print 0 *default-log-port* "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.")) (exn (corrupt) (debug:print 0 *default-log-port* "ERROR: database " fname " is corrupt. Repair it to proceed.")) (exn (busy) (debug:print 0 *default-log-port* "ERROR: database " fname " is locked. Try copying to another location, remove original and copy back.")) (exn (permission)(debug:print 0 *default-log-port* "ERROR: database " fname " has some permissions problem.")) @@ -3493,15 +3493,19 @@ (map dbr:counts-status state-status-counts))) *common:std-statuses* >)) (non-completes (filter (lambda (x) (not (equal? x "COMPLETED"))) all-curr-states)) - (num-non-completes (length non-completes)) - + (preq-fails (filter (lambda (x) + (equal? x "PREQ_FAIL")) + all-curr-statuses)) + (num-non-completes (length non-completes)) (newstate (cond ((> running 0) "RUNNING") ;; anything running, call the situation running + ;;((> (length preq-fails) 0) + ;; "NOT_STARTED") ((> bad-not-started 0) ;; we have an ugly situation, it is completed in the sense we cannot do more. "COMPLETED") ((> num-non-completes 0) ;; (car non-completes)) ;; (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED @@ -3510,20 +3514,38 @@ ;; (if (> running 0) ;; "RUNNING" ;; (if (> bad-not-started 0) ;; "COMPLETED" ;; (car all-curr-states)))) - (newstatus (if (or (> bad-not-started 0) - (and (equal? newstate "NOT_STARTED") - (> num-non-completes 0))) - "STARTED" - (car all-curr-statuses)))) + ;; (newstatus (cond + ;; ;;((> (length preq-fails) 0) + ;; ;; "PREQ_FAIL") + ;; ((or (> bad-not-started 0) + ;; (and (equal? newstate "NOT_STARTED") + ;; (> num-non-completes 0))) + ;; "STARTED") + ;; (else + ;; (car all-curr-statuses))))) + (newstatus (if (or (> bad-not-started 0) + (and (equal? newstate "NOT_STARTED") + (> num-non-completes 0))) + "STARTED" + (car all-curr-statuses)))) + ;; (print "bad-not-supported: " bad-not-support " all-curr-states: " all-curr-states " all-curr-statuses: " all-curr-states) ;; " newstate: " newstate " newstatus: " newstatus) ;; NB// Pass the db so it is part of the transaction + (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path"> bad-not-started="bad-not-started" newstate="newstate" newstatus="newstatus" num-non-completes="num-non-completes" non-completes="non-completes "len(sscs)="(length state-status-counts) " state-status-counts: " + (apply conc + (map (lambda (x) + (conc + (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) + state-status-counts)) + + ); end debug:print (if tl-test-id - (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)))))))) + (db:test-set-state-status dbstruct run-id tl-test-id newstate newstatus #f)))))))) (mutex-unlock! *db-transaction-mutex*) (if (and test-id state status (equal? status "AUTO")) (db:test-data-rollup dbstruct run-id test-id status)) tr-res))))) ;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status* Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -426,11 +426,11 @@ ;; ;; What happended, this code is now duplicated in tests!? ;; ;;====================================================================== - (if (not (null? test-names)) + (if (not (null? test-names)) ;; BEGIN test-names loop (let loop ((hed (car test-names)) ;; NOTE: This is the main loop that iterates over the test-names (tal (cdr test-names))) ;; 'return-procs tells the config reader to prep running system but return a proc (change-directory *toppath*) ;; PLEASE OPTIMIZE ME!!! I think this should be a no-op but there are several places where change-directories could be happening. (setenv "MT_TEST_NAME" hed) ;; (let*-values (((waitons waitors config)(tests:get-waitons hed all-tests-registry))) @@ -463,11 +463,11 @@ (waiton-tconfig (if waiton-record (vector-ref waiton-record 1) #f)) (waiton-itemized (and waiton-tconfig (or (hash-table-ref/default waiton-tconfig "items" #f) (hash-table-ref/default waiton-tconfig "itemstable" #f)))) (itemmaps (tests:get-itemmaps config)) ;; (configf:lookup config "requirements" "itemmap")) - (new-test-patts (tests:extend-test-patts test-patts hed waiton itemmaps))) + (new-test-patts (tests:extend-test-patts test-patts hed waiton itemmaps))) ;; BB: items expanded here. (debug:print-info 0 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items") ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%" ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt ;; is this satisfied by merely appending "/" to the waiton name added to the list? ;; @@ -503,11 +503,11 @@ (delete-duplicates (append waitons waitors))) (let ((remtests (delete-duplicates (append waitons tal)))) (if (not (null? remtests)) (begin ;; (debug:print-info 0 *default-log-port* "Preprocessing continues for " (string-intersperse remtests ", ")) - (loop (car remtests)(cdr remtests)))))))) + (loop (car remtests)(cdr remtests)))))))) ;; END test-names loop (if (not (null? required-tests)) (debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue")) ;; NOTE: these are all parent tests, items are not expanded yet. (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records)) @@ -532,11 +532,11 @@ ;; (begin ;; (set! run-queue-retries (- run-queue-retries 1)) ;; (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)))) ;; (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))) "runs:run-tests-queue")) - (th2 (make-thread (lambda () + (th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here? ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions @@ -750,11 +750,13 @@ ", removing it from to-do list") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (if (not (null? prereq-fails)) (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites") - (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))) + (begin + (debug:print 4 *default-log-port*"BB> set PREQ_FAIL on "hed) + (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))))) ;; BB: this works, btu equivalent for itemwait mode does not work. (if (or (not (null? reg))(not (null? tal))) (begin (hash-table-set! test-registry hed 'CANNOTRUN) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) @@ -794,10 +796,13 @@ ;; hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry itemmaps) (define (runs:process-expanded-tests runsdat testdat) ;; unroll the contents of runsdat and testdat (due to ongoing refactoring). + (debug:print 2 *default-log-port* "runs:process-expanded-tests; testdat:" ) + (debug:print 2 *default-log-port* (with-output-to-string + (lambda () (pp (runs:testdat->alist testdat) )))) (let* ((hed (runs:testdat-hed testdat)) (tal (runs:testdat-tal testdat)) (reg (runs:testdat-reg testdat)) (reruns (runs:testdat-reruns testdat)) (test-name (runs:testdat-test-name testdat)) @@ -865,11 +870,11 @@ ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch? (debug:print-info 4 *default-log-port* "run-limits-info = " run-limits-info) - (cond + (cond ; cond 894- 1067 ;; Check item path against item-patts, ;; ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run ;; else the run is stuck, temporarily or permanently @@ -987,37 +992,43 @@ (debug:print-info 0 *default-log-port* "Waiting for more work to do...")) (thread-sleep! 1) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (if (or (not (null? reg))(not (null? tal))) - (if (vector? hed) + (if (or (vector? hed) (not (null? fails))) ;; BB: why do we need a vector? in my case, fails is populated (prereq failed), reg is not nul, and we really want to drop this one (begin (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) ;; This next is for the items - (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) + + (if (not (null? fails)) + ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f) + (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f) + ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) + (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) ) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns ;; WAS: (cons hed reruns) ;; but that makes no sense? )) - (let ((nth-try (hash-table-ref/default test-registry hed 0))) + (let ((nth-try (hash-table-ref/default test-registry hed 0))) ;; hed not a vector... + (debug:print 2 *default-log-port* "nth-try("hed")="nth-try) (cond ((member "RUNNING" (map db:test-get-state prereqs-not-met)) (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet.")) (thread-sleep! 4) (list (runs:queue-next-hed newtal reg reglen regfull) (runs:queue-next-tal newtal reg reglen regfull) (runs:queue-next-reg newtal reg reglen regfull) reruns)) - ((or (not nth-try) + ((or (not nth-try) ;; BB: condition on subsequent tries, condition below fires on first try (and (number? nth-try) (< nth-try 10))) (hash-table-set! test-registry hed (if (number? nth-try) (+ nth-try 1) 0)) @@ -1030,18 +1041,18 @@ ;; (hash-table-set! test-registry hed 'removed) (list (runs:queue-next-hed newtal reg reglen regfull) (runs:queue-next-tal newtal reg reglen regfull) (runs:queue-next-reg newtal reg reglen regfull) reruns)) - ((symbol? nth-try) + ((symbol? nth-try) ;; BB: 'done matches here in one case where prereq itemwait failed. This is first "try" (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW (if (null? tal) #f ;; yes, really (list (car tal)(cdr tal) reg reruns)) (begin (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60) - (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state " nth-try " will be overridden and we'll retry.")) + (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry.")) (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f) (hash-table-set! test-registry hed 0) (list (runs:queue-next-hed newtal reg reglen regfull) (runs:queue-next-tal newtal reg reglen regfull) (runs:queue-next-reg newtal reg reglen regfull) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -458,14 +458,15 @@ (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) (no-sync-db (db:open-no-sync-db)) (sync-duration 0) ;; run time of the sync in milliseconds - (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))) + ;;(this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) + ) (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls (debug:print-info 2 *default-log-port* "Periodic sync thread started.") - (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) + (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id) );; " this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) (let* (;;(dbstruct (db:setup)) (mtdb (dbr:dbstruct-mtdb dbstruct)) (mtpath (db:dbdat-get-path mtdb)) (tmp-area (common:get-db-tmp-area)) @@ -579,7 +580,7 @@ (delay-loop (+ count 1)))) (if (not *time-to-exit*) (loop)))) ;; time to exit, close the no-sync db here (db:no-sync-close-db no-sync-db) (if (common:low-noise-print 30) - (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num))))))) + (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num)))))))