Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -282,12 +282,14 @@ (begin (debug:print-info 11 "open-test-db END (unsucessful)" testpath) #f))) ;; find and open the testdat.db file for an existing test -(define (db:open-test-db-by-test-id db test-id) - (let* ((test-path (cdb:remote-run db:test-get-rundir-from-test-id db test-id))) +(define (db:open-test-db-by-test-id db test-id #!key (testpath #f)) + (let* ((test-path (if testpath + testpath + (cdb:remote-run db:test-get-rundir-from-test-id db test-id)))) (debug:print 3 "TEST PATH: " test-path) (open-test-db test-path))) (define (db:testdb-initialize db) (debug:print 11 "db:testdb-initialize START") @@ -842,13 +844,13 @@ ) (debug:print-info 11 "db:get-tests-for-run START run-ids=" run-ids ", testpatt=" testpatt ", states=" states ", statuses=" statuses ", not-in=" not-in ", sort-by=" sort-by) res)) ;; this one is a bit broken BUG FIXME -(define (db:delete-test-step-records db test-id) +(define (db:delete-test-step-records db test-id #!key (testpath #f)) ;; Breaking it into two queries for better file access interleaving - (let* ((tdb (db:open-test-db-by-test-id db test-id))) + (let* ((tdb (db:open-test-db-by-test-id db test-id testpath: testpath))) ;; test db's can go away - must check every time (if tdb (begin (sqlite3:execute tdb "DELETE FROM test_steps;") (sqlite3:execute tdb "DELETE FROM test_data;") @@ -981,12 +983,15 @@ (define db:get-test-id db:get-test-id-not-cached) ;; given a test-info record, patch in the latest data from the testdat.db file ;; found in the test run directory -(define (db:patch-tdb-data-into-test-info db test-id res) - (let ((tdb (db:open-test-db-by-test-id db test-id))) +;; +;; NOT USED +;; +(define (db:patch-tdb-data-into-test-info db test-id res #!key (testpath #f)) + (let ((tdb (db:open-test-db-by-test-id db test-id testpath: testpath))) ;; get state and status from megatest.db in real time ;; other fields that perhaps should be updated: ;; fail_count ;; pass_count ;; final_logf @@ -1223,10 +1228,11 @@ (vector-ref tmp 2)))) ((zmq) (handle-exceptions exn (begin + (debug:print-info 0 "cdb:client-call timeout or error. Trying again in 5 seconds") (thread-sleep! 5) (if (> numretries 0)(apply cdb:client-call serverdat qtype immediate (- numretries 1) params))) (let* ((push-socket (vector-ref serverdat 0)) (sub-socket (vector-ref serverdat 1)) (client-sig (client:get-signature)) @@ -1244,31 +1250,32 @@ (receive-message* sub-socket) ;; now get the actual message (let ((myres (db:string->obj (receive-message* sub-socket)))) (if (equal? query-sig (vector-ref myres 1)) (set! res (vector-ref myres 2)) - (loop)))))) - (timeout (lambda () - (let loop ((n numretries)) - (thread-sleep! 15) - (if (not res) - (if (> numretries 0) - (begin - (debug:print 2 "WARNING: no reply to query " params ", trying resend") - (debug:print-info 11 "re-sending message") - (send-message push-socket zdat) - (debug:print-info 11 "message re-sent") - (loop (- n 1))) - ;; (apply cdb:client-call *runremote* qtype immediate (- numretries 1) params)) - (begin - (debug:print 0 "ERROR: cdb:client-call timed out " params ", exiting.") - (exit 5)))))))) + (loop))))))) + ;; (timeout (lambda () + ;; (let loop ((n numretries)) + ;; (thread-sleep! 15) + ;; (if (not res) + ;; (if (> numretries 0) + ;; (begin + ;; (debug:print 2 "WARNING: no reply to query " params ", trying resend") + ;; (debug:print-info 11 "re-sending message") + ;; (send-message push-socket zdat) + ;; (debug:print-info 11 "message re-sent") + ;; (loop (- n 1))) + ;; ;; (apply cdb:client-call *runremote* qtype immediate (- numretries 1) params)) + ;; (begin + ;; (debug:print 0 "ERROR: cdb:client-call timed out " params ", exiting.") + ;; (exit 5)))))))) (debug:print-info 11 "Starting threads") (let ((th1 (make-thread send-receive "send receive")) - (th2 (make-thread timeout "timeout"))) + ;; (th2 (make-thread timeout "timeout")) + ) (thread-start! th1) - (thread-start! th2) + ;; (thread-start! th2) (thread-join! th1) (debug:print-info 11 "cdb:client-call returning res=" res) res)))))) (define (cdb:set-verbosity serverdat val) @@ -1624,13 +1631,13 @@ ;;====================================================================== ;; T E S T D A T A ;;====================================================================== -(define (db:csv->test-data db test-id csvdata) +(define (db:csv->test-data db test-id csvdata #!key (testpath #f)) (debug:print 4 "test-id " test-id ", csvdata: " csvdata) - (let ((tdb (db:open-test-db-by-test-id db test-id))) + (let ((tdb (db:open-test-db-by-test-id db test-id testpath: testpath))) (if tdb (let ((csvlist (csv->list (make-csv-reader (open-input-string csvdata) '((strip-leading-whitespace? #t) (strip-trailing-whitespace? #t)) )))) ;; (csv->list csvdata))) @@ -1685,12 +1692,12 @@ test-id category variable value expected tol units (if comment comment "") status type) (sqlite3:finalize! tdb))) csvlist))))) ;; get a list of test_data records matching categorypatt -(define (db:read-test-data db test-id categorypatt) - (let ((tdb (db:open-test-db-by-test-id db test-id))) +(define (db:read-test-data db test-id categorypatt #!key (testpath #f)) + (let ((tdb (db:open-test-db-by-test-id db test-id testpath: testpath))) (if tdb (let ((res '())) (sqlite3:for-each-row (lambda (id test_id category variable value expected tol units comment status type) (set! res (cons (vector id test_id category variable value expected tol units comment status type) res))) @@ -1699,28 +1706,28 @@ (sqlite3:finalize! tdb) (reverse res)) '()))) ;; NOTE: Run this local with #f for db !!! -(define (db:load-test-data db test-id) +(define (db:load-test-data db test-id #!key (testpath #f)) (let loop ((lin (read-line))) (if (not (eof-object? lin)) (begin (debug:print 4 lin) (db:csv->test-data db test-id lin) (loop (read-line))))) ;; roll up the current results. ;; FIXME: Add the status to - (db:test-data-rollup db test-id #f)) + (db:test-data-rollup db test-id #f testpath: testpath)) ;; WARNING: Do NOT call this for the parent test on an iterated test ;; Roll up test_data pass/fail results ;; look at the test_data status field, ;; if all are pass (any case) and the test status is PASS or NULL or '' then set test status to PASS. ;; if one or more are fail (any case) then set test status to PASS, non "pass" or "fail" are ignored -(define (db:test-data-rollup db test-id status) - (let ((tdb (db:open-test-db-by-test-id db test-id)) +(define (db:test-data-rollup db test-id status #!key (testpath #f)) + (let ((tdb (db:open-test-db-by-test-id db test-id testpath: testpath)) (fail-count 0) (pass-count 0)) (if tdb (begin (sqlite3:for-each-row @@ -1769,12 +1776,12 @@ (define (db:step-get-time-as-string vec) (seconds->time-string (db:step-get-event_time vec))) ;; db-get-test-steps-for-run -(define (db:get-steps-for-test db test-id) - (let* ((tdb (db:open-test-db-by-test-id db test-id)) +(define (db:get-steps-for-test db test-id #!key (testpath #f)) + (let* ((tdb (db:open-test-db-by-test-id db test-id testpath: testpath)) (res '())) (if tdb (begin (sqlite3:for-each-row (lambda (id test-id stepname state status event-time logfile) @@ -1786,12 +1793,12 @@ (reverse res)) '()))) ;; get a pretty table to summarize steps ;; -(define (db:get-steps-table db test-id) - (let ((steps (db:get-steps-for-test db test-id))) +(define (db:get-steps-table db test-id #!key (testpath #f)) + (let ((steps (db:get-steps-for-test db test-id testpath: testpath))) ;; organise the steps for better readability (let ((res (make-hash-table))) (for-each (lambda (step) (debug:print 6 "step=" step) @@ -1846,12 +1853,12 @@ (else #f))))) res))) ;; get a pretty table to summarize steps ;; -(define (db:get-steps-table-list db test-id) - (let ((steps (db:get-steps-for-test db test-id))) +(define (db:get-steps-table-list db test-id #!key (testpath #f)) + (let ((steps (db:get-steps-for-test db test-id testpath: testpath))) ;; organise the steps for better readability (let ((res (make-hash-table))) (for-each (lambda (step) (debug:print 6 "step=" step) @@ -1988,14 +1995,14 @@ (if (not ever-seen) (set! result (append (if (null? tests)(list waitontest-name) tests) result))))) waitons) (delete-duplicates result)))) -(define (db:teststep-set-status! db test-id teststep-name state-in status-in comment logfile) +(define (db:teststep-set-status! db test-id teststep-name state-in status-in comment logfile #!key (testpath #f)) (debug:print 4 "test-id: " test-id " teststep-name: " teststep-name) ;; db:open-test-db-by-test-id does cdb:remote-run - (let* ((tdb (db:open-test-db-by-test-id db test-id)) + (let* ((tdb (db:open-test-db-by-test-id db test-id testpath: testpath)) (state (items:check-valid-items "state" state-in)) (status (items:check-valid-items "status" status-in))) (if (or (not state)(not status)) (debug:print 3 "WARNING: Invalid " (if status "status" "state") " value \"" (if status state-in status-in) "\", update your validvalues section in megatest.config")) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -133,11 +133,11 @@ (alist->env-vars env-ovrd) (set-megatest-env-vars run-id) (set-item-env-vars itemdat) (save-environment-as-files "megatest") ;; open-run-close not needed for test-set-meta-info - (tests:set-meta-info #f test-id run-id test-name itemdat 0) + (tests:set-meta-info #f test-id run-id test-name itemdat 0 testpath) (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f) (if (args:get-arg "-xterm") (set! fullrunscript "xterm") (if (and fullrunscript (not (file-execute-access? fullrunscript))) (system (conc "chmod ug+x " fullrunscript)))) @@ -208,11 +208,11 @@ ;; call the command using mt_ezstep (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "-") " " stepcmd)) (debug:print 4 "script: " script) ;; DO NOT remote - (db:teststep-set-status! #f test-id stepname "start" "-" #f #f) + (db:teststep-set-status! #f test-id stepname "start" "-" #f #f testpath: testpath) ;; now launch (let ((pid (process-run script))) (let processloop ((i 0)) (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) (mutex-lock! m) @@ -226,11 +226,11 @@ (processloop (+ i 1)))) )) (let ((exinfo (vector-ref exit-info 2)) (logfna (if logpro-used (conc stepname ".html") ""))) ;; testing if procedures called in a remote call cause problems (ans: no or so I suspect) - (db:teststep-set-status! #f test-id stepname "end" exinfo #f logfna)) + (db:teststep-set-status! #f test-id stepname "end" exinfo #f logfna testpath: testpath)) (if logpro-used (cdb:test-set-log! *runremote* test-id (conc stepname ".html"))) ;; set the test final status (let* ((this-step-status (cond ((and (eq? (vector-ref exit-info 2) 2) logpro-used) 'warn) @@ -276,11 +276,11 @@ (kill-tries 0)) (let loop ((minutes (calc-minutes))) (begin (set! kill-job? (test-get-kill-request test-id)) ;; run-id test-name itemdat)) ;; open-run-close not needed for test-set-meta-info - (tests:set-meta-info #f test-id run-id test-name itemdat minutes) + (tests:set-meta-info #f test-id run-id test-name itemdat minutes testpath) (if kill-job? (begin (mutex-lock! m) (let* ((pid (vector-ref exit-info 0))) (if (number? pid) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -34,10 +34,13 @@ (include "db_records.scm") (include "megatest-fossil-hash.scm") ;; (use trace dot-locking) ;; (trace +;; db:teststep-set-status! +;; db:open-test-db-by-test-id +;; db:test-get-rundir-from-test-id ;; cdb:tests-register-test ;; cdb:tests-update-uname-host ;; cdb:tests-update-run-duration ;; ;; cdb:client-call ;; ;; cdb:remote-run @@ -811,11 +814,11 @@ (begin (debug:print 0 "Failed to setup, exiting") (exit 1))) (if (and state status) ;; DO NOT remote run, makes calls to the testdat.db test db. - (db:teststep-set-status! db test-id step state status msg logfile) + (db:teststep-set-status! db test-id step state status msg logfile testpath: testpath) (begin (debug:print 0 "ERROR: You must specify :state and :status with every call to -step") (exit 6)))))) (if (args:get-arg "-step") @@ -827,11 +830,12 @@ (args:get-arg "-setlog") (args:get-arg "-m")) ;; (if db (sqlite3:finalize! db)) (set! *didsomething* #t))) -(if (or (args:get-arg "-setlog") ;; since setting up is so costly lets piggyback on -test-status +(if (or (and (args:get-arg "-setlog") ;; since setting up is so costly lets piggyback on -test-status + (not (args:get-arg "-step"))) ;; -setlog may have been processed already in the "-step" previous (args:get-arg "-set-toplog") (args:get-arg "-test-status") (args:get-arg "-set-values") (args:get-arg "-load-test-data") (args:get-arg "-runstep") @@ -898,11 +902,11 @@ (fullcmd (conc "(" (string-intersperse (cons cmd params) " ") ") " redir " " logfile))) ;; mark the start of the test ;; DO NOT run remote - (db:teststep-set-status! db test-id stepname "start" "n/a" (args:get-arg "-m") logfile) + (db:teststep-set-status! db test-id stepname "start" "n/a" (args:get-arg "-m") logfile testpath: testpath) ;; run the test step (debug:print-info 2 "Running \"" fullcmd "\"") (change-directory startingdir) (set! exitstat (system fullcmd)) ;; cmd params)) (set! *globalexitstatus* exitstat) @@ -918,11 +922,11 @@ (set! *globalexitstatus* exitstat) ;; no necessary (change-directory testpath) (cdb:test-set-log! *runremote* test-id htmllogfile))) (let ((msg (args:get-arg "-m"))) ;; DO NOT run remote - (db:teststep-set-status! db test-id stepname "end" exitstat msg logfile)) + (db:teststep-set-status! db test-id stepname "end" exitstat msg logfile testpath: testpath)) ))) (if (or (args:get-arg "-test-status") (args:get-arg "-set-values")) (let ((newstatus (cond ((number? status) (if (equal? status 0) "PASS" "FAIL")) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -444,11 +444,13 @@ ", ")) (thread-sleep! 0.1) (loop hed tal reruns)) ((not have-resources) ;; simply try again after waiting a second (debug:print-info 1 "no resources to run new tests, waiting ...") - ;; (thread-sleep! (+ 2 *global-delta*)) + ;; Have gone back and forth on this but db starvation is an issue. + ;; wait one second before looking again to run jobs. + (thread-sleep! 1) ;; (+ 2 *global-delta*)) ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (loop (car newtal)(cdr newtal) reruns)) ((and have-resources (or (null? prereqs-not-met) (and (eq? testmode 'toplevel) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -571,22 +571,23 @@ (cdb:tests-update-uname-host *runremote* test-id uname hostname)) ;;(sqlite3:execute db "UPDATE tests SET uname=?,host=? WHERE id=?;" uname hostname test-id)) ;;(sqlite3:finalize! db)) ) -(define (tests:set-meta-info db test-id run-id testname itemdat minutes) +(define (tests:set-meta-info db test-id run-id testname itemdat minutes testpath) ;; DOES cdb:remote-run under the hood! - (let* ((tdb (db:open-test-db-by-test-id db test-id)) + (let* ((tdb (db:open-test-db-by-test-id db test-id testpath: testpath)) (num-records (test:tdb-get-rundat-count tdb)) (cpuload (get-cpu-load)) (diskfree (get-df (current-directory)))) (if (eq? (modulo num-records 10) 0) ;; every ten records update central (let ((uname (get-uname "-srvpio")) (hostname (get-host-name))) (tests:update-central-meta-info test-id cpuload diskfree minutes num-records uname hostname))) (sqlite3:execute tdb "INSERT INTO test_rundat (update_time,cpuload,diskfree,run_duration) VALUES (strftime('%s','now'),?,?,?);" - cpuload diskfree minutes))) + cpuload diskfree minutes) + (sqlite3:finalize! tdb))) ;;====================================================================== ;; A R C H I V I N G ;;====================================================================== Index: tests/fullrun/config/mt_include_1.config ================================================================== --- tests/fullrun/config/mt_include_1.config +++ tests/fullrun/config/mt_include_1.config @@ -1,8 +1,8 @@ [setup] # exectutable /path/to/megatest -max_concurrent_jobs 200 +max_concurrent_jobs 25 linktree #{getenv MT_RUN_AREA_HOME}/tmp/mt_links [jobtools] useshell yes