Index: TODO ================================================================== --- TODO +++ TODO @@ -16,10 +16,13 @@ # along with Megatest. If not, see . TODO ==== +23WW46 - v1.80 branch +. Use file semaphore to kill tests, eliminate db load of the KILLREQ query +. Merge this change to revolution branch 23WW45 - the revolution branch . Add "fast" db start option (no handshaking over NFS) . Add server-ro to server types (just "server" is fine for read/write). . Create pause-server and resume-server calls . Create rsync or cp sync to MTRAH function Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -226,11 +226,11 @@ ((ping) #t) ;; we are fine (else (assert ok "FATAL: database file and run-id not aligned."))))) (ttdat *server-info*) (server-state (tt-state ttdat)) - (maxthreads 10) ;; make this a parameter? + (maxthreads 20) ;; make this a parameter? (status (cond ((and (> numthreads maxthreads) (> (random 100) 70)) ;; allow a 30% probability to go through so we can figure out what is going wrong in main.db server. 'busy) ;; ((> numthreads 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down. @@ -242,11 +242,11 @@ (result (case status ((busy) (if (eq? cmd 'ping) (normal-proc cmd run-id params) ;; numthreads must be greater than 5 for busy - (* 1 (- numthreads maxthreads)) ;; was 15 + (* 0.1 (- numthreads maxthreads)) ;; was 15 )) ;; (- numthreads 29)) ;; call back in as many seconds ((loaded) ;; (if (eq? (rmt:transport-mode) 'tcp) ;; (thread-sleep! 0.5)) (normal-proc cmd run-id params)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2227,21 +2227,24 @@ qry run-id (or last-update 0)))))) (define (db:get-testinfo-state-status dbstruct run-id test-id) - (let ((res #f)) - (db:with-db dbstruct run-id #f - (lambda (dbdat db) - (sqlite3:for-each-row - (lambda (run-id testname item-path state status) - ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment - (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-"))) - db - "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" - test-id run-id))) - res)) + (db:with-db + dbstruct run-id #f + (lambda (dbdat db) + (let* ((res #f) + (stmth (db:get-cache-stmth dbdat db "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;"))) + (sqlite3:for-each-row + (lambda (run-id testname item-path state status) + ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment + (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-"))) + ;; db + ;; "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" + stmth + test-id run-id) + res)))) ;; get a useful subset of the tests data (used in dashboard ;; use db:mintest-get-{id ,run_id,testname ...} ;; (define (db:get-tests-for-run-mindata dbstruct run-id testpatt states statuses not-in) @@ -2630,18 +2633,20 @@ (db:with-db dbstruct run-id #f (lambda (dbdat db) - (let ((res (cons #f #f))) -;; (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;"))) - (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test - (lambda (state status) - (cons state status)) - db - "SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue - test-id run-id) + (let ((res (cons #f #f)) + (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=? AND run_id=?;"))) + (db:with-mutex-for-stmth + (lambda() + (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test + (lambda (state status) + (cons state status)) + ;; db + stmth ;;"SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue + test-id run-id))) res)))) ;; Use db:test-get* to access ;; Get test data using test_ids. NB// Only works within a single run!! ;; Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -526,17 +526,18 @@ reason TEXT DEFAULT 'none', CONSTRAINT no_sync_processes UNIQUE (host,pid));" )))))) (on-tmp (equal? (car (string-split dbpath "/")) "tmp")) (db (if on-tmp - (dbfile:cautious-open-database dbname init-proc 0 "WAL" force-init: #t) - (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t) + (dbfile:cautious-open-database dbname init-proc 1 "WAL" force-init: #t) ;; WAL MODE should use syncronous=1 + ;; (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t) + (dbfile:cautious-open-database dbname init-proc 0 "MEMORY" force-init: #t) ;; (sqlite3:open-database dbname) ))) (if on-tmp ;; done in cautious-open-database (begin - (sqlite3:execute db "PRAGMA synchronous = 0;") + ;; (sqlite3:execute db "PRAGMA synchronous = 0;") ;; why was this here when is is handled by cautious-open-database? (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)))) db)) ;; mtest processes registry calls @@ -1579,7 +1580,15 @@ ;; (db:hoh-set! stmt-cache db stmt newstmth) (hash-table-set! stmt-cache stmt newstmth) newstmth)))) (mutex-unlock! *get-cache-stmth-mutex*) result)) + +(define *mutex-stmth-call* (make-mutex)) + +(define (db:with-mutex-for-stmth proc) + (mutex-lock! *mutex-stmth-call*) + (let* ((res (proc))) + (mutex-unlock! *mutex-stmth-call*) + res)) ) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -236,11 +236,10 @@ (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -256,15 +255,16 @@ (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-state-status-by-id run-id test-id)) (state (car test-info));; (db:test-get-state test-info)) (status (cdr test-info));; (db:test-get-status test-info)) + (killreq (equal? state "KILLREQ")) (kill-reason "no kill reason specified") (kill-job? #f)) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond - ((test-get-kill-request run-id test-id) + (killreq (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) @@ -276,16 +276,11 @@ (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty (launch:handle-zombie-tests run-id)) (when do-sync - ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) - ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))) - ) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) @@ -331,18 +326,17 @@ ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. But run end of run before exiting? (launch:end-of-run-check run-id) (exit))) - (if (hash-table-ref/default misc-flags 'keep-going #f) + (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (begin - (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses - (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta - (loop (calc-minutes) - (or new-cpu-load cpu-load) - (or new-disk-free disk-free) - (if do-sync (current-seconds) last-sync))))))) + (thread-sleep! 6) ;; was 3 + (loop (calc-minutes) + (or new-cpu-load cpu-load) + (or new-disk-free disk-free) + (if do-sync (current-seconds) last-sync)))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -256,11 +256,11 @@ (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay (let* ((raw-dly (if (number? result) result 0.1)) - (dly (* raw-dly (/ attemptnum 2)))) + (dly (+ raw-dly (/ attemptnum 10)))) ;; (* raw-dly (/ attemptnum 2)))) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, cmd is "cmd", will try again in "dly" seconds. This is attempt "(- attemptnum 1)) (thread-sleep! dly) (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))) ((loaded) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.") Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1964,11 +1964,12 @@ ;; test steps ;;====================================================================== ;; teststep-set-status! used to be here -(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) +;; NOT NEEDED +#;(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) (let* ((testdat (rmt:get-test-state-status-by-id run-id test-id))) (and testdat (equal? (car testdat) "KILLREQ")))) (define (test:tdb-get-rundat-count tdb)