Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -59,26 +59,26 @@ mofiles/processmod.o : mofiles/commonmod.o mofiles/servermod.o : mofiles/commonmod.o mofiles/rmtmod.o : mofiles/mtmod.o mofiles/apimod.o mofiles/dbmod.o : mofiles/mtmod.o # mofiles/mtmod.o : mofiles/tcp-transportmod.o -mofiles/megatestmod.o : mofiles/pkts.o +mofiles/megatestmod.o : mofiles/pkts.o mofiles/servermod.o mofiles/mtmod.o : mofiles/testsmod.o mofiles/dbfile.o : \ mofiles/debugprint.o mofiles/commonmod.o mofiles/configfmod.o mofiles/apimod.o : mofiles/commonmod.o mofiles/tcp-transportmod.o mofiles/configfmod.o mofiles/megatestmod.o -mofiles/rmtmod.o : mofiles/tasksmod.o mofiles/dbmod.o : mofiles/dbfile.o mofiles/api.o : mofiles/apimod.o mofiles/commonmod.o : mofiles/debugprint.o mofiles/stml2.o configf.o : commonmod.import.o mofiles/dbfile.o : mofiles/debugprint.o mofiles/rmtmod.o mofiles/dbmod.o : mofiles/dbfile.o mofiles/commonmod.o mofiles/debugprint.o db.o : mofiles/dbmod.o mofiles/dbfile.o mofiles/debugprint.o : mofiles/mtargs.o mofiles/tcp-transportmod.o : mofiles/portlogger.o +mofiles/tasksmod.o : mofiles/rmtmod.o # ftail.scm rmtmod.scm commonmod.scm removed # MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm \ # mtargs.scm commonmod.scm dbmod.scm adjutant.scm ulex.scm \ # rmtmod.scm apimod.scm Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -3855,11 +3855,11 @@ (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") (exit 1) ) ) - #;(if (not (common:on-homehost?)) + #;(if (not (rmt:on-homehost?)) (begin (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost)) (debug:print 0 *default-log-port* "It will be slower.") )) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -1773,5 +1773,92 @@ ;; now wait on that process if all is correct ;; periodically update the db with runtime ;; when the process exits look at the db, if still RUNNING after 10 seconds set ;; state/status appropriately (process-wait pid))) + +;;====================================================================== +;; Maintenance +;;====================================================================== + +(define (rmt:find-and-mark-incomplete run-id ovr-deadtime) + (let* ((cfg-deadtime (configf:lookup-number *configdat* "setup" "deadtime")) + (test-stats-update-period (configf:lookup-number *configdat* "setup" "test-stats-update-period"))) + (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period) + ;;call end of eud of run detection for posthook + (launch:end-of-run-check run-id))) + +;; select end_time-now from +;; (select testname,item_path,event_time+run_duration as +;; end_time,strftime('%s','now') as now from tests where state in +;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); +;; +;; NOT EASY TO MIGRATE TO db{file,mod} +;; +(define (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period) + (let* ((incompleted '()) + (oldlaunched '()) + (toplevels '()) + ;; The default running-deadtime is 720 seconds = 12 minutes. + ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30)) + (deadtime-trim (or ovr-deadtime cfg-deadtime)) + (server-start-allowance 200) + (server-overloaded-budget 200) + (launch-monitor-off-time (or test-stats-update-period 30)) + (launch-monitor-on-time-budget 30) + (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget)) + (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30)) + (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default)) + (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period))) + (running-deadtime (or deadtime-trim running-deadtime-default))) ;; two minutes (30 seconds between updates, this leaves 3x grace period) + + (debug:print-info 4 *default-log-port* "running-deadtime = " running-deadtime) + (debug:print-info 4 *default-log-port* "deadtime-trim = " deadtime-trim) + + (let* ((dat (rmt:get-toplevels-and-incompletes run-id running-deadtime remotehoststart-deadtime))) + (set! oldlaunched (list-ref dat 1)) + (set! toplevels (list-ref dat 2)) + (set! incompleted (list-ref dat 0))) + + (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " + (length toplevels) " old LAUNCHED toplevel tests and " + (length incompleted) " tests marked RUNNING but apparently dead.") + + ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE. + ;; + ;; (db:delay-if-busy dbdat) + (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all + (all-ids (append min-incompleted-ids (map car oldlaunched)))) + (if (> (length all-ids) 0) + (begin + ;; (launch:is-test-alive "localhost" 435) + (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") + " as DEAD") + (for-each + (lambda (test-id) + (let* ((tinfo (rmt:get-test-info-by-id run-id test-id)) + (run-dir (db:test-get-rundir tinfo)) + (host (db:test-get-host tinfo)) + (pid (db:test-get-process_id tinfo)) + (result (rmt:get-status-from-final-status-file run-dir))) + (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") + (rmt:set-state-status-and-roll-up-items + run-id test-id 'foo "COMPLETED" "PASS" + "Test stopped responding but it has PASSED; marking it PASS in the DB.")) + (let ((is-alive (and (not (eq? pid 0)) ;; 0 is default in re-used field "attemptnum" where pid stored. + (commonmod:is-test-alive host pid)))) + (if is-alive + (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host + " has a process on pid " pid ", NOT setting to DEAD.") + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id + " final state/status is not COMPLETED/PASS. It is " result) + (rmt:set-state-status-and-roll-up-items + run-id test-id 'foo "COMPLETED" "DEAD" + "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) + ;; call end of eud of run detection for posthook - from merge, is it needed? + ;; (launch:end-of-run-check run-id) + all-ids) + ))))) + Index: megatestmod.scm ================================================================== --- megatestmod.scm +++ megatestmod.scm @@ -32,10 +32,12 @@ (declare (uses commonmod)) (declare (uses configfmod)) (declare (uses processmod)) (declare (uses mtmod)) (declare (uses pkts)) +(declare (uses servermod)) +(declare (uses dbi)) (use srfi-69) (module megatestmod * @@ -50,27 +52,16 @@ (prefix sqlite3 sqlite3:) data-structures extras files - matchable - md5 - message-digest pathname-expand posix posix-extras - regex - regex-case - sparse-vectors - srfi-1 - srfi-18 - srfi-69 - typed-records - z3 - - debugprint - (prefix mtargs args:) + (prefix dbi dbi:) + + directory-utils ) (use srfi-69)) (chicken-5 (import (prefix sqlite3 sqlite3:) ;; data-structures @@ -99,27 +90,34 @@ pathname-expand system-information ))) -(import regex +(import call-with-environment-variables + matchable + md5 + message-digest + regex regex-case + sparse-vectors srfi-1 + srfi-13 srfi-18 srfi-69 typed-records - directory-utils - call-with-environment-variables + z3 + (prefix mtargs args:) commonmod configfmod - debugprint + dbfile dbmod - dbfile - processmod + debugprint mtmod pkts + processmod + servermod ) (define read-config (lambda ()(assert #f "FATAL: read-config proc not set!"))) (define (read-config-set! proc) @@ -314,40 +312,10 @@ (begin (debug:print-info 0 *default-log-port* "ATTENTION! Forcing use of server, force setting is \"" force-setting "\".") #t) #f))) -;;====================================================================== -;; calculate a delay number based on a droop curve -;; inputs are: -;; - load-in, load as from uptime, NOT normalized -;; - numcpus, number of cpus, ideally use the real cpus, not threads -;; -(define (common:get-delay load-in numcpus) - (let* ((ratio (/ load-in numcpus)) - (new-option (configf:lookup *configdat* "load" "new-load-method")) - (paramstr (or (configf:lookup *configdat* "load" "exp-params") - "15 12 1281453987.9543 0.75")) ;; 5 4 10 1")) - (paramlst (map string->number (string-split paramstr)))) - (if new-option - (begin - (cond ((and (>= ratio 0) (< ratio .5)) - 0) - ((and (>= ratio 0.5) (<= ratio .9)) - (* ratio (/ 5 .9))) - ((and (> ratio .9) (<= ratio 1.1)) - (+ 5 (* (- ratio .9) (/ 55 .2)))) - ((> ratio 1.1) - 60))) - (match paramlst - ((r1 r2 s1 s2) - (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2) - (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30)) - (else - (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr) - 30))))) - ;; -mrw- this appears to not be used ;; ;; (define (common:print-delay-table) ;; (let loop ((x 0)) ;; (print x "," (common:get-delay x 1)) @@ -368,76 +336,10 @@ ;; (if (number? newval) ;; (set! cpu-load newval)))))) ;; (car load-res)) ;; cpu-load)) -;;====================================================================== -;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the -;; [host-rules] section. -;; -(define (common:get-least-loaded-host hosts-raw host-type configdat) - (let* ((rdat (configf:lookup configdat "host-rules" host-type)) - (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate - (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load - (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs - (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second - (hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw)) - ;; (best-host #f) - (get-rec (lambda (hostname) - ;; (print "get-rec hostname=" hostname) - (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h))))) - (best-load 99999) - (curr-time (current-seconds)) - (get-hosts-sorted (lambda (hosts) - (sort hosts (lambda (a b) - (let ((a-rec (get-rec a)) - (b-rec (get-rec b))) - ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) - ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) - (< (host-last-used a-rec) - (host-last-used b-rec)))))))) - (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) - (if (null? hosts) - #f ;; no hosts to select from. All done and giving up now. - (let ((hosts-sorted (get-hosts-sorted hosts))) - (common:update-host-loads-table hosts) - (let loop ((hostname (car hosts-sorted)) - (tal (cdr hosts-sorted)) - (best-host #f)) - (let* ((rec (get-rec hostname)) - (reachable (host-reachable rec)) - (load (host-last-cpuload rec)) - (last-used (host-last-used rec)) - (delta (- curr-time last-used)) - (job-rate (if (> delta 0) - (/ 1 delta) - 999)) ;; jobs per second - (new-best - (cond - ((not reachable) - (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") - best-host) - ((and (< load maxnload) ;; load is acceptable - (< job-rate maxjobrate)) ;; job rate is acceptable - (set! best-load load) - hostname) - (else best-host)))) - (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) - (if new-best - (begin ;; found a host, return it - (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) - (host-last-used-set! rec curr-time) - new-best) - (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) - ;;====================================================================== ;; given path get free space, allows override in [setup] ;; with free-space-script /path/to/some/script.sh ;; (define (get-df path) @@ -544,84 +446,10 @@ (map car disks)) (if (and best (> bestsize minsize)) best #f))) ;; #f means no disk candidate found -;;====================================================================== -;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S -;;====================================================================== -;; -;; [hosts] -;; arm cubie01 cubie02 -;; x86_64 zeus xena myth01 -;; allhosts #{g hosts arm} #{g hosts x86_64} -;; -;; [host-types] -;; general #MTLOWESTLOAD #{g hosts allhosts} -;; arm #MTLOWESTLOAD #{g hosts arm} -;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo -;; -;; [host-rules] -;; # maxnload => max normalized load -;; # maxnjobs => max jobs per cpu -;; # maxjobrate => max jobs per second -;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 -;; -;; [launchers] -;; envsetup general -;; xor/%/n 4C16G -;; % nbgeneral -;; -;; [jobtools] -;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. -;; flexi-launcher yes -;; launcher nbfake -;; -(define (common:get-launcher configdat testname itempath) - (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) - (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher - (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) - (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) - (if (null? launchers) - fallback-launcher - (let loop ((hed (car launchers)) - (tal (cdr launchers))) - (let ((patt (car hed)) - (host-type (cadr hed))) - (if (tests:match patt testname itempath) - (begin - (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) - (let ((launcher (configf:lookup configdat "host-types" host-type))) - (if launcher - (let* ((launcher-parts (string-split launcher)) - (launcher-exe (car launcher-parts))) - (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline - (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) - (count 100)) - (if targ-host - (conc "remrun " targ-host) - (if (> count 0) - (begin - (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) - (thread-sleep! (- 101 count)) - (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) - (- count 1))) - (begin - (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) - (exit))))) - launcher)) - (begin - (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) - (if (null? tal) - fallback-launcher - (loop (car tal)(cdr tal))))))) - ;; no match, try again - (if (null? tal) - fallback-launcher - (loop (car tal)(cdr tal)))))))) - fallback-launcher))) - (define (common:get-pkts-dirs mtconf use-lt) (let* ((pktsdirs-str (or (configf:lookup mtconf "setup" "pktsdirs") (and use-lt (conc (or *toppath* (current-directory)) @@ -741,44 +569,10 @@ (if (null? (car uname-res)) "unknown" (caar uname-res)))) -;;====================================================================== -;; faux-lock is deprecated. Please use simple-lock below -;; -(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) - (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count - (if (> wait-time 0) - (begin - (thread-sleep! 1) - (if (eq? wait-time 1) ;; only one second left, steal the lock - (begin - (debug:print-info 0 *default-log-port* "stealing lock for " keyname) - (common:faux-unlock keyname force: #t))) - (common:faux-lock keyname wait-time: (- wait-time 1))) - #f) - (begin - (rmt:no-sync-set keyname (conc (current-process-id))) - (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) - -(define (common:faux-unlock keyname #!key (force #f)) - (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) - (begin - (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) - #t) - #f)) - -;;====================================================================== -;; simple lock. improve and converge on this one. -;; -(define (common:simple-lock keyname) - (rmt:no-sync-get-lock keyname)) - -(define (common:simple-unlock keyname #!key (force #f)) - (rmt:no-sync-del! keyname)) - ;;====================================================================== ;; use-lt is use linktree "lt" link to find pkts dir (define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already (if (or (not add-only) (hash-table-exists? *pkts-info* 'last-parent)) @@ -1128,69 +922,10 @@ " items: " items " itemstable: " itemstable) (items:get-items-from-config tconfig)) (else #f)))) ;; not iterated -;; returns waitons waitors tconfigdat -;; -(define (tests:get-waitons test-name all-tests-registry global-waitons) - (let* ((config (tests:get-testconfig test-name #f all-tests-registry 'return-procs))) ;; assuming no problems with immediate evaluation, this could be simplified ('return-procs -> #t) - (let ((instr (if config - (configf:lookup config "requirements" "waiton") - (begin ;; No config means this is a non-existant test - (debug:print-error 0 *default-log-port* "non-existent required test \"" test-name "\"") - (exit 1)))) - (instr2 (if config - (configf:lookup config "requirements" "waitor") - ""))) - (debug:print-info 8 *default-log-port* "waitons string is " instr ", waitors string is " instr2) - (let* ((newwaitons-tmp - (string-split (cond - ((procedure? instr) ;; here - (let ((res (instr))) - (debug:print-info 8 *default-log-port* "waiton procedure results in string " res " for test " test-name) - res)) - ((string? instr) instr) - (else - ;; NOTE: This is actually the case of *no* waitons! ;; (debug:print-error 0 *default-log-port* "something went wrong in processing waitons for test " test-name) - "")))) - (newwaitors - (string-split (cond - ((procedure? instr2) - (let ((res (instr2))) - (debug:print-info 8 *default-log-port* "waitor procedure results in string " res " for test " test-name) - res)) - ((string? instr2) instr2) - (else - ;; NOTE: This is actually the case of *no* waitons! ;; (debug:print-error 0 *default-log-port* "something went wrong in processing waitons for test " test-name) - "")))) - (newwaitons (if (and (list? global-waitons) - (not (null? global-waitons))) - (begin - (debug:print 0 *default-log-port* "Adding global waitons " global-waitons) - (append newwaitons-tmp (filter (lambda (x) ;; remove self from global waitons - (not (equal? x test-name))) - global-waitons))) - newwaitons-tmp))) - (values - ;; the waitons - (filter (lambda (x) - (if (hash-table-ref/default all-tests-registry x #f) - #t - (begin - (debug:print-error 0 *default-log-port* "test " test-name " has unrecognised waiton testname " x) - #f))) - newwaitons) - (filter (lambda (x) - (if (hash-table-ref/default all-tests-registry x #f) - #t - (begin - (debug:print-error 0 *default-log-port* "test " test-name " has unrecognised waiton testname " x) - #f))) - newwaitors) - config))))) - ;; given waiting-test that is waiting on waiton-test extend test-patt appropriately ;; ;; genlib/testconfig sim/testconfig ;; genlib/sch sim/sch/cell1 ;; @@ -1243,177 +978,6 @@ (new-patts (if (member waiton-test patts) patts (cons waiton-test patts)))) (string-intersperse (delete-duplicates new-patts) ","))))) -;; Check for waiver eligibility -;; -(define (tests:check-waiver-eligibility testdat prev-testdat) - (let* ((test-registry (make-hash-table)) - (testconfig (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f)) - (test-rundir ;; (sdb:qry 'passstr - (db:test-get-rundir testdat)) ;; ) - (prev-rundir ;; (sdb:qry 'passstr - (db:test-get-rundir prev-testdat)) ;; ) - (waivers (if testconfig (configf:section-vars testconfig "waivers") '())) - (waiver-rx (regexp "^(\\S+)\\s+(.*)$")) - (diff-rule "diff %file1% %file2%") - (logpro-rule "diff %file1% %file2% | logpro %waivername%.logpro %waivername%.html")) - (if (not (common:file-exists? test-rundir)) - (begin - (debug:print-error 0 *default-log-port* "test run directory is gone, cannot propagate waiver") - #f) - (begin - (push-directory test-rundir) - (let ((result (if (null? waivers) - #f - (let loop ((hed (car waivers)) - (tal (cdr waivers))) - (debug:print 0 *default-log-port* "INFO: Applying waiver rule \"" hed "\"") - (let* ((waiver (configf:lookup testconfig "waivers" hed)) - (wparts (if waiver (string-match waiver-rx waiver) #f)) - (waiver-rule (if wparts (cadr wparts) #f)) - (waiver-glob (if wparts (caddr wparts) #f)) - (logpro-file (if waiver - (let ((fname (conc hed ".logpro"))) - (if (common:file-exists? fname) - fname - (begin - (debug:print 0 *default-log-port* "INFO: No logpro file " fname " falling back to diff") - #f))) - #f)) - ;; if rule by name of waiver-rule is found in testconfig - use it - ;; else if waivername.logpro exists use logpro-rule - ;; else default to diff-rule - (rule-string (let ((rule (configf:lookup testconfig "waiver_rules" waiver-rule))) - (if rule - rule - (if logpro-file - logpro-rule - (begin - (debug:print 0 *default-log-port* "INFO: No logpro file " logpro-file " found, using diff rule") - diff-rule))))) - ;; (string-substitute "%file1%" "foofoo.txt" "This is %file1% and so is this %file1%." #t) - (processed-cmd (string-substitute - "%file1%" (conc test-rundir "/" waiver-glob) - (string-substitute - "%file2%" (conc prev-rundir "/" waiver-glob) - (string-substitute - "%waivername%" hed rule-string #t) #t) #t)) - (res #f)) - (debug:print 0 *default-log-port* "INFO: waiver command is \"" processed-cmd "\"") - (if (eq? (system processed-cmd) 0) - (if (null? tal) - #t - (loop (car tal)(cdr tal))) - #f)))))) - (pop-directory) - result))))) - - -;; if .testconfig exists in test directory read and return it -;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata" -;; else read the testconfig file -;; if have path to test directory save the config as .testconfig and return it -;; -(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)(allow-write-cache #t)(wait-a-minute #f)) - (let* ((use-cache (common:use-cache?)) - (cache-path (tests:get-test-path-from-environment)) - (cache-file (and cache-path (conc cache-path "/.testconfig"))) - (cache-exists (and cache-file - (not force-create) ;; if force-create then pretend there is no cache to read - (common:file-exists? cache-file))) - (cached-dat (if (and (not force-create) - cache-exists - use-cache) - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "failed to read " cache-file ", exn=" exn) - #f) ;; any issues, just give up with the cached version and re-read - (configf:read-alist cache-file)) - #f)) - (test-full-name (if (and item-path (not (string-null? item-path))) - (conc test-name "/" item-path) - test-name))) - (if cached-dat - cached-dat - (let ((dat (hash-table-ref/default *testconfigs* test-full-name #f))) - (if (and dat ;; have a locally cached version - (hash-table-ref/default dat "have fulldata" #f)) ;; marked as good data? - dat - ;; no cached data available - (let* ((treg (or test-registry - (tests:get-all))) - (test-path (or (hash-table-ref/default treg test-name #f) - (let* ((local-tcdir (conc (getenv "MT_LINKTREE") "/" - (getenv "MT_TARGET") "/" - (getenv "MT_RUNNAME") "/" - test-name "/" item-path)) - (local-tcfg (conc local-tcdir "/testconfig"))) - (if (common:file-exists? local-tcfg) - local-tcdir - #f)) - (conc *toppath* "/tests/" test-name))) - (test-configf (conc test-path "/testconfig")) - (testexists (let loopa ((tries-left 30)) - (cond - ( - (and (common:file-exists? test-configf)(file-read-access? test-configf)) - #t) - ( - (common:file-exists? test-configf) - (debug:print 0 *default-log-port* "WARNING: Cannot read testconfig file: "test-configf) - #f) - ( - (and wait-a-minute (> tries-left 0)) - (thread-sleep! 10) - (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf" will retry in 10 seconds. Tries left: "tries-left) ;; BB: this fires - (loopa (sub1 tries-left))) - (else - (debug:print 2 *default-log-port* "WARNING: testconfig file does not exist: "test-configf) ;; BB: this fires - #f)))) - (tcfg (if testexists - (read-config test-configf #f system-allowed - environ-patt: (if system-allowed - "pre-launch-env-vars" - #f)) - #f))) - (if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data - (if tcfg (hash-table-set! *testconfigs* test-full-name tcfg)) - (if (and testexists - cache-file - (file-write-access? cache-path) - allow-write-cache) - (let ((tpath (conc cache-path "/.testconfig"))) - (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath) - (if (and tcfg (not (common:in-running-test?))) - (configf:write-alist tcfg tpath)))) - tcfg)))))) - -(define (configf:write-alist cdat fname) - (if (not (common:faux-lock fname)) - (debug:print 0 *default-log-port* "INFO: Could not get lock on " fname)) - (let* ((dat (configf:config->alist cdat)) - (res - (begin - (with-output-to-file fname ;; first write out the file - (lambda () - (pp dat))) - - (if (common:file-exists? fname) ;; now verify it is readable - (if (configf:read-alist fname) - #t ;; data is good. - (begin - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "deleting " fname " failed, exn=" exn) - #f) - (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") - (delete-file fname)) - #f)) - #f)))) - (common:faux-unlock fname) - res)) - ) Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -19,28 +19,43 @@ ;;====================================================================== (declare (unit rmtmod)) (declare (uses debugprint)) (declare (uses commonmod)) +(declare (uses configfmod)) (declare (uses dbfile)) ;; needed for records (declare (uses dbmod)) (declare (uses mtmod)) (declare (uses tcp-transportmod)) (declare (uses apimod)) +(declare (uses servermod)) (module rmtmod * -(import scheme chicken data-structures extras matchable srfi-1 srfi-69) -(import (prefix sqlite3 sqlite3:) posix typed-records srfi-18) +(import scheme + chicken + data-structures + regex + extras + matchable + srfi-1 + srfi-69 + (prefix sqlite3 sqlite3:) + posix + typed-records + srfi-18) (import commonmod + configfmod tcp-transportmod dbfile dbmod debugprint apimod - mtmod) + mtmod + servermod + ) (include "db_records.scm") (defstruct alldat (areapath #f) @@ -222,85 +237,10 @@ (if (null? res) #f res))))) ;; (string-split (car res))))))) <== I would have preferred a single line STATE STATUS without "'s ;; (string-split (car res))))))) ;; DUNNO WHICH IS CORRECT -;; select end_time-now from -;; (select testname,item_path,event_time+run_duration as -;; end_time,strftime('%s','now') as now from tests where state in -;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); -;; -;; NOT EASY TO MIGRATE TO db{file,mod} -;; -(define (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period) - (let* ((incompleted '()) - (oldlaunched '()) - (toplevels '()) - ;; The default running-deadtime is 720 seconds = 12 minutes. - ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30)) - (deadtime-trim (or ovr-deadtime cfg-deadtime)) - (server-start-allowance 200) - (server-overloaded-budget 200) - (launch-monitor-off-time (or test-stats-update-period 30)) - (launch-monitor-on-time-budget 30) - (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget)) - (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30)) - (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default)) - (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period))) - (running-deadtime (or deadtime-trim running-deadtime-default))) ;; two minutes (30 seconds between updates, this leaves 3x grace period) - - (debug:print-info 4 *default-log-port* "running-deadtime = " running-deadtime) - (debug:print-info 4 *default-log-port* "deadtime-trim = " deadtime-trim) - - (let* ((dat (rmt:get-toplevels-and-incompletes run-id running-deadtime remotehoststart-deadtime))) - (set! oldlaunched (list-ref dat 1)) - (set! toplevels (list-ref dat 2)) - (set! incompleted (list-ref dat 0))) - - (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " - (length toplevels) " old LAUNCHED toplevel tests and " - (length incompleted) " tests marked RUNNING but apparently dead.") - - ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE. - ;; - ;; (db:delay-if-busy dbdat) - (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all - (all-ids (append min-incompleted-ids (map car oldlaunched)))) - (if (> (length all-ids) 0) - (begin - ;; (launch:is-test-alive "localhost" 435) - (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") - " as DEAD") - (for-each - (lambda (test-id) - (let* ((tinfo (rmt:get-test-info-by-id run-id test-id)) - (run-dir (db:test-get-rundir tinfo)) - (host (db:test-get-host tinfo)) - (pid (db:test-get-process_id tinfo)) - (result (rmt:get-status-from-final-status-file run-dir))) - (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) - (begin - (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") - (rmt:set-state-status-and-roll-up-items - run-id test-id 'foo "COMPLETED" "PASS" - "Test stopped responding but it has PASSED; marking it PASS in the DB.")) - (let ((is-alive (and (not (eq? pid 0)) ;; 0 is default in re-used field "attemptnum" where pid stored. - (commonmod:is-test-alive host pid)))) - (if is-alive - (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host - " has a process on pid " pid ", NOT setting to DEAD.") - (begin - (debug:print 0 *default-log-port* "INFO: test " test-id - " final state/status is not COMPLETED/PASS. It is " result) - (rmt:set-state-status-and-roll-up-items - run-id test-id 'foo "COMPLETED" "DEAD" - "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) - ;; call end of eud of run detection for posthook - from merge, is it needed? - ;; (launch:end-of-run-check run-id) - all-ids) - ))))) - ;;====================================================================== (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id (define *ttdat* #f) @@ -998,21 +938,10 @@ (tt-ro-mode-checked-set! runremote #t) ro-mode) ro-mode)))))) -;;====================================================================== -;; Maintenance -;;====================================================================== - -(define (rmt:find-and-mark-incomplete run-id ovr-deadtime) - (let* ((cfg-deadtime (configf:lookup-number *configdat* "setup" "deadtime")) - (test-stats-update-period (configf:lookup-number *configdat* "setup" "test-stats-update-period"))) - (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period) - ;;call end of eud of run detection for posthook - (launch:end-of-run-check run-id))) - ;;====================================================================== ;; S U P P O R T F U N C T I O N S ;;====================================================================== (define (rmt:on-homehost? runremote) @@ -1071,18 +1000,14 @@ ;; From 1.70 to 1.80, db's are compatible. (define (common:api-changed?) - (let* ( - (megatest-major-version (substring (->string megatest-version) 0 4)) - (run-major-version (substring (conc (common:get-last-run-version)) 0 4)) - ) - (and (not (equal? megatest-major-version "1.80")) - (not (equal? megatest-major-version megatest-run-version))) - ) -) + (let* ((megatest-major-version (substring (->string megatest-version) 0 4)) + (run-major-version (substring (conc (common:get-last-run-version)) 0 4))) + (and (not (equal? megatest-major-version "1.80")) + (not (equal? megatest-major-version run-major-version))))) ;;====================================================================== ;; Move me elsewhere ... ;; RADT => Why do we meed the version check here, this is called only if version misma ;; @@ -1218,111 +1143,11 @@ (rmt:set-state-status-and-roll-up-items run-id test-name item-path new-state new-status new-comment) ;; (mt:process-triggers run-id test-id new-state new-status) #t);) ;;(mt:test-set-state-status-by-id run-id test-id new-state new-status new-comment))) -;; Do not rpc this one, do the underlying calls!!! -(define (tests:test-set-status! run-id test-id state status comment dat #!key (work-area #f)) - (let* ((real-status status) - (otherdat (if dat dat (make-hash-table))) - (testdat (rmt:get-test-info-by-id run-id test-id)) - (test-name (db:test-get-testname testdat)) - (item-path (db:test-get-item-path testdat)) - ;; before proceeding we must find out if the previous test (where all keys matched except runname) - ;; was WAIVED if this test is FAIL - - ;; NOTES: - ;; 1. Is the call to test:get-previous-run-record remotified? - ;; 2. Add test for testconfig waiver propagation control here - ;; - (prev-test (if (equal? status "FAIL") - (rmt:get-previous-test-run-record run-id test-name item-path) - #f)) - (waived (if prev-test - (if prev-test ;; true if we found a previous test in this run series - (let ((prev-status (db:test-get-status prev-test)) - (prev-state (db:test-get-state prev-test)) - (prev-comment (db:test-get-comment prev-test))) - (debug:print 4 *default-log-port* "prev-status " prev-status ", prev-state " prev-state ", prev-comment " prev-comment) - (if (and (equal? prev-state "COMPLETED") - (equal? prev-status "WAIVED")) - (if comment - comment - prev-comment) ;; waived is either the comment or #f - #f)) - #f) - #f))) - (if (and waived - (tests:check-waiver-eligibility testdat prev-test)) - (set! real-status "WAIVED")) - - (debug:print 4 *default-log-port* "real-status " real-status ", waived " waived ", status " status) - - ;; update the primary record IF state AND status are defined - (if (and state status) - (begin - (rmt:set-state-status-and-roll-up-items run-id test-id item-path state real-status (if waived waived comment)) - ;; (mt:process-triggers run-id test-id state real-status) ;; triggers are called in test-set-state-status - )) - - ;; if status is "AUTO" then call rollup (note, this one modifies data in test - ;; run area, it does remote calls under the hood. - ;; (if (and test-id state status (equal? status "AUTO")) - ;; (rmt:test-data-rollup run-id test-id status)) - - ;; add metadata (need to do this way to avoid SQL injection issues) - - ;; :first_err - ;; (let ((val (hash-table-ref/default otherdat ":first_err" #f))) - ;; (if val - ;; (sqlite3:execute db "UPDATE tests SET first_err=? WHERE run_id=? AND testname=? AND item_path=?;" val run-id test-name item-path))) - ;; - ;; ;; :first_warn - ;; (let ((val (hash-table-ref/default otherdat ":first_warn" #f))) - ;; (if val - ;; (sqlite3:execute db "UPDATE tests SET first_warn=? WHERE run_id=? AND testname=? AND item_path=?;" val run-id test-name item-path))) - - (let ((category (hash-table-ref/default otherdat ":category" "")) - (variable (hash-table-ref/default otherdat ":variable" "")) - (value (hash-table-ref/default otherdat ":value" #f)) - (expected (hash-table-ref/default otherdat ":expected" "n/a")) - (tol (hash-table-ref/default otherdat ":tol" "n/a")) - (units (hash-table-ref/default otherdat ":units" "")) - (type (hash-table-ref/default otherdat ":type" "")) - (dcomment (hash-table-ref/default otherdat ":comment" ""))) - (debug:print 4 *default-log-port* - "category: " category ", variable: " variable ", value: " value - ", expected: " expected ", tol: " tol ", units: " units) - (if (and value) ;; require only value; BB was- all three required - (let ((dat (conc category "," - variable "," - value "," - expected "," - tol "," - units "," - dcomment ",," ;; extra comma for status - type ))) - ;; This was run remote, don't think that makes sense. Perhaps not, but that is the easiest path for the moment. - (rmt:csv->test-data run-id test-id - dat) - ;; This was added in check-in a5adfa3f9a. Message was: "...added delay in set-values to allow for delayed write on server start" - ;; I'm inserting an arbitrary rmt: call to force/ensure that the server is available to (hopefully) prevent a communication issue. - (rmt:get-var "MEGATEST_VERSION") ;; this does NOTHING but ensure the server is reachable. This is almost certainly NOT needed :) - ;; BB - commentiong out arbitrary 10 second wait (thread-sleep! 10) ;; add 10 second delay before quit incase rmt needs time to start a server. - ))) - - ;; need to update the top test record if PASS or FAIL and this is a subtest - ;;;;;; (if (not (equal? item-path "")) - ;;;;;; (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status #f) ;;;;;) - - (if (or (and (string? comment) - (string-match (regexp "\\S+") comment)) - waived) - (let ((cmt (if waived waived comment))) - (rmt:general-call 'set-test-comment run-id cmt test-id))))) - (define (tests:test-set-toplog! run-id test-name logf) (rmt:general-call 'tests:test-set-toplog run-id logf run-id test-name)) ) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1198,11 +1198,11 @@ ((runs:dat-load-mgmt-function runsdat)) (runs:dat-load-mgmt-function-set! runsdat (lambda () ;; jobtools maxload is useful for where the full Megatest run is done on one machine - (if (and (not (common:on-homehost?)) + (if (and (not (rmt:on-homehost?)) maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload Index: servermod.scm ================================================================== --- servermod.scm +++ servermod.scm @@ -18,25 +18,37 @@ (declare (unit servermod)) (declare (uses commonmod)) (declare (uses configfmod)) +(declare (uses mtmod)) (declare (uses debugprint)) (declare (uses mtargs)) (module servermod * + +(import scheme + chicken) (use (srfi 18) extras s11n) (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) (use directory-utils posix-extras matchable utils) (use spiffy uri-common intarweb http-client spiffy-request-vars) -(import commonmod +(import ports + data-structures + files + srfi-4 + typed-records + + commonmod configfmod debugprint - (prefix mtargs args:)) + (prefix mtargs args:) + mtmod + ) (include "common_records.scm") (include "db_records.scm") (define (server:make-server-url hostport) @@ -159,14 +171,10 @@ (unsetenv "TARGETHOST_LOGF") ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) -;; given a path to a server log return: host port startseconds server-id -;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let -;; example of what it's looking for in the log file: -;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 (define (server:logf-get-start-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0) @@ -199,150 +207,20 @@ (string->number port) (string->number start) server-id (string->number pid))) (else - (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) + (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) bad-dat)))) (begin (if dbprep-found (begin (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) bad-dat)))))))) -;; ;; get a list of servers from the log files, with all relevant data -;; ;; ( mod-time host port start-time pid ) -;; ;; -;; (define (server:get-list areapath #!key (limit #f)) -;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) -;; (day-seconds (* 24 60 60))) -;; ;; if the directory exists continue to get the list -;; ;; otherwise attempt to create the logs dir and then -;; ;; continue -;; (if (if (directory-exists? (conc areapath "/logs")) -;; '() -;; (if (file-write-access? areapath) -;; (begin -;; (condition-case -;; (create-directory (conc areapath "/logs") #t) -;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) -;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) -;; (directory-exists? (conc areapath "/logs"))) -;; '())) -;; -;; ;; Get the list of server logs. -;; (let* ( -;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. -;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) -;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) -;; (num-serv-logs (length server-logs))) -;; (if (or (null? server-logs) (= num-serv-logs 0)) -;; (let () -;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) -;; '() -;; ) -;; (let loop ((hed (string-chomp (car server-logs))) -;; (tal (cdr server-logs)) -;; (res '())) -;; (let* ((mod-time (handle-exceptions -;; exn -;; (begin -;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) -;; (current-seconds)) ;; 0 -;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted -;; (down-time (- (current-seconds) mod-time)) -;; (serv-dat (if (or (< num-serv-logs 10) -;; (< down-time 900)) ;; day-seconds)) -;; (server:logf-get-start-info hed) -;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at -;; (serv-rec (cons mod-time serv-dat)) -;; (fmatch (string-match fname-rx hed)) -;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) -;; (new-res (if (null? serv-dat) -;; res -;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let -;; (if (null? tal) -;; (if (and limit -;; (> (length new-res) limit)) -;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work -;; new-res) -;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) - -#;(define (server:get-num-alive srvlst) - (let ((num-alive 0)) - (for-each - (lambda (server) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn)) - (match-let (((mod-time host port start-time server-id pid) - server)) - (let* ((uptime (- (current-seconds) mod-time)) - (runtime (if start-time - (- mod-time start-time) - 0))) - (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) - srvlst) - num-alive)) - -;; ;; given a list of servers get a list of valid servers, i.e. at least -;; ;; 10 seconds old, has started and is less than 1 hour old and is -;; ;; active (i.e. mod-time < 10 seconds -;; ;; -;; ;; mod-time host port start-time pid -;; ;; -;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off -;; ;; and servers should stick around for about two hours or so. -;; ;; -;; (define (server:get-best srvlst) -;; (let* ((nums (server:get-num-servers)) -;; (now (current-seconds)) -;; (slst (sort -;; (filter (lambda (rec) -;; (if (and (list? rec) -;; (> (length rec) 2)) -;; (let ((start-time (list-ref rec 3)) -;; (mod-time (list-ref rec 0))) -;; ;; (print "start-time: " start-time " mod-time: " mod-time) -;; (and start-time mod-time -;; (> (- now start-time) 0) ;; been running at least 0 seconds -;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds -;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set -;; (< (- now start-time) -;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) -;; 180) -;; (random 360)))) ;; under one hour running time +/- 180 -;; )) -;; #f)) -;; srvlst) -;; (lambda (a b) -;; (< (list-ref a 3) -;; (list-ref b 3)))))) -;; (if (> (length slst) nums) -;; (take slst nums) -;; slst))) - -;; ;; switch from server:get-list to server:get-servers-info -;; ;; -;; (define (server:get-first-best areapath) -;; (let ((srvrs (server:get-best (server:get-list areapath)))) -;; (if (and srvrs -;; (not (null? srvrs))) -;; (car srvrs) -;; #f))) -;; -;; (define (server:get-rand-best areapath) -;; (let ((srvrs (server:get-best (server:get-list areapath)))) -;; (if (and (list? srvrs) -;; (not (null? srvrs))) -;; (let* ((len (length srvrs)) -;; (idx (random len))) -;; (list-ref srvrs idx)) -;; #f))) (define (server:record->id servr) (handle-exceptions exn (begin @@ -364,43 +242,10 @@ servr)) (if (and host port) (conc host ":" port) #f)))) - -;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. -;; if it is old enough, overwrite it and wait 0.25 seconds. -;; if it then has the wrong server key, wait + 1 and call this function recursively. -;; -#;(define (server:wait-for-server-start-last-flag areapath) - (let* ((start-flag (conc areapath "/logs/server-start-last")) - ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) - (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) - (server-key (conc (get-host-name) "-" (current-process-id)))) - (if (file-exists? start-flag) - (let* ((fmodtime (file-modification-time start-flag)) - (delta (- (current-seconds) fmodtime)) - (old-enough (> delta idletime)) - (new-server-key "")) - ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t. - ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. - (if (and old-enough - (begin - (debug:print-info 2 *default-log-port* "Writing " start-flag) - (with-output-to-file start-flag (lambda () (print server-key))) - (thread-sleep! 0.25) - (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) - (equal? server-key new-server-key))) - #t - ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively. - (begin - (debug:print-info 0 *default-log-port* "Gating server start, last start: " - (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) - - (thread-sleep! ( + 1 idletime)) - (server:wait-for-server-start-last-flag areapath))))))) - ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; (define (server:get-servers-info areapath) ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") @@ -564,170 +409,10 @@ exn (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) (delete-file sfile)))))) sfiles))) -;; would like to eventually get rid of this -;; -(define (common:on-homehost?) - (if (eq? (rmt:transport-mode) 'http) - (server:choose-server *toppath* 'home?) - #t)) ;; there is no homehost for tcp and nfs is always on home so #t should work - -;; kind start up of server, wait before allowing another server for a given -;; area to be launched -;; -(define (server:kind-run areapath) - ;; look for $MT_RUN_AREA_HOME/logs/server-start-last - ;; and wait for it to be at least seconds old - ;; (server:wait-for-server-start-last-flag areapath) - (let loop () - (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) - (begin - (if (common:low-noise-print 30 "our-host-load") - (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) - (loop)))) - (if (< (server:choose-server areapath 'count) 20) - (server:run areapath)) - #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? - (let* ((lock-file (conc areapath "/logs/server-start.lock"))) - (let* ((start-flag (conc areapath "/logs/server-start-last"))) - (common:simple-file-lock-and-wait lock-file expire-time: 25) - (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) - (system (conc "touch " start-flag)) ;; lazy but safe - (server:run areapath) - (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". - (common:simple-file-release-lock lock-file))) - (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) - -;; this one seems to be the general entry point -;; -(define (server:start-and-wait areapath #!key (timeout 60)) - (let ((give-up-time (+ (current-seconds) timeout))) - (let loop ((server-info (server:check-if-running areapath)) - (try-num 0)) - (if (or server-info - (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. - (server:record->url server-info) - (let* ( (servers (server:choose-server areapath 'all-valid)) - (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) - (if (and (> try-num 0) ;; first time through simply wait a little while then try again - (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one - (server:run areapath)) - (thread-sleep! 5) - (loop (server:check-if-running areapath) - (+ try-num 1))))))) - -(define (server:get-num-servers #!key (numservers 2)) - (let ((ns (string->number - (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) - (or ns numservers))) - -;; no longer care if multiple servers are started by accident. older servers will drop off in time. -;; -(define (server:check-if-running areapath) ;; #!key (numservers "2")) - (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed - (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) - (if (or (and servers - (null? servers)) - (not servers)) - ;; (and (list? servers) - ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers - #f - (let loop ((hed (car servers)) - (tal (cdr servers))) - (let ((res (server:check-server hed))) - (if res - hed - (if (null? tal) - #f - (loop (car tal)(cdr tal))))))))) - -;; ping the given server -;; -(define (server:check-server server-record) - (let* ((server-url (server:record->url server-record)) - (server-id (server:record->id server-record)) - (res (server:ping server-url server-id))) - (if res - server-url - #f))) - -(define (server:kill servr) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) - #f) - (match-let (((hostname port start-time server-id pid) - servr)) - (tasks:kill-server hostname pid)))) - -;; ;; called in megatest.scm, host-port is string hostname:port -;; ;; -;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running -;; ;; in the same process as the server. -;; ;; -;; (define (server:ping host:port server-id #!key (do-exit #f)) -;; (let* ((host-port (cond -;; ((string? host:port) -;; (let ((slst (string-split host:port ":"))) -;; (if (eq? (length slst) 2) -;; (list (car slst)(string->number (cadr slst))) -;; #f))) -;; (else -;; #f)))) -;; (cond -;; ((and (list? host-port) -;; (eq? (length host-port) 2)) -;; (let* ((myrunremote (make-and-init-remote *toppath*)) -;; (iface (car host-port)) -;; (port (cadr host-port)) -;; (server-dat (client:connect iface port server-id myrunremote)) -;; (login-res (rmt:login-no-auto-client-setup myrunremote))) -;; (http-transport:close-connections myrunremote) -;; (if (and (list? login-res) -;; (car login-res)) -;; (begin -;; ;; (print "LOGIN_OK") -;; (if do-exit (exit 0)) -;; #t) -;; (begin -;; ;; (print "LOGIN_FAILED") -;; (if do-exit (exit 1)) -;; #f)))) -;; (else -;; (if host:port -;; (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port)) -;; (if do-exit -;; (exit 1) -;; #f))))) -;; -;; ;; run ping in separate process, safest way in some cases -;; ;; -;; (define (server:ping-server ifaceport) -;; (with-input-from-pipe -;; (conc (common:get-megatest-exe) " -ping " ifaceport) -;; (lambda () -;; (let loop ((inl (read-line)) -;; (res "NOREPLY")) -;; (if (eof-object? inl) -;; (case (string->symbol res) -;; ((NOREPLY) #f) -;; ((LOGIN_OK) #t) -;; (else #f)) -;; (loop (read-line) inl)))))) -;; -;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). -;; ;; -;; (define (server:login toppath) -;; (lambda (toppath) -;; (set! *db-last-access* (current-seconds)) ;; might not be needed. -;; (if (equal? *toppath* toppath) -;; #t -;; #f))) - ;; timeout is hms string: 1h 5m 3s, default is 1 minute ;; This is currently broken. Just use the number of hours with no unit. ;; Default is 600 seconds. ;; (define (server:expiration-timeout) @@ -875,86 +560,13 @@ (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) (delete-file* fullname))))) files) (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files.")))))) -(define (common:wait-for-homehost-load maxnormload msg) - (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test... - (if (not *toppath*) - (begin - (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.") - (thread-sleep! 30) - (if (< (- (current-seconds) start-time) 300) - (loop start-time))))) - (case (rmt:transport-mode) - ((http) - (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. - #f - (server:choose-server *toppath* 'homehost))) - (hh (if hh-dat (car hh-dat) #f))) - (common:wait-for-normalized-load maxnormload msg hh))) - (else - (common:wait-for-normalized-load maxnormload msg (get-host-name))))) - -;;====================================================================== -;; Force a megatest cleanup-db if version is changed and skip-version-check not specified -;; Do NOT check if not on homehost! -;; -(define (common:exit-on-version-changed) - (if (and *toppath* ;; do nothing if *toppath* not yet provided - (common:on-homehost?)) - (if (common:api-changed?) - (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) - (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".mtdb/main.db")) - (read-only (not (file-write-access? dbfile))) - (dbstruct (db:setup))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) - (debug:print 0 *default-log-port* - "WARNING: Version mismatch!\n" - " expected: " (common:version-signature) "\n" - " got: " (common:get-last-run-version)) - (cond - ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) - ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) - (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db - (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain (current-error-port)) - (exit 1)) - (common:cleanup-db dbstruct))) - ((not (common:file-exists? mtconf)) - (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (common:file-exists? dbfile)) - (debug:print 0 *default-log-port* " .mtdb/main.db does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (eq? (current-user-id)(file-owner mtconf))) - (debug:print 0 *default-log-port* " You do not own .mtdb/main.db in this area. Cannot proceed with megatest version migration.") - (exit 1)) - (read-only - (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") - (exit 1)) - (else - (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") - (exit 1))))))) -;;====================================================================== -;; (begin -;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") -;; (exit 1)))) - ;;====================================================================== ;; E X I T H A N D L I N G ;;====================================================================== - -;; (define (common:run-sync?) -;; (and *toppath* ;; gate if called before *toppath* is set -;; (common:on-homehost?) -;; (args:get-arg "-server"))) - (define (std-signal-handler signum) ;; (signal-mask! signum) (set! *time-to-exit* #t) ;;(debug:print-info 13 *default-log-port* "got signal "signum) @@ -973,65 +585,38 @@ (set-signal-handler! signal/int std-signal-handler) ;; ^C (set-signal-handler! signal/term std-signal-handler) ;;====================================================================== -;; ideally put all this info into the db, no need to preserve it across moving homehost -;; -;; return list of -;; ( reachable? cpuload update-time ) -(define (common:get-host-info hostname) - (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data - (load (car loadinfo)) - (load-sample-time (cdr loadinfo)) - (load-sample-age (- (current-seconds) load-sample-time)) - (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds - (host-last-update-timeout-seconds 4) - (host-rec (hash-table-ref/default *host-loads* hostname #f)) - ) - (cond - ((< load-sample-age loadinfo-timeout-seconds) - (list #t - load-sample-time - load)) - ((and host-rec - (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) - (list #t - (host-last-update host-rec) - (host-last-cpuload host-rec ))) - ((common:unix-ping hostname) - (list #t - (current-seconds) - (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds - (else - (list #f 0 -1) ;; bad host, don't use! - )))) - -;;====================================================================== -;; see defstruct host at top of file. -;; host: reachable last-update last-used last-cpuload -;; -(define (common:update-host-loads-table hosts-raw) - (let* ((hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw))) - (for-each - (lambda (hostname) - (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h)))) - (host-info (common:get-host-info hostname)) - (is-reachable (car host-info)) - (last-reached-time (cadr host-info)) - (load (caddr host-info))) - (host-reachable-set! rec is-reachable) - (host-last-update-set! rec last-reached-time) - (host-last-cpuload-set! rec load))) - hosts))) +;; calculate a delay number based on a droop curve +;; inputs are: +;; - load-in, load as from uptime, NOT normalized +;; - numcpus, number of cpus, ideally use the real cpus, not threads +;; +(define (common:get-delay load-in numcpus) + (let* ((ratio (/ load-in numcpus)) + (new-option (configf:lookup *configdat* "load" "new-load-method")) + (paramstr (or (configf:lookup *configdat* "load" "exp-params") + "15 12 1281453987.9543 0.75")) ;; 5 4 10 1")) + (paramlst (map string->number (string-split paramstr)))) + (if new-option + (begin + (cond ((and (>= ratio 0) (< ratio .5)) + 0) + ((and (>= ratio 0.5) (<= ratio .9)) + (* ratio (/ 5 .9))) + ((and (> ratio .9) (<= ratio 1.1)) + (+ 5 (* (- ratio .9) (/ 55 .2)))) + ((> ratio 1.1) + 60))) + (match paramlst + ((r1 r2 s1 s2) + (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2) + (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30)) + (else + (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr) + 30))))) ;;====================================================================== ;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load ;; count - count down to zero, at some point we'd give up if the load never drops ;; num-tries - count down to zero number tries to get numcpus @@ -1097,11 +682,11 @@ ;; overloaded and count expired (i.e. went to zero) (else (if (> num-tries 0) ;; should be "num-tries-left". (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host)) (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of " - effective-normalized-load " continuing.")) + normalized-effective-load " continuing.")) (debug:print 0 *default-log-port* "Load on " effective-host ", " first" could not be retrieved. Giving up and continuing.")))))) ;;====================================================================== ;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load Index: tasksmod.scm ================================================================== --- tasksmod.scm +++ tasksmod.scm @@ -1108,7 +1108,637 @@ (begin (debug:print 0 *default-log-port* "ERROR: unable to create an area record") #f))))) +;;====================================================================== +;; see defstruct host at top of file. +;; host: reachable last-update last-used last-cpuload +;; +(define (common:update-host-loads-table hosts-raw) + (let* ((hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw))) + (for-each + (lambda (hostname) + (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h)))) + (host-info (common:get-host-info hostname)) + (is-reachable (car host-info)) + (last-reached-time (cadr host-info)) + (load (caddr host-info))) + (host-reachable-set! rec is-reachable) + (host-last-update-set! rec last-reached-time) + (host-last-cpuload-set! rec load))) + hosts))) + +;;====================================================================== +;; ideally put all this info into the db, no need to preserve it across moving homehost +;; +;; return list of +;; ( reachable? cpuload update-time ) +(define (common:get-host-info hostname) + (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data + (load (car loadinfo)) + (load-sample-time (cdr loadinfo)) + (load-sample-age (- (current-seconds) load-sample-time)) + (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds + (host-last-update-timeout-seconds 4) + (host-rec (hash-table-ref/default *host-loads* hostname #f)) + ) + (cond + ((< load-sample-age loadinfo-timeout-seconds) + (list #t + load-sample-time + load)) + ((and host-rec + (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) + (list #t + (host-last-update host-rec) + (host-last-cpuload host-rec ))) + ((common:unix-ping hostname) + (list #t + (current-seconds) + (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds + (else + (list #f 0 -1) ;; bad host, don't use! + )))) + +;;====================================================================== +;; Force a megatest cleanup-db if version is changed and skip-version-check not specified +;; Do NOT check if not on homehost! +;; +(define (common:exit-on-version-changed) + (if (and *toppath* ;; do nothing if *toppath* not yet provided + (rmt:on-homehost?)) + (if (common:api-changed?) + (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) + (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".mtdb/main.db")) + (read-only (not (file-write-access? dbfile))) + (dbstruct (db:setup))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) + (debug:print 0 *default-log-port* + "WARNING: Version mismatch!\n" + " expected: " (common:version-signature) "\n" + " got: " (common:get-last-run-version)) + (cond + ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) + ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) + (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db + (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (print-call-chain (current-error-port)) + (exit 1)) + (common:cleanup-db dbstruct))) + ((not (common:file-exists? mtconf)) + (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") + (exit 1)) + ((not (common:file-exists? dbfile)) + (debug:print 0 *default-log-port* " .mtdb/main.db does not exist in this area. Cannot proceed with megatest version migration.") + (exit 1)) + ((not (eq? (current-user-id)(file-owner mtconf))) + (debug:print 0 *default-log-port* " You do not own .mtdb/main.db in this area. Cannot proceed with megatest version migration.") + (exit 1)) + (read-only + (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") + (exit 1)) + (else + (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") + (exit 1))))))) +;;====================================================================== +;; (begin +;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") +;; (exit 1)))) + +(define (common:wait-for-homehost-load maxnormload msg) + (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test... + (if (not *toppath*) + (begin + (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.") + (thread-sleep! 30) + (if (< (- (current-seconds) start-time) 300) + (loop start-time))))) + (case (rmt:transport-mode) + ((http) + (let* ((hh-dat (if (rmt:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. + #f + (server:choose-server *toppath* 'homehost))) + (hh (if hh-dat (car hh-dat) #f))) + (common:wait-for-normalized-load maxnormload msg hh))) + (else + (common:wait-for-normalized-load maxnormload msg (get-host-name))))) + + +(define (configf:write-alist cdat fname) + (if (not (common:faux-lock fname)) + (debug:print 0 *default-log-port* "INFO: Could not get lock on " fname)) + (let* ((dat (configf:config->alist cdat)) + (res + (begin + (with-output-to-file fname ;; first write out the file + (lambda () + (pp dat))) + + (if (common:file-exists? fname) ;; now verify it is readable + (if (configf:read-alist fname) + #t ;; data is good. + (begin + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "deleting " fname " failed, exn=" exn) + #f) + (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") + (delete-file fname)) + #f)) + #f)))) + (common:faux-unlock fname) + res)) + +;;====================================================================== +;; faux-lock is deprecated. Please use simple-lock below +;; +(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) + (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count + (if (> wait-time 0) + (begin + (thread-sleep! 1) + (if (eq? wait-time 1) ;; only one second left, steal the lock + (begin + (debug:print-info 0 *default-log-port* "stealing lock for " keyname) + (common:faux-unlock keyname force: #t))) + (common:faux-lock keyname wait-time: (- wait-time 1))) + #f) + (begin + (rmt:no-sync-set keyname (conc (current-process-id))) + (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) + +(define (common:faux-unlock keyname #!key (force #f)) + (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) + (begin + (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) + #t) + #f)) + +;;====================================================================== +;; simple lock. improve and converge on this one. +;; +(define (common:simple-lock keyname) + (rmt:no-sync-get-lock keyname)) + +(define (common:simple-unlock keyname #!key (force #f)) + (rmt:no-sync-del! keyname)) + +;; returns waitons waitors tconfigdat +;; +(define (tests:get-waitons test-name all-tests-registry global-waitons) + (let* ((config (tests:get-testconfig test-name #f all-tests-registry 'return-procs))) ;; assuming no problems with immediate evaluation, this could be simplified ('return-procs -> #t) + (let ((instr (if config + (configf:lookup config "requirements" "waiton") + (begin ;; No config means this is a non-existant test + (debug:print-error 0 *default-log-port* "non-existent required test \"" test-name "\"") + (exit 1)))) + (instr2 (if config + (configf:lookup config "requirements" "waitor") + ""))) + (debug:print-info 8 *default-log-port* "waitons string is " instr ", waitors string is " instr2) + (let* ((newwaitons-tmp + (string-split (cond + ((procedure? instr) ;; here + (let ((res (instr))) + (debug:print-info 8 *default-log-port* "waiton procedure results in string " res " for test " test-name) + res)) + ((string? instr) instr) + (else + ;; NOTE: This is actually the case of *no* waitons! ;; (debug:print-error 0 *default-log-port* "something went wrong in processing waitons for test " test-name) + "")))) + (newwaitors + (string-split (cond + ((procedure? instr2) + (let ((res (instr2))) + (debug:print-info 8 *default-log-port* "waitor procedure results in string " res " for test " test-name) + res)) + ((string? instr2) instr2) + (else + ;; NOTE: This is actually the case of *no* waitons! ;; (debug:print-error 0 *default-log-port* "something went wrong in processing waitons for test " test-name) + "")))) + (newwaitons (if (and (list? global-waitons) + (not (null? global-waitons))) + (begin + (debug:print 0 *default-log-port* "Adding global waitons " global-waitons) + (append newwaitons-tmp (filter (lambda (x) ;; remove self from global waitons + (not (equal? x test-name))) + global-waitons))) + newwaitons-tmp))) + (values + ;; the waitons + (filter (lambda (x) + (if (hash-table-ref/default all-tests-registry x #f) + #t + (begin + (debug:print-error 0 *default-log-port* "test " test-name " has unrecognised waiton testname " x) + #f))) + newwaitons) + (filter (lambda (x) + (if (hash-table-ref/default all-tests-registry x #f) + #t + (begin + (debug:print-error 0 *default-log-port* "test " test-name " has unrecognised waiton testname " x) + #f))) + newwaitors) + config))))) + +;; Check for waiver eligibility +;; +(define (tests:check-waiver-eligibility testdat prev-testdat) + (let* ((test-registry (make-hash-table)) + (testconfig (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f)) + (test-rundir ;; (sdb:qry 'passstr + (db:test-get-rundir testdat)) ;; ) + (prev-rundir ;; (sdb:qry 'passstr + (db:test-get-rundir prev-testdat)) ;; ) + (waivers (if testconfig (configf:section-vars testconfig "waivers") '())) + (waiver-rx (regexp "^(\\S+)\\s+(.*)$")) + (diff-rule "diff %file1% %file2%") + (logpro-rule "diff %file1% %file2% | logpro %waivername%.logpro %waivername%.html")) + (if (not (common:file-exists? test-rundir)) + (begin + (debug:print-error 0 *default-log-port* "test run directory is gone, cannot propagate waiver") + #f) + (begin + (push-directory test-rundir) + (let ((result (if (null? waivers) + #f + (let loop ((hed (car waivers)) + (tal (cdr waivers))) + (debug:print 0 *default-log-port* "INFO: Applying waiver rule \"" hed "\"") + (let* ((waiver (configf:lookup testconfig "waivers" hed)) + (wparts (if waiver (string-match waiver-rx waiver) #f)) + (waiver-rule (if wparts (cadr wparts) #f)) + (waiver-glob (if wparts (caddr wparts) #f)) + (logpro-file (if waiver + (let ((fname (conc hed ".logpro"))) + (if (common:file-exists? fname) + fname + (begin + (debug:print 0 *default-log-port* "INFO: No logpro file " fname " falling back to diff") + #f))) + #f)) + ;; if rule by name of waiver-rule is found in testconfig - use it + ;; else if waivername.logpro exists use logpro-rule + ;; else default to diff-rule + (rule-string (let ((rule (configf:lookup testconfig "waiver_rules" waiver-rule))) + (if rule + rule + (if logpro-file + logpro-rule + (begin + (debug:print 0 *default-log-port* "INFO: No logpro file " logpro-file " found, using diff rule") + diff-rule))))) + ;; (string-substitute "%file1%" "foofoo.txt" "This is %file1% and so is this %file1%." #t) + (processed-cmd (string-substitute + "%file1%" (conc test-rundir "/" waiver-glob) + (string-substitute + "%file2%" (conc prev-rundir "/" waiver-glob) + (string-substitute + "%waivername%" hed rule-string #t) #t) #t)) + (res #f)) + (debug:print 0 *default-log-port* "INFO: waiver command is \"" processed-cmd "\"") + (if (eq? (system processed-cmd) 0) + (if (null? tal) + #t + (loop (car tal)(cdr tal))) + #f)))))) + (pop-directory) + result))))) + + +;; if .testconfig exists in test directory read and return it +;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata" +;; else read the testconfig file +;; if have path to test directory save the config as .testconfig and return it +;; +(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)(allow-write-cache #t)(wait-a-minute #f)) + (let* ((use-cache (common:use-cache?)) + (cache-path (tests:get-test-path-from-environment)) + (cache-file (and cache-path (conc cache-path "/.testconfig"))) + (cache-exists (and cache-file + (not force-create) ;; if force-create then pretend there is no cache to read + (common:file-exists? cache-file))) + (cached-dat (if (and (not force-create) + cache-exists + use-cache) + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "failed to read " cache-file ", exn=" exn) + #f) ;; any issues, just give up with the cached version and re-read + (configf:read-alist cache-file)) + #f)) + (test-full-name (if (and item-path (not (string-null? item-path))) + (conc test-name "/" item-path) + test-name))) + (if cached-dat + cached-dat + (let ((dat (hash-table-ref/default *testconfigs* test-full-name #f))) + (if (and dat ;; have a locally cached version + (hash-table-ref/default dat "have fulldata" #f)) ;; marked as good data? + dat + ;; no cached data available + (let* ((treg (or test-registry + (tests:get-all))) + (test-path (or (hash-table-ref/default treg test-name #f) + (let* ((local-tcdir (conc (getenv "MT_LINKTREE") "/" + (getenv "MT_TARGET") "/" + (getenv "MT_RUNNAME") "/" + test-name "/" item-path)) + (local-tcfg (conc local-tcdir "/testconfig"))) + (if (common:file-exists? local-tcfg) + local-tcdir + #f)) + (conc *toppath* "/tests/" test-name))) + (test-configf (conc test-path "/testconfig")) + (testexists (let loopa ((tries-left 30)) + (cond + ( + (and (common:file-exists? test-configf)(file-read-access? test-configf)) + #t) + ( + (common:file-exists? test-configf) + (debug:print 0 *default-log-port* "WARNING: Cannot read testconfig file: "test-configf) + #f) + ( + (and wait-a-minute (> tries-left 0)) + (thread-sleep! 10) + (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf" will retry in 10 seconds. Tries left: "tries-left) ;; BB: this fires + (loopa (sub1 tries-left))) + (else + (debug:print 2 *default-log-port* "WARNING: testconfig file does not exist: "test-configf) ;; BB: this fires + #f)))) + (tcfg (if testexists + (read-config test-configf #f system-allowed + environ-patt: (if system-allowed + "pre-launch-env-vars" + #f)) + #f))) + (if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data + (if tcfg (hash-table-set! *testconfigs* test-full-name tcfg)) + (if (and testexists + cache-file + (file-write-access? cache-path) + allow-write-cache) + (let ((tpath (conc cache-path "/.testconfig"))) + (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath) + (if (and tcfg (not (common:in-running-test?))) + (configf:write-alist tcfg tpath)))) + tcfg)))))) + +;;====================================================================== +;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the +;; [host-rules] section. +;; +(define (common:get-least-loaded-host hosts-raw host-type configdat) + (let* ((rdat (configf:lookup configdat "host-rules" host-type)) + (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate + (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load + (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs + (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second + (hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw)) + ;; (best-host #f) + (get-rec (lambda (hostname) + ;; (print "get-rec hostname=" hostname) + (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h))))) + (best-load 99999) + (curr-time (current-seconds)) + (get-hosts-sorted (lambda (hosts) + (sort hosts (lambda (a b) + (let ((a-rec (get-rec a)) + (b-rec (get-rec b))) + ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) + ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) + (< (host-last-used a-rec) + (host-last-used b-rec)))))))) + (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) + (if (null? hosts) + #f ;; no hosts to select from. All done and giving up now. + (let ((hosts-sorted (get-hosts-sorted hosts))) + (common:update-host-loads-table hosts) + (let loop ((hostname (car hosts-sorted)) + (tal (cdr hosts-sorted)) + (best-host #f)) + (let* ((rec (get-rec hostname)) + (reachable (host-reachable rec)) + (load (host-last-cpuload rec)) + (last-used (host-last-used rec)) + (delta (- curr-time last-used)) + (job-rate (if (> delta 0) + (/ 1 delta) + 999)) ;; jobs per second + (new-best + (cond + ((not reachable) + (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") + best-host) + ((and (< load maxnload) ;; load is acceptable + (< job-rate maxjobrate)) ;; job rate is acceptable + (set! best-load load) + hostname) + (else best-host)))) + (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) + (if new-best + (begin ;; found a host, return it + (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) + (host-last-used-set! rec curr-time) + new-best) + (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) + +;;====================================================================== +;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S +;;====================================================================== +;; +;; [hosts] +;; arm cubie01 cubie02 +;; x86_64 zeus xena myth01 +;; allhosts #{g hosts arm} #{g hosts x86_64} +;; +;; [host-types] +;; general #MTLOWESTLOAD #{g hosts allhosts} +;; arm #MTLOWESTLOAD #{g hosts arm} +;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo +;; +;; [host-rules] +;; # maxnload => max normalized load +;; # maxnjobs => max jobs per cpu +;; # maxjobrate => max jobs per second +;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 +;; +;; [launchers] +;; envsetup general +;; xor/%/n 4C16G +;; % nbgeneral +;; +;; [jobtools] +;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. +;; flexi-launcher yes +;; launcher nbfake +;; +(define (common:get-launcher configdat testname itempath) + (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) + (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher + (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) + (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) + (if (null? launchers) + fallback-launcher + (let loop ((hed (car launchers)) + (tal (cdr launchers))) + (let ((patt (car hed)) + (host-type (cadr hed))) + (if (tests:match patt testname itempath) + (begin + (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) + (let ((launcher (configf:lookup configdat "host-types" host-type))) + (if launcher + (let* ((launcher-parts (string-split launcher)) + (launcher-exe (car launcher-parts))) + (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline + (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) + (count 100)) + (if targ-host + (conc "remrun " targ-host) + (if (> count 0) + (begin + (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) + (thread-sleep! (- 101 count)) + (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) + (- count 1))) + (begin + (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) + (exit))))) + launcher)) + (begin + (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) + (if (null? tal) + fallback-launcher + (loop (car tal)(cdr tal))))))) + ;; no match, try again + (if (null? tal) + fallback-launcher + (loop (car tal)(cdr tal)))))))) + fallback-launcher))) + +;; Do not rpc this one, do the underlying calls!!! +(define (tests:test-set-status! run-id test-id state status comment dat #!key (work-area #f)) + (let* ((real-status status) + (otherdat (if dat dat (make-hash-table))) + (testdat (rmt:get-test-info-by-id run-id test-id)) + (test-name (db:test-get-testname testdat)) + (item-path (db:test-get-item-path testdat)) + ;; before proceeding we must find out if the previous test (where all keys matched except runname) + ;; was WAIVED if this test is FAIL + + ;; NOTES: + ;; 1. Is the call to test:get-previous-run-record remotified? + ;; 2. Add test for testconfig waiver propagation control here + ;; + (prev-test (if (equal? status "FAIL") + (rmt:get-previous-test-run-record run-id test-name item-path) + #f)) + (waived (if prev-test + (if prev-test ;; true if we found a previous test in this run series + (let ((prev-status (db:test-get-status prev-test)) + (prev-state (db:test-get-state prev-test)) + (prev-comment (db:test-get-comment prev-test))) + (debug:print 4 *default-log-port* "prev-status " prev-status ", prev-state " prev-state ", prev-comment " prev-comment) + (if (and (equal? prev-state "COMPLETED") + (equal? prev-status "WAIVED")) + (if comment + comment + prev-comment) ;; waived is either the comment or #f + #f)) + #f) + #f))) + (if (and waived + (tests:check-waiver-eligibility testdat prev-test)) + (set! real-status "WAIVED")) + + (debug:print 4 *default-log-port* "real-status " real-status ", waived " waived ", status " status) + + ;; update the primary record IF state AND status are defined + (if (and state status) + (begin + (rmt:set-state-status-and-roll-up-items run-id test-id item-path state real-status (if waived waived comment)) + ;; (mt:process-triggers run-id test-id state real-status) ;; triggers are called in test-set-state-status + )) + + ;; if status is "AUTO" then call rollup (note, this one modifies data in test + ;; run area, it does remote calls under the hood. + ;; (if (and test-id state status (equal? status "AUTO")) + ;; (rmt:test-data-rollup run-id test-id status)) + + ;; add metadata (need to do this way to avoid SQL injection issues) + + ;; :first_err + ;; (let ((val (hash-table-ref/default otherdat ":first_err" #f))) + ;; (if val + ;; (sqlite3:execute db "UPDATE tests SET first_err=? WHERE run_id=? AND testname=? AND item_path=?;" val run-id test-name item-path))) + ;; + ;; ;; :first_warn + ;; (let ((val (hash-table-ref/default otherdat ":first_warn" #f))) + ;; (if val + ;; (sqlite3:execute db "UPDATE tests SET first_warn=? WHERE run_id=? AND testname=? AND item_path=?;" val run-id test-name item-path))) + + (let ((category (hash-table-ref/default otherdat ":category" "")) + (variable (hash-table-ref/default otherdat ":variable" "")) + (value (hash-table-ref/default otherdat ":value" #f)) + (expected (hash-table-ref/default otherdat ":expected" "n/a")) + (tol (hash-table-ref/default otherdat ":tol" "n/a")) + (units (hash-table-ref/default otherdat ":units" "")) + (type (hash-table-ref/default otherdat ":type" "")) + (dcomment (hash-table-ref/default otherdat ":comment" ""))) + (debug:print 4 *default-log-port* + "category: " category ", variable: " variable ", value: " value + ", expected: " expected ", tol: " tol ", units: " units) + (if (and value) ;; require only value; BB was- all three required + (let ((dat (conc category "," + variable "," + value "," + expected "," + tol "," + units "," + dcomment ",," ;; extra comma for status + type ))) + ;; This was run remote, don't think that makes sense. Perhaps not, but that is the easiest path for the moment. + (rmt:csv->test-data run-id test-id + dat) + ;; This was added in check-in a5adfa3f9a. Message was: "...added delay in set-values to allow for delayed write on server start" + ;; I'm inserting an arbitrary rmt: call to force/ensure that the server is available to (hopefully) prevent a communication issue. + (rmt:get-var "MEGATEST_VERSION") ;; this does NOTHING but ensure the server is reachable. This is almost certainly NOT needed :) + ;; BB - commentiong out arbitrary 10 second wait (thread-sleep! 10) ;; add 10 second delay before quit incase rmt needs time to start a server. + ))) + + ;; need to update the top test record if PASS or FAIL and this is a subtest + ;;;;;; (if (not (equal? item-path "")) + ;;;;;; (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status #f) ;;;;;) + + (if (or (and (string? comment) + (string-match (regexp "\\S+") comment)) + waived) + (let ((cmt (if waived waived comment))) + (rmt:general-call 'set-test-comment run-id cmt test-id))))) + + ) Index: testsmod.scm ================================================================== --- testsmod.scm +++ testsmod.scm @@ -107,14 +107,14 @@ ))) (import directory-utils debugprint - commonmod - configfmod - dbmod - dbfile + ;; commonmod + ;; configfmod + ;; dbmod + ;; dbfile ;; megatestmod ) ) ADDED utils/run-plot.sh Index: utils/run-plot.sh ================================================================== --- /dev/null +++ utils/run-plot.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +IGNORE_UNITS=portlogger,stml2,debugprint,mtargs,ods + +FILES=$(ls *mod.scm|grep -v import) + +if [[ utils/plot-uses.scm -nt utils/plot-uses ]];then + oldcsc csc utils/plot-uses.scm +fi + +./utils/plot-uses todot $IGNORE_UNITS $FILES > unitdeps.dot +dot unitdeps.dot -Tpdf -o unitdeps.pdf