Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -102,11 +102,12 @@ (list-ref params 4) ; comment )) ((delete-test-records) (apply db:delete-test-records dbstruct params)) ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params)) - ((test-set-state-status) (apply db:test-set-state-status dbstruct params)) + ((test-set-state-status) (apply db:test-set-state-status dbstruct params) + ) ((test-set-top-process-pid) (apply db:test-set-top-process-pid dbstruct params)) ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params)) ((set-state-status-and-roll-up-run) (apply db:set-state-status-and-roll-up-run dbstruct params)) ((top-test-set-per-pf-counts) (apply db:top-test-set-per-pf-counts dbstruct params)) ((test-set-archive-block-id) (apply db:test-set-archive-block-id dbstruct params)) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -75,5 +75,1154 @@ (cond ;; verbosity arg ((args:get-arg "-q") 'v) ((args:get-arg "-q") 'q) (else #f)))) +;;====================================================================== +;; L O C K I N G M E C H A N I S M S +;;====================================================================== + +;; faux-lock is deprecated. Please use simple-lock below +;; +(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) + (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count + (if (> wait-time 0) + (begin + (thread-sleep! 1) + (if (eq? wait-time 1) ;; only one second left, steal the lock + (begin + (debug:print-info 0 *default-log-port* "stealing lock for " keyname) + (common:faux-unlock keyname force: #t))) + (common:faux-lock keyname wait-time: (- wait-time 1))) + #f) + (begin + (rmt:no-sync-set keyname (conc (current-process-id))) + (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) + +(define (common:faux-unlock keyname #!key (force #f)) + (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) + (begin + (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) + #t) + #f)) + +;; simple lock. improve and converge on this one. +;; +(define (common:simple-lock keyname) + (rmt:no-sync-get-lock keyname)) + +(define (common:simple-unlock keyname #!key (force #f)) + (rmt:no-sync-del! keyname)) + + +(define (common:run-a-command cmd #!key (with-vars #f) (with-orig-env #f)) + (let* ((pre-cmd (dtests:get-pre-command)) + (post-cmd (dtests:get-post-command)) + (fullcmd (if (or pre-cmd post-cmd) + (conc pre-cmd cmd post-cmd) + (conc "viewscreen " cmd)))) + (debug:print-info 02 *default-log-port* "Running command: " fullcmd) + (cond + (with-vars (common:without-vars fullcmd)) + (with-orig-env (common:with-orig-env fullcmd)) + (else (common:without-vars fullcmd "MT_.*"))))) + +;; ideally put all this info into the db, no need to preserve it across moving homehost +;; +;; return list of +;; ( reachable? cpuload update-time ) +(define (common:get-host-info hostname) + (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data + (load (car loadinfo)) + (load-sample-time (cdr loadinfo)) + (load-sample-age (- (current-seconds) load-sample-time)) + (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds + (host-last-update-timeout-seconds 4) + (host-rec (hash-table-ref/default *host-loads* hostname #f)) + ) + (cond + ((< load-sample-age loadinfo-timeout-seconds) + (list #t + load-sample-time + load)) + ((and host-rec + (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) + (list #t + (host-last-update host-rec) + (host-last-cpuload host-rec ))) + ((common:unix-ping hostname) + (list #t + (current-seconds) + (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds + (else + (list #f 0 -1) ;; bad host, don't use! + )))) + +;; see defstruct host at top of file. +;; host: reachable last-update last-used last-cpuload +;; +(define (common:update-host-loads-table hosts-raw) + (let* ((hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw))) + (for-each + (lambda (hostname) + (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h)))) + (host-info (common:get-host-info hostname)) + (is-reachable (car host-info)) + (last-reached-time (cadr host-info)) + (load (caddr host-info))) + (host-reachable-set! rec is-reachable) + (host-last-update-set! rec last-reached-time) + (host-last-cpuload-set! rec load))) + hosts))) + +;; hash-table tree to html list tree +;; +;; tipfunc takes two parameters: y the tip value and path the path to that point +;; +(define (common:htree->html ht path tipfunc) + (let ((datlist (sort (hash-table->alist ht) + (lambda (a b) + (string< (car a)(car b)))))) + (if (null? datlist) + (tipfunc #f path) ;; really shouldn't get here + (s:ul + (map (lambda (x) + (let* ((levelname (car x)) + (y (cdr x)) + (newpath (append path (list levelname))) + (leaf (or (not (hash-table? y)) + (null? (hash-table-keys y))))) + (if leaf + (s:li (tipfunc y newpath)) + (s:li + (list + levelname + (common:htree->html y newpath tipfunc)))))) + datlist))))) + +;; logic for getting homehost. Returns (host . at-home) +;; IF *toppath* is not set, wait up to five seconds trying every two seconds +;; (this is to accomodate the watchdog) +;; +(define (common:get-homehost #!key (trynum 5)) + ;; called often especially at start up. use mutex to eliminate collisions + (mutex-lock! *homehost-mutex*) + (cond + (*home-host* + (mutex-unlock! *homehost-mutex*) + *home-host*) + ((not *toppath*) + (mutex-unlock! *homehost-mutex*) + (launch:setup) ;; safely mutexed now + (if (> trynum 0) + (begin + (thread-sleep! 2) + (common:get-homehost trynum: (- trynum 1))) + #f)) + (else + (let* ((currhost (get-host-name)) + (bestadrs (server:get-best-guess-address currhost)) + ;; first look in config, then look in file .homehost, create it if not found + (homehost (or (configf:lookup *configdat* "server" "homehost" ) + (handle-exceptions + exn + (if (> trynum 0) + (let ((delay-time (* (- 5 trynum) 5))) + (mutex-unlock! *homehost-mutex*) + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " + delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn) + ", exn=" exn) + (thread-sleep! delay-time) + (common:get-homehost trynum: (- trynum 1))) + (begin + (mutex-unlock! *homehost-mutex*) + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time) + "] Failed to read .homehost file after trying five times. Giving up and exiting, message: " + ((condition-property-accessor 'exn 'message) exn)) + (exit 1))) + (let ((hhf (conc *toppath* "/.homehost"))) + (if (common:file-exists? hhf) + (with-input-from-file hhf read-line) + (if (file-write-access? *toppath*) + (begin + (with-output-to-file hhf + (lambda () + (print bestadrs))) + (begin + (mutex-unlock! *homehost-mutex*) + (car (common:get-homehost)))) + #f)))))) + (at-home (or (equal? homehost currhost) + (equal? homehost bestadrs)))) + (set! *home-host* (cons homehost at-home)) + (mutex-unlock! *homehost-mutex*) + *home-host*)))) + +;; am I on the homehost? +;; +(define (common:on-homehost?) + (let ((hh (common:get-homehost))) + (if hh + (cdr hh) + #f))) + +;;====================================================================== +;; D A S H B O A R D U S E R V I E W S +;;====================================================================== + +;; first read ~/views.config if it exists, then read $MTRAH/views.config if it exists +;; +(define (common:load-views-config) + (let* ((view-cfgdat (make-hash-table)) + (home-cfgfile (conc (get-environment-variable "HOME") "/.mtviews.config")) + (mthome-cfgfile (conc *toppath* "/.mtviews.config"))) + (if (common:file-exists? mthome-cfgfile) + (read-config mthome-cfgfile view-cfgdat #t)) + ;; we load the home dir file AFTER the MTRAH file so the user can clobber settings when running the dashboard in read-only areas + (if (common:file-exists? home-cfgfile) + (read-config home-cfgfile view-cfgdat #t)) + view-cfgdat)) + +;;====================================================================== +;; T A R G E T S , S T A T E , S T A T U S , +;; R U N N A M E A N D T E S T P A T T +;;====================================================================== + +;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t)))) +;; +(define (common:get-runconfig-targets #!key (configf #f)) + (let ((targs (sort (map car (hash-table->alist + (or configf ;; NOTE: There is no value in using runconfig:read here. + (read-config (conc *toppath* "/runconfigs.config") + #f #t) + (make-hash-table)))) + stringsymbol force-setting) #f)) + (force-result (case force-type + ((#f) #f) + ((always) #t) + ((test) (if (args:get-arg "-execute") ;; we are in a test + #t + #f)) + (else + (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") + #t)))) ;; default to requiring server + (if force-result + (begin + (debug:print-info 0 *default-log-port* "forcing use of server, force setting is \"" force-setting "\".") + #t) + #f))) + +(define (common:in-running-test?) + (and (args:get-arg "-execute") (get-environment-variable "MT_CMDINFO"))) + +(define (debug:setup debug-arg verbose-arg) ;; debug-arg= #f, #t or 'noprop + (let ((debugstr (or debug-arg ;; (args:get-arg "-debug") + ;; (args:get-arg "-debug-noprop") + (get-environment-variable "MT_DEBUG_MODE")))) + (debug:calc-verbosity debugstr verbose-arg) + ;; (debug:check-verbosity *verbosity* debugstr) + ;; if we were handed a bad verbosity rule then we will override it with 1 and continue + (if (not (verbosity))(set! (verbosity) 1)) + (if (and (not (eq? debug-arg 'noprop)) + (or debug-arg + (not (get-environment-variable "MT_DEBUG_MODE")))) + (setenv "MT_DEBUG_MODE" (if (list? (verbosity)) + (string-intersperse (map conc (verbosity)) ",") + (conc (verbosity))))))) + +;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the +;; [host-rules] section. +;; +(define (common:get-least-loaded-host hosts-raw host-type configdat) + (let* ((rdat (configf:lookup configdat "host-rules" host-type)) + (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate + (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load + (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs + (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second + (hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw)) + ;; (best-host #f) + (get-rec (lambda (hostname) + ;; (print "get-rec hostname=" hostname) + (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h))))) + (best-load 99999) + (curr-time (current-seconds)) + (get-hosts-sorted (lambda (hosts) + (sort hosts (lambda (a b) + (let ((a-rec (get-rec a)) + (b-rec (get-rec b))) + ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) + ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) + (< (host-last-used a-rec) + (host-last-used b-rec)))))))) + (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) + (if (null? hosts) + #f ;; no hosts to select from. All done and giving up now. + (let ((hosts-sorted (get-hosts-sorted hosts))) + (common:update-host-loads-table hosts) + (let loop ((hostname (car hosts-sorted)) + (tal (cdr hosts-sorted)) + (best-host #f)) + (let* ((rec (get-rec hostname)) + (reachable (host-reachable rec)) + (load (host-last-cpuload rec)) + (last-used (host-last-used rec)) + (delta (- curr-time last-used)) + (job-rate (if (> delta 0) + (/ 1 delta) + 999)) ;; jobs per second + (new-best + (cond + ((not reachable) + (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") + best-host) + ((and (< load maxnload) ;; load is acceptable + (< job-rate maxjobrate)) ;; job rate is acceptable + (set! best-load load) + hostname) + (else best-host)))) + (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) + (if new-best + (begin ;; found a host, return it + (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) + (host-last-used-set! rec curr-time) + new-best) + (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) + +;; '(print (string-intersperse (map cadr (hash-table-ref/default (read-config "megatest.config" \#f \#t) "disks" '"'"'("none" ""))) "\n"))' +(define (common:get-disks #!key (configf #f)) + (hash-table-ref/default + (or configf (read-config "megatest.config" #f #t)) + "disks" '("none" ""))) + + +;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp +;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) +;; +(define (common:readonly-watchdog dbstruct) + (thread-sleep! 0.05) ;; delay for startup + (debug:print-info 13 *default-log-port* "common:readonly-watchdog entered.") + ;; sync megatest.db to /tmp/.../megatst.db + (let* ((sync-cool-off-duration 3) + (golden-mtdb (dbr:dbstruct-mtdb dbstruct)) + (golden-mtpath (db:dbdat-get-path golden-mtdb)) + (tmp-mtdb (dbr:dbstruct-tmpdb dbstruct)) + (tmp-mtpath (db:dbdat-get-path tmp-mtdb))) + (debug:print-info 0 *default-log-port* "Read-only periodic sync thread started.") + (let loop ((last-sync-time 0)) + (debug:print-info 13 *default-log-port* "loop top tmp-mtpath="tmp-mtpath" golden-mtpath="golden-mtpath) + (let* ((duration-since-last-sync (- (current-seconds) last-sync-time))) + (debug:print-info 13 *default-log-port* "duration-since-last-sync="duration-since-last-sync) + (if (and (not *time-to-exit*) + (< duration-since-last-sync sync-cool-off-duration)) + (thread-sleep! (- sync-cool-off-duration duration-since-last-sync))) + (if (not *time-to-exit*) + (let ((golden-mtdb-mtime (file-modification-time golden-mtpath)) + (tmp-mtdb-mtime (file-modification-time tmp-mtpath))) + (if (> golden-mtdb-mtime tmp-mtdb-mtime) + (if (< golden-mtdb-mtime (- (current-seconds) 3)) ;; file has NOT been touched in past three seconds, this way multiple servers won't fight to sync back + (let ((res (db:multi-db-sync dbstruct 'old2new))) + (debug:print-info 13 *default-log-port* "rosync called, " res " records transferred.")))) + (loop (current-seconds))) + #t))) + (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" mtpath="golden-mtpath))) + + +;; Force a megatest cleanup-db if version is changed and skip-version-check not specified +;; Do NOT check if not on homehost! +;; +(define (common:exit-on-version-changed) + (if (common:on-homehost?) + (if (common:api-changed?) + (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) + (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db")) + (read-only (not (file-write-access? dbfile))) + (dbstruct (db:setup #t))) + (debug:print 0 *default-log-port* + "WARNING: Version mismatch!\n" + " expected: " (common:version-signature) "\n" + " got: " (common:get-last-run-version)) + (cond + ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) + ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) + (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db + (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (print-call-chain (current-error-port)) + (exit 1)) + (common:cleanup-db dbstruct))) + ((not (common:file-exists? mtconf)) + (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") + (exit 1)) + ((not (common:file-exists? dbfile)) + (debug:print 0 *default-log-port* " megatest.db does not exist in this area. Cannot proceed with megatest version migration.") + (exit 1)) + ((not (eq? (current-user-id)(file-owner mtconf))) + (debug:print 0 *default-log-port* " You do not own megatest.db in this area. Cannot proceed with megatest version migration.") + (exit 1)) + (read-only + (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") + (exit 1)) + (else + (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") + (exit 1))))))) +;; (begin +;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") +;; (exit 1)))) + + +;; Rotate logs, logic: +;; if > 500k and older than 1 week: +;; remove previous compressed log and compress this log +;; WARNING: This proc operates assuming that it is in the directory above the +;; logs directory you wish to log-rotate. +;; +(define (common:rotate-logs) + (let* ((all-files (make-hash-table)) + (stats (make-hash-table)) + (inc-stat (lambda (key) + (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) + (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age + (if (not (directory-exists? "logs"))(create-directory "logs")) + (directory-fold + (lambda (file rem) + (handle-exceptions + exn + (begin + (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn) + (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (print-call-chain (current-error-port)) ;; + ) + (let* ((fullname (conc "logs/" file)) + (mod-time (file-modification-time fullname)) + (file-age (- (current-seconds) mod-time)) + (file-old (> file-age (* 48 60 60))) + (file-big (> (file-size fullname) 200000))) + (hash-table-set! all-files file mod-time) + (if (or (and (string-match "^.*.log" file) + file-old + file-big) + (and (string-match "^server-.*.log" file) + file-old)) + (let ((gzfile (conc fullname ".gz"))) + (if (common:file-exists? gzfile) + (begin + (debug:print-info 0 *default-log-port* "removing " gzfile) + (delete-file* gzfile) + (hash-table-delete! all-files gzfile) ;; needed? + )) + (debug:print-info 0 *default-log-port* "compressing " file) + (system (conc "gzip " fullname)) + (inc-stat "gzipped") + (hash-table-set! all-files (conc file ".gz") file-age) ;; add the .gz file and remove the base file + (hash-table-delete! all-files file) + ) + (if (and (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600)) + (file-exists? fullname)) ;; just in case it was gzipped - will get it next time + (handle-exceptions + exn + #f + (if (directory? fullname) + (begin + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (inc-stat "directories")) + (begin + (delete-file* fullname) + (inc-stat "deleted"))) + (hash-table-delete! all-files file))))))) + '() + "logs") + (for-each + (lambda (category) + (let ((quant (hash-table-ref/default stats category 0))) + (if (> quant 0) + (debug:print-info 0 *default-log-port* category " log files: " quant)))) + `("deleted" "gzipped" "directories")) + (let ((num-logs (hash-table-size all-files))) + (if (> num-logs max-allowed) ;; because NFS => don't let number of logs exceed 300 + (let ((files (take (sort (hash-table-keys all-files) + (lambda (a b) + (< (hash-table-ref all-files a)(hash-table-ref all-files b)))) + (- num-logs max-allowed)))) + (for-each + (lambda (file) + (let* ((fullname (conc "logs/" file))) + (if (directory? fullname) + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (handle-exceptions + exn + (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) + (delete-file* fullname))))) + files) + (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files.")))))) + +;; calculate a delay number based on a droop curve +;; inputs are: +;; - load-in, load as from uptime, NOT normalized +;; - numcpus, number of cpus, ideally use the real cpus, not threads +;; +(define (common:get-delay load-in numcpus) + (let* ((ratio (/ load-in numcpus)) + (new-option (configf:lookup *configdat* "load" "new-load-method")) + (paramstr (or (configf:lookup *configdat* "load" "exp-params") + "15 12 1281453987.9543 0.75")) ;; 5 4 10 1")) + (paramlst (map string->number (string-split paramstr)))) + (if new-option + (begin + (cond ((and (>= ratio 0) (< ratio .5)) + 0) + ((and (>= ratio 0.5) (<= ratio .9)) + (* ratio (/ 5 .9))) + ((and (> ratio .9) (<= ratio 1.1)) + (+ 5 (* (- ratio .9) (/ 55 .2)))) + ((> ratio 1.1) + 60))) + (match paramlst + ((r1 r2 s1 s2) + (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2) + (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30)) + (else + (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr) + 30))))) + +(define (common:print-delay-table) + (let loop ((x 0)) + (print x "," (common:get-delay x 1)) + (if (< x 2) + (loop (+ x 0.1))))) + +;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load +;; count - count down to zero, at some point we'd give up if the load never drops +;; num-tries - count down to zero number tries to get numcpus +;; +(define (common:wait-for-cpuload maxnormload numcpus-in + #!key (count 1000) + (msg #f)(remote-host #f)(num-tries 5)) + (let* ((loadavg (common:get-cpu-load remote-host)) + ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (numcpus (if (<= 1 numcpus-in) + (common:get-num-cpus remote-host) + numcpus-in)) + (first (car loadavg)) + (next (cadr loadavg)) + (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug + ;; where numcpus + ;; (or could be + ;; maxload) is + ;; zero, crude + ;; fallback is to + ;; at least use 1 + ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit + ;; etc. + (effective-load (common:get-intercept first next)) + (recommended-delay (common:get-delay effective-load numcpus)) + (effective-host (or remote-host "localhost")) + (normalized-effective-load (/ effective-load numcpus)) + (will-wait (> normalized-effective-load maxnormload))) + (if (> recommended-delay 0) + (let* ((actual-delay (min recommended-delay 30))) + (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) + (debug:print-info 0 *default-log-port* "Load control, delaying " + actual-delay " seconds to maintain safe load. current normalized effective load is " + normalized-effective-load".")) + (thread-sleep! actual-delay))) + + (cond + ;; bad data, try again to get the data + ((not will-wait) + (if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) + (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing."))) + ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable + (> num-tries 0)) + (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " + first ", we'll sleep 10s and try " num-tries " more times.") + (thread-sleep! 10) + (common:wait-for-cpuload maxnormload numcpus-in + count: count remote-host: remote-host num-tries: (- num-tries 1))) + ;; need to wait for load to drop + ((and will-wait ;; (> first adjmaxload) + (> count 0)) + (debug:print-info 0 *default-log-port* + "Delaying 15" ;; adjwait + " seconds due to normalized effective load " normalized-effective-load ;; first + " exceeding max of " adjmaxload + " on server " (or remote-host (get-host-name)) + " (normalized load-limit: " maxnormload ") " (if msg msg "")) + (thread-sleep! 15) ;; adjwait) + (common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host) + ;; put the message here to indicate came out of waiting + (debug:print-info 1 *default-log-port* + "On host: " effective-host + ", effective load: " effective-load + ", numcpus: " numcpus + ", normalized effective load: " normalized-effective-load + )) + ;; overloaded and count expired (i.e. went to zero) + (else + (if (> num-tries 0) ;; should be "num-tries-left". + (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host)) + (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of " + normalized-effective-load " continuing.")) + (debug:print 0 *default-log-port* "Load on " effective-host ", " + first" could not be retrieved. Giving up and continuing.")))))) + +;; wait for normalized cpu load to drop below maxload +;; +(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) + (let ((num-cpus (common:get-num-cpus remote-host))) + (if num-cpus + (common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host) + (begin + (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again + (if (> rem-tries 0) + (common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1)) + #f))))) + +;; given path get free space, allows override in [setup] +;; with free-space-script /path/to/some/script.sh +;; +(define (get-df path) + (if (configf:lookup *configdat* "setup" "free-space-script") + (with-input-from-pipe + (conc (configf:lookup *configdat* "setup" "free-space-script") " " path) + (lambda () + (let ((res (read-line))) + (if (string? res) + (string->number res))))) + (get-unix-df path))) + +(define (common:check-space-in-dir dirpath required) + (let* ((dbspace (if (directory? dirpath) + (get-df dirpath) + 0))) + (list (> dbspace required) + dbspace + required + dirpath))) + + +(define (get-free-inodes path) + (if (configf:lookup *configdat* "setup" "free-inodes-script") + (with-input-from-pipe + (conc (configf:lookup *configdat* "setup" "free-inodes-script") " " path) + (lambda () + (let ((res (read-line))) + (if (string? res) + (string->number res))))) + (get-unix-inodes path))) + +;;====================================================================== +;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S +;;====================================================================== +;; +;; [hosts] +;; arm cubie01 cubie02 +;; x86_64 zeus xena myth01 +;; allhosts #{g hosts arm} #{g hosts x86_64} +;; +;; [host-types] +;; general #MTLOWESTLOAD #{g hosts allhosts} +;; arm #MTLOWESTLOAD #{g hosts arm} +;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo +;; +;; [host-rules] +;; # maxnload => max normalized load +;; # maxnjobs => max jobs per cpu +;; # maxjobrate => max jobs per second +;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 +;; +;; [launchers] +;; envsetup general +;; xor/%/n 4C16G +;; % nbgeneral +;; +;; [jobtools] +;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. +;; flexi-launcher yes +;; launcher nbfake +;; +(define (common:get-launcher configdat testname itempath) + (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) + (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher + (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) + (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) + (if (null? launchers) + fallback-launcher + (let loop ((hed (car launchers)) + (tal (cdr launchers))) + (let ((patt (car hed)) + (host-type (cadr hed))) + (if (tests:match patt testname itempath) + (begin + (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) + (let ((launcher (configf:lookup configdat "host-types" host-type))) + (if launcher + (let* ((launcher-parts (string-split launcher)) + (launcher-exe (car launcher-parts))) + (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline + (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) + (count 100)) + (if targ-host + (conc "remrun " targ-host) + (if (> count 0) + (begin + (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) + (thread-sleep! (- 101 count)) + (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) + (- count 1))) + (begin + (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) + (exit))))) + launcher)) + (begin + (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) + (if (null? tal) + fallback-launcher + (loop (car tal)(cdr tal))))))) + ;; no match, try again + (if (null? tal) + fallback-launcher + (loop (car tal)(cdr tal)))))))) + fallback-launcher))) + +(define (common:get-pkts-dirs mtconf use-lt) + (let* ((pktsdirs-str (or (configf:lookup mtconf "setup" "pktsdirs") + (and use-lt + (conc (or *toppath* + (current-directory)) + "/lt/.pkts")))) + (pktsdirs (if pktsdirs-str + (string-split pktsdirs-str " ") + #f))) + pktsdirs)) + +(define (common:with-queue-db mtconf proc #!key (use-lt #f)(toppath-in #f)) + (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) + (pktsdir (if pktsdirs (car pktsdirs) #f)) + (toppath (or (configf:lookup mtconf "scratchdat" "toppath") + toppath-in)) + (pdbpath (or (configf:lookup mtconf "setup" "pdbpath") pktsdir))) + (cond + ((not (and pktsdir toppath pdbpath)) + (debug:print 0 *default-log-port* "ERROR: settings are missing in your megatest.config for area management.") + (debug:print 0 *default-log-port* " you need to have pktsdirs in the [setup] section.")) + ((not (common:file-exists? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: pkts directory not found " pktsdir)) + ((not (equal? (file-owner pktsdir)(current-effective-user-id))) + (debug:print 0 *default-log-port* "ERROR: directory " pktsdir " is not owned by " (current-effective-user-name))) + (else + (let* ((pdb (open-queue-db pdbpath "pkts.db" + schema: '("CREATE TABLE groups (id INTEGER PRIMARY KEY,groupname TEXT, CONSTRAINT group_constraint UNIQUE (groupname));")))) + (proc pktsdirs pktsdir pdb) + (dbi:close pdb)))))) + +;; check space in dbdir and in megatest dir +;; returns: ok/not dbspace required-space +;; +(define (common:check-db-dir-space) + (let* ((required (string->number + ;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks. + (or (configf:lookup *configdat* "setup" "dbdir-space-required") + "1000000"))) + (dbdir (common:get-db-tmp-area)) ;; (db:get-dbdir)) + (tdbspace (common:check-space-in-dir dbdir required)) + (mdbspace (common:check-space-in-dir *toppath* required))) + (sort (list tdbspace mdbspace) (lambda (a b) + (< (cadr a)(cadr b)))))) + +;; check available space in dbdir, exit if insufficient +;; +(define (common:check-db-dir-and-exit-if-insufficient) + (let* ((spacedat (car (common:check-db-dir-space))) ;; look only at worst for now + (is-ok (car spacedat)) + (dbspace (cadr spacedat)) + (required (caddr spacedat)) + (dbdir (cadddr spacedat))) + (if (not is-ok) + (begin + (debug:print-error 0 *default-log-port* "Insufficient space in " dbdir ", require " required ", have " dbspace ", exiting now.") + (exit 1))))) + +;; paths is list of lists ((name path) ... ) +;; +(define (common:get-disk-with-most-free-space disks minsize) + (let* ((best #f) + (bestsize 0) + (default-min-inodes-string "1000000") + (default-min-inodes (string->number default-min-inodes-string)) + (min-inodes (or (string->number (if (configf:lookup *configdat* "setup" "min_inodes") (configf:lookup *configdat* "setup" "min_inodes") default-min-inodes-string)) default-min-inodes))) + + (for-each + (lambda (disk-num) + (let* ((dirpath (cadr (assoc disk-num disks))) + (freespc (cond + ((not (directory? dirpath)) + (if (common:low-noise-print 300 "disks not a dir " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) + -1) + ((not (file-write-access? dirpath)) + (if (common:low-noise-print 300 "disks not writeable " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) + -1) + ((not (eq? (string-ref dirpath 0) #\/)) + (if (common:low-noise-print 300 "disks not a proper path " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) + -1) + (else + (get-df dirpath)))) + (free-inodes (cond + ((not (directory? dirpath)) + (if (common:low-noise-print 300 "disks not a dir " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) + -1) + ((not (file-write-access? dirpath)) + (if (common:low-noise-print 300 "disks not writeable " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) + -1) + ((not (eq? (string-ref dirpath 0) #\/)) + (if (common:low-noise-print 300 "disks not a proper path " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) + -1) + (else + (get-free-inodes dirpath)))) + ;;(free-inodes (get-free-inodes dirpath)) + ) + (debug:print 2 *default-log-port* "INFO: disk " disk-num " path " dirpath " free space " freespc " free inodes " free-inodes) + (if (and (> freespc bestsize)(> free-inodes min-inodes )) + (begin + (set! best (cons disk-num dirpath)) + (set! bestsize freespc))) + ;;(print "Processing: " disk-num " bestsize: " bestsize " best: " best " freespc: " freespc " min-inodes: " min-inodes " free-inodes: " free-inodes) + )) + (map car disks)) + (if (and best (> bestsize minsize)) + best + #f))) ;; #f means no disk candidate found + +;; from metadat lookup MEGATEST_VERSION +;; +(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB + (rmt:get-var "MEGATEST_VERSION")) + +(define (common:set-last-run-version) + (rmt:set-var "MEGATEST_VERSION" (common:version-signature))) + +(define (common:get-last-run-version-number) + (string->number + (substring (common:get-last-run-version) 0 6))) + +;; postive number if megatest version > db version +;; negative number if megatest version < db version +(define (common:version-db-delta) + (- megatest-version (common:get-last-run-version-number))) + +(define (common:version-changed?) + (not (equal? (common:get-last-run-version) + (common:version-signature)))) + +(define (common:api-changed?) + (not (equal? (substring (->string megatest-version) 0 4) + (substring (conc (common:get-last-run-version)) 0 4)))) + + +;; Move me elsewhere ... +;; RADT => Why do we meed the version check here, this is called only if version misma +;; +(define (common:cleanup-db dbstruct #!key (full #f)) + (apply db:multi-db-sync + dbstruct + 'schema + ;; 'new2old + 'killservers + 'adj-target + ;; 'old2new + 'new2old + ;; (if full + '(dejunk) + ;; '()) + ) + (if (common:api-changed?) + (common:set-last-run-version))) + +(define common:get-area-name common:get-testsuite-name) + +(define (common:load-pkts-to-db mtconf #!key (use-lt #f)) + (common:with-queue-db + mtconf + (lambda (pktsdirs pktsdir pdb) + (for-each + (lambda (pktsdir) ;; look at all + (cond + ((not (common:file-exists? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory " pktsdir " does not exist.")) + ((not (directory? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not a directory.")) + ((not (file-read-access? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not readable.")) + (else + (debug:print-info 0 *default-log-port* "Loading packets found in " pktsdir) + (let ((pkts (glob (conc pktsdir "/*.pkt")))) + (for-each + (lambda (pkt) + (let* ((uuid (cadr (string-match ".*/([0-9a-f]+).pkt" pkt))) + (exists (lookup-by-uuid pdb uuid #f))) + (if (not exists) + (let* ((pktdat (string-intersperse + (with-input-from-file pkt read-lines) + "\n")) + (apkt (pkt->alist pktdat)) + (ptype (alist-ref 'T apkt))) + (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0) + (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue")) + (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...") + ))) + pkts))))) + pktsdirs)) + use-lt: use-lt)) + +;; use-lt is use linktree "lt" link to find pkts dir +(define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already + (if (or add-only + (hash-table-exists? *pkts-info* 'last-parent)) + (let* ((parent (hash-table-ref/default *pkts-info* 'last-parent #f)) + (pktalist (if parent + (cons `(parent . ,parent) + pktalist-in) + pktalist-in))) + (let-values (((uuid pkt) + (alist->pkt pktalist common:pkts-spec))) + (hash-table-set! *pkts-info* 'last-parent uuid) + (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f) + (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) + (pktsdir (car pktsdirs))) ;; assume it is there + (hash-table-set! *pkts-info* 'pkts-dir pktsdir) + pktsdir)))) + (handle-exceptions + exn + (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!! + (if (not (file-exists? pktsdir)) + (create-directory pktsdir #t)) + (with-output-to-file + (conc pktsdir "/" uuid ".pkt") + (lambda () + (print pkt))))))))) + Index: commonmod.scm ================================================================== --- commonmod.scm +++ commonmod.scm @@ -41,10 +41,12 @@ srfi-1 srfi-13 srfi-69 stack typed-records + directory-utils + z3 ) ;;====================================================================== ;; CONTENTS @@ -271,25 +273,10 @@ (not (null? (lset-intersection! eq? vb n)))) ((and (number? vb) (list? n)) (member vb n))))) -(define (debug:setup debug-arg verbose-arg) ;; debug-arg= #f, #t or 'noprop - (let ((debugstr (or debug-arg ;; (args:get-arg "-debug") - ;; (args:get-arg "-debug-noprop") - (get-environment-variable "MT_DEBUG_MODE")))) - (debug:calc-verbosity debugstr verbose-arg) - ;; (debug:check-verbosity *verbosity* debugstr) - ;; if we were handed a bad verbosity rule then we will override it with 1 and continue - (if (not (verbosity))(set! (verbosity) 1)) - (if (and (not (eq? debug-arg 'noprop)) - (or debug-arg - (not (get-environment-variable "MT_DEBUG_MODE")))) - (setenv "MT_DEBUG_MODE" (if (list? (verbosity)) - (string-intersperse (map conc (verbosity)) ",") - (conc (verbosity))))))) - (define (debug:print n e . params) (if (debug:debug-mode n) (with-output-to-port (or e (current-error-port)) (lambda () ;; (if *logging* @@ -646,59 +633,10 @@ #t (if (null? tal) #f (loop (car tal)(cdr tal)))))))) -;;====================================================================== -;; File locking -;;====================================================================== - -;; dot-locking egg seems not to work, using this for now -;; if lock is older than expire-time then remove it and try again -;; to get the lock -;; -(define (common:simple-file-lock fname #!key (expire-time 300)) - (let ((fmod-time (handle-exceptions - ext - (current-seconds) - (file-modification-time fname)))) - (if (common:file-exists? fname) - (if (> (- (current-seconds) fmod-time) expire-time) - (begin - (handle-exceptions exn #f (delete-file* fname)) - (common:simple-file-lock fname expire-time: expire-time)) - #f) - (let ((key-string (conc (get-host-name) "-" (current-process-id)))) - (with-output-to-file fname - (lambda () - (print key-string))) - (thread-sleep! 0.25) - (if (common:file-exists? fname) - (handle-exceptions exn - #f - (with-input-from-file fname - (lambda () - (equal? key-string (read-line))))) - #f))))) - -(define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) - (let ((end-time (+ expire-time (current-seconds)))) - (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) - (if got-lock - #t - (if (> end-time (current-seconds)) - (begin - (thread-sleep! 3) - (loop (common:simple-file-lock fname expire-time: expire-time))) - #f))))) - -(define (common:simple-file-release-lock fname) - (handle-exceptions - exn - #f ;; I don't really care why this failed (at least for now) - (delete-file* fname))) - ;;====================================================================== ;; Keys ;;====================================================================== (define (keys->keystr keys) ;; => key1,key2,key3,additiona1, ... @@ -985,11 +923,11 @@ ((dashboard) progname) (else exe))))) '("../../" "../"))))) (if (null? res) (begin - (debug:print 0 *current-log-port* "Failed to find this executable! Using what can be found on the path") + (debug:print 0 *default-log-port* "Failed to find this executable! Using what can be found on the path") progname) (car res)))) (define *common:logpro-exit-code->status-sym-alist* '( ( 0 . pass ) @@ -1078,60 +1016,10 @@ (conc megatest-version "-" megatest-fossil-hash)) (define (common:version-signature) (conc megatest-version "-" (substring megatest-fossil-hash 0 4))) -;; from metadat lookup MEGATEST_VERSION -;; -(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB - (rmt:get-var "MEGATEST_VERSION")) - -(define (common:get-last-run-version-number) - (string->number - (substring (common:get-last-run-version) 0 6))) - -(define (common:set-last-run-version) - (rmt:set-var "MEGATEST_VERSION" (common:version-signature))) - -;; postive number if megatest version > db version -;; negative number if megatest version < db version -(define (common:version-db-delta) - (- megatest-version (common:get-last-run-version-number))) - -(define (common:version-changed?) - (not (equal? (common:get-last-run-version) - (common:version-signature)))) - -(define (common:api-changed?) - (not (equal? (substring (->string megatest-version) 0 4) - (substring (conc (common:get-last-run-version)) 0 4)))) - - -(define (common:get-sync-lock-filepath) - (let* ((tmp-area (common:get-db-tmp-area)) - (lockfile (conc tmp-area "/megatest.db.sync-lock"))) - lockfile)) - -;; Move me elsewhere ... -;; RADT => Why do we meed the version check here, this is called only if version misma -;; -(define (common:cleanup-db dbstruct #!key (full #f)) - (apply db:multi-db-sync - dbstruct - 'schema - ;; 'new2old - 'killservers - 'adj-target - ;; 'old2new - 'new2old - ;; (if full - '(dejunk) - ;; '()) - ) - (if (common:api-changed?) - (common:set-last-run-version))) - (define (common:snapshot-file filepath #!key (subdir ".") ) (if (file-exists? filepath) (let* ((age-sec (lambda (file) (if (file-exists? file) (- (current-seconds) (file-modification-time file)) @@ -1209,142 +1097,10 @@ (copy hrsfile daysfile)) #t) #f)) - -;; Rotate logs, logic: -;; if > 500k and older than 1 week: -;; remove previous compressed log and compress this log -;; WARNING: This proc operates assuming that it is in the directory above the -;; logs directory you wish to log-rotate. -;; -(define (common:rotate-logs) - (let* ((all-files (make-hash-table)) - (stats (make-hash-table)) - (inc-stat (lambda (key) - (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) - (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age - (if (not (directory-exists? "logs"))(create-directory "logs")) - (directory-fold - (lambda (file rem) - (handle-exceptions - exn - (begin - (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn) - (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - ;; (print-call-chain (current-error-port)) ;; - ) - (let* ((fullname (conc "logs/" file)) - (mod-time (file-modification-time fullname)) - (file-age (- (current-seconds) mod-time)) - (file-old (> file-age (* 48 60 60))) - (file-big (> (file-size fullname) 200000))) - (hash-table-set! all-files file mod-time) - (if (or (and (string-match "^.*.log" file) - file-old - file-big) - (and (string-match "^server-.*.log" file) - file-old)) - (let ((gzfile (conc fullname ".gz"))) - (if (common:file-exists? gzfile) - (begin - (debug:print-info 0 *default-log-port* "removing " gzfile) - (delete-file* gzfile) - (hash-table-delete! all-files gzfile) ;; needed? - )) - (debug:print-info 0 *default-log-port* "compressing " file) - (system (conc "gzip " fullname)) - (inc-stat "gzipped") - (hash-table-set! all-files (conc file ".gz") file-age) ;; add the .gz file and remove the base file - (hash-table-delete! all-files file) - ) - (if (and (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600)) - (file-exists? fullname)) ;; just in case it was gzipped - will get it next time - (handle-exceptions - exn - #f - (if (directory? fullname) - (begin - (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") - (inc-stat "directories")) - (begin - (delete-file* fullname) - (inc-stat "deleted"))) - (hash-table-delete! all-files file))))))) - '() - "logs") - (for-each - (lambda (category) - (let ((quant (hash-table-ref/default stats category 0))) - (if (> quant 0) - (debug:print-info 0 *default-log-port* category " log files: " quant)))) - `("deleted" "gzipped" "directories")) - (let ((num-logs (hash-table-size all-files))) - (if (> num-logs max-allowed) ;; because NFS => don't let number of logs exceed 300 - (let ((files (take (sort (hash-table-keys all-files) - (lambda (a b) - (< (hash-table-ref all-files a)(hash-table-ref all-files b)))) - (- num-logs max-allowed)))) - (for-each - (lambda (file) - (let* ((fullname (conc "logs/" file))) - (if (directory? fullname) - (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") - (handle-exceptions - exn - (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) - (delete-file* fullname))))) - files) - (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files.")))))) - -;; Force a megatest cleanup-db if version is changed and skip-version-check not specified -;; Do NOT check if not on homehost! -;; -(define (common:exit-on-version-changed) - (if (common:on-homehost?) - (if (common:api-changed?) - (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) - (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db")) - (read-only (not (file-write-access? dbfile))) - (dbstruct (db:setup #t))) - (debug:print 0 *default-log-port* - "WARNING: Version mismatch!\n" - " expected: " (common:version-signature) "\n" - " got: " (common:get-last-run-version)) - (cond - ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) - ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) - (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db - (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain (current-error-port)) - (exit 1)) - (common:cleanup-db dbstruct))) - ((not (common:file-exists? mtconf)) - (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (common:file-exists? dbfile)) - (debug:print 0 *default-log-port* " megatest.db does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (eq? (current-user-id)(file-owner mtconf))) - (debug:print 0 *default-log-port* " You do not own megatest.db in this area. Cannot proceed with megatest version migration.") - (exit 1)) - (read-only - (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") - (exit 1)) - (else - (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") - (exit 1))))))) -;; (begin -;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") -;; (exit 1)))) - ;;====================================================================== ;; S P A R S E A R R A Y S ;;====================================================================== (define (make-sparse-array) @@ -1384,10 +1140,59 @@ (let ((val (begin (mutex-lock! *db-access-mutex*) *db-access-allowed* (mutex-unlock! *db-access-mutex*)))) val)) + +;;====================================================================== +;; File locking +;;====================================================================== + +;; dot-locking egg seems not to work, using this for now +;; if lock is older than expire-time then remove it and try again +;; to get the lock +;; +(define (common:simple-file-lock fname #!key (expire-time 300)) + (let ((fmod-time (handle-exceptions + ext + (current-seconds) + (file-modification-time fname)))) + (if (common:file-exists? fname) + (if (> (- (current-seconds) fmod-time) expire-time) + (begin + (handle-exceptions exn #f (delete-file* fname)) + (common:simple-file-lock fname expire-time: expire-time)) + #f) + (let ((key-string (conc (get-host-name) "-" (current-process-id)))) + (with-output-to-file fname + (lambda () + (print key-string))) + (thread-sleep! 0.25) + (if (common:file-exists? fname) + (handle-exceptions exn + #f + (with-input-from-file fname + (lambda () + (equal? key-string (read-line))))) + #f))))) + +(define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) + (let ((end-time (+ expire-time (current-seconds)))) + (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) + (if got-lock + #t + (if (> end-time (current-seconds)) + (begin + (thread-sleep! 3) + (loop (common:simple-file-lock fname expire-time: expire-time))) + #f))))) + +(define (common:simple-file-release-lock fname) + (handle-exceptions + exn + #f ;; I don't really care why this failed (at least for now) + (delete-file* fname))) ;;====================================================================== ;; U S E F U L S T U F F ;;====================================================================== @@ -1577,28 +1382,14 @@ ;;====================================================================== (define *verbosity* 1) (define *logging* #f) -(define (get-with-default val default) - (let ((val (args:get-arg val))) - (if val val default))) - (define (assoc/default key lst . default) (let ((res (assoc key lst))) (if res (cadr res)(if (null? default) #f (car default))))) -(define (common:get-testsuite-name) - (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description. - (configf:lookup *configdat* "setup" "testsuite" ) - (getenv "MT_TESTSUITE_NAME") - (pathname-file (or (if (string? *toppath* ) - (pathname-file *toppath*) - #f) - (common:get-topath #f))) - "please-set-setup-area-name")) ;; (pathname-file (current-directory))))) - ;; safe getting of toppath (define (common:get-toppath areapath) (or *toppath* (if areapath (begin @@ -1620,44 +1411,14 @@ (debug:print-error 0 *default-log-port* "Unable to find megatest home directory.") #f) (loop (pathname-directory thepath))))) )) -(define common:get-area-name common:get-testsuite-name) - -(define (common:get-db-tmp-area . junk) - (if *db-cache-path* - *db-cache-path* - (if *toppath* ;; common:get-create-writeable-dir - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn) - (exit 1)) - (let* ((tsname (common:get-testsuite-name)) - (dbpath (common:get-create-writeable-dir - (list (conc "/tmp/" (current-user-name) - "/megatest_localdb/" - tsname "/" - (string-translate *toppath* "/" ".")) - (conc "/tmp/" (current-process-id) ;; just in case we have an issue with the dir by own user name - "/megatest_localdb/" - tsname - (string-translate *toppath* "/" ".")) - )))) - (set! *db-cache-path* dbpath) - dbpath)) - #f))) - ;;====================================================================== ;; E X I T H A N D L I N G ;;====================================================================== -(define (common:run-sync?) - (and (common:on-homehost?) - (args:get-arg "-server"))) - ;; (let ((ohh (common:on-homehost?)) ;; (srv (args:get-arg "-server"))) ;; (and ohh srv))) ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) @@ -1668,118 +1429,10 @@ (define (common:human-time) (time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S")) - -;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp -;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) -;; -(define (common:readonly-watchdog dbstruct) - (thread-sleep! 0.05) ;; delay for startup - (debug:print-info 13 *default-log-port* "common:readonly-watchdog entered.") - ;; sync megatest.db to /tmp/.../megatst.db - (let* ((sync-cool-off-duration 3) - (golden-mtdb (dbr:dbstruct-mtdb dbstruct)) - (golden-mtpath (db:dbdat-get-path golden-mtdb)) - (tmp-mtdb (dbr:dbstruct-tmpdb dbstruct)) - (tmp-mtpath (db:dbdat-get-path tmp-mtdb))) - (debug:print-info 0 *default-log-port* "Read-only periodic sync thread started.") - (let loop ((last-sync-time 0)) - (debug:print-info 13 *default-log-port* "loop top tmp-mtpath="tmp-mtpath" golden-mtpath="golden-mtpath) - (let* ((duration-since-last-sync (- (current-seconds) last-sync-time))) - (debug:print-info 13 *default-log-port* "duration-since-last-sync="duration-since-last-sync) - (if (and (not *time-to-exit*) - (< duration-since-last-sync sync-cool-off-duration)) - (thread-sleep! (- sync-cool-off-duration duration-since-last-sync))) - (if (not *time-to-exit*) - (let ((golden-mtdb-mtime (file-modification-time golden-mtpath)) - (tmp-mtdb-mtime (file-modification-time tmp-mtpath))) - (if (> golden-mtdb-mtime tmp-mtdb-mtime) - (if (< golden-mtdb-mtime (- (current-seconds) 3)) ;; file has NOT been touched in past three seconds, this way multiple servers won't fight to sync back - (let ((res (db:multi-db-sync dbstruct 'old2new))) - (debug:print-info 13 *default-log-port* "rosync called, " res " records transferred.")))) - (loop (current-seconds))) - #t))) - (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" mtpath="golden-mtpath))) - -;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage -(define (common:watchdog) - (debug:print-info 13 *default-log-port* "common:watchdog entered.") - (if (launch:setup) - (if (common:on-homehost?) - (let ((dbstruct (db:setup #t))) - (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct) - (cond - ((dbr:dbstruct-read-only dbstruct) - (debug:print-info 13 *default-log-port* "loading read-only watchdog") - (common:readonly-watchdog dbstruct)) - (else - (debug:print-info 13 *default-log-port* "loading writable-watchdog.") - (let* ((syncer (or (configf:lookup *configdat* "server" "sync-method") "brute-force-sync"))) - (cond - ((equal? syncer "brute-force-sync") - (server:writable-watchdog-bruteforce dbstruct)) - ((equal? syncer "delta-sync") - (server:writable-watchdog-deltasync dbstruct)) - (else - (debug:print-error 0 *default-log-port* "Unknown server/sync-method specified ("syncer") - valid values are brute-force-sync and delta-sync.") - (exit 1))) - ;;(debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] Syncer started (method="syncer")") - ))) - (debug:print-info 13 *default-log-port* "watchdog done.")) - (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))) - - -(define (std-exit-procedure) - ;;(common:telemetry-log-close) - (on-exit (lambda () 0)) - ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) - (let ((no-hurry (if *time-to-exit* ;; hurry up - #f - (begin - (set! *time-to-exit* #t) - #t)))) - (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") - (if (and no-hurry (debug:debug-mode 18)) - (rmt:print-db-stats)) - (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds - (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated - (if *task-db* - (let ((db (cdr *task-db*))) - (if (sqlite3:database? db) - (begin - (sqlite3:interrupt! db) - (sqlite3:finalize! db #t) - ;; (vector-set! *task-db* 0 #f) - (set! *task-db* #f))))) - (http-client#close-all-connections!) - ;; (if (and *runremote* - ;; (remote-conndat *runremote*)) - ;; (begin - ;; (http-client#close-all-connections!))) ;; for http-client - (if (not (eq? *default-log-port* (current-error-port))) - (close-output-port *default-log-port*)) - (set! *default-log-port* (current-error-port))) "Cleanup db exit thread")) - (th2 (make-thread (lambda () - (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...") - (if no-hurry - (begin - (thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff - (begin - (thread-sleep! 2))) - (debug:print 4 *default-log-port* " ... done") - ) - "clean exit"))) - (thread-start! th1) - (thread-start! th2) - (thread-join! th1) - ) - ) - - 0) - (define (std-signal-handler signum) ;; (signal-mask! signum) (set! *time-to-exit* #t) ;;(debug:print-info 13 *default-log-port* "got signal "signum) (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly") @@ -1829,16 +1482,10 @@ (set! res #t)))) (string-split patts ",")) res) #t)) -;; '(print (string-intersperse (map cadr (hash-table-ref/default (read-config "megatest.config" \#f \#t) "disks" '"'"'("none" ""))) "\n"))' -(define (common:get-disks #!key (configf #f)) - (hash-table-ref/default - (or configf (read-config "megatest.config" #f #t)) - "disks" '("none" ""))) - ;; return first command that exists, else #f ;; (define (common:which cmds) (if (null? cmds) #f @@ -1927,119 +1574,14 @@ inlst (begin (if message (debug:print-error 0 *default-log-port* message)) (or ovrd '())))) -;;====================================================================== -;; T A R G E T S , S T A T E , S T A T U S , -;; R U N N A M E A N D T E S T P A T T -;;====================================================================== - -;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t)))) -;; -(define (common:get-runconfig-targets #!key (configf #f)) - (let ((targs (sort (map car (hash-table->alist - (or configf ;; NOTE: There is no value in using runconfig:read here. - (read-config (conc *toppath* "/runconfigs.config") - #f #t) - (make-hash-table)))) - string trynum 0) - (begin - (thread-sleep! 2) - (common:get-homehost trynum: (- trynum 1))) - #f)) - (else - (let* ((currhost (get-host-name)) - (bestadrs (server:get-best-guess-address currhost)) - ;; first look in config, then look in file .homehost, create it if not found - (homehost (or (configf:lookup *configdat* "server" "homehost" ) - (handle-exceptions - exn - (if (> trynum 0) - (let ((delay-time (* (- 5 trynum) 5))) - (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " - delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn) - ", exn=" exn) - (thread-sleep! delay-time) - (common:get-homehost trynum: (- trynum 1))) - (begin - (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time) - "] Failed to read .homehost file after trying five times. Giving up and exiting, message: " - ((condition-property-accessor 'exn 'message) exn)) - (exit 1))) - (let ((hhf (conc *toppath* "/.homehost"))) - (if (common:file-exists? hhf) - (with-input-from-file hhf read-line) - (if (file-write-access? *toppath*) - (begin - (with-output-to-file hhf - (lambda () - (print bestadrs))) - (begin - (mutex-unlock! *homehost-mutex*) - (car (common:get-homehost)))) - #f)))))) - (at-home (or (equal? homehost currhost) - (equal? homehost bestadrs)))) - (set! *home-host* (cons homehost at-home)) - (mutex-unlock! *homehost-mutex*) - *home-host*)))) - -;; am I on the homehost? -;; -(define (common:on-homehost?) - (let ((hh (common:get-homehost))) - (if hh - (cdr hh) - #f))) - -;; do we honor the caches of the config files? -;; -(define (common:use-cache?) - (let ((res #t)) ;; priority by order of evaluation - (if *configdat* ;; sillyness here. can't use setup/use-cache to know if we can use the cached files! - (if (equal? (configf:lookup *configdat* "setup" "use-cache") "no") - (set! res #f) - (if (equal? (configf:lookup *configdat* "setup" "use-cache") "yes") - (set! res #t)))) - (if (args:get-arg "-no-cache")(set! res #f)) ;; overrides setting in "setup" - (if (getenv "MT_USE_CACHE") - (if (equal? (getenv "MT_USE_CACHE") "yes") - (set! res #t) - (if (equal? (getenv "MT_USE_CACHE") "no") - (set! res #f)))) ;; overrides -no-cache switch - res)) - -;; force use of server? -;; -(define (common:force-server?) - (let* ((force-setting (configf:lookup *configdat* "server" "force")) - (force-type (if force-setting (string->symbol force-setting) #f)) - (force-result (case force-type - ((#f) #f) - ((always) #t) - ((test) (if (args:get-arg "-execute") ;; we are in a test - #t - #f)) - (else - (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") - #t)))) ;; default to requiring server - (if force-result - (begin - (debug:print-info 0 *default-log-port* "forcing use of server, force setting is \"" force-setting "\".") - #t) - #f))) - ;;====================================================================== ;; M I S C L I S T S ;;====================================================================== ;; items in lista are matched value and position in listb @@ -2228,35 +1667,10 @@ (hash-table-set! ht hed (make-hash-table)) (loop ht hed tal))))) lst) resh)) -;; hash-table tree to html list tree -;; -;; tipfunc takes two parameters: y the tip value and path the path to that point -;; -(define (common:htree->html ht path tipfunc) - (let ((datlist (sort (hash-table->alist ht) - (lambda (a b) - (string< (car a)(car b)))))) - (if (null? datlist) - (tipfunc #f path) ;; really shouldn't get here - (s:ul - (map (lambda (x) - (let* ((levelname (car x)) - (y (cdr x)) - (newpath (append path (list levelname))) - (leaf (or (not (hash-table? y)) - (null? (hash-table-keys y))))) - (if leaf - (s:li (tipfunc y newpath)) - (s:li - (list - levelname - (common:htree->html y newpath tipfunc)))))) - datlist))))) - ;; hash-table tree to alist tree ;; (define (common:htree->atree ht) (map (lambda (x) (cons (car x) @@ -2366,45 +1780,10 @@ onemin (let* ((load-change (- onemin fivemin)) (tchange (- 300 60))) (max (+ onemin (* 60 (/ load-change tchange))) 0)))) -;; calculate a delay number based on a droop curve -;; inputs are: -;; - load-in, load as from uptime, NOT normalized -;; - numcpus, number of cpus, ideally use the real cpus, not threads -;; -(define (common:get-delay load-in numcpus) - (let* ((ratio (/ load-in numcpus)) - (new-option (configf:lookup *configdat* "load" "new-load-method")) - (paramstr (or (configf:lookup *configdat* "load" "exp-params") - "15 12 1281453987.9543 0.75")) ;; 5 4 10 1")) - (paramlst (map string->number (string-split paramstr)))) - (if new-option - (begin - (cond ((and (>= ratio 0) (< ratio .5)) - 0) - ((and (>= ratio 0.5) (<= ratio .9)) - (* ratio (/ 5 .9))) - ((and (> ratio .9) (<= ratio 1.1)) - (+ 5 (* (- ratio .9) (/ 55 .2)))) - ((> ratio 1.1) - 60))) - (match paramlst - ((r1 r2 s1 s2) - (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2) - (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30)) - (else - (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr) - 30))))) - -(define (common:print-delay-table) - (let loop ((x 0)) - (print x "," (common:get-delay x 1)) - (if (< x 2) - (loop (+ x 0.1))))) - (define (get-cpu-load #!key (remote-host #f)) (car (common:get-cpu-load remote-host))) ;; (let* ((load-res (process:cmd-run->list "uptime")) ;; (load-rx (regexp "load average:\\s+(\\d+)")) ;; (cpu-load #f)) @@ -2421,11 +1800,11 @@ ;; e.g. key is host and dtype is normalized-load ;; (define (common:get-cached-info key dtype #!key (age 10)) (if *toppath* (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")) - (delfile (lambda () + (delfile (lambda (exn) (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn) (delete-file* fullpath) #f))) (if (and (file-exists? fullpath) (file-read-access? fullpath)) @@ -2444,15 +1823,15 @@ 0) (file-change-time fullpath))))) (if (< real-age age) (handle-exceptions exn - (delfile) + (delfile exn) (let* ((res (with-input-from-file fullpath read))) (if (eof-object? res) (begin - (delfile) + (delfile "n/a") #f) res))) (begin (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") @@ -2468,11 +1847,11 @@ (fullpath (conc fulldir "/" key "-" dtype ".log"))) (if (not (file-exists? fulldir))(create-directory fulldir #t)) (handle-exceptions exn (begin - (debug:print 0 *default-log-path* "failed to write file " fullpath ", exn=" exn) + (debug:print 0 *default-log-port* "failed to write file " fullpath ", exn=" exn) #f) (with-output-to-file fullpath (lambda ()(pp dat))))) #f)) (define (common:raw-get-remote-host-load remote-host) @@ -2596,137 +1975,10 @@ (define (common:unix-ping hostname) (let ((res (system (conc "ping -c 1 " hostname " > /dev/null")))) (eq? res 0))) -;; ideally put all this info into the db, no need to preserve it across moving homehost -;; -;; return list of -;; ( reachable? cpuload update-time ) -(define (common:get-host-info hostname) - (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data - (load (car loadinfo)) - (load-sample-time (cdr loadinfo)) - (load-sample-age (- (current-seconds) load-sample-time)) - (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds - (host-last-update-timeout-seconds 4) - (host-rec (hash-table-ref/default *host-loads* hostname #f)) - ) - (cond - ((< load-sample-age loadinfo-timeout-seconds) - (list #t - load-sample-time - load)) - ((and host-rec - (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) - (list #t - (host-last-update host-rec) - (host-last-cpuload host-rec ))) - ((common:unix-ping hostname) - (list #t - (current-seconds) - (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds - (else - (list #f 0 -1) ;; bad host, don't use! - )))) - -;; see defstruct host at top of file. -;; host: reachable last-update last-used last-cpuload -;; -(define (common:update-host-loads-table hosts-raw) - (let* ((hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw))) - (for-each - (lambda (hostname) - (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h)))) - (host-info (common:get-host-info hostname)) - (is-reachable (car host-info)) - (last-reached-time (cadr host-info)) - (load (caddr host-info))) - (host-reachable-set! rec is-reachable) - (host-last-update-set! rec last-reached-time) - (host-last-cpuload-set! rec load))) - hosts))) - -;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the -;; [host-rules] section. -;; -(define (common:get-least-loaded-host hosts-raw host-type configdat) - (let* ((rdat (configf:lookup configdat "host-rules" host-type)) - (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate - (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load - (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs - (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second - (hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw)) - ;; (best-host #f) - (get-rec (lambda (hostname) - ;; (print "get-rec hostname=" hostname) - (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h))))) - (best-load 99999) - (curr-time (current-seconds)) - (get-hosts-sorted (lambda (hosts) - (sort hosts (lambda (a b) - (let ((a-rec (get-rec a)) - (b-rec (get-rec b))) - ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) - ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) - (< (host-last-used a-rec) - (host-last-used b-rec)))))))) - (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) - (if (null? hosts) - #f ;; no hosts to select from. All done and giving up now. - (let ((hosts-sorted (get-hosts-sorted hosts))) - (common:update-host-loads-table hosts) - (let loop ((hostname (car hosts-sorted)) - (tal (cdr hosts-sorted)) - (best-host #f)) - (let* ((rec (get-rec hostname)) - (reachable (host-reachable rec)) - (load (host-last-cpuload rec)) - (last-used (host-last-used rec)) - (delta (- curr-time last-used)) - (job-rate (if (> delta 0) - (/ 1 delta) - 999)) ;; jobs per second - (new-best - (cond - ((not reachable) - (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") - best-host) - ((and (< load maxnload) ;; load is acceptable - (< job-rate maxjobrate)) ;; job rate is acceptable - (set! best-load load) - hostname) - (else best-host)))) - (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) - (if new-best - (begin ;; found a host, return it - (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) - (host-last-used-set! rec curr-time) - new-best) - (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) - -(define (common:wait-for-homehost-load maxnormload msg) - (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. - #f - (common:get-homehost))) - (hh (if hh-dat (car hh-dat) #f))) - (common:wait-for-normalized-load maxnormload msg hh))) - (define *numcpus-cache* (make-hash-table)) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) ;; hosts had better not be changing the number of cpus too often! (or (hash-table-ref/default *numcpus-cache* actual-host #f) @@ -2752,97 +2004,10 @@ (common:write-cached-info actual-host "num-cpus" result)) result)))) (hash-table-set! *numcpus-cache* actual-host numcpus) numcpus)))) -;; wait for normalized cpu load to drop below maxload -;; -(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) - (let ((num-cpus (common:get-num-cpus remote-host))) - (if num-cpus - (common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host) - (begin - (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again - (if (> rem-tries 0) - (common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1)) - #f))))) - -;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load -;; count - count down to zero, at some point we'd give up if the load never drops -;; num-tries - count down to zero number tries to get numcpus -;; -(define (common:wait-for-cpuload maxnormload numcpus-in - #!key (count 1000) - (msg #f)(remote-host #f)(num-tries 5)) - (let* ((loadavg (common:get-cpu-load remote-host)) - ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (numcpus (if (<= 1 numcpus-in) - (common:get-num-cpus remote-host) - numcpus-in)) - (first (car loadavg)) - (next (cadr loadavg)) - (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug - ;; where numcpus - ;; (or could be - ;; maxload) is - ;; zero, crude - ;; fallback is to - ;; at least use 1 - ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit - ;; etc. - (effective-load (common:get-intercept first next)) - (recommended-delay (common:get-delay effective-load numcpus)) - (effective-host (or remote-host "localhost")) - (normalized-effective-load (/ effective-load numcpus)) - (will-wait (> normalized-effective-load maxnormload))) - (if (> recommended-delay 0) - (let* ((actual-delay (min recommended-delay 30))) - (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) - (debug:print-info 0 *default-log-port* "Load control, delaying " - actual-delay " seconds to maintain safe load. current normalized effective load is " - normalized-effective-load".")) - (thread-sleep! actual-delay))) - - (cond - ;; bad data, try again to get the data - ((not will-wait) - (if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) - (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing."))) - ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable - (> num-tries 0)) - (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " - first ", we'll sleep 10s and try " num-tries " more times.") - (thread-sleep! 10) - (common:wait-for-cpuload maxnormload numcpus-in - count: count remote-host: remote-host num-tries: (- num-tries 1))) - ;; need to wait for load to drop - ((and will-wait ;; (> first adjmaxload) - (> count 0)) - (debug:print-info 0 *default-log-port* - "Delaying 15" ;; adjwait - " seconds due to normalized effective load " normalized-effective-load ;; first - " exceeding max of " adjmaxload - " on server " (or remote-host (get-host-name)) - " (normalized load-limit: " maxnormload ") " (if msg msg "")) - (thread-sleep! 15) ;; adjwait) - (common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host) - ;; put the message here to indicate came out of waiting - (debug:print-info 1 *default-log-port* - "On host: " effective-host - ", effective load: " effective-load - ", numcpus: " numcpus - ", normalized effective load: " normalized-effective-load - )) - ;; overloaded and count expired (i.e. went to zero) - (else - (if (> num-tries 0) ;; should be "num-tries-left". - (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host)) - (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of " - effective-normalized-load " continuing.")) - (debug:print 0 *default-log-port* "Load on " effective-host ", " - first" could not be retrieved. Giving up and continuing.")))))) - ;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load ;; ;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5)) ;; (let* ((loadavg (common:get-cpu-load remote-host)) ;; (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again @@ -2958,33 +2123,10 @@ ;;====================================================================== (define (common:get-disk-space-used fpath) (with-input-from-pipe (conc "/usr/bin/du -s " fpath) read)) -;; given path get free space, allows override in [setup] -;; with free-space-script /path/to/some/script.sh -;; -(define (get-df path) - (if (configf:lookup *configdat* "setup" "free-space-script") - (with-input-from-pipe - (conc (configf:lookup *configdat* "setup" "free-space-script") " " path) - (lambda () - (let ((res (read-line))) - (if (string? res) - (string->number res))))) - (get-unix-df path))) - -(define (get-free-inodes path) - (if (configf:lookup *configdat* "setup" "free-inodes-script") - (with-input-from-pipe - (conc (configf:lookup *configdat* "setup" "free-inodes-script") " " path) - (lambda () - (let ((res (read-line))) - (if (string? res) - (string->number res))))) - (get-unix-inodes path))) - (define (get-unix-df path) (let* ((df-results (process:cmd-run->list (conc "df " path))) (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) (freespc #f)) ;; (write df-results) @@ -3009,102 +2151,10 @@ (if (number? newval) (set! freenodes newval)))))) (car df-results)) freenodes)) -(define (common:check-space-in-dir dirpath required) - (let* ((dbspace (if (directory? dirpath) - (get-df dirpath) - 0))) - (list (> dbspace required) - dbspace - required - dirpath))) - -;; check space in dbdir and in megatest dir -;; returns: ok/not dbspace required-space -;; -(define (common:check-db-dir-space) - (let* ((required (string->number - ;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks. - (or (configf:lookup *configdat* "setup" "dbdir-space-required") - "1000000"))) - (dbdir (common:get-db-tmp-area)) ;; (db:get-dbdir)) - (tdbspace (common:check-space-in-dir dbdir required)) - (mdbspace (common:check-space-in-dir *toppath* required))) - (sort (list tdbspace mdbspace) (lambda (a b) - (< (cadr a)(cadr b)))))) - -;; check available space in dbdir, exit if insufficient -;; -(define (common:check-db-dir-and-exit-if-insufficient) - (let* ((spacedat (car (common:check-db-dir-space))) ;; look only at worst for now - (is-ok (car spacedat)) - (dbspace (cadr spacedat)) - (required (caddr spacedat)) - (dbdir (cadddr spacedat))) - (if (not is-ok) - (begin - (debug:print-error 0 *default-log-port* "Insufficient space in " dbdir ", require " required ", have " dbspace ", exiting now.") - (exit 1))))) - -;; paths is list of lists ((name path) ... ) -;; -(define (common:get-disk-with-most-free-space disks minsize) - (let* ((best #f) - (bestsize 0) - (default-min-inodes-string "1000000") - (default-min-inodes (string->number default-min-inodes-string)) - (min-inodes (or (string->number (if (configf:lookup *configdat* "setup" "min_inodes") (configf:lookup *configdat* "setup" "min_inodes") default-min-inodes-string)) default-min-inodes))) - - (for-each - (lambda (disk-num) - (let* ((dirpath (cadr (assoc disk-num disks))) - (freespc (cond - ((not (directory? dirpath)) - (if (common:low-noise-print 300 "disks not a dir " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) - -1) - ((not (file-write-access? dirpath)) - (if (common:low-noise-print 300 "disks not writeable " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) - -1) - ((not (eq? (string-ref dirpath 0) #\/)) - (if (common:low-noise-print 300 "disks not a proper path " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) - -1) - (else - (get-df dirpath)))) - (free-inodes (cond - ((not (directory? dirpath)) - (if (common:low-noise-print 300 "disks not a dir " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) - -1) - ((not (file-write-access? dirpath)) - (if (common:low-noise-print 300 "disks not writeable " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) - -1) - ((not (eq? (string-ref dirpath 0) #\/)) - (if (common:low-noise-print 300 "disks not a proper path " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) - -1) - (else - (get-free-inodes dirpath)))) - ;;(free-inodes (get-free-inodes dirpath)) - ) - (debug:print 2 *default-log-port* "INFO: disk " disk-num " path " dirpath " free space " freespc " free inodes " free-inodes) - (if (and (> freespc bestsize)(> free-inodes min-inodes )) - (begin - (set! best (cons disk-num dirpath)) - (set! bestsize freespc))) - ;;(print "Processing: " disk-num " bestsize: " bestsize " best: " best " freespc: " freespc " min-inodes: " min-inodes " free-inodes: " free-inodes) - )) - (map car disks)) - (if (and best (> bestsize minsize)) - best - #f))) ;; #f means no disk candidate found - ;; convert a spec string to a list of vectors #( rx action rx-string ) (define (common:spec-string->list-of-specs spec-string actions) (let ((spec-strings (string-split-fields "\\s*;\\s*" spec-string #:infix)) (actions-regex (regexp (conc "^(.*)\\s+(" (string-intersperse (map conc actions) "|") ")")))) (filter @@ -3315,23 +2365,10 @@ vars (lambda (var val) (setenv var val))) vars)) - -(define (common:run-a-command cmd #!key (with-vars #f) (with-orig-env #f)) - (let* ((pre-cmd (dtests:get-pre-command)) - (post-cmd (dtests:get-post-command)) - (fullcmd (if (or pre-cmd post-cmd) - (conc pre-cmd cmd post-cmd) - (conc "viewscreen " cmd)))) - (debug:print-info 02 *default-log-port* "Running command: " fullcmd) - (cond - (with-vars (common:without-vars fullcmd)) - (with-orig-env (common:with-orig-env fullcmd)) - (else (common:without-vars fullcmd "MT_.*"))))) - ;;====================================================================== ;; C O L O R S ;;====================================================================== (define (common:name->iup-color name) @@ -3364,53 +2401,14 @@ (number->string x 16)) (map string->number (string-split instr))) "/")) -;;====================================================================== -;; L O C K I N G M E C H A N I S M S -;;====================================================================== - -;; faux-lock is deprecated. Please use simple-lock below -;; -(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) - (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count - (if (> wait-time 0) - (begin - (thread-sleep! 1) - (if (eq? wait-time 1) ;; only one second left, steal the lock - (begin - (debug:print-info 0 *default-log-port* "stealing lock for " keyname) - (common:faux-unlock keyname force: #t))) - (common:faux-lock keyname wait-time: (- wait-time 1))) - #f) - (begin - (rmt:no-sync-set keyname (conc (current-process-id))) - (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) - -(define (common:faux-unlock keyname #!key (force #f)) - (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) - (begin - (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) - #t) - #f)) - -;; simple lock. improve and converge on this one. -;; -(define (common:simple-lock keyname) - (rmt:no-sync-get-lock keyname)) - -(define (common:simple-unlock keyname #!key (force #f)) - (rmt:no-sync-del! keyname)) - - -;;====================================================================== -;; -;;====================================================================== - -(define (common:in-running-test?) - (and (args:get-arg "-execute") (get-environment-variable "MT_CMDINFO"))) + +;;====================================================================== +;; +;;====================================================================== (define (common:get-color-from-status status) (cond ((equal? status "PASS") "green") ((equal? status "FAIL") "red") @@ -3482,11 +2480,10 @@ (if (null? patts) '("") patts)) comparator))) - ;; if itempath is #f then look only at the testname part ;; (define (tests:match->sqlqry patterns) (if (string? patterns) (let ((patts (string-split patterns ","))) @@ -3505,10 +2502,136 @@ ;; (print "tests:match => patt-parts: " patt-parts ", test-patt: " test-patt ", item-patt: " item-patt) (if (null? tal) (string-intersperse (append (reverse res)(list qry)) " OR ") (loop (car tal)(cdr tal)(cons qry res))))))) #f)) + +;; A routine to map itempaths using a itemmap +;; patha and pathb must be strings or this will fail +;; +;; path-b is waiting on path-a +;; +(define (db:compare-itempaths test-b-name path-a path-b itemmaps ) + (debug:print-info 6 *default-log-port* "ITEMMAPS: " itemmaps) + (let* ((itemmap (tests:lookup-itemmap itemmaps test-b-name))) + (if itemmap + (let ((path-b-mapped (db:multi-pattern-apply path-b itemmap))) + (debug:print-info 6 *default-log-port* "ITEMMAP is " itemmap ", path: " path-b ", mapped path: " path-b-mapped) + (equal? path-a path-b-mapped)) + (equal? path-b path-a)))) + +;; A routine to convert test/itempath using a itemmap +;; NOTE: to process only an itempath (i.e. no prepended testname) +;; just call db:multi-pattern-apply +;; +(define (db:convert-test-itempath path-in itemmap) + (debug:print-info 6 *default-log-port* "ITEMMAP is " itemmap) + (let* ((path-parts (string-split path-in "/")) + (test-name (if (null? path-parts) "" (car path-parts))) + (item-path (string-intersperse (if (null? path-parts) '() (cdr path-parts)) "/"))) + (conc test-name "/" + (db:multi-pattern-apply item-path itemmap)))) + +;; patterns are: +;; "rx1" "replacement1"\n +;; "rx2" "replacement2" +;; etc. +;; +(define (db:multi-pattern-apply item-path itemmap) + (let ((all-patts (string-split itemmap "\n"))) + (if (null? all-patts) + item-path + (let loop ((hed (car all-patts)) + (tal (cdr all-patts)) + (res item-path)) + (let* ((parts (string-split hed)) + (patt (car parts)) + + (repl (if (> (length parts) 1)(cadr parts) "")) + + (newr (if (and patt repl) + (begin + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* + "WARNING: itemmap has problem \"" itemmap "\", patt: " patt ", repl: " repl ", exn=" exn) + res) + (string-substitute patt repl res)) + + + ) + (begin + (debug:print 0 *default-log-port* + "WARNING: itemmap has problem \"" itemmap "\", patt: " patt ", repl: " repl) + res)))) + (if (null? tal) + newr + (loop (car tal)(cdr tal) newr))))))) + + +(define (keys->valslots keys) ;; => ?,?,? .... + (string-intersperse (map (lambda (x) "?") keys) ",")) + +;; (define-inline (keys->key/field keys . additional) +;; (string-join (map (lambda (k)(conc k " TEXT")) +;; (append keys additional)) ",")) + +(define (item-list->path itemdat) + (if (list? itemdat) + (string-intersperse (map cadr itemdat) "/") + "")) + +(define (launch:is-test-alive host pid) + (if (and host pid (not (equal? host "n/a"))) + (let* ((cmd (conc "ssh " host " pstree -A " pid)) + (output (with-input-from-pipe cmd read-lines))) + (debug:print 2 *default-log-port* "Running " cmd " received " output) + (if (eq? (length output) 0) + #f + #t)) + #t)) + +(define (common:get-sync-lock-filepath) + (let* ((tmp-area (common:get-db-tmp-area)) + (lockfile (conc tmp-area "/megatest.db.sync-lock"))) + lockfile)) + +(define (common:get-testsuite-name) + (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description. + (configf:lookup *configdat* "setup" "testsuite" ) + (getenv "MT_TESTSUITE_NAME") + (pathname-file (or (if (string? *toppath* ) + (pathname-file *toppath*) + #f) + (common:get-toppath #f))) + "please-set-setup-area-name")) ;; (pathname-file (current-directory))))) + +(define (common:get-db-tmp-area . junk) + (if *db-cache-path* + *db-cache-path* + (if *toppath* ;; common:get-create-writeable-dir + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn) + (exit 1)) + (let* ((tsname (common:get-testsuite-name)) + (dbpath (common:get-create-writeable-dir + (list (conc "/tmp/" (current-user-name) + "/megatest_localdb/" + tsname "/" + (string-translate *toppath* "/" ".")) + (conc "/tmp/" (current-process-id) ;; just in case we have an issue with the dir by own user name + "/megatest_localdb/" + tsname + (string-translate *toppath* "/" ".")) + )))) + (set! *db-cache-path* dbpath) + dbpath)) + #f))) + ;; ;;====================================================================== ;; ;; N A N O M S G C L I E N T ;; ;;====================================================================== @@ -3579,84 +2702,10 @@ ;; (define (mddb:get-dashboards) ;; (let ((db (mddb:open-db))) ;; (query fetch-column ;; (sql db "SELECT ipaddr || ':' || portnum FROM dashboards;")))) -;;====================================================================== -;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S -;;====================================================================== -;; -;; [hosts] -;; arm cubie01 cubie02 -;; x86_64 zeus xena myth01 -;; allhosts #{g hosts arm} #{g hosts x86_64} -;; -;; [host-types] -;; general #MTLOWESTLOAD #{g hosts allhosts} -;; arm #MTLOWESTLOAD #{g hosts arm} -;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo -;; -;; [host-rules] -;; # maxnload => max normalized load -;; # maxnjobs => max jobs per cpu -;; # maxjobrate => max jobs per second -;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 -;; -;; [launchers] -;; envsetup general -;; xor/%/n 4C16G -;; % nbgeneral -;; -;; [jobtools] -;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. -;; flexi-launcher yes -;; launcher nbfake -;; -(define (common:get-launcher configdat testname itempath) - (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) - (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher - (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) - (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) - (if (null? launchers) - fallback-launcher - (let loop ((hed (car launchers)) - (tal (cdr launchers))) - (let ((patt (car hed)) - (host-type (cadr hed))) - (if (tests:match patt testname itempath) - (begin - (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) - (let ((launcher (configf:lookup configdat "host-types" host-type))) - (if launcher - (let* ((launcher-parts (string-split launcher)) - (launcher-exe (car launcher-parts))) - (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline - (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) - (count 100)) - (if targ-host - (conc "remrun " targ-host) - (if (> count 0) - (begin - (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) - (thread-sleep! (- 101 count)) - (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) - (- count 1))) - (begin - (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) - (exit))))) - launcher)) - (begin - (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) - (if (null? tal) - fallback-launcher - (loop (car tal)(cdr tal))))))) - ;; no match, try again - (if (null? tal) - fallback-launcher - (loop (car tal)(cdr tal)))))))) - fallback-launcher))) - ;;====================================================================== ;; NMSG AND NEW API ;;====================================================================== ;; nm based server experiment, keep around for now. @@ -3691,27 +2740,10 @@ (nn-send soc "queued")) ;; reply with "queued" (print "ERROR: ["(common:human-time)"] BAD request " dat)) (loop (nn-recv soc))))) (nn-close soc))) -;;====================================================================== -;; D A S H B O A R D U S E R V I E W S -;;====================================================================== - -;; first read ~/views.config if it exists, then read $MTRAH/views.config if it exists -;; -(define (common:load-views-config) - (let* ((view-cfgdat (make-hash-table)) - (home-cfgfile (conc (get-environment-variable "HOME") "/.mtviews.config")) - (mthome-cfgfile (conc *toppath* "/.mtviews.config"))) - (if (common:file-exists? mthome-cfgfile) - (read-config mthome-cfgfile view-cfgdat #t)) - ;; we load the home dir file AFTER the MTRAH file so the user can clobber settings when running the dashboard in read-only areas - (if (common:file-exists? home-cfgfile) - (read-config home-cfgfile view-cfgdat #t)) - view-cfgdat)) - ;;====================================================================== ;; H I E R A R C H I C A L H A S H T A B L E S ;;====================================================================== ;; Every element including top element is a vector: @@ -3780,102 +2812,10 @@ (state . s) (target . t) (status . u) (parent . P))))) -(define (common:get-pkts-dirs mtconf use-lt) - (let* ((pktsdirs-str (or (configf:lookup mtconf "setup" "pktsdirs") - (and use-lt - (conc (or *toppath* - (current-directory)) - "/lt/.pkts")))) - (pktsdirs (if pktsdirs-str - (string-split pktsdirs-str " ") - #f))) - pktsdirs)) - -;; use-lt is use linktree "lt" link to find pkts dir -(define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already - (if (or add-only - (hash-table-exists? *pkts-info* 'last-parent)) - (let* ((parent (hash-table-ref/default *pkts-info* 'last-parent #f)) - (pktalist (if parent - (cons `(parent . ,parent) - pktalist-in) - pktalist-in))) - (let-values (((uuid pkt) - (alist->pkt pktalist common:pkts-spec))) - (hash-table-set! *pkts-info* 'last-parent uuid) - (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f) - (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) - (pktsdir (car pktsdirs))) ;; assume it is there - (hash-table-set! *pkts-info* 'pkts-dir pktsdir) - pktsdir)))) - (handle-exceptions - exn - (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!! - (if (not (file-exists? pktsdir)) - (create-directory pktsdir #t)) - (with-output-to-file - (conc pktsdir "/" uuid ".pkt") - (lambda () - (print pkt))))))))) - -(define (common:with-queue-db mtconf proc #!key (use-lt #f)(toppath-in #f)) - (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) - (pktsdir (if pktsdirs (car pktsdirs) #f)) - (toppath (or (configf:lookup mtconf "scratchdat" "toppath") - toppath-in)) - (pdbpath (or (configf:lookup mtconf "setup" "pdbpath") pktsdir))) - (cond - ((not (and pktsdir toppath pdbpath)) - (debug:print 0 *default-log-port* "ERROR: settings are missing in your megatest.config for area management.") - (debug:print 0 *default-log-port* " you need to have pktsdirs in the [setup] section.")) - ((not (common:file-exists? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: pkts directory not found " pktsdir)) - ((not (equal? (file-owner pktsdir)(current-effective-user-id))) - (debug:print 0 *default-log-port* "ERROR: directory " pktsdir " is not owned by " (current-effective-user-name))) - (else - (let* ((pdb (open-queue-db pdbpath "pkts.db" - schema: '("CREATE TABLE groups (id INTEGER PRIMARY KEY,groupname TEXT, CONSTRAINT group_constraint UNIQUE (groupname));")))) - (proc pktsdirs pktsdir pdb) - (dbi:close pdb)))))) - -(define (common:load-pkts-to-db mtconf #!key (use-lt #f)) - (common:with-queue-db - mtconf - (lambda (pktsdirs pktsdir pdb) - (for-each - (lambda (pktsdir) ;; look at all - (cond - ((not (common:file-exists? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: packets directory " pktsdir " does not exist.")) - ((not (directory? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not a directory.")) - ((not (file-read-access? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not readable.")) - (else - (debug:print-info 0 *default-log-port* "Loading packets found in " pktsdir) - (let ((pkts (glob (conc pktsdir "/*.pkt")))) - (for-each - (lambda (pkt) - (let* ((uuid (cadr (string-match ".*/([0-9a-f]+).pkt" pkt))) - (exists (lookup-by-uuid pdb uuid #f))) - (if (not exists) - (let* ((pktdat (string-intersperse - (with-input-from-file pkt read-lines) - "\n")) - (apkt (pkt->alist pktdat)) - (ptype (alist-ref 'T apkt))) - (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0) - (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue")) - (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...") - ))) - pkts))))) - pktsdirs)) - use-lt: use-lt)) - (define (common:get-pkt-alists pkts) (map (lambda (x) (alist-ref 'apkt x)) ;; 'pkta pulls out the alist from the read pkt pkts)) @@ -4027,8 +2967,326 @@ ;; ) ;; (begin ;; (define *common:telemetry-log-state* 'closed) ;; (udp-close-socket *common:telemetry-log-socket*) ;; (set! *common:telemetry-log-socket* #f))))) + +(define (process:conservative-read port) + (let loop ((res "")) + (if (not (eof-object? (peek-char port))) + (loop (conc res (read-char port))) + res))) + +(define (process:cmd-run-with-stderr->list cmd . params) + ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params) +;; (handle-exceptions +;; exn +;; (begin +;; (print "ERROR: Failed to run command: " cmd " " (string-intersperse params " ")) +;; (print " " ((condition-property-accessor 'exn 'message) exn)) +;; #f) + (let-values (((fh fho pid fhe) (if (null? params) + (process* cmd) + (process* cmd params)))) + (let loop ((curr (read-line fh)) + (result '())) + (let ((errstr (process:conservative-read fhe))) + (if (not (string=? errstr "")) + (set! result (append result (list errstr))))) + (if (not (eof-object? curr)) + (loop (read-line fh) + (append result (list curr))) + (begin + (close-input-port fh) + (close-input-port fhe) + (close-output-port fho) + result))))) ;; ) + +(define (process:cmd-run-with-stderr-and-exitcode->list cmd . params) + ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params) +;; (handle-exceptions +;; exn +;; (begin +;; (print "ERROR: Failed to run command: " cmd " " (string-intersperse params " ")) +;; (print " " ((condition-property-accessor 'exn 'message) exn)) +;; #f) + (let-values (((fh fho pid fhe) (if (null? params) + (process* cmd) + (process* cmd params)))) + (let loop ((curr (read-line fh)) + (result '())) + (let ((errstr (process:conservative-read fhe))) + (if (not (string=? errstr "")) + (set! result (append result (list errstr))))) + (if (not (eof-object? curr)) + (loop (read-line fh) + (append result (list curr))) + (begin + (let-values (((anotherpid normalexit? exitstatus) (process-wait pid))) + (close-input-port fh) + (close-input-port fhe) + (close-output-port fho) + (list result (if normalexit? exitstatus -1)))))))) + +(define (process:cmd-run-proc-each-line cmd proc . params) + ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params) + (handle-exceptions + exn + (begin + (print "ERROR: Failed to run command: " cmd " " (string-intersperse params " ")) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 5 *default-log-port* "exn=" (condition->list exn)) + #f) + (let-values (((fh fho pid) (if (null? params) + (process cmd) + (process cmd params)))) + (let loop ((curr (read-line fh)) + (result '())) + (if (not (eof-object? curr)) + (loop (read-line fh) + (append result (list (proc curr)))) + (begin + (close-input-port fh) + ;;(close-input-port fhe) + (close-output-port fho) + result)))))) + +(define (process:cmd-run-proc-each-line-alt cmd proc) + (let* ((fh (open-input-pipe cmd)) + (res (port-proc->list fh proc)) + (status (close-input-pipe fh))) + (if (eq? status 0) res #f))) + +(define (process:cmd-run->list cmd #!key (delta-env-alist-or-hash-table '())) + (common:with-env-vars + delta-env-alist-or-hash-table + (lambda () + (let* ((fh (open-input-pipe cmd)) + (res (port->list fh)) + (status (close-input-pipe fh))) + (list res status))))) + +(define (port->list fh) + (if (eof-object? fh) #f + (let loop ((curr (read-line fh)) + (result '())) + (if (not (eof-object? curr)) + (loop (read-line fh) + (append result (list curr))) + result)))) + +(define (port-proc->list fh proc) + (if (eof-object? fh) #f + (let loop ((curr (proc (read-line fh))) + (result '())) + (if (not (eof-object? curr)) + (loop (let ((l (read-line fh))) + (if (eof-object? l) l (proc l))) + (append result (list curr))) + result)))) + +;; here is an example line where the shell is sh or bash +;; "find / -print 2&>1 > findall.log" +(define (run-n-wait cmdline #!key (params #f)(print-cmd #f)(run-dir #f)) + (if print-cmd + (debug:print 0 *default-log-port* + (if (string? print-cmd) + print-cmd + "") + (if run-dir (conc "Run in " run-dir ";") "") + cmdline + (if params + (conc " " (string-intersperse params " ")) + ""))) + (if (and run-dir + (directory-exists? run-dir)) + (push-directory run-dir)) + (let ((pid (if params + (process-run cmdline params) + (process-run cmdline)))) + (let loop ((i 0)) + (let-values (((pid-val exit-status exit-code) (process-wait pid #t))) + (if (eq? pid-val 0) + (begin + (thread-sleep! 2) + (loop (+ i 1))) + (begin + (if (and run-dir + (directory-exists? run-dir)) + (pop-directory)) + (values pid-val exit-status exit-code))))))) + +;;====================================================================== +;; MISC PROCESS RELATED STUFF +;;====================================================================== + +(define (process:children proc) + (with-input-from-pipe + (conc "ps h --ppid " (current-process-id) " -o pid") + (lambda () + (let loop ((inl (read-line)) + (res '())) + (if (eof-object? inl) + (reverse res) + (let ((pid (string->number inl))) + (if proc (proc pid)) + (loop (read-line) (cons pid res)))))))) + +(define (process:alive? pid) + (handle-exceptions + exn + ;; possibly pid is a process not a child, look in /proc to see if it is running still + (common:file-exists? (conc "/proc/" pid)) + (let-values (((rpid exit-type exit-signal)(process-wait pid #t))) + (and (number? rpid) + (equal? rpid pid))))) + +(define (process:alive-on-host? host pid) + (let ((cmd (conc "ssh " host " ps -o pid= -p " pid))) + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "failed to identify if process " pid ", on host " host " is alive. exn=" exn) + #f) ;; anything goes wrong - assume the process in NOT running. + (with-input-from-pipe + cmd + (lambda () + (let loop ((inl (read-line))) + (if (eof-object? inl) + #f + (let* ((clean-str (string-substitute "^[^\\d]*([0-9]+)[^\\d]*$" "\\1" inl)) + (innum (string->number clean-str))) + (and innum + (eq? pid innum)))))))))) + +(define (process:get-sub-pids pid) + (with-input-from-pipe + (conc "pstree -A -p " pid) ;; | tr 'a-z\\-+`()\\.' ' ' " pid) + (lambda () + (let loop ((inl (read-line)) + (res '())) + (if (eof-object? inl) + (reverse res) + (let ((nums (map string->number + (string-split-fields "\\d+" inl)))) + (loop (read-line) + (append res nums)))))))) + +;;====================================================================== +;; lookup routines - replicated from configf +;;====================================================================== + +(define (config:assoc-safe-add alist key val #!key (metadata #f)) + (let ((newalist (filter (lambda (x)(not (equal? key (car x)))) alist))) + (append newalist (list (if metadata + (list key val metadata) + (list key val)))))) + +(define (configf:section-var-set! cfgdat section-name var value #!key (metadata #f)) + (hash-table-set! cfgdat section-name + (config:assoc-safe-add + (hash-table-ref/default cfgdat section-name '()) + var value metadata: metadata))) + +(define (configf:lookup cfgdat section var) + (if (hash-table? cfgdat) + (let ((sectdat (hash-table-ref/default cfgdat section '()))) + (if (null? sectdat) + #f + (let ((match (assoc var sectdat))) + (if match ;; (and match (list? match)(> (length match) 1)) + (cadr match) + #f)) + )) + #f)) + +;; use to have definitive setting: +;; [foo] +;; var yes +;; +;; (configf:var-is? cfgdat "foo" "var" "yes") => #t +;; +(define (configf:var-is? cfgdat section var expected-val) + (equal? (configf:lookup cfgdat section var) expected-val)) + +(define config-lookup configf:lookup) + +;; safely look up a value that is expected to be a number, return +;; a default (#f unless provided) +;; +(define (configf:lookup-number cfdat section varname #!key (default #f)) + (let* ((val (configf:lookup *configdat* section varname)) + (res (if val + (string->number (string-substitute "\\s+" "" val #t)) + #f))) + (cond + (res res) + (val (debug:print 0 *default-log-port* "ERROR: no number found for [" section "], " varname ", got: " val)) + (else default)))) + +(define (configf:section-vars cfgdat section) + (let ((sectdat (hash-table-ref/default cfgdat section '()))) + (if (null? sectdat) + '() + (map car sectdat)))) + +(define (configf:get-section cfgdat section) + (hash-table-ref/default cfgdat section '())) + +(define (configf:set-section-var cfgdat section var val) + (let ((sectdat (configf:get-section cfgdat section))) + (hash-table-set! cfgdat section + (config:assoc-safe-add sectdat var val)))) + + ;;(append (filter (lambda (x)(not (assoc var sectdat))) sectdat) + ;; (list var val)))) + +;;====================================================================== +;; stuff from tests.scm +;;====================================================================== + +;; given a list of itemmaps (testname . map), return the first match +;; +(define (tests:lookup-itemmap itemmaps testname) + (let ((best-matches (filter (lambda (itemmap) + (tests:match (car itemmap) testname #f)) + itemmaps))) + (if (null? best-matches) + #f + (let ((res (car best-matches))) + ;; (debug:print 0 *default-log-port* "res=" res) + (cond + ((string? res) res) ;;; FIX THE ROOT CAUSE HERE .... + ((null? res) #f) + ((string? (cdr res)) (cdr res)) ;; it is a pair + ((string? (cadr res))(cadr res)) ;; it is a list + (else cadr res)))))) + +;;====================================================================== +;; stuff from tasks.scm +;;====================================================================== + +;; no elegance here ... +;; +(define (tasks:kill-server hostname pid #!key (kill-switch "")) + (debug:print-info 0 *default-log-port* "Attempting to kill server process " pid " on host " hostname) + (setenv "TARGETHOST" hostname) + (let* ((logdir (if (directory-exists? "logs") + "logs/" + "")) + (logfile (if logdir (conc "logs/server-"pid"-"hostname".log") #f)) + (gzfile (if logfile (conc logfile ".gz")))) + (setenv "TARGETHOST_LOGF" (conc logdir "server-kills.log")) + + (system (conc "nbfake kill "kill-switch" "pid)) + + (when logfile + (thread-sleep! 0.5) + (if (common:file-exists? gzfile) (delete-file gzfile)) + (system (conc "gzip " logfile)) + + (unsetenv "TARGETHOST_LOGF") + (unsetenv "TARGETHOST")))) + ) Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -49,22 +49,10 @@ (let ((remcwd (take dir (- (length dir) 1)))) (if (null? remcwd) (list #f #f #f) ;; #f #f) (loop remcwd))))))))) -(define (config:assoc-safe-add alist key val #!key (metadata #f)) - (let ((newalist (filter (lambda (x)(not (equal? key (car x)))) alist))) - (append newalist (list (if metadata - (list key val metadata) - (list key val)))))) - -(define (configf:section-var-set! cfgdat section-name var value #!key (metadata #f)) - (hash-table-set! cfgdat section-name - (config:assoc-safe-add - (hash-table-ref/default cfgdat section-name '()) - var value metadata: metadata))) - (define (config:eval-string-in-environment str) ;; (if (or (string-null? str) ;; (equal? "!" (substring str 0 1))) ;; null string or starts with ! are preserved but NOT set in the environment str (handle-exceptions @@ -505,10 +493,26 @@ (if (and toppath pathenvvar)(setenv pathenvvar toppath)) (let ((configdat (if configfile (read-config configfile #f #t environ-patt: environ-patt post-section-procs: (list (cons "^fields$" set-fields)) #f)))) (if toppath (change-directory curr-dir)) (list configdat toppath configfile fname)))) + +;;====================================================================== +;; lookup and manipulation routines +;;====================================================================== + +(define (config:assoc-safe-add alist key val #!key (metadata #f)) + (let ((newalist (filter (lambda (x)(not (equal? key (car x)))) alist))) + (append newalist (list (if metadata + (list key val metadata) + (list key val)))))) + +(define (configf:section-var-set! cfgdat section-name var value #!key (metadata #f)) + (hash-table-set! cfgdat section-name + (config:assoc-safe-add + (hash-table-ref/default cfgdat section-name '()) + var value metadata: metadata))) (define (configf:lookup cfgdat section var) (if (hash-table? cfgdat) (let ((sectdat (hash-table-ref/default cfgdat section '()))) (if (null? sectdat) @@ -559,10 +563,14 @@ (hash-table-set! cfgdat section (config:assoc-safe-add sectdat var val)))) ;;(append (filter (lambda (x)(not (assoc var sectdat))) sectdat) ;; (list var val)))) + +;;====================================================================== +;; setup +;;====================================================================== (define (setup) (let* ((configf (find-config "megatest.config")) (config (if configf (read-config configf #f #t) #f))) (if config Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -23,11 +23,12 @@ ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc (use (srfi 18) extras tcp stack) -(use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records matchable) +(use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest + base64 format dot-locking z3 typed-records matchable) (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) (declare (unit db)) (declare (uses common)) @@ -44,5 +45,370 @@ (include "common_records.scm") (include "db_records.scm") (include "key_records.scm") (include "run_records.scm") + +;; MUST RESOLVE mt:process-triggers before these can move to dbmod. + +;; set tests with state currstate and status currstatus to newstate and newstatus +;; use currstate = #f and or currstatus = #f to apply to any state or status respectively +;; WARNING: SQL injection risk. NB// See new but not yet used "faster" version below +;; +;; AND NOT (item_path='' AND testname in (SELECT DISTINCT testname FROM tests WHERE testname=? AND item_path != ''));"))) +;; (debug:print 0 *default-log-port* "QRY: " qry) +;; (db:delay-if-busy) +;; +;; NB// This call only operates on toplevel tests. Consider replacing it with more general call +;; +(define (db:set-tests-state-status dbstruct run-id testnames currstate currstatus newstate newstatus) + (let ((test-ids '())) + (for-each + (lambda (testname) + (let ((qry (conc "UPDATE tests SET state=?,status=? WHERE " + (if currstate (conc "state='" currstate "' AND ") "") + (if currstatus (conc "status='" currstatus "' AND ") "") + " run_id=? AND testname LIKE ?;")) + (test-id (db:get-test-id dbstruct run-id testname ""))) + (db:with-db + dbstruct + run-id + #t + (lambda (db) + (sqlite3:execute db qry + (or newstate currstate "NOT_STARTED") + (or newstatus currstate "UNKNOWN") + run-id testname))) + (if test-id + (begin + (set! test-ids (cons test-id test-ids)) + (mt:process-triggers dbstruct run-id test-id newstate newstatus))))) + testnames) + test-ids)) + +;; state is the priority rollup of all states +;; status is the priority rollup of all completed statesfu +;; +;; if test-name is an integer work off that instead of test-name test-path +;; +(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) + ;; establish info on incoming test followed by info on top level test + ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met + (let* ((testdat (if (number? test-name) + (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id + (db:get-test-info dbstruct run-id test-name item-path))) + (test-id (db:test-get-id testdat)) + (test-name (if (number? test-name) + (db:test-get-testname testdat) + test-name)) + (item-path (db:test-get-item-path testdat)) + (tl-testdat (db:get-test-info dbstruct run-id test-name "")) + (tl-test-id (if tl-testdat + (db:test-get-id tl-testdat) + #f))) + (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) + (db:general-call dbstruct 'set-test-start-time (list test-id))) + (mutex-lock! *db-transaction-mutex*) + (db:with-db + dbstruct #f #f + (lambda (db) + (let ((tr-res + (sqlite3:with-transaction + db + (lambda () + ;; NB// Pass the db so it is part fo the transaction + (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status + (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item + (let* ((state-status-counts (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test + (state-stauses (db:roll-up-rules state-status-counts state status)) + (newstate (car state-stauses)) + (newstatus (cadr state-stauses))) + (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts) " state-status-counts: " + (apply conc + (map (lambda (x) + (conc + (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) + state-status-counts))); end debug:print + + (if tl-test-id + (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct + )))))) + (mutex-unlock! *db-transaction-mutex*) + (if (and test-id state status (equal? status "AUTO")) + (db:test-data-rollup dbstruct run-id test-id status)) + tr-res))))) + +;; ;; speed up for common cases with a little logic +;; ;; NB// Ultimately this will be deprecated in deference to mt:test-set-state-status-by-id +;; +;; NOTE: run-id is not used +;; ;; +(define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment) + (db:with-db + dbstruct + ;; run-id + #f + #t + (lambda (db) + (cond + ((and newstate newstatus newcomment) + (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment) + test-id)) + ((and newstate newstatus) + (sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id)) + (else + (if newstate (sqlite3:execute db "UPDATE tests SET state=? WHERE id=?;" newstate test-id)) + (if newstatus (sqlite3:execute db "UPDATE tests SET status=? WHERE id=?;" newstatus test-id)) + (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment) + test-id)))))) + (mt:process-triggers dbstruct run-id test-id newstate newstatus)) + +(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) + (let* ((incompleted '()) + (oldlaunched '()) + (toplevels '()) + ;; The default running-deadtime is 720 seconds = 12 minutes. + ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30)) + (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime"))) + (server-start-allowance 200) + (server-overloaded-budget 200) + (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30)) + (launch-monitor-on-time-budget 30) + (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget)) + (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30)) + (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default)) + (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period))) + (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period) + ) + (debug:print-info 4 *default-log-port* "running-deadtime = " running-deadtime) + (debug:print-info 4 *default-log-port* "deadtime-trim = " deadtime-trim) + + (db:with-db + dbstruct #f #f + (lambda (db) + (let* ((stmth1 (db:get-cache-stmth + dbstruct db + "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests + WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) + AND state IN ('RUNNING');")) + (stmth2 (db:get-cache-stmth + dbstruct db + "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests + WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) + AND state IN ('REMOTEHOSTSTART');")) + (stmth3 (db:get-cache-stmth + dbstruct db + "SELECT id,rundir,uname,testname,item_path FROM tests + WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 + AND state IN ('LAUNCHED');"))) + ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes + ;; + ;; HOWEVER: this code in run:test seems to work fine + ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) + ;; (db:test-get-run_duration testdat))) + ;; 600) + ;; (db:delay-if-busy dbdat) + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path event-time run-duration) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) + (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" + test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) + " event-time="event-time" run-duration="run-duration)))) + stmth1 + run-id running-deadtime) ;; default time 720 seconds + + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path event-time run-duration) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id + " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time + " run-duration="run-duration) + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) + stmth2 + run-id remotehoststart-deadtime) ;; default time 230 seconds + + ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config + ;; + ;; (db:delay-if-busy dbdat) + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (begin + (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id + " 1 day since event_time marked") + (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))) + stmth3 + run-id) + + (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " + (length toplevels) " old LAUNCHED toplevel tests and " + (length incompleted) " tests marked RUNNING but apparently dead.")) + + ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE. + ;; + ;; (db:delay-if-busy dbdat) + (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all + (all-ids (append min-incompleted-ids (map car oldlaunched)))) + (if (> (length all-ids) 0) + (begin + ;; (launch:is-test-alive "localhost" 435) + (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") + " as DEAD") + (for-each + (lambda (test-id) + (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id)) + (tinfo (db:get-test-info-by-id dbstruct run-id test-id)) + (run-dir (db:test-get-rundir tinfo)) + (host (db:test-get-host tinfo)) + (pid (db:test-get-process_id tinfo)) + (result (db:get-status-from-final-status-file run-dir))) + (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") + (db:set-state-status-and-roll-up-items + dbstruct run-id test-id 'foo "COMPLETED" "PASS" + "Test stopped responding but it has PASSED; marking it PASS in the DB.")) + (let ((is-alive (and (not (eq? pid 0)) ;; 0 is default in re-used field "attemptnum" where pid stored. + (launch:is-test-alive host pid)))) + (if is-alive + (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host + " has a process on pid " pid ", NOT setting to DEAD.") + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id + " final state/status is not COMPLETED/PASS. It is " result) + (db:set-state-status-and-roll-up-items + dbstruct run-id test-id 'foo "COMPLETED" "DEAD" + "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) + ;; call end of eud of run detection for posthook - from merge, is it needed? + ;; (launch:end-of-run-check run-id) + all-ids) + + ;; MOVE TO rmt:find-and-mark-incomplete - for now always call launch:end-of-run-check after + ;; calling rmt:find-and-mark-incompletes + + ;;ALWAYS CALL after rmt:find-and-mark-incompletes + ;; call end of eud of run detection for posthook + ;; (launch:end-of-run-check run-id) + + ))))))) + +(define (db:test-set-state-status-process-triggers dbstruct run-id test-id newstate newstatus newcomment) + (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment) + (mt:process-triggers dbstruct run-id test-id newstate newstatus)) + +;; ; This is to be the big daddy call NOPE: Replaced by db:set-state-status-and-roll-up-items +;; ; +(define (db:test-set-state-status dbstruct run-id test-id state status msg) + (let ((dbdat (db:get-db dbstruct run-id))) + (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) + (db:general-call dbdat 'set-test-start-time (list test-id))) + ;; (if msg + ;; (db:general-call dbdat 'state-status-msg (list state status msg test-id)) + ;; (db:general-call dbdat 'state-status (list state status test-id))) + (db:set-state-status-and-roll-up-items dbstruct run-id test-id #f state status msg) + ;; process the test_data table + (if (and test-id state status (equal? status "AUTO")) + (db:test-data-rollup dbstruct run-id test-id status)) + (mt:process-triggers dbstruct run-id test-id state status))) + +;; options: +;; +;; 'killservers - kills all servers +;; 'dejunk - removes junk records +;; 'adj-testids - move test-ids into correct ranges +;; 'old2new - sync megatest.db to /tmp/.../megatest.db and /tmp/.../megatest_ref.db +;; 'new2old - sync /tmp/.../megatest.db to megatest.db and /tmp/.../megatest_ref.db (and update data_synced) +;; 'closeall - close all opened dbs +;; 'schema - attempt to apply schema changes +;; run-ids: '(1 2 3 ...) or #f (for all) +;; +(define (db:multi-db-sync dbstruct . options) + ;; (if (not (launch:setup)) + ;; (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") + (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) + (tmpdb (db:get-db dbstruct)) + (refndb (dbr:dbstruct-refndb dbstruct)) + (allow-cleanup #t) ;; (if run-ids #f #t)) + (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) + (data-synced 0)) ;; count of changed records (I hope) + + (for-each + (lambda (option) + + (case option + ;; kill servers + ((killservers) + (for-each + (lambda (server) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " server ", exn=" exn) + #f) + (match-let (((mod-time host port start-time server-id pid) server)) + (if (and host pid) + (tasks:kill-server host pid))))) + servers) + + ;; /tmp/bjbarcla/megatest_localdb/fullrun/.nfs.pdx.disks.icf_env_disk001.bjbarcla.gwa.issues.mtdev.matt-bisect.megatest.ext-tests.runs.all.v1.65.1.6524.dev.bb-24-justrollup-f8.rollup.fullrun/megatest.db.lock + (delete-file* (common:get-sync-lock-filepath)) + ) + + ;; clear out junk records + ;; + ((dejunk) + ;; (db:delay-if-busy mtdb) ;; ok to delay on mtdb + (when (file-write-access? (db:dbdat-get-path mtdb)) (db:clean-up mtdb)) + (db:clean-up tmpdb) + (db:clean-up refndb)) + + ;; sync runs, test_meta etc. + ;; + ((old2new) + (set! data-synced + (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb) + data-synced))) + + ;; now ensure all newdb data are synced to megatest.db + ;; do not use the run-ids list passed in to the function + ;; + ((new2old) + (set! data-synced + (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb) + data-synced))) + + ((adj-target) + (db:adj-target (db:dbdat-get-db mtdb)) + (db:adj-target (db:dbdat-get-db tmpdb)) + (db:adj-target (db:dbdat-get-db refndb))) + + ((schema) + (db:patch-schema-maindb (db:dbdat-get-db mtdb)) + (db:patch-schema-maindb (db:dbdat-get-db tmpdb)) + (db:patch-schema-maindb (db:dbdat-get-db refndb)) + (db:patch-schema-rundb (db:dbdat-get-db mtdb)) + (db:patch-schema-rundb (db:dbdat-get-db tmpdb)) + (db:patch-schema-rundb (db:dbdat-get-db refndb)))) + + (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb)) + options) + data-synced)) + +(define (db:get-access-mode) + (if (args:get-arg "-use-db-cache") 'cached 'rmt)) + Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -26,22 +26,38 @@ * (import commonmod) (import ods) -(import scheme chicken data-structures extras) -(import (prefix sqlite3 sqlite3:) - posix typed-records srfi-18 srfi-69 - stack - files - srfi-13 - srfi-1 - ) +(import scheme chicken data-structures extras ports) +(import + (prefix base64 base64:) + (prefix sqlite3 sqlite3:) + csv + csv-xml + call-with-environment-variables + directory-utils + files + matchable + md5 + posix + typed-records + srfi-18 + srfi-69 + regex + s11n + srfi-1 + srfi-13 + stack + z3 + ) ;;====================================================================== ;; R E C O R D S ;;====================================================================== + +(include "db_records.scm") ;; each db entry is a pair ( db . dbfilepath ) ;; I propose this record evolves into the area record ;; (defstruct dbr:dbstruct @@ -50,10 +66,11 @@ (mtdb #f) (refndb #f) (homehost #f) ;; not used yet (on-homehost #f) ;; not used yet (read-only #f) + (locdbs #f) (stmt-cache (make-hash-table)) ) ;; goal is to converge on one struct for an area but for now it is too confusing ;; record for keeping state,status and count for doing roll-ups in @@ -1088,13 +1105,13 @@ (cond (*dbstruct-db* *dbstruct-db*);; TODO: when multiple areas are supported, this optimization will be a hazard (else ;;(common:on-homehost?) (debug:print-info 13 *default-log-port* "db:setup entered (first time, not cached.)") (let* ((dbstruct (make-dbr:dbstruct))) - (when (not *toppath*) - (debug:print-info 13 *default-log-port* "in db:setup, *toppath* not set; calling launch:setup") - (launch:setup areapath: areapath)) + (assert *toppath* "ERROR: db:setup called without first calling launch:setup.") + ;;(debug:print-info 13 *default-log-port* "in db:setup, *toppath* not set; calling launch:setup") + ;; (launch:setup areapath: areapath)) (common:get-db-tmp-area) (debug:print-info 13 *default-log-port* "Begin db:open-db") (db:open-db dbstruct areapath: areapath do-sync: do-sync) (debug:print-info 13 *default-log-port* "Done db:open-db") (set! *dbstruct-db* dbstruct) @@ -1129,11 +1146,11 @@ exn (begin (print "Attempt to safely close sqlite3 db failed. Trying again. exn=" exn) (thread-sleep! 3) (sqlite3:interrupt! db) - (db:safely-close-sqlite3-db db stmtcache try-num: (- try-num 1))) + (db:safely-close-sqlite3-db db stmt-cache try-num: (- try-num 1))) (if (sqlite3:database? db) (let* ((stmts (and stmt-cache (hash-table-ref/default stmt-cache db #f)))) (if stmts (map sqlite3:finalize! (hash-table-values stmts))) (sqlite3:finalize! db) #t) @@ -1298,13 +1315,10 @@ (set! field-num (+ field-num 1)))) fields))) (define *global-db-store* (make-hash-table)) -(define (db:get-access-mode) - (if (args:get-arg "-use-db-cache") 'cached 'rmt)) - ;; Add db direct ;; (define (db:dispatch-query access-mode rmt-cmd db-cmd . params) (if (eq? access-mode 'cached) (debug:print 2 *default-log-port* "not doing cached calls right now")) @@ -1313,14 +1327,15 @@ ;;) ;; return the target db handle so it can be used ;; (define (db:cache-for-read-only source target #!key (use-last-update #f)) + (assert *toppath* "ERROR: db:cache-for-read-only called without calling launch:setup first.") (if (and (hash-table-ref/default *global-db-store* target #f) (>= (file-modification-time target)(file-modification-time source))) (hash-table-ref *global-db-store* target) - (let* ((toppath (launch:setup)) + (let* ((toppath *toppath*) ;; (launch:setup)) (targ-db-last-mod (if (common:file-exists? target) (file-modification-time target) 0)) (cache-db (or (hash-table-ref/default *global-db-store* target #f) (db:open-megatest-db path: target))) @@ -1363,93 +1378,10 @@ ;; use-last-update: #t))) ;; (thread-start! th1) ;; (apply proc cache-db params) ;; )))) -;; options: -;; -;; 'killservers - kills all servers -;; 'dejunk - removes junk records -;; 'adj-testids - move test-ids into correct ranges -;; 'old2new - sync megatest.db to /tmp/.../megatest.db and /tmp/.../megatest_ref.db -;; 'new2old - sync /tmp/.../megatest.db to megatest.db and /tmp/.../megatest_ref.db (and update data_synced) -;; 'closeall - close all opened dbs -;; 'schema - attempt to apply schema changes -;; run-ids: '(1 2 3 ...) or #f (for all) -;; -(define (db:multi-db-sync dbstruct . options) - ;; (if (not (launch:setup)) - ;; (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") - (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) - (tmpdb (db:get-db dbstruct)) - (refndb (dbr:dbstruct-refndb dbstruct)) - (allow-cleanup #t) ;; (if run-ids #f #t)) - (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) - (data-synced 0)) ;; count of changed records (I hope) - - (for-each - (lambda (option) - - (case option - ;; kill servers - ((killservers) - (for-each - (lambda (server) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) - #f) - (match-let (((mod-time host port start-time server-id pid) server)) - (if (and host pid) - (tasks:kill-server host pid))))) - servers) - - ;; /tmp/bjbarcla/megatest_localdb/fullrun/.nfs.pdx.disks.icf_env_disk001.bjbarcla.gwa.issues.mtdev.matt-bisect.megatest.ext-tests.runs.all.v1.65.1.6524.dev.bb-24-justrollup-f8.rollup.fullrun/megatest.db.lock - (delete-file* (common:get-sync-lock-filepath)) - ) - - ;; clear out junk records - ;; - ((dejunk) - ;; (db:delay-if-busy mtdb) ;; ok to delay on mtdb - (when (file-write-access? (db:dbdat-get-path mtdb)) (db:clean-up mtdb)) - (db:clean-up tmpdb) - (db:clean-up refndb)) - - ;; sync runs, test_meta etc. - ;; - ((old2new) - (set! data-synced - (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb) - data-synced))) - - ;; now ensure all newdb data are synced to megatest.db - ;; do not use the run-ids list passed in to the function - ;; - ((new2old) - (set! data-synced - (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb) - data-synced))) - - ((adj-target) - (db:adj-target (db:dbdat-get-db mtdb)) - (db:adj-target (db:dbdat-get-db tmpdb)) - (db:adj-target (db:dbdat-get-db refndb))) - - ((schema) - (db:patch-schema-maindb (db:dbdat-get-db mtdb)) - (db:patch-schema-maindb (db:dbdat-get-db tmpdb)) - (db:patch-schema-maindb (db:dbdat-get-db refndb)) - (db:patch-schema-rundb (db:dbdat-get-db mtdb)) - (db:patch-schema-rundb (db:dbdat-get-db tmpdb)) - (db:patch-schema-rundb (db:dbdat-get-db refndb)))) - - (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb)) - options) - data-synced)) - (define (db:tmp->megatest.db-sync dbstruct last-update) (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) (tmpdb (db:get-db dbstruct)) (refndb (dbr:dbstruct-refndb dbstruct)) (res (db:sync-tables (db:sync-all-tables-list dbstruct) last-update tmpdb refndb mtdb))) @@ -1668,11 +1600,11 @@ ;;====================================================================== ;; L O G G I N G D B ;;====================================================================== -(define (open-logging-db) +#;(define (open-logging-db) (let* ((dbpath (conc (if *toppath* (conc *toppath* "/") "") "logging.db")) ;; fname) (dbexists (common:file-exists? dbpath)) (db (sqlite3:open-database dbpath)) (handler (sqlite3:make-busy-timeout (if (args:get-arg "-override-timeout") (string->number (args:get-arg "-override-timeout")) @@ -1683,15 +1615,15 @@ (sqlite3:execute db "CREATE TABLE IF NOT EXISTS log (id INTEGER PRIMARY KEY,event_time TIMESTAMP DEFAULT (strftime('%s','now')),logline TEXT,pwd TEXT,cmdline TEXT,pid INTEGER);") (db:set-sync db) ;; (sqlite3:execute db (conc "PRAGMA synchronous = 0;")) )) db)) -(define (db:log-local-event . loglst) +#;(define (db:log-local-event . loglst) (let ((logline (apply conc loglst))) (db:log-event logline))) -(define (db:log-event logline) +#;(define (db:log-event logline) (let ((db (open-logging-db))) (sqlite3:execute db "INSERT INTO log (logline,pwd,cmdline,pid) VALUES (?,?,?,?);" logline (current-directory) (string-intersperse (argv) " ") @@ -1777,149 +1709,10 @@ ;; select end_time-now from ;; (select testname,item_path,event_time+run_duration as ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); -(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) - (let* ((incompleted '()) - (oldlaunched '()) - (toplevels '()) - ;; The default running-deadtime is 720 seconds = 12 minutes. - ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30)) - (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime"))) - (server-start-allowance 200) - (server-overloaded-budget 200) - (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30)) - (launch-monitor-on-time-budget 30) - (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget)) - (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30)) - (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default)) - (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period))) - (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period) - ) - (debug:print-info 4 *default-log-port* "running-deadtime = " running-deadtime) - (debug:print-info 4 *default-log-port* "deadtime-trim = " deadtime-trim) - - (db:with-db - dbstruct #f #f - (lambda (db) - (let* ((stmth1 (db:get-cache-stmth - dbstruct db - "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests - WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) - AND state IN ('RUNNING');")) - (stmth2 (db:get-cache-stmth - dbstruct db - "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests - WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) - AND state IN ('REMOTEHOSTSTART');")) - (stmth3 (db:get-cache-stmth - dbstruct db - "SELECT id,rundir,uname,testname,item_path FROM tests - WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 - AND state IN ('LAUNCHED');"))) - ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes - ;; - ;; HOWEVER: this code in run:test seems to work fine - ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) - ;; (db:test-get-run_duration testdat))) - ;; 600) - ;; (db:delay-if-busy dbdat) - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path event-time run-duration) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (begin - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) - (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" - test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) - " event-time="event-time" run-duration="run-duration)))) - stmth1 - run-id running-deadtime) ;; default time 720 seconds - - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path event-time run-duration) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (begin - (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id - " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time - " run-duration="run-duration) - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) - stmth2 - run-id remotehoststart-deadtime) ;; default time 230 seconds - - ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config - ;; - ;; (db:delay-if-busy dbdat) - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (begin - (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id - " 1 day since event_time marked") - (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))) - stmth3 - run-id) - - (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " - (length toplevels) " old LAUNCHED toplevel tests and " - (length incompleted) " tests marked RUNNING but apparently dead.")) - - ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE. - ;; - ;; (db:delay-if-busy dbdat) - (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all - (all-ids (append min-incompleted-ids (map car oldlaunched)))) - (if (> (length all-ids) 0) - (begin - ;; (launch:is-test-alive "localhost" 435) - (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") - " as DEAD") - (for-each - (lambda (test-id) - (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id)) - (tinfo (db:get-test-info-by-id dbstruct run-id test-id)) - (run-dir (db:test-get-rundir tinfo)) - (host (db:test-get-host tinfo)) - (pid (db:test-get-process_id tinfo)) - (result (db:get-status-from-final-status-file run-dir))) - (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) - (begin - (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") - (db:set-state-status-and-roll-up-items - dbstruct run-id test-id 'foo "COMPLETED" "PASS" - "Test stopped responding but it has PASSED; marking it PASS in the DB.")) - (let ((is-alive (and (not (eq? pid 0)) ;; 0 is default in re-used field "attemptnum" where pid stored. - (launch:is-test-alive host pid)))) - (if is-alive - (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host - " has a process on pid " pid ", NOT setting to DEAD.") - (begin - (debug:print 0 *default-log-port* "INFO: test " test-id - " final state/status is not COMPLETED/PASS. It is " result) - (db:set-state-status-and-roll-up-items - dbstruct run-id test-id 'foo "COMPLETED" "DEAD" - "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) - ;; call end of eud of run detection for posthook - from merge, is it needed? - ;; (launch:end-of-run-check run-id) - all-ids) - ;;call end of eud of run detection for posthook - (launch:end-of-run-check run-id) - ))))))) - ;; BUG: Probably broken - does not explicitly use run-id in the query ;; (define (db:top-test-set-per-pf-counts dbstruct run-id test-name) (db:general-call dbstruct 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name))) @@ -2892,11 +2685,11 @@ ;; Get run-ids for runs with same target but different runnames and NOT run-id ;; (define (db:get-prev-run-ids dbstruct run-id) (let* ((keyvals (db:get-key-val-pairs dbstruct run-id)) (kvalues (map cadr keyvals)) - (keys (rmt:get-keys)) + (keys (db:get-keys)) (qrystr (string-intersperse (map (lambda (x)(conc x "=?")) keys) " AND "))) (let ((prev-run-ids '())) (if (null? keyvals) '() (begin @@ -3134,70 +2927,10 @@ (lambda () (sqlite3:execute db "DELETE FROM test_steps WHERE test_id IN (SELECT id FROM tests WHERE state='DELETED' AND event_timetest-data dbstruct run-id test-id csvdata) (debug:print 4 *default-log-port* "test-id " test-id ", csvdata: " csvdata) (db:with-db dbstruct #f #f @@ -3935,77 +3672,10 @@ (print-call-chain (current-error-port)) msg))) ;; crude reply for when things go awry ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize)))) (else msg))) ;; rpc -;; ; This is to be the big daddy call NOPE: Replaced by db:set-state-status-and-roll-up-items -;; ; -;; define (db:test-set-state-status dbstruct run-id test-id state status msg) -;; (let ((dbdat (db:get-db dbstruct run-id))) -;; (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) -;; (db:general-call dbdat 'set-test-start-time (list test-id))) -;; ;; (if msg -;; ;; (db:general-call dbdat 'state-status-msg (list state status msg test-id)) -;; ;; (db:general-call dbdat 'state-status (list state status test-id))) -;; (db:set-state-status-and-roll-up-items dbstruct run-id test-id #f state status msg) -;; ;; process the test_data table -;; (if (and test-id state status (equal? status "AUTO")) -;; (db:test-data-rollup dbstruct run-id test-id status)) -;; (mt:process-triggers dbstruct run-id test-id state status))) - -;; state is the priority rollup of all states -;; status is the priority rollup of all completed statesfu -;; -;; if test-name is an integer work off that instead of test-name test-path -;; -(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) - ;; establish info on incoming test followed by info on top level test - ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met - (let* ((testdat (if (number? test-name) - (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id - (db:get-test-info dbstruct run-id test-name item-path))) - (test-id (db:test-get-id testdat)) - (test-name (if (number? test-name) - (db:test-get-testname testdat) - test-name)) - (item-path (db:test-get-item-path testdat)) - (tl-testdat (db:get-test-info dbstruct run-id test-name "")) - (tl-test-id (if tl-testdat - (db:test-get-id tl-testdat) - #f))) - (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) - (db:general-call dbstruct 'set-test-start-time (list test-id))) - (mutex-lock! *db-transaction-mutex*) - (db:with-db - dbstruct #f #f - (lambda (db) - (let ((tr-res - (sqlite3:with-transaction - db - (lambda () - ;; NB// Pass the db so it is part fo the transaction - (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status - (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item - (let* ((state-status-counts (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test - (state-stauses (db:roll-up-rules state-status-counts state status)) - (newstate (car state-stauses)) - (newstatus (cadr state-stauses))) - (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts) " state-status-counts: " - (apply conc - (map (lambda (x) - (conc - (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) - state-status-counts))); end debug:print - - (if tl-test-id - (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct - )))))) - (mutex-unlock! *db-transaction-mutex*) - (if (and test-id state status (equal? status "AUTO")) - (db:test-data-rollup dbstruct run-id test-id status)) - tr-res))))) - (define (db:roll-up-rules state-status-counts state status) (let* ((running (length (filter (lambda (x) (member (dbr:counts-state x) *common:running-states*)) state-status-counts))) (bad-not-started (length (filter (lambda (x) @@ -4582,75 +4252,10 @@ res)))) ;;====================================================================== ;; M I S C M A N A G E M E N T I T E M S ;;====================================================================== - -;; A routine to map itempaths using a itemmap -;; patha and pathb must be strings or this will fail -;; -;; path-b is waiting on path-a -;; -(define (db:compare-itempaths test-b-name path-a path-b itemmaps ) - (debug:print-info 6 *default-log-port* "ITEMMAPS: " itemmaps) - (let* ((itemmap (tests:lookup-itemmap itemmaps test-b-name))) - (if itemmap - (let ((path-b-mapped (db:multi-pattern-apply path-b itemmap))) - (debug:print-info 6 *default-log-port* "ITEMMAP is " itemmap ", path: " path-b ", mapped path: " path-b-mapped) - (equal? path-a path-b-mapped)) - (equal? path-b path-a)))) - -;; A routine to convert test/itempath using a itemmap -;; NOTE: to process only an itempath (i.e. no prepended testname) -;; just call db:multi-pattern-apply -;; -(define (db:convert-test-itempath path-in itemmap) - (debug:print-info 6 *default-log-port* "ITEMMAP is " itemmap) - (let* ((path-parts (string-split path-in "/")) - (test-name (if (null? path-parts) "" (car path-parts))) - (item-path (string-intersperse (if (null? path-parts) '() (cdr path-parts)) "/"))) - (conc test-name "/" - (db:multi-pattern-apply item-path itemmap)))) - -;; patterns are: -;; "rx1" "replacement1"\n -;; "rx2" "replacement2" -;; etc. -;; -(define (db:multi-pattern-apply item-path itemmap) - (let ((all-patts (string-split itemmap "\n"))) - (if (null? all-patts) - item-path - (let loop ((hed (car all-patts)) - (tal (cdr all-patts)) - (res item-path)) - (let* ((parts (string-split hed)) - (patt (car parts)) - - (repl (if (> (length parts) 1)(cadr parts) "")) - - (newr (if (and patt repl) - (begin - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* - "WARNING: itemmap has problem \"" itemmap "\", patt: " patt ", repl: " repl ", exn=" exn) - res) - (string-substitute patt repl res)) - - - ) - (begin - (debug:print 0 *default-log-port* - "WARNING: itemmap has problem \"" itemmap "\", patt: " patt ", repl: " repl) - res)))) - (if (null? tal) - newr - (loop (car tal)(cdr tal) newr))))))) - - ;; the new prereqs calculation, looks also at itempath if specified ;; all prereqs must be met ;; if prereq test with itempath='' is COMPLETED and PASS, WARN, CHECK, or WAIVED then prereq is met @@ -4989,8 +4594,51 @@ ;; brutal clean up (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat) (system "rm -rf tempdir"))) ;; (db:extract-ods-file db "outputfile.ods" '(("sysname" "%")("fsname" "%")("datapath" "%")) "%") + +;;====================================================================== +;; mt routines that fit moved here +;;====================================================================== + +;;====================================================================== +;; T R I G G E R S +;;====================================================================== + +(define (mt:run-trigger cmd test-id test-rundir trigger logname test-name item-path event-time actual-state actual-status) + ;; Putting the commandline into ( )'s means no control over the shell. + ;; stdout and stderr will be caught in the NBFAKE or mt_launch.log files + ;; or equivalent. No need to do this. Just run it? + (let* ((fullcmd (conc "nbfake " + cmd " " + test-id " " + test-rundir " " + trigger " " + test-name " " + item-path " " ;; has / prepended to deal with toplevel tests + actual-state " " + actual-status " " + event-time + )) + (prev-nbfake-log (get-environment-variable "NBFAKE_LOG"))) + (setenv "NBFAKE_LOG" (conc (cond + ((and (directory-exists? test-rundir) + (file-write-access? test-rundir)) + test-rundir) + ((and (directory-exists? *toppath*) + (file-write-access? *toppath*)) + *toppath*) + (else (conc "/tmp/" (current-user-name)))) + "/" logname)) + (debug:print-info 0 *default-log-port* "TRIGGERED on " trigger ", running command " fullcmd " output at " (get-environment-variable "NBFAKE_LOG")) + ;; (call-with-environment-variables + ;; `(("NBFAKE_LOG" . ,(conc test-rundir "/" logname))) + ;; (lambda () + (process-run fullcmd) + (if prev-nbfake-log + (setenv "NBFAKE_LOG" prev-nbfake-log) + (unsetenv "NBFAKE_LOG")) + )) ;; )) ) Index: key_records.scm ================================================================== --- key_records.scm +++ key_records.scm @@ -16,17 +16,5 @@ ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see . ;;====================================================================== -(define-inline (keys->valslots keys) ;; => ?,?,? .... - (string-intersperse (map (lambda (x) "?") keys) ",")) - -;; (define-inline (keys->key/field keys . additional) -;; (string-join (map (lambda (k)(conc k " TEXT")) -;; (append keys additional)) ",")) - -(define-inline (item-list->path itemdat) - (if (list? itemdat) - (string-intersperse (map cadr itemdat) "/") - "")) - Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -772,20 +772,10 @@ (item-path (vector-ref running-test 11))) (debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) -(define (launch:is-test-alive host pid) - (if (and host pid (not (equal? host "n/a"))) - (let* ((cmd (conc "ssh " host " pstree -A " pid)) - (output (with-input-from-pipe cmd read-lines))) - (debug:print 2 *default-log-port* "Running " cmd " received " output) - (if (eq? (length output) 0) - #f - #t)) - #t)) - (define (launch:kill-tests-if-dead run-id) (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) (let loop ((running-test (car running-tests)) (tal (cdr running-tests)) (kill-cnt 0)) Index: mt.scm ================================================================== --- mt.scm +++ mt.scm @@ -136,107 +136,10 @@ (begin (debug:print 0 *default-log-port* "Discarding test " testn "(" test-dat ") due to " failed-test) res) (cons testn res))))))))) -;;====================================================================== -;; T R I G G E R S -;;====================================================================== - -(define (mt:run-trigger cmd test-id test-rundir trigger logname test-name item-path event-time actual-state actual-status) - ;; Putting the commandline into ( )'s means no control over the shell. - ;; stdout and stderr will be caught in the NBFAKE or mt_launch.log files - ;; or equivalent. No need to do this. Just run it? - (let* ((fullcmd (conc "nbfake " - cmd " " - test-id " " - test-rundir " " - trigger " " - test-name " " - item-path " " ;; has / prepended to deal with toplevel tests - actual-state " " - actual-status " " - event-time - )) - (prev-nbfake-log (get-environment-variable "NBFAKE_LOG"))) - (setenv "NBFAKE_LOG" (conc (cond - ((and (directory-exists? test-rundir) - (file-write-access? test-rundir)) - test-rundir) - ((and (directory-exists? *toppath*) - (file-write-access? *toppath*)) - *toppath*) - (else (conc "/tmp/" (current-user-name)))) - "/" logname)) - (debug:print-info 0 *default-log-port* "TRIGGERED on " trigger ", running command " fullcmd " output at " (get-environment-variable "NBFAKE_LOG")) - ;; (call-with-environment-variables - ;; `(("NBFAKE_LOG" . ,(conc test-rundir "/" logname))) - ;; (lambda () - (process-run fullcmd) - (if prev-nbfake-log - (setenv "NBFAKE_LOG" prev-nbfake-log) - (unsetenv "NBFAKE_LOG")) - )) ;; )) - -(define (mt:process-triggers dbstruct run-id test-id newstate newstatus) - (if test-id - (let* ((test-dat (db:get-test-info-by-id dbstruct run-id test-id))) - (if test-dat - (let* ((test-rundir (db:test-get-rundir test-dat)) ;; ) ;; ) - (test-name (db:test-get-testname test-dat)) - (item-path (db:test-get-item-path test-dat)) - (duration (db:test-get-run_duration test-dat)) - (comment (db:test-get-comment test-dat)) - (event-time (db:test-get-event_time test-dat)) - (tconfig #f) - (state (if newstate newstate (db:test-get-state test-dat))) - (status (if newstatus newstatus (db:test-get-status test-dat)))) - ;; (mutex-lock! *triggers-mutex*) - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* " Exception in mt:process-triggers for run-id="run-id" test-id="test-id" newstate="newstate" newstatus="newstatus - "\n error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn - "\n test-rundir="test-rundir - "\n test-name="test-name - "\n item-path="item-path - "\n state="state - "\n status="status - "\n") - (print-call-chain (current-error-port)) - #f) - (if (and test-name - test-rundir) ;; #f means no dir set yet - ;; (common:file-exists? test-rundir) - ;; (directory? test-rundir)) - (call-with-environment-variables - (list (cons "MT_TEST_NAME" (or test-name "no such test")) - (cons "MT_TEST_RUN_DIR" (or test-rundir "no test directory yet")) - (cons "MT_ITEMPATH" (or item-path ""))) - (lambda () - (if (directory-exists? test-rundir) - (push-directory test-rundir) - (push-directory *toppath*)) - (set! tconfig (mt:lazy-read-test-config test-name)) - (for-each (lambda (trigger) - (let* ((munged-trigger (string-translate trigger "/ " "--")) - (logname (conc "last-trigger-" munged-trigger ".log"))) - ;; first any triggers from the testconfig - (let ((cmd (configf:lookup tconfig "triggers" trigger))) - (if cmd (mt:run-trigger cmd test-id test-rundir trigger (conc "tconfig-" logname) test-name item-path event-time state status))) - ;; next any triggers from megatest.config - (let ((cmd (configf:lookup *configdat* "triggers" trigger))) - (if cmd (mt:run-trigger cmd test-id test-rundir trigger (conc "mtconfig-" logname) test-name item-path event-time state status))))) - (list - (conc state "/" status) - (conc state "/") - (conc "/" status))) - (pop-directory)) - ))) - ;; (mutex-unlock! *triggers-mutex*) - ))))) - ;;====================================================================== ;; S T A T E A N D S T A T U S F O R T E S T S ;;====================================================================== ;; speed up for common cases with a little logic @@ -303,6 +206,66 @@ (if (null? tal) (begin (debug:print-error 0 *default-log-port* "No readable testconfig found for " test-name) #f) (loop (car tal)(cdr tal)))))))))) + +;; cannot move to dbmod until lazy-read-testconfig is unravelled. +;; +(define (mt:process-triggers dbstruct run-id test-id newstate newstatus) + (if test-id + (let* ((test-dat (db:get-test-info-by-id dbstruct run-id test-id))) + (if test-dat + (let* ((test-rundir (db:test-get-rundir test-dat)) ;; ) ;; ) + (test-name (db:test-get-testname test-dat)) + (item-path (db:test-get-item-path test-dat)) + (duration (db:test-get-run_duration test-dat)) + (comment (db:test-get-comment test-dat)) + (event-time (db:test-get-event_time test-dat)) + (tconfig #f) + (state (if newstate newstate (db:test-get-state test-dat))) + (status (if newstatus newstatus (db:test-get-status test-dat)))) + ;; (mutex-lock! *triggers-mutex*) + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* " Exception in mt:process-triggers for run-id="run-id" test-id="test-id" newstate="newstate" newstatus="newstatus + "\n error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn + "\n test-rundir="test-rundir + "\n test-name="test-name + "\n item-path="item-path + "\n state="state + "\n status="status + "\n") + (print-call-chain (current-error-port)) + #f) + (if (and test-name + test-rundir) ;; #f means no dir set yet + ;; (common:file-exists? test-rundir) + ;; (directory? test-rundir)) + (call-with-environment-variables + (list (cons "MT_TEST_NAME" (or test-name "no such test")) + (cons "MT_TEST_RUN_DIR" (or test-rundir "no test directory yet")) + (cons "MT_ITEMPATH" (or item-path ""))) + (lambda () + (if (directory-exists? test-rundir) + (push-directory test-rundir) + (push-directory *toppath*)) + (set! tconfig (mt:lazy-read-test-config test-name)) + (for-each (lambda (trigger) + (let* ((munged-trigger (string-translate trigger "/ " "--")) + (logname (conc "last-trigger-" munged-trigger ".log"))) + ;; first any triggers from the testconfig + (let ((cmd (configf:lookup tconfig "triggers" trigger))) + (if cmd (mt:run-trigger cmd test-id test-rundir trigger (conc "tconfig-" logname) test-name item-path event-time state status))) + ;; next any triggers from megatest.config + (let ((cmd (configf:lookup *configdat* "triggers" trigger))) + (if cmd (mt:run-trigger cmd test-id test-rundir trigger (conc "mtconfig-" logname) test-name item-path event-time state status))))) + (list + (conc state "/" status) + (conc state "/") + (conc "/" status))) + (pop-directory)) + ))) + ;; (mutex-unlock! *triggers-mutex*) + ))))) Index: process.scm ================================================================== --- process.scm +++ process.scm @@ -26,205 +26,5 @@ (declare (unit process)) (declare (uses commonmod)) (import commonmod) -(define (process:conservative-read port) - (let loop ((res "")) - (if (not (eof-object? (peek-char port))) - (loop (conc res (read-char port))) - res))) - -(define (process:cmd-run-with-stderr->list cmd . params) - ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params) -;; (handle-exceptions -;; exn -;; (begin -;; (print "ERROR: Failed to run command: " cmd " " (string-intersperse params " ")) -;; (print " " ((condition-property-accessor 'exn 'message) exn)) -;; #f) - (let-values (((fh fho pid fhe) (if (null? params) - (process* cmd) - (process* cmd params)))) - (let loop ((curr (read-line fh)) - (result '())) - (let ((errstr (process:conservative-read fhe))) - (if (not (string=? errstr "")) - (set! result (append result (list errstr))))) - (if (not (eof-object? curr)) - (loop (read-line fh) - (append result (list curr))) - (begin - (close-input-port fh) - (close-input-port fhe) - (close-output-port fho) - result))))) ;; ) - -(define (process:cmd-run-with-stderr-and-exitcode->list cmd . params) - ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params) -;; (handle-exceptions -;; exn -;; (begin -;; (print "ERROR: Failed to run command: " cmd " " (string-intersperse params " ")) -;; (print " " ((condition-property-accessor 'exn 'message) exn)) -;; #f) - (let-values (((fh fho pid fhe) (if (null? params) - (process* cmd) - (process* cmd params)))) - (let loop ((curr (read-line fh)) - (result '())) - (let ((errstr (process:conservative-read fhe))) - (if (not (string=? errstr "")) - (set! result (append result (list errstr))))) - (if (not (eof-object? curr)) - (loop (read-line fh) - (append result (list curr))) - (begin - (let-values (((anotherpid normalexit? exitstatus) (process-wait pid))) - (close-input-port fh) - (close-input-port fhe) - (close-output-port fho) - (list result (if normalexit? exitstatus -1)))))))) - -(define (process:cmd-run-proc-each-line cmd proc . params) - ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params) - (handle-exceptions - exn - (begin - (print "ERROR: Failed to run command: " cmd " " (string-intersperse params " ")) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 5 *default-log-port* "exn=" (condition->list exn)) - #f) - (let-values (((fh fho pid) (if (null? params) - (process cmd) - (process cmd params)))) - (let loop ((curr (read-line fh)) - (result '())) - (if (not (eof-object? curr)) - (loop (read-line fh) - (append result (list (proc curr)))) - (begin - (close-input-port fh) - ;;(close-input-port fhe) - (close-output-port fho) - result)))))) - -(define (process:cmd-run-proc-each-line-alt cmd proc) - (let* ((fh (open-input-pipe cmd)) - (res (port-proc->list fh proc)) - (status (close-input-pipe fh))) - (if (eq? status 0) res #f))) - -(define (process:cmd-run->list cmd #!key (delta-env-alist-or-hash-table '())) - (common:with-env-vars - delta-env-alist-or-hash-table - (lambda () - (let* ((fh (open-input-pipe cmd)) - (res (port->list fh)) - (status (close-input-pipe fh))) - (list res status))))) - -(define (port->list fh) - (if (eof-object? fh) #f - (let loop ((curr (read-line fh)) - (result '())) - (if (not (eof-object? curr)) - (loop (read-line fh) - (append result (list curr))) - result)))) - -(define (port-proc->list fh proc) - (if (eof-object? fh) #f - (let loop ((curr (proc (read-line fh))) - (result '())) - (if (not (eof-object? curr)) - (loop (let ((l (read-line fh))) - (if (eof-object? l) l (proc l))) - (append result (list curr))) - result)))) - -;; here is an example line where the shell is sh or bash -;; "find / -print 2&>1 > findall.log" -(define (run-n-wait cmdline #!key (params #f)(print-cmd #f)(run-dir #f)) - (if print-cmd - (debug:print 0 *default-log-port* - (if (string? print-cmd) - print-cmd - "") - (if run-dir (conc "Run in " run-dir ";") "") - cmdline - (if params - (conc " " (string-intersperse params " ")) - ""))) - (if (and run-dir - (directory-exists? run-dir)) - (push-directory run-dir)) - (let ((pid (if params - (process-run cmdline params) - (process-run cmdline)))) - (let loop ((i 0)) - (let-values (((pid-val exit-status exit-code) (process-wait pid #t))) - (if (eq? pid-val 0) - (begin - (thread-sleep! 2) - (loop (+ i 1))) - (begin - (if (and run-dir - (directory-exists? run-dir)) - (pop-directory)) - (values pid-val exit-status exit-code))))))) - -;;====================================================================== -;; MISC PROCESS RELATED STUFF -;;====================================================================== - -(define (process:children proc) - (with-input-from-pipe - (conc "ps h --ppid " (current-process-id) " -o pid") - (lambda () - (let loop ((inl (read-line)) - (res '())) - (if (eof-object? inl) - (reverse res) - (let ((pid (string->number inl))) - (if proc (proc pid)) - (loop (read-line) (cons pid res)))))))) - -(define (process:alive? pid) - (handle-exceptions - exn - ;; possibly pid is a process not a child, look in /proc to see if it is running still - (common:file-exists? (conc "/proc/" pid)) - (let-values (((rpid exit-type exit-signal)(process-wait pid #t))) - (and (number? rpid) - (equal? rpid pid))))) - -(define (process:alive-on-host? host pid) - (let ((cmd (conc "ssh " host " ps -o pid= -p " pid))) - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "failed to identify if process " pid ", on host " host " is alive. exn=" exn) - #f) ;; anything goes wrong - assume the process in NOT running. - (with-input-from-pipe - cmd - (lambda () - (let loop ((inl (read-line))) - (if (eof-object? inl) - #f - (let* ((clean-str (string-substitute "^[^\\d]*([0-9]+)[^\\d]*$" "\\1" inl)) - (innum (string->number clean-str))) - (and innum - (eq? pid innum)))))))))) - -(define (process:get-sub-pids pid) - (with-input-from-pipe - (conc "pstree -A -p " pid) ;; | tr 'a-z\\-+`()\\.' ' ' " pid) - (lambda () - (let loop ((inl (read-line)) - (res '())) - (if (eof-object? inl) - (reverse res) - (let ((nums (map string->number - (string-split-fields "\\d+" inl)))) - (loop (read-line) - (append res nums)))))))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -802,11 +802,12 @@ (define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order))) (define (rmt:find-and-mark-incomplete run-id ovr-deadtime) ;; (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime)) - (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))) ;; ) + (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime)) + ) ;; ) (define (rmt:get-main-run-stats run-id) (rmt:send-receive 'get-main-run-stats #f (list run-id))) (define (rmt:get-var varname) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -807,11 +807,14 @@ (for-each (lambda (run-id) (if keep-going (handle-exceptions exn (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) - (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) + (rmt:find-and-mark-incomplete run-id #f) + (launch:end-of-run-check run-id) + + ))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ;; (thread-start! th1) (thread-start! th2) ;; (thread-join! th1) @@ -1906,10 +1909,12 @@ (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id))) (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) + ;;call end of eud of run detection for posthook + (launch:end-of-run-check run-id) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -193,32 +193,10 @@ (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:need-server run-id) (equal? (configf:lookup *configdat* "server" "required") "yes")) -;; no elegance here ... -;; -(define (tasks:kill-server hostname pid #!key (kill-switch "")) - (debug:print-info 0 *default-log-port* "Attempting to kill server process " pid " on host " hostname) - (setenv "TARGETHOST" hostname) - (let* ((logdir (if (directory-exists? "logs") - "logs/" - "")) - (logfile (if logdir (conc "logs/server-"pid"-"hostname".log") #f)) - (gzfile (if logfile (conc logfile ".gz")))) - (setenv "TARGETHOST_LOGF" (conc logdir "server-kills.log")) - - (system (conc "nbfake kill "kill-switch" "pid)) - - (when logfile - (thread-sleep! 0.5) - (if (common:file-exists? gzfile) (delete-file gzfile)) - (system (conc "gzip " logfile)) - - (unsetenv "TARGETHOST_LOGF") - (unsetenv "TARGETHOST")))) - ;;====================================================================== ;; M O N I T O R S ;;====================================================================== Index: tdb.scm ================================================================== --- tdb.scm +++ tdb.scm @@ -243,14 +243,10 @@ (loop (read-line))))) ;; roll up the current results. ;; FIXME: Add the status too (rmt:test-data-rollup run-id test-id #f)) -(define (tdb:get-prev-tol-for-test tdb test-id category variable) - ;; Finish me? - (values #f #f #f)) - ;;====================================================================== ;; S T E P S ;;====================================================================== (define (tdb:step-get-time-as-string vec) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -122,27 +122,10 @@ '()) (if itemmap-table itemmap-table '())))) -;; given a list of itemmaps (testname . map), return the first match -;; -(define (tests:lookup-itemmap itemmaps testname) - (let ((best-matches (filter (lambda (itemmap) - (tests:match (car itemmap) testname #f)) - itemmaps))) - (if (null? best-matches) - #f - (let ((res (car best-matches))) - ;; (debug:print 0 *default-log-port* "res=" res) - (cond - ((string? res) res) ;;; FIX THE ROOT CAUSE HERE .... - ((null? res) #f) - ((string? (cdr res)) (cdr res)) ;; it is a pair - ((string? (cadr res))(cadr res)) ;; it is a list - (else cadr res)))))) - ;; return items given config ;; (define (tests:get-items tconfig) (let ((items (hash-table-ref/default tconfig "items" #f)) ;; items 4 (itemstable (hash-table-ref/default tconfig "itemstable" #f)))