ADDED utils/wip/mtest-dbstop.scm Index: utils/wip/mtest-dbstop.scm ================================================================== --- /dev/null +++ utils/wip/mtest-dbstop.scm @@ -0,0 +1,12 @@ +#!/p/foundry/env/pkgs/chicken/4.10.0_ext/bin/csi -s + +(use chicken) +(use data-structures) + + +(include "/nfs/site/home/bjbarcla/bin2/mtest-repair-lib.scm") +(glib-color-mode 1) + +(set! *this-cmd* "/nfs/site/home/bjbarcla/bin2/mtest-dbstop.scm") +(kill-in-db) + ADDED utils/wip/mtest-diag.scm Index: utils/wip/mtest-diag.scm ================================================================== --- /dev/null +++ utils/wip/mtest-diag.scm @@ -0,0 +1,165 @@ +#!/p/foundry/env/pkgs/chicken/4.10.0_ext/bin/csi -s + +(use chicken) +(use data-structures) + + +(include "/nfs/site/home/bjbarcla/bin2/mtest-repair-lib.scm") +(glib-color-mode 1) + +;;; check mtver in xterm +(let ((mt-ver (do-or-die "megatest -version"))) + (when (member mt-ver '("1.6309-738c" "1.6029")) + (iwarn "This xterm has an older version of megatest.") + (ierr "Please load latest megatest version to proceed.") + (print "eg.: source ../scripts/newrel-setup.csh 1.63/11b") + (exit 3))) + + +;;;; kill netbatch jobs from this megatest +;; TODO! + + +(define *diag* #t) +;;(define *user* (get-environment-variable "USER")) +(define *user* (do-or-die "ls -ld . | awk '{print $3}'")) +(print "user="*user*) +;;;; delete .homehost .homehost.config +;;;; if not on homehost, ssh homehost, cd here, killall mtest dboard +(if (not *diag*) + (when (file-exists? ".homehost.config") + (delete-db ".homehost.config"))) + +(when (file-exists? ".homehost") + (let* ((homehost (with-input-from-file ".homehost" (lambda () (read))))) + (let* ((homehostname (do-or-die "host `cat .homehost` | sed 's/.$//' | awk '{print $NF}' | awk -F. '{print $1}'")) + (thishostname (get-environment-variable "HOST"))) + (when (not (equal? homehostname thishostname)) + (let* ((this-exe-compiled (car (argv))) + (this-exe "/nfs/site/home/bjbarcla/bin2/mtest-diag.scm") + (cmd (conc "ssh "homehostname" 'cd "(get-environment-variable "PWD")" && "this-exe"'"))) + (iwarn "Running on the homehost -- "homehostname) + ;;(iwarn "eg: % ssh "homehostname" 'cd "(get-environment-variable "PWD")" && "(car (argv))"'") + (print "cmd="cmd) + ;;(inote "sleeping for 5 seconds. hit ctrl-c now to run on homehost or wait to proceed.") + (system cmd) + (exit 0)))))) + + + + +;;;; kill megatests and dashboards in this area +(define (kill-mtest-dboard) + (if *diag* + #f + (let* ((this-toppath (pid->cwd (current-process-id))) + (tmppath (toppath->tmppath this-toppath)) + (config (let ((res (conc this-toppath "/megatest.config"))) + (when (not (file-exists? res)) + (ierr "This is not a megatest run area; "res" does not exist. Aborting.") + (exit 2)) + res)) + (mtest-procs (get-my-mtest-procs)) + (dashboard-procs (get-my-dashboard-procs)) + (all-pids (map proc-PID (append mtest-procs dashboard-procs))) + (our-pids (filter (lambda (pid) + (equal? (pid->cwd pid) this-toppath)) + all-pids))) + + (if (null? our-pids) + (inote "No mtest or dboard processes on this host in in this runarea.") + (begin + (iwarn "Killing all megatest and dashboard processes on this host.") + (gracefully-kill-pids our-pids))) + ))) + +(kill-mtest-dboard) + + +;;;; delete /tmp/.$USER-portlogger.db +(let ((plfile (conc "/tmp/."*user* "-portlogger.db"))) + + (if (safe-file-exists? plfile) + (if *diag* + (print "plfile exists - "plfile) + (begin + (inote "removing portlogger file") + (system (conc "rm "plfile)))))) + + +;;;; move logs dir aside +(when (not *diag*) + (system (conc "mv logs logs-aside-`date +%s`")) + (system "mkdir logs")) + + +;;;; fixes for dependency diagram +(when (not *diag*) + (inote "Removing dep graph tmp files if they exist") + (system (conc "rm /tmp/."*user*"-*.dot")) + + ;;#ln -s /p/fdk/gwa/$USER/fossil/ext/_ext ext + (let* ((toppath (pid->cwd (current-process-id))) + (flow (car (string-split + (car (reverse (string-split toppath "/"))) + "."))) + (extdir (conc "/p/fdk/gwa/"*user* + "/fossil/ext/"flow"_ext"))) + (when (and (safe-file-exists? extdir) + (not (safe-file-exists? "ext"))) + (inote "Linking in ext dir") + (system (conc "ln -s "extdir" ext"))))) + + +;;;; check for 0 byte megatest{,_ref}.db in tmp. delete them +;;;; check for wal-mode megatest{,_ref}.db in tmp. delete them +(define (repair-dbs) + (let* ((this-toppath (pid->cwd (current-process-id))) + (tmppath (toppath->tmppath this-toppath)) + (golden-mtest-file (conc this-toppath "/megatest.db")) + (golden-mtest-file-ok (check-db "megatest.db")) + (tmp-mtest-file (conc tmppath "/megatest.db")) + (tmp-mtestref-file (conc tmppath "/megatest_ref.db")) + (tmp-mtest-file-ok (check-db tmp-mtest-file)) + (tmp-mtestref-file-ok (check-db tmp-mtestref-file)) + ) +;;;; check for megatest{,_ref}.db in tmp that die on .schema. delete them + (when (safe-file-exists? tmppath) + (if tmp-mtest-file-ok + (inote "tmp megatest db file ok") + (if *diag* + (print "diag: tmp megatest db broken - "tmp-mtest-file) + (delete-db tmp-mtest-file))) + (if tmp-mtestref-file-ok + (inote "tmp megatestref db file ok") + (if *diag* + (print "diag: tmpref megatest db broken - "tmp-mtestref-file) + (delete-db tmp-mtestref-file)))) + +;;;; check for megatest.db + (if golden-mtest-file-ok + (inote "golden megatest db file ok") + (if (not (file-exists? golden-mtest-file)) + (inote "megatest.db not present. Continuing.") + (begin + ;;;; if golden megatest db is broken, stop now! + (ierr "Golden megatest.db is broken. Please delete it or replace it from a backup version in .snapshot. If critical, contact env team to assist.") + (sendmail "bjbarcla" "!!Bad golden megatest.db" this-toppath) + (inote "Backups in .snapshot:") + (system "ls -l .snapshot/*/megatest.db") + (ierr "Not proceeding with any more checks.") + (exit 3)))) + + + + )) + +(repair-dbs) + + + + + + + + ADDED utils/wip/mtest-nbstop.scm Index: utils/wip/mtest-nbstop.scm ================================================================== --- /dev/null +++ utils/wip/mtest-nbstop.scm @@ -0,0 +1,34 @@ +#!/p/foundry/env/pkgs/chicken/4.10.0_ext/bin/csi -s + +(use chicken) +(use data-structures) + + +(include "/nfs/site/home/bjbarcla/bin2/mtest-repair-lib.scm") +(glib-color-mode 1) + +(set! *this-cmd* "/nfs/site/home/bjbarcla/bin2/mtest-nbstop.scm") + +(inote "Killing local mtest/dboard in this run area.") +(kill-mtest-dboard) + +;;;; move logs dir aside +(inote "move logs") +(system (conc "mv logs logs-aside-`date +%s`")) +(system "mkdir logs") + + + +(inote "Killing netbatch mtest jobs launched from this run area.") +(let ((jobcount (kill-mtest-jobs-in-netbatch))) + (when (> jobcount 0) + (inote "Marking in-flight tests killed in db") + (when (db-islocked? "megatest.db") + (iwarn "Unlocking megatest.db") + (db-unlock "megaetest.db")) + (kill-in-db))) + +(inote "Final reaping of mtest/dboard") +(kill-mtest-dboard) + + ADDED utils/wip/mtest-reaper.scm Index: utils/wip/mtest-reaper.scm ================================================================== --- /dev/null +++ utils/wip/mtest-reaper.scm @@ -0,0 +1,142 @@ +#!/p/foundry/env/pkgs/chicken/4.10.0_ext/bin/csi -s + +(use general-lib) +(use typed-records) +(use regex-literals) +(use regex) +(use sql-de-lite) + +(defstruct proc + (USER "") + (PID -1) + (%CPU -1.0) + (%MEM -1.0) + (VSZ -1) + (RSS -1) + (TTY "") + (STAT "") + (START "") + (TIME "") + (COMMAND "")) + +(define (linux-get-process-info-records) + (let* ((raw (do-or-die "/bin/ps auwx")) + (all-lines (string-split raw "\n")) + (lines (cdr all-lines)) ;; skip title lines + (re #/^(\S+)\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/)) + (filter + proc? + (map + (lambda (line) + (let ((match (string-match re line))) + (if match + (make-proc + USER: (list-ref match 1) + PID: (string->number (list-ref match 2)) + %CPU: (string->number (list-ref match 3)) + %MEM: (string->number (list-ref match 4)) + VSZ: (string->number (list-ref match 5)) + RSS: (string->number (list-ref match 6)) + TTY: (string->number (list-ref match 7)) + STAT: (list-ref match 8) + START: (list-ref match 9) + TIME: (list-ref match 10) + COMMAND: (list-ref match 11)) + #f))) + lines)))) + +(define (get-my-mtest-server-procs) + (let* ((procs (linux-get-process-info-records)) + (my-mtest-procs + (filter + (lambda (a-proc) + (and + (equal? (get-environment-variable "USER") (proc-USER a-proc)) + (string-match #/^.*\/mtest\s+.*-server.*/ (proc-COMMAND a-proc)))) + procs))) + my-mtest-procs)) + + +(define (pid->environ-hash pid) + (let* ((envfile (conc "/proc/"pid"/environ")) + (ht (make-hash-table)) + (rawdata (with-input-from-file envfile read-string)) + (lines (string-split rawdata (make-string 1 #\nul )))) + (for-each + (lambda (line) + (let ((match (string-match #/(^[^=]+)=(.*)/ line))) + (if match + (hash-table-set! ht (list-ref match 1) (list-ref match 2))))) + lines) + ht)) + +(define (pid->cwd pid) + (read-symbolic-link (conc "/proc/"pid"/cwd"))) + +(define (pid->mtest-monitor-db-file pid) + (let* ((env (pid->environ-hash pid)) + (ltdir (hash-table-ref/default env "MT_LINKTREE" #f)) + (radir (hash-table-ref/default env "MT_RUN_AREA_HOME" #f)) + (cwd (pid->cwd pid))) + (let ((res + (cond + (ltdir (conc ltdir "/.db/monitor.db")) + (radir (conc + (do-or-die + (conc "megatest -start-dir "radir" -show-config -section setup -var linktree")) + "/.db/monitor.db")) + (cwd (conc + (do-or-die + (conc "megatest -start-dir "cwd" -show-config -section setup -var linktree")) + "/.db/monitor.db")) + + (else #f)))) + res))) + + + + + +(define (get-mdb-status mdb-file pid) + ;; select state from servers where pid='4465'; + + (cond + ((not (string? mdb-file)) (conc "mdb-file could not be determined for pid " pid ">>"mdb-file )) + ((not (file-exists? mdb-file)) (conc "mdb-file does not exist for pid "pid" : "mdb-file)) + (else + (let ((dbh (open-database mdb-file))) + + (set-busy-handler! dbh 10000) + (let* ((sql-str "select state from servers where pid=?;") + (stm (sql dbh sql-str)) + (alists (query fetch-alists stm (->string pid)))) + (if (null? alists) + "server pid not in monitor.db" + (cdr (car (car alists))))))))) + + +(define (mtest-server-pid->status pid) + (let* ((mdb-file (pid->mtest-monitor-db-file pid))) + (if mdb-file + (get-mdb-status mdb-file pid) + "no monitor.db file could be found" + ))) + + +(define (kill pid) + (print "KILL "pid) + (do-or-die (conc "kill -9 "pid))) + +(define (reap-defunct-mtest-server-pid pid) + (let ((status (mtest-server-pid->status pid))) + (print pid"->"(mtest-server-pid->status pid)) + (if (member status (list "running" "dbprep" "available" "collision")) + (print "pid="pid" in status "status" -- not killing") + (kill pid)))) + +(let* ((procs (get-my-mtest-server-procs)) + (pids (map proc-PID procs)) + ) + + (for-each reap-defunct-mtest-server-pid pids)) + ADDED utils/wip/mtest-repair-lib.scm Index: utils/wip/mtest-repair-lib.scm ================================================================== --- /dev/null +++ utils/wip/mtest-repair-lib.scm @@ -0,0 +1,508 @@ +(use general-lib) +(use typed-records) +(use regex-literals) +(use regex) +(use sql-de-lite) +(use posix) +(use files) +(use s11n) +(use ports) +(use z3) +(use base64) +(use matchable) + + + + +(define (cli-arg arg #!key (default #f) (is-list #f)) + (let* ((temp (skim-cmdline-opts-withargs-by-regex arg))) + (if (> (length temp) 0) + (if is-list + temp + (car temp)) + default))) + +(define (cli-switch arg) + (let ((temp (skim-cmdline-opts-noarg-by-regex arg))) + (if (> (length temp) 0) + (car temp) + #f))) + + + +(defstruct nbjob + pool + jobid + owner + mtver + user + status + cmdline + execute) + +(define (cmdline->execute cmdline) + (let* ((match (string-match ".*-execute\\s+(\\S+)" cmdline))) + (if match + (with-input-from-string (z3:decode-buffer (base64-decode (cadr match))) read) + #f))) + + +(define (nbjob-execute-ref nbjob key #!key (default #f)) + (let ((execute (nbjob-execute nbjob))) + (if (list? execute) + (let* ((match (alist-ref key execute))) + (if match + (if (list? match) (car match) match) + default)) + default))) + +(define (nbjob-process pool nbstatus-line) + (let ((toks (string-split nbstatus-line ","))) + (if (eq? 4 (length toks)) + (if (equal? (list-ref toks 1) "Jobid") + #f + (begin + (let ((res + (make-nbjob + pool: pool + status: (list-ref toks 0) + jobid: (list-ref toks 1) + user: (list-ref toks 2) + cmdline: (list-ref toks 3) + execute: (cmdline->execute (list-ref toks 3)) + ))) + res))) + #f))) + +(define (get-mtest-nb-jobs user nbpools #!key (cmdline-filter "megatest")) + (let* ((res + (apply append + (map (lambda (pool) + (let* (;;(user-filter ".*") + (user-filter user) + (cmd + (conc "nbstatus jobs --tar "pool" --fields status,jobid,user,cmdline --format csv " + "'USER=~\""user-filter + "\"&&cmdline=~\""cmdline-filter"\"'")) + (res (do-or-die cmd))) + (filter nbjob? + (map (lambda (line) + (nbjob-process pool line)) + (string-split res "\n"))))) + nbpools)))) + res)) + +;;(define foo (get-mtest-nb-jobs "bjbarcla" '("pdx_normal" "pdx_critical"))) + +(define (cmdline->execute cmdline) + (let* ((match (string-match ".*-execute\\s+(\\S+)" cmdline))) + (if match + (with-input-from-string (z3:decode-buffer (base64-decode (cadr match))) read) + #f))) + +;;;; kill jobs in netbatch for this area +(define (kill-mtest-jobs-in-netbatch) + (let ((pwd (get-environment-variable "PWD")) + (jobs (get-mtest-nb-jobs (get-environment-variable "USER") '("pdx_normal" "pdx_critical") ))) + + (for-each + (lambda (job) + (let* ((jobid (nbjob-jobid job)) + (pool (nbjob-pool job)) + (status (nbjob-status job)) + (cmd (conc "nbjob --target "pool" remove "jobid))) + ;;(print status) + (print cmd) + (system cmd))) + ;(pp (nbjob->alist job)) + (filter + (lambda (job) + (equal? (nbjob-execute-ref job 'toppath) pwd)) + jobs)) + (length jobs) + + )) + + +;;;; kill megatest jobs in running in netbatch +(define (kill-in-db #!key (megatest_exe "megatest")) + (let* ((all-targ-patt (do-or-die "sqlite3 megatest.db \"select id from keys\" | tr \"\\n1234567890\" \"/%%%%%%%%%%\" | sed 's/\\/$//'")) + ) + (for-each (lambda (state) + (let* ((cmd (conc megatest_exe " -state "state" -set-state-status KILLED,n/a -testpatt % -target "all-targ-patt" -runname %"))) + (print cmd) + (system cmd))) + '("REMOTEHOSTSTART" "LAUNCHED" "RUNNING" "KEEP_TRYING" "PREQ_FAIL")))) + + + +;;;; kill megatests and dashboards in this area running on this host +(define (kill-mtest-dboard) + + (let* ((this-toppath (pid->cwd (current-process-id))) + (tmppath (toppath->tmppath this-toppath)) + (config (let ((res (conc this-toppath "/megatest.config"))) + (when (not (file-exists? res)) + (ierr "This is not a megatest run area; "res" does not exist. Aborting.") + (exit 2)) + res)) + (mtest-procs (get-my-mtest-procs)) + (dashboard-procs (get-my-dashboard-procs)) + (all-pids (map proc-PID (append mtest-procs dashboard-procs))) + (our-pids (filter (lambda (pid) + ;;(print (pid-COMMAND pid)) + (and + (equal? (pid->cwd pid) this-toppath) + + )) + all-pids))) + + (if (null? our-pids) + (inote "No mtest or dboard processes on this host in in this runarea.") + (begin + (iwarn "Killing all megatest and dashboard processes on this host.") + (gracefully-kill-pids our-pids))) + )) + + + +(define (db-mt-version dbpath) + (let* ((cmd (conc "sqlite3 "dbpath" 'select val from metadat where var=\"MEGATEST_VERSION\"'")) + (res (do-or-die cmd))) + res)) + +;; TODO +(define (db-islocked? dbpath) + (let-values (((ec so se) (isys (conc "sqlite3 "dbpath" vacuum")))) + (let* ((message se) + (is-locked (string-match "^.*database is locked.*$" message))) + (inote "dbfile - "dbpath "; message - "message) + is-locked))) + +(define (db-unlock dbpath) + (system (conc "/nfs/site/bjbarcla/bin/unlock_db.sh " dbpath)) + + ;; (let* ((temp-dbpath (conc "/tmp/"(get-environment-variable "USER")"-"(current-process-id)".db"))) + ;; (inote "Unlocking "dbpath) + ;; (do-or-die (conc "cp "dbpath" "temp-dbpath)) + ;; (do-or-die (conc "rm -f "dbpath)) + ;; (let* ((cmd (conc "sqlite3 "temp-dbpath" .dump | sqlite3 "dbpath))) + ;; (inote "Running: "cmd) + ;; (system cmd)) + ;; ;;(do-or-die "sqlite3 "temp-dbpath" .dump | sqlite3 "dbpath) + ;; (if (db-islocked? dbpath) + ;; (begin + ;; (ierr "Could not unlock "dbpath) + ;; (exit 5)) + ;; (inote "Unlocked "dbpath)) + ;; #t) + + ) + + +(define *user* (do-or-die "ls -ld . | awk '{print $3}'")) + +(define (false-on-exception thunk) + (handle-exceptions exn #f (thunk) )) + +(define (safe-file-exists? path-string) + (false-on-exception (lambda () (file-exists? path-string)))) + +(defstruct proc + (USER "") + (PID -1) + (%CPU -1.0) + (%MEM -1.0) + (VSZ -1) + (RSS -1) + (TTY "") + (STAT "") + (START "") + (TIME "") + (COMMAND "")) + +(define (toppath->tmppath toppath) + (let* ((user *user*) + (area (car (string-split + (car (reverse (string-split toppath "/"))) + "."))) + (dotified-path (string-substitute "/" "." toppath "all"))) + (conc "/tmp/" user "/megatest_localdb/" area "/" dotified-path))) + + +(define (delete-db dbfile) + (let* ((db-files (glob (conc dbfile "*")))) + (for-each + (lambda (file) + (inote "delete file " file) + (if (delete-file* file) + (inote "Removed file - " file) + (iwarn "Could Not Remove file - " file)) + ) + db-files))) + +(define (check-db dbfile) + (let* ((has-wal (safe-file-exists? (conc dbfile "-wal"))) + (has-shm (safe-file-exists? (conc dbfile "-shm"))) + (has-journal (safe-file-exists? (conc dbfile "-journal"))) + (has-db (safe-file-exists? dbfile)) + (ok-flag #t)) + (when has-journal + (iwarn "Journal exists - "(conc dbfile "-journal")) + ) + (when has-wal + (set! ok-flag #f) + (iwarn "Wal-mode db exists: "(conc dbfile "-wal"))) + (if (not has-db) + (begin + (inote "db does not exists " dbfile) + (set! ok-flag #f)) + (let* ((db-size (file-size dbfile))) + (inote "db size = " db-size " -- " dbfile) + (when (member db-size (list 0 1024)) + (iwarn "db has bad size - "db-size" -- "dbfile) + (set! ok-flag #f)))) + ok-flag)) + + +(define (linux-get-process-info-records) + (let* ((raw (do-or-die "/bin/ps auwx")) + (all-lines (string-split raw "\n")) + (lines (cdr all-lines)) ;; skip title lines + (re (regexp "^(\\S+)\\s+(\\d+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(\\S+)\\s+(.*)$"))) + (filter + proc? + (map + (lambda (line) + (let ((match (string-match re line))) + (if match + (make-proc + USER: (list-ref match 1) + PID: (string->number (list-ref match 2)) + %CPU: (string->number (list-ref match 3)) + %MEM: (string->number (list-ref match 4)) + VSZ: (string->number (list-ref match 5)) + RSS: (string->number (list-ref match 6)) + TTY: (string->number (list-ref match 7)) + STAT: (list-ref match 8) + START: (list-ref match 9) + TIME: (list-ref match 10) + COMMAND: (list-ref match 11)) + #f))) + lines)))) + +(define (get-my-mtest-server-procs) + (let* ((procs (linux-get-process-info-records)) + (my-mtest-procs + (filter + (lambda (a-proc) + (and + (equal? *user* (proc-USER a-proc)) + (string-match "^.*/mtest\\s+.*-server.*" (proc-COMMAND a-proc)))) + procs))) + my-mtest-procs)) + +(define (get-my-mtest-procs) + (let* ((procs (linux-get-process-info-records)) + (my-mtest-procs + (filter + (lambda (a-proc) + (and + (equal? *user* (proc-USER a-proc)) + (string-match "^.*/m(ega)?test .*" (proc-COMMAND a-proc)) + (not (string-match "^.*/mtest-repair.*" (proc-COMMAND a-proc))))) + procs))) + my-mtest-procs)) + +(define (get-my-dashboard-procs) + (let* ((procs (linux-get-process-info-records)) + (my-dboard-procs + (filter + (lambda (a-proc) + (and + (equal? *user* (proc-USER a-proc)) + (string-match "^.*/dboard.*" (proc-COMMAND a-proc)))) + procs))) + my-dboard-procs)) + + +(define (pid->environ-hash pid) + (let* ((envfile (conc "/proc/"pid"/environ")) + (ht (make-hash-table)) + (rawdata (with-input-from-file envfile read-string)) + (lines (string-split rawdata (make-string 1 #\nul )))) + (for-each + (lambda (line) + (let ((match (string-match "(^[^=]+)=(.*)" line))) + (if match + (hash-table-set! ht (list-ref match 1) (list-ref match 2))))) + lines) + ht)) + +(define (pid->cwd pid) + (read-symbolic-link (conc "/proc/"pid"/cwd"))) + +(define (pid->mtest-monitor-db-file pid #!key (megatest_exe "megatest")) + (let* ((env (pid->environ-hash pid)) + (ltdir (hash-table-ref/default env "MT_LINKTREE" #f)) + (radir (hash-table-ref/default env "MT_RUN_AREA_HOME" #f)) + (cwd (pid->cwd pid))) + (let ((res + (cond + (ltdir (conc ltdir "/.db/monitor.db")) + (radir (conc + (do-or-die + (conc megatest_exe " -start-dir "radir" -show-config -section setup -var linktree")) + "/.db/monitor.db")) + (cwd (conc + (do-or-die + (conc megatest_exe " -start-dir "cwd" -show-config -section setup -var linktree")) + "/.db/monitor.db")) + + (else #f)))) + res))) + + + + + +(define (get-mdb-status mdb-file pid) + ;; select state from servers where pid='4465'; + + (cond + ((not (string? mdb-file)) (conc "mdb-file could not be determined for pid " pid ">>"mdb-file )) + ((not (safe-file-exists? mdb-file)) (conc "mdb-file does not exist for pid "pid" : "mdb-file)) + (else + (let ((dbh (open-database mdb-file))) + + (set-busy-handler! dbh 10000) + (let* ((sql-str "select state from servers where pid=?;") + (stm (sql dbh sql-str)) + (alists (query fetch-alists stm (->string pid)))) + (if (null? alists) + "server pid not in monitor.db" + (cdr (car (car alists))))))))) + + +(define (mtest-server-pid->status pid) + (let* ((mdb-file (pid->mtest-monitor-db-file pid))) + (if mdb-file + (get-mdb-status mdb-file pid) + "no monitor.db file could be found" + ))) + + +(define (gracefully-kill-pids pids) + (for-each (lambda (pid) + (print "kill "pid) + (system (conc "kill "pid))) + pids) + (sleep 5) + (let* ((procs-left (linux-get-process-info-records)) + (pids-left (map proc-PID procs-left))) + (for-each (lambda (pid) + (when (member pid pids-left) + (print "kill -9"pid) + (system (conc "kill -9 "pid)))) + pids))) + + + +(define (kill pid) + (print "KILL "pid) + (do-or-die (conc "kill -9 "pid))) + +(define (reap-defunct-mtest-server-pid pid) + (let ((status (mtest-server-pid->status pid))) + (print pid"->"(mtest-server-pid->status pid)) + (if (member status (list "running" "dbprep" "available" "collision")) + (print "pid="pid" in status "status" -- not killing") + (kill pid)))) + +;; (let* ((procs (get-my-mtest-server-procs)) +;; (pids (map proc-PID procs)) +;; ) + +;; (for-each reap-defunct-mtest-server-pid pids)) + + + +(define (mtdbver->mtrelver mtdbver) + (let* ((table-alist '( + ("1.5402" . "1.54/02") + ("1.5406" . "1.54/05") + ("1.5408" . "1.54/07") + ("1.5409" . "1.54/09") + ("1.5412" . "1.54/12") + ("1.5413" . "1.54/13") + ("1.5414" . "1.54/14") + ("1.5415" . "1.54/15") + ("1.5416" . "1.54/16") + ("1.5417" . "1.54/17") + ("1.5418" . "1.54/18") + ("1.5419" . "1.54/19") + ("1.5421" . "1.54/21") + ("1.5411" . "1.54/support-for-skip") + ("1.5522" . "1.55/22") + ("1.5523" . "1.55/23") + ("1.5524" . "1.55/24") + ("1.5525" . "1.55/25") + ("1.6001" . "1.60/01") + ("1.6002" . "1.60/02") + ("1.6003" . "1.60/03") + ("1.6004" . "1.60/04") + ("1.6005" . "1.60/05") + ("1.6006" . "1.60/06") + ("1.6007" . "1.60/07") + ("1.6008" . "1.60/08") + ("1.6009" . "1.60/09") + ("1.6009" . "1.60/11") + ("1.6012" . "1.60/12") + ("1.6013" . "1.60/13") + ("1.6014" . "1.60/14") + ("1.6015" . "1.60/15") + ("1.6016" . "1.60/16") + ("1.6017" . "1.60/17") + ("1.6018" . "1.60/18") + ("1.6019" . "1.60/19") + ("1.6021" . "1.60/21") + ("1.6022" . "1.60/22") + ("1.6023" . "1.60/23") + ("1.6024" . "1.60/24") + ("1.6025" . "1.60/25") + ("1.6026" . "1.60/26") + ("1.6027" . "1.60/27") + ("1.6028" . "1.60/28") + ;;("1.6029" . "1.60/29") + ("1.6029" . "1.60/29a") + ("1.6031" . "1.60/31") + ("1.6101" . "1.61/01") + ("1.6101" . "1.61/01a") + ("1.6102-c2ba" . "1.61/02") + ("1.6103-3e88" . "1.61/03") + ("1.6104-ee53" . "1.61/04") + ("1.6105-232b" . "1.61/05") + ("1.6201-e652" . "1.62/01") + ("1.6204-c74d" . "1.62/04") + ("1.6205-aff0" . "1.62/05") + ("1.6207-6f59" . "1.62/07") + ("1.6301-fbf0" . "1.63/01") + ("1.6302-da4a" . "1.63/02") + ("1.6303-aa5f" . "1.63/03") + ("1.6304-fa49" . "1.63/04") + ("1.6305-a03b" . "1.63/05") + ("1.6306-7a12" . "1.63/06") + ("1.6307-fb5d" . "1.63/07") + ("1.6308-35e0" . "1.63/08") + ("1.6309-738c" . "1.63/09") + ("1.6309-880c" . "1.63/09a") + ("1.6309-b566" . "1.63/09b") + ("1.6311-fb43" . "1.63/11") + ("1.6311-fb43" . "1.63/11b") + ("1.6311-8a6c" . "1.63/11b") + ("1.6402-03c5" . "1.64/02") + ) + ) + (res (alist-ref mtdbver table-alist equal?))) + res)) + ADDED utils/wip/mtest-repair.scm Index: utils/wip/mtest-repair.scm ================================================================== --- /dev/null +++ utils/wip/mtest-repair.scm @@ -0,0 +1,139 @@ +#!/p/foundry/env/pkgs/chicken/4.10.0_ext/bin/csi -s + +(use chicken) +(use data-structures) + + +(include "/nfs/site/home/bjbarcla/bin2/mtest-repair-lib.scm") +(glib-color-mode 1) + +;;(define this-cmd (car (argv))) +(define this-cmd "/nfs/site/home/bjbarcla/bin2/mtest-repair.scm") + +;;; check mtver in xterm + +;; note - 11b is 1.6311-fb43 +(let ((mt-ver (do-or-die "megatest -version"))) + (when (member mt-ver '("1.6309-738c" "1.6029" "1.6309-b566")) + (iwarn "This xterm has an older version of megatest.") + (ierr "Please load latest megatest version to proceed.") + (print "eg.: source ../scripts/newrel-setup.csh 1.63/11b") + (exit 3))) + + +;;;; kill netbatch jobs from this megatest +;;(kill-mtest-dboard) +;;(system "/nfs/site/home/bjbarcla/bin2/mtest-nbstop.scm") + + +;;;; delete .homehost .homehost.config +;;;; if not on homehost, ssh homehost, cd here, killall mtest dboard +(when (file-exists? ".homehost.config") + (delete-db ".homehost.config")) + +(when (file-exists? ".homehost") + (let* ((homehost (with-input-from-file ".homehost" (lambda () (read))))) + (let* ((homehostname (do-or-die "host `cat .homehost` | sed 's/.$//' | awk '{print $NF}' | awk -F. '{print $1}'")) + (thishostname (get-environment-variable "HOST"))) + (when (not (equal? homehostname thishostname)) + (iwarn "Please also run this on the homehost -- "homehostname) + + (iwarn "eg: % ssh "homehostname" 'cd "(get-environment-variable "PWD")" && "this-cmd"'") + (print "") + (inote "sleeping for 5 seconds. hit ctrl-c now to run on homehost or wait to proceed.") + (sleep 5))))) + + + + + +(kill-mtest-dboard) + + +;;;; delete /tmp/.$USER-portlogger.db +(let ((plfile (conc "/tmp/."(get-environment-variable "USER") "-portlogger.db"))) + (when (safe-file-exists? plfile) + (inote "removing portlogger file") + (system (conc "rm "plfile)))) + + +;;;; move logs dir aside +(system (conc "mv logs logs-aside-`date +%s`")) +(system "mkdir logs") + +;;;; fixes for dependency diagram +(inote "Removing dep graph tmp files if they exist") +(system (conc "rm /tmp/."(get-environment-variable "USER")"-*.dot")) + +;;#ln -s /p/fdk/gwa/$USER/fossil/ext/_ext ext +(let* ((toppath (pid->cwd (current-process-id))) + (flow (car (string-split + (car (reverse (string-split toppath "/"))) + "."))) + (extdir (conc "/p/fdk/gwa/"(get-environment-variable "USER") + "/fossil/ext/"flow"_ext"))) + (when (and (safe-file-exists? extdir) + (not (safe-file-exists? "ext"))) + (inote "Linking in ext dir") + (system (conc "ln -s "extdir" ext")))) + + +;;;; check for 0 byte megatest{,_ref}.db in tmp. delete them +;;;; check for wal-mode megatest{,_ref}.db in tmp. delete them +(define (repair-dbs) + (let* ((this-toppath (pid->cwd (current-process-id))) + (tmppath (toppath->tmppath this-toppath)) + (golden-mtest-file (conc this-toppath "/megatest.db")) + (golden-mtest-file-ok (check-db "megatest.db")) + (tmp-mtest-file (conc tmppath "/megatest.db")) + (tmp-mtestref-file (conc tmppath "/megatest_ref.db")) + (tmp-mtest-file-ok (check-db tmp-mtest-file)) + (tmp-mtestref-file-ok (check-db tmp-mtestref-file)) + (alldbs (list tmp-mtest-file tmp-mtestref-file golden-mtest-file)) + ) +;;;; check for megatest{,_ref}.db in tmp that die on .schema. delete them + (when (safe-file-exists? tmppath) + (if tmp-mtest-file-ok + (inote "tmp megatest db file ok") + (delete-db tmp-mtest-file)) + (if tmp-mtestref-file-ok + (inote "tmp megatestref db file ok") + (delete-db tmp-mtestref-file))) + + ;;;;; check for locked dbs + (for-each (lambda (dbfile) + (let* ((locked (db-islocked? dbfile))) + (if (db-islocked? dbfile) + (begin + (iwarn "db locked - "dbfile) + (db-unlock dbfile)) + (inote "db not locked - "dbfile)))) + alldbs) + +;;;; check for megatest.db + (if golden-mtest-file-ok + (inote "golden megatest db file ok") + (if (not (file-exists? golden-mtest-file)) + (inote "megatest.db not present. Continuing.") + (begin + ;;;; if golden megatest db is broken, stop now! + (ierr "Golden megatest.db is broken. Please delete it or replace it from a backup version in .snapshot. If critical, contact env team to assist.") + (sendmail "bjbarcla" "!!Bad golden megatest.db" this-toppath) + (inote "Backups in .snapshot:") + (system "ls -l .snapshot/*/megatest.db") + (ierr "Not proceeding with any more checks.") + (exit 3)))) + )) + +;; TODO: check for and fix locked megatest.db and locked monitor.db (ritika working on) + + +(repair-dbs) + + + + + + + +