Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -1499,11 +1499,11 @@ (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next))) (cond ((and (> first adjload) (> count 0)) - (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load " first " exceeding max of " adjload " " (if msg msg "")) + (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load " first " exceeding normalized max of " maxload "(adjusted load: " adjload ") " (if msg msg "")) (thread-sleep! waitdelay) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1))) ((and (> loadjmp numcpus) (> count 0)) (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) ADDED emergency-patch-3.scm Index: emergency-patch-3.scm ================================================================== --- /dev/null +++ emergency-patch-3.scm @@ -0,0 +1,81 @@ + ;; To build patch: + ;;;;;;;;;;;;;;;;;;;;;;;;; + ;; ldd /p/foundry/env/pkgs/megatest/1.64/19/bin/.11/mtest + ;; linux-vdso.so.1 => (0x00002aaaaaaab000) + ;; libchicken.so.7 => /p/foundry/env/pkgs/megatest/1.64/chicken-4.10.0//lib/libchicken.so.7 (0x00002aaaaaaad000) + ;; libm.so.6 => /lib64/libm.so.6 (0x00002aaaab0a6000) + ;; libdl.so.2 => /lib64/libdl.so.2 (0x00002aaaab31f000) + ;; libc.so.6 => /lib64/libc.so.6 (0x00002aaaab523000) + ;; /lib64/ld-linux-x86-64.so.2 (0x0000555555554000) + ;; + ;; /p/foundry/env/pkgs/megatest/1.64/chicken-4.10.0/bin/csc -s emergency-patch-3.scm + ;; + + + ;; to test patch: + ;;;;;;;;;;;;;;;;;;;;;;;;; + ;; in .megatestrc, add: + ;; (if (and (> megatest-version 1.64) + ;; (< megatest-version 1.6421)) + ;; (begin + ;; (load "/p/foundry/env/pkgs/megatest/1.64/19/share/epatch-1.so") + ;; (load "/p/foundry/env/pkgs/megatest/1.64/19/share/epatch-2.so"))) + ;; + + + ;; to productize patch: + ;;;;;;;;;;;;;;;;;;;;;;;;; + ;; +(use directory-utils regex) + +(include "common_records.scm") +(include "key_records.scm") +(include "db_records.scm") +(include "run_records.scm") +(include "test_records.scm") + +;; Given a run id start a server process ### NOTE ### > file 2>&1 +;; if the run-id is zero and the target-host is set +;; try running on that host +;; incidental: rotate logs in logs/ dir. +;; +(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area + (let* ((curr-host (get-host-name)) + ;; (attempt-in-progress (server:start-attempted? areapath)) + ;; (dot-server-url (server:check-if-running areapath)) + (curr-ip (server:get-best-guess-address curr-host)) + (curr-pid (current-process-id)) + (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) + (target-host (car homehost)) + (testsuite (common:get-testsuite-name)) + (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) + (cmdln (conc (common:get-megatest-exe) + " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") + " -daemonize " + "") + ;; " -log " logfile + " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) + (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) + (load-limit (configf:lookup-number *configdat* "server" "load-limit" default: 0.9))) + ;; we want the remote server to start in *toppath* so push there + (push-directory areapath) + (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") + (thread-start! log-rotate) + + ;; host.domain.tld match host? + (if (and target-host + ;; look at target host, is it host.domain.tld or ip address and does it + ;; match current ip or hostname + (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + (not (equal? curr-ip target-host))) + (begin + (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + (setenv "TARGETHOST" target-host))) + + (setenv "TARGETHOST_LOGF" logfile) + (common:wait-for-normalized-load load-limit " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever + (system (conc "nbfake " cmdln)) + (unsetenv "TARGETHOST_LOGF") + (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + (thread-join! log-rotate) + (pop-directory))) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -73,11 +73,12 @@ -version : print megatest version (currently " megatest-version ") Launching and managing runs -run : run all tests or as specified by -testpatt -remove-runs : remove the data for a run, requires -runname and -testpatt - Optionally use :state and :status + Optionally use :state and :status, use -keep-records to remove only + the run data. -set-state-status X,Y : set state to X and status to Y, requires controls per -remove-runs -rerun FAIL,WARN... : force re-run for tests with specificed status(s) -rerun-clean : set all tests not COMPLETED+PASS,WARN,WAIVED to NOT_STARTED,n/a and then run the specified testpatt with -preclean -rerun-all : set all tests to NOT_STARTED,n/a and run with -preclean @@ -338,10 +339,11 @@ "-test-paths" ;; get path(s) to a test, ordered by youngest first "-runall" ;; run all tests, respects -testpatt, defaults to % "-run" ;; alias for -runall "-remove-runs" + "-keep-records" ;; use with -remove-runs to remove only the run data "-rebuild-db" "-cleanup-db" "-rollup" "-update-meta" "-create-megatest-area" @@ -961,11 +963,11 @@ ;; Remove old run(s) ;;====================================================================== ;; since several actions can be specified on the command line the removal ;; is done first -(define (operate-on action) +(define (operate-on action #!key (mode #f)) ;; #f is "use default" (let* ((runrec (runs:runrec-make-record)) (target (common:args-get-target))) (cond ((not target) (debug:print-error 0 *default-log-port* "Missing required parameter for " action ", you must specify -target or -reqtarg") @@ -990,19 +992,22 @@ target (common:args-get-runname) ;; (or (args:get-arg "-runname")(args:get-arg ":runname")) (common:args-get-testpatt #f) ;; (args:get-arg "-testpatt") state: (common:args-get-state) status: (common:args-get-status) - new-state-status: (args:get-arg "-set-state-status")))) + new-state-status: (args:get-arg "-set-state-status") + mode: mode))) (set! *didsomething* #t))))) (if (args:get-arg "-remove-runs") (general-run-call "-remove-runs" "remove runs" (lambda (target runname keys keyvals) - (operate-on 'remove-runs)))) + (operate-on 'remove-runs mode: (if (args:get-arg "-keep-records") + 'remove-data-only + 'remove-all))))) (if (args:get-arg "-set-state-status") (general-run-call "-set-state-status" "set state and status" @@ -2157,15 +2162,14 @@ ;; 'new2old ) (set! *didsomething* #t))) (if (args:get-arg "-sync-to-megatest.db") - (begin - (db:multi-db-sync - (db:setup #f) - 'new2old - ) + (let ((res (db:multi-db-sync + (db:setup #f) + 'new2old))) + (print "Synced " res " records to megatest.db") (set! *didsomething* #t))) (if (args:get-arg "-sync-to") (let ((toppath (launch:setup))) (tasks:sync-to-postgres *configdat* (args:get-arg "-sync-to")) ADDED nexttag.rb Index: nexttag.rb ================================================================== --- /dev/null +++ nexttag.rb @@ -0,0 +1,46 @@ +#!/usr/bin/env ruby + + +def get_next_tag(branch) + + + + abort "Not on a version branch like v1.64 (got: >#{branch}<)" unless branch.match(/^v\d\.\d\d$/) + + #puts "this branch: #{branch}" + + tag_pat = /#{branch}(\d\d)/ + remote=`fsl remote`.chomp.sub(/^file:\/\//,'') # get tagset from origin + cmd="fossil tag -R '#{remote}' list" + tags = `#{cmd}`.split /\n/ + abort "fossil command failed [#{cmd}]" if $? != 0 + branch_tags = tags.find_all{|x| x.match(tag_pat) }.sort + if branch_tags.length == 0 + return branch + "01" + else + latest_tag = branch_tags.last + m1 = latest_tag.match(tag_pat) + minor_digits = m1[1].to_i + 1 + if (minor_digits % 10) == 0 + minor_digits += 1 + end + new_tag=sprintf("%s%02d", branch, minor_digits) + return new_tag + end +end + +branch = `fossil branch`.sub(/\A.*\* /m,'').sub(/\n.*\z/m,'') +tag= get_next_tag(branch) + +puts "TODO: Write to megatest-version.scm:" +puts ";; Always use two or four digit decimal +;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. + +(declare (unit megatest-version)) + +(define megatest-version #{tag.sub(/^v/,'')}) + +" + +puts "TODO: fossil tag add #{tag} #{branch}" +puts "" Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -241,11 +241,15 @@ (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) - (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time + (if (and (vector? conninfo) (> 5 (vector-length conninfo))) + (http-transport:server-dat-update-last-access conninfo) ;; refresh access time + (begin + (set! conninfo #f) + (remote-conndat-set! runremote #f))) ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end (if (and (vector? res) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1748,11 +1748,11 @@ ;; 'remove-runs ;; 'set-state-status ;; ;; NB// should pass in keys? ;; -(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode 'remove-all)(options '())) +(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode #f)(options '())) (common:clear-caches) ;; clear all caches (let* ((db #f) ;; (tdbdat (tasks:open-db)) (keys (rmt:get-keys)) (rundat (mt:get-runs-by-patt keys runnamepatt target)) @@ -1760,11 +1760,12 @@ (runs (vector-ref rundat 1)) (states (if state (string-split state ",") '())) (statuses (if status (string-split status ",") '())) (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f))) (rp-mutex (make-mutex)) - (bup-mutex (make-mutex))) + (bup-mutex (make-mutex)) + (keep-records (args:get-arg "-keep-records"))) ;; used in conjunction with -remove-runs to keep the records, TODO: consolidate this with "mode". (let* ((write-access-actions '(remove-runs set-state-status archive run-wait)) (dbfile (conc *toppath* "/megatest.db")) (readonly-mode (not (file-write-access? dbfile)))) (when (and readonly-mode @@ -1936,13 +1937,15 @@ (let* ((dparts (string-split lasttpath "/")) (runpath (conc "/" (string-intersperse (take dparts (- (length dparts) 1)) "/")))) (debug:print 1 *default-log-port* "Removing run: " runkey " " (db:get-value-by-header run header "runname") " and related record") - (rmt:delete-run run-id) - (rmt:delete-old-deleted-test-records) - ;; (rmt:set-var "DELETED_TESTS" (current-seconds)) + (if (not keep-records) + (begin + (rmt:delete-run run-id) + (rmt:delete-old-deleted-test-records))) + ;; (rmt:set-var "DELETED_TESTS" (current-seconds)) ;; need to figure out the path to the run dir and remove it if empty ;; (if (null? (glob (conc runpath "/*"))) ;; (begin ;; (debug:print 1 *default-log-port* "Removing run dir " runpath) ;; (system (conc "rmdir -p " runpath)))) @@ -1956,12 +1959,13 @@ (define (runs:remove-test-directory test mode) ;; remove-data-only) (let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree (real-dir (if (common:file-exists? run-dir) ;; (resolve-pathname run-dir) (common:nice-path run-dir) - #f))) - (case mode + #f)) + (clean-mode (or mode 'remove-all))) + (case clean-mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f)) ((remove-all) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f))) (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) (if (and real-dir @@ -1993,12 +1997,12 @@ (not (member run-dir (list "n/a" "/tmp/badname")))) (debug:print 0 *default-log-port* "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink") (debug:print 0 *default-log-port* "NOTE: the run dir for this test is undefined. Test may have already been deleted.")) )) ;; Only delete the records *after* removing the directory. If things fail we have a record - (case mode - ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "NOT_STARTED" "n/a" #f)) + (case clean-mode + ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) (db:test-get-state test)(db:test-get-status test) #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVED" #f #f)) (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test)))))) ;;====================================================================== ;; Routines for manipulating runs Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -116,11 +116,12 @@ " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") " -daemonize " "") ;; " -log " logfile " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) - (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) + (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) + (load-limit (configf:lookup-number *configdat* "server" "load-limit" default: 0.9))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) @@ -133,11 +134,11 @@ (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) - (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever + (common:wait-for-normalized-load load-limit " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) @@ -474,10 +475,11 @@ (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) (no-sync-db (db:open-no-sync-db)) + (sync-duration 0) ;; run time of the sync in milliseconds (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))) (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls (debug:print-info 2 *default-log-port* "Periodic sync thread started.") (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) @@ -503,18 +505,43 @@ ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) (if will-sync (set! *db-sync-in-progress* #t)) (mutex-unlock! *db-multi-sync-mutex*) (if will-sync - (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive - (if (> res 0) ;; some records were transferred, keep the db alive - (begin - (mutex-lock! *heartbeat-mutex*) - (set! *db-last-access* (current-seconds)) - (mutex-unlock! *heartbeat-mutex*) - (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) - (debug:print-info 2 *default-log-port* "sync called but zero records transferred")))) + (let ((sync-start (current-milliseconds))) + (if (< sync-duration 300) + (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive + (set! sync-duration (- (current-milliseconds) sync-start)) + (if (> res 0) ;; some records were transferred, keep the db alive + (begin + (mutex-lock! *heartbeat-mutex*) + (set! *db-last-access* (current-seconds)) + (mutex-unlock! *heartbeat-mutex*) + (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) + (debug:print-info 2 *default-log-port* "sync called but zero records transferred"))) + ;; TODO: factor this next routine out into a function + (with-input-from-pipe ;; this should not block other threads but need to verify this + "megatest -sync-to-megatest.db" + (lambda () + (let loop ((inl (read-line)) + (res #f)) + (if (eof-object? inl) + (begin + (set! sync-duration (- (current-milliseconds) sync-start)) + (cond + ((not res) + (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\"")) + ((> res 0) + (mutex-lock! *heartbeat-mutex*) + (set! *db-last-access* (current-seconds)) + (mutex-unlock! *heartbeat-mutex*)))) + (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl))) + (if matches + (string->number (cadr matches)) + #f)))) + (loop (read-line) + (or num-synced res)))))))))) (if will-sync (begin (mutex-lock! *db-multi-sync-mutex*) (set! *db-sync-in-progress* #f) (set! *db-last-sync* start-time)