Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -28,11 +28,11 @@ ezsteps.scm lock-queue.scm sdb.scm rmt.scm api.scm \ subrun.scm portlogger.scm archive.scm env.scm \ diff-report.scm cgisetup/models/pgdb.scm # module source files -MSRCFILES = +MSRCFILES = adjutant.scm mutils.scm # ftail.scm rmtmod.scm commonmod.scm removed # MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm \ # mtargs.scm commonmod.scm dbmod.scm adjutant.scm ulex.scm \ # rmtmod.scm apimod.scm Index: adjutant.scm ================================================================== --- adjutant.scm +++ adjutant.scm @@ -22,12 +22,23 @@ (module adjutant * (import scheme chicken data-structures extras files) (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18 srfi-69 - md5 message-digest + md5 message-digest matchable regex srfi-1) -(define (adjutant-run) - (print "Running the adjutant!")) +(define (adjutant-run host-type rmt:no-sync-take-job) + (print "Running the adjutant!") + (let loop ((wait-count 0)) + (if (< wait-count 10) ;; 6 x 10 seconds = one minute + (let* ((dat (rmt:no-sync-take-job host-type))) + (match dat + ((id ht vars exekey cmdline state event-time last-update) + (system cmdline) + (loop 0)) + (else + (thread-sleep! 10) + (loop (+ wait-count 1))))) + (print "I'm bored. Exiting.")))) ) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -3187,18 +3187,19 @@ ;; # maxjobrate => max jobs per second ;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 ;; ;; [launchers] ;; envsetup general -;; xor/%/n 4C16G +;; xor/%/n 2/2G/arm ;; % nbgeneral ;; ;; [jobtools] ;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. ;; flexi-launcher yes ;; launcher nbfake ;; mode adjutant|normal (default is normal) +;; ;; ;; mode is 'normal (i.e. directly use launcher) or 'adjutant (i.e. use adjutant) ;; (define (common:get-launcher configdat testname itempath mode) (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2145,25 +2145,25 @@ (if (not db-exists) (begin (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));") (sqlite3:execute db "PRAGMA journal_mode=WAL;"))) ;; MOVE THIS TABLE CREATION TO THE (begin above in about six months (it is Sep 2020 right now). - (sqlite3:execute db "CREATE TABLE IF NOT EXISTS jobs_queue (id INTEGER PRIMARY KEY, host_type TEXT, cores INTEGER, memory TEXT, vars TEXT, exekey TEXT, state TEXT, event_time INTEGER, last_update INTEGER);") + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS jobs_queue (id INTEGER PRIMARY KEY, host_type TEXT, cores INTEGER, memory TEXT, vars TEXT, exekey TEXT, cmdline TEXT, state TEXT, event_time INTEGER, last_update INTEGER);") ;; not sure I'll use this next one. I prefer if tests simply append to a file: ;; last-update-seconds cpuload tmpspace rundirspace (sqlite3:execute db "CREATE TABLE IF NOT EXISTS test_extra_data (id INTEGER PRIMARY KEY, run_id INTEGER, test_id INTEGER, last_seen_running INTEGER);") (sqlite3:execute db "PRAGMA synchronous = 0;") db)) -(define (db:no-sync-add-job db-in host-type vars-list exekey) - (sqlite3:execute (db:no-sync-db db-in) "INSERT INTO jobs_queue (host_type,vars,exekey,state,event_time,last_update) VALUES (?,?,?,?,?,?);" - host-type vars-list exekey "waiting" (current-seconds)(current-seconds))) +(define (db:no-sync-add-job db-in host-type vars-list exekey cmdline) + (sqlite3:execute (db:no-sync-db db-in) "INSERT INTO jobs_queue (host_type,vars,exekey,cmdline,state,event_time,last_update) VALUES (?,?,?,?,?,?,?);" + host-type vars-list exekey cmdline "waiting" (current-seconds)(current-seconds))) ;; find next job (waiting longest) that matches host-type - future, we'll find jobs that fit if no exact match (define (db:no-sync-take-job db-in host-type) (let* ((db (db:no-sync-db db-in)) - (stmt1 "SELECT id,host_type,vars,exekey,state,event_time,last_update FROM jobs_queue WHERE host_type=? AND state != 'taken' ORDER BY event_time ASC;") + (stmt1 "SELECT id,host_type,vars,exekey,cmdline,state,event_time,last_update FROM jobs_queue WHERE host_type=? AND state != 'taken' ORDER BY event_time ASC;") (stmt1h (sqlite3:prepare db stmt1)) (stmt2 "UPDATE jobs_queue SET state='taken',last_update=? WHERE id=?;") (stmt2h (sqlite3:prepare db stmt2)) (res (sqlite3:with-transaction db Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -1355,12 +1355,20 @@ (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) - +(defstruct launch:ajt + (vars '()) + (exekey #f) + (host-type #f) + (test-sig #f) + (cmdline #f)) +;; append vars +(define (launch:ajt-add-vars dat vars) + (launch:ajt-vars-set! dat (append (launch:ajt-vars dat) vars))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host @@ -1370,31 +1378,38 @@ (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* (;; locking code removed from here commented out and pasted at end of file (item-path (item-list->path itemdat)) (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour"))) - ;; launch-mode will be 'adjutant or 'normal - (launch-mode (string->symbol (or (configf:lookup *configdat* "jobtools" "mode") "normal")))) + ;; launcher-mode will be 'adjutant or 'normal + (launcher-mode (string->symbol (or (configf:lookup *configdat* "jobtools" "mode") "normal"))) + (ajtdat (make-launch:ajt))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0))) (if (> launch-delay delta) (begin (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) *last-launch*) launch-delay)))) (change-directory *toppath*) - (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars) - (append - (list - (list "MT_RUN_AREA_HOME" *toppath*) - (list "MT_TEST_NAME" test-name) - (list "MT_RUNNAME" runname) - (list "MT_ITEMPATH" item-path) - (list "MT_CONTOUR" contour) - ) - itemdat)) + (let ((var-list (append + (list + (list "MT_RUN_AREA_HOME" *toppath*) + (list "MT_TEST_NAME" test-name) + (list "MT_RUNNAME" runname) + (list "MT_ITEMPATH" item-path) + (list "MT_CONTOUR" contour) + ) + itemdat))) + ;; consolidate this code with the code in megatest.scm for + ;; "-execute", *maybe* - the longer they are set the longer + ;; each launch takes (must be non-overlapping with the vars) + (alist->env-vars var-list) + ;; the var-list into the ajtdat adjutant record whether it is needed or not. + (launch:ajt-add-vars ajtdat var-list)) + (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed ;; for tconfig, why do we allow fallback to test-conf? (tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t) (begin (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") @@ -1432,10 +1447,19 @@ (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '()) (if (configf:lookup *configdat* "misc" "profilesw") (list (configf:lookup *configdat* "misc" "profilesw")) '())))) + ;; save the test-sig in the ajtdat record + (launch:ajt-test-sig-set! ajtdat test-sig) + ;; go ahead and figure out if we have a host-type from the + ;; launcher call above and save it in the ajtdat record + (if (and (eq? launcher-mode 'adjutant) + (list? launcher) + (> (length launcher) 1)) + (launch:ajt-host-type-set! ajtdat (car launcher))) + ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) ;; (if launcher (set! launcher (string-split launcher))) ;; yuk! @@ -1493,64 +1517,92 @@ (list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) + ;; save the cmdparms in the ajtdat + (launch:ajt-exekey-set! ajtdat cmdparms) ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (common:file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir + + ;; save the command line for adjutant mode (might never be needed but best to assemble it here) + (launch:ajt-cmdline-set! ajtdat (string-intersperse + (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) (cond - ;; ((and launcher hosts) ;; must be using ssh hostname - ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) - ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms)))) (launcher (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) - ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms)))) (else (if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) - ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" ""))))) + (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) (debug:print 1 *default-log-port* "Launching " work-area) ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done (debug:print 4 *default-log-port* "fullcmd: " fullcmd) (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible. - (let* ((commonprevvals (alist->env-vars - (hash-table-ref/default *configdat* "env-override" '()))) - (miscprevvals (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute" - (append (list (list "MT_TEST_RUN_DIR" work-area) - (list "MT_TEST_NAME" test-name) - (list "MT_ITEM_INFO" (conc itemdat)) - (list "MT_RUNNAME" runname) - (list "MT_TARGET" mt_target) - (list "MT_ITEMPATH" item-path) - ) - itemdat))) - (testprevvals (alist->env-vars - (hash-table-ref/default tconfig "pre-launch-env-overrides" '()))) + (let* ((env-override-vars (hash-table-ref/default *configdat* "env-override" '())) + (commonprevvals (alist->env-vars env-override-vars)) + (misc-vars (append (list (list "MT_TEST_RUN_DIR" work-area) + (list "MT_TEST_NAME" test-name) + (list "MT_ITEM_INFO" (conc itemdat)) + (list "MT_RUNNAME" runname) + (list "MT_TARGET" mt_target) + (list "MT_ITEMPATH" item-path)) + itemdat)) + (miscprevvals (alist->env-vars misc-vars));; consolidate this code with the code in megatest.scm for "-execute" + (test-vars (hash-table-ref/default tconfig "pre-launch-env-overrides" '())) + (testprevvals (alist->env-vars test-vars)) + ;; Launchwait defaults to true, must override it to turn off wait (launchwait (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t)) - (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. - process:cmd-run-with-stderr-and-exitcode->list - process-run) - (if useshell - (let ((cmdstr (string-intersperse fullcmd " "))) - (if launchwait - cmdstr - (conc cmdstr " >> mt_launch.log 2>&1 &"))) - (car fullcmd)) - (if useshell - '() - (cdr fullcmd)))) + ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. + (launch-results-prev (if (eq? launcher-mode 'adjutant) + '(0 #t) ;; just some fake data to fool downstream but non-applicable code + (apply (if launchwait + process:cmd-run-with-stderr-and-exitcode->list + process-run) + (if useshell + (let ((cmdstr (string-intersperse fullcmd " "))) + (if launchwait + cmdstr + (conc cmdstr " >> mt_launch.log 2>&1 &"))) + (car fullcmd)) + (if useshell + '() + (cdr fullcmd))))) (success (if launchwait (equal? 0 (cadr launch-results-prev)) #t)) (launch-results (if launchwait (car launch-results-prev) launch-results-prev))) - (if (not success) + + (launch:ajt-add-vars ajtdat env-override-vars) + (launch:ajt-add-vars ajtdat misc-vars) + (launch:ajt-add-vars ajtdat test-vars) + + ;; if in adjutant mode we register the job in the jobs_queue + ;; then fire off an adjutant runner + ;; + (if (eq? launcher-mode 'adjutant) + (let* ((adjutant-runner-cmd (append (cdr launcher) + (list remote-megatest "-adjutant" + (launch:ajt-host-type ajtdat) + "-start-dir" *toppath*))) + (adj-cmd (conc (string-intersperse (map conc adjutant-runner-cmd) " ") + "&"))) + (rmt:no-sync-add-job + (launch:ajt-host-type ajtdat) + (conc (launch:ajt-vars ajtdat)) + (launch:ajt-exekey ajtdat) + (launch:ajt-cmdline ajtdat)) + (print "adj-cmd: " adj-cmd) + (system adj-cmd) + )) + + (if (not success) (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED")) - (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () @@ -1572,10 +1624,14 @@ (process-signal (current-process-id) signal/kill) )) (alist->env-vars miscprevvals) (alist->env-vars testprevvals) (alist->env-vars commonprevvals) + ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. + ;; the unlock previously was further up. This seemed wrong as we should not proceed until the + ;; vars have been reset. + (mutex-unlock! *launch-setup-mutex*) launch-results)) (change-directory *toppath*) (thread-sleep! (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.0)))) ;; recover a test where the top controlling mtest may have died Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -39,10 +39,14 @@ (declare (uses mt)) (declare (uses api)) (declare (uses tasks)) ;; only used for debugging. (declare (uses env)) (declare (uses diff-report)) + +(declare (uses adjutant)) +(import adjutant) + ;; (declare (uses ftail)) ;; (import ftail) (define *db* #f) ;; this is only for the repl, do not use in general!!!! @@ -51,11 +55,11 @@ (include "db_records.scm") (include "run_records.scm") (include "megatest-fossil-hash.scm") (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:) - readline apropos json http-client directory-utils typed-records + readline apropos json http-client directory-utils typed-records matchable http-client srfi-18 extras format) ;; Added for csv stuff - will be removed ;; (use sparse-vectors) @@ -199,11 +203,11 @@ -update-meta : update the tests metadata for all tests -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are overwritten by values set in config files. -server -|hostname : start the server (reduces contention on megatest.db), use - to automatically figure out hostname - -adjutant C,M : start the server/adjutant with allocated cores C and Mem M (Gig), + -adjutant host-type : start the server/adjutant with given host-type use 0,0 to auto use full machine -transport http|rpc : use http or rpc for transport (default is http) -log logfile : send stdout and stderr to logfile -list-servers : list the servers -kill-servers : kill all servers @@ -912,12 +916,25 @@ ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to ;; a specific Megatest area. Detail are being hashed out and this may change. ;; (if (args:get-arg "-adjutant") - (begin - (adjutant-run) + (let* ((host-type (args:get-arg "-adjutant"))) + (launch:setup) ;; dang it, wish this wasn't needed + (print "Running the adjutant!") + (let loop ((wait-count 0)) + (if (< wait-count 10) ;; 6 x 10 seconds = one minute + (let* ((dat (rmt:no-sync-take-job host-type))) + (match dat + ((id ht vars exekey cmdline state event-time last-update) + (system cmdline) + (loop 0)) + (else + (thread-sleep! 10) + (loop (+ wait-count 1))))) + (print "I'm bored. Exiting."))) + ;; (adjutant-run (args:get-arg "-ajutant") rmt:no-sync-take-job) (set! *didsomething* #t))) (if (or (args:get-arg "-list-servers") (args:get-arg "-kill-servers")) (let ((tl (launch:setup))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -928,12 +928,12 @@ (rmt:send-receive 'no-sync-del! #f `(,var))) (define (rmt:no-sync-get-lock keyname) (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) -(define (rmt:no-sync-add-job host-type vars-list exekey) - (rmt:send-receive 'no-sync-add-job #f `(,host-type ,vars-list ,exekey))) +(define (rmt:no-sync-add-job host-type vars-list exekey cmdline) + (rmt:send-receive 'no-sync-add-job #f `(,host-type ,vars-list ,exekey ,cmdline))) (define (rmt:no-sync-take-job host-type) (rmt:send-receive 'no-sync-take-job #f `(,host-type))) (define (rmt:no-sync-job-records-clean)