Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -251,17 +251,20 @@ ;; NO SYNC DB ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) ((no-sync-get-lock) (apply db:no-sync-get-lock *no-sync-db* params)) + ((no-sync-add-job) (apply db:no-sync-add-job *no-sync-db* params)) + ((no-sync-take-job) (apply db:no-sync-take-job *no-sync-db* params)) + ((no-sync-job-records-clean) (apply db:no-sync-job-records-clean *no-sync-db* params)) ;; ARCHIVES ;; ((archive-get-allocations) ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) ;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) - + ;;====================================================================== ;; READ ONLY QUERIES ;;====================================================================== ;; KEYS Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -3171,13 +3171,17 @@ ;; arm cubie01 cubie02 ;; x86_64 zeus xena myth01 ;; allhosts #{g hosts arm} #{g hosts x86_64} ;; ;; [host-types] +;; C/M/A lets megatest know this launcher provides C cores, M bytes memory for architecture A +;; 2/2G/arm smart -cores 2 -memory 2G -arch arm ;; general #MTLOWESTLOAD #{g hosts allhosts} ;; arm #MTLOWESTLOAD #{g hosts arm} ;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo +;; +;; NOTE: host-rules is ONLY used for MTLOWESTLOAD ;; ;; [host-rules] ;; # maxnload => max normalized load ;; # maxnjobs => max jobs per cpu ;; # maxjobrate => max jobs per second @@ -3190,12 +3194,15 @@ ;; ;; [jobtools] ;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. ;; flexi-launcher yes ;; launcher nbfake +;; mode adjutant|normal (default is normal) +;; +;; mode is 'normal (i.e. directly use launcher) or 'adjutant (i.e. use adjutant) ;; -(define (common:get-launcher configdat testname itempath) +(define (common:get-launcher configdat testname itempath mode) (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) (if (null? launchers) @@ -3202,32 +3209,36 @@ fallback-launcher (let loop ((hed (car launchers)) (tal (cdr launchers))) (let ((patt (car hed)) (host-type (cadr hed))) - (if (tests:match patt testname itempath) + (if (tests:match patt testname itempath) ;; have a launcher match for this test (begin (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) - (let ((launcher (configf:lookup configdat "host-types" host-type))) + (let ((launcher (configf:lookup configdat "host-types" host-type))) ;; find the actual launcher from the host-types table + ;; if we are in adjutant mode then we want to return both host-type and launcher (if launcher (let* ((launcher-parts (string-split launcher)) (launcher-exe (car launcher-parts))) (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) (count 100)) (if targ-host (conc "remrun " targ-host) (if (> count 0) + (begin (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) (thread-sleep! (- 101 count)) (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) (- count 1))) (begin (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) (exit))))) - launcher)) + (case mode + ((adjutant) (list host-type launcher)) + (else launcher)))) (begin (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) (if (null? tal) fallback-launcher (loop (car tal)(cdr tal))))))) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2142,14 +2142,62 @@ (db-exists (common:file-exists? dbname)) (db (sqlite3:open-database dbname))) (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) (if (not db-exists) (begin - (sqlite3:execute db "PRAGMA synchronous = 0;") (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));") (sqlite3:execute db "PRAGMA journal_mode=WAL;"))) + ;; MOVE THIS TABLE CREATION TO THE (begin above in about six months (it is Sep 2020 right now). + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS jobs_queue (id INTEGER PRIMARY KEY, host_type TEXT, cores INTEGER, memory TEXT, vars TEXT, exekey TEXT, state TEXT, event_time INTEGER, last_update INTEGER);") + ;; not sure I'll use this next one. I prefer if tests simply append to a file: + ;; last-update-seconds cpuload tmpspace rundirspace + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS test_extra_data (id INTEGER PRIMARY KEY, run_id INTEGER, test_id INTEGER, last_seen_running INTEGER);") + (sqlite3:execute db "PRAGMA synchronous = 0;") db)) + +(define (db:no-sync-add-job db-in host-type vars-list exekey) + (sqlite3:execute (db:no-sync-db db-in) "INSERT INTO jobs_queue (host_type,vars,exekey,state,event_time,last_update) VALUES (?,?,?,?,?,?);" + host-type vars-list exekey "waiting" (current-seconds)(current-seconds))) + +;; find next job (waiting longest) that matches host-type - future, we'll find jobs that fit if no exact match +(define (db:no-sync-take-job db-in host-type) + (let* ((db (db:no-sync-db db-in)) + (stmt1 "SELECT id,host_type,vars,exekey,state,event_time,last_update FROM jobs_queue WHERE host_type=? AND state != 'taken' ORDER BY event_time ASC;") + (stmt1h (sqlite3:prepare db stmt1)) + (stmt2 "UPDATE jobs_queue SET state='taken',last_update=? WHERE id=?;") + (stmt2h (sqlite3:prepare db stmt2)) + (res (sqlite3:with-transaction + db + (lambda () + (let* ((matching-jobs (sqlite3:fold-row + (lambda (res . row) ;; id host-type vars exekey state event-time last-update) + (cons row res)) + '() + stmt1h + host-type))) + (if (null? matching-jobs) + #f + (let ((choosen-one (let loop ((tal matching-jobs) + (res #f)) ;; put bestest one in here + (if (null? tal) + res + (let ((curr (car tal)) + (rem (cdr tal))) + curr) ;; here we will compare with res, if better candidate the loop with curr else loop with res + )))) + (if choosen-one ;; we need to mark it as taken + (sqlite3:execute stmt2h (current-seconds) (car choosen-one))) + choosen-one))))))) + (sqlite3:finalize! stmt1h) ;; it'd be nice to cache these and finalize on exit. + (sqlite3:finalize! stmt2h) + res)) + +;; clean out old jobs in queue, i.e. taken and event_time > 24 hrs ago +;; +(define (db:no-sync-job-records-clean db) + (sqlite3:execute (db:no-sync-db db) "DELETE FROM jobs_queue WHERE state='taken' AND event_time < ?;" (- (current-seconds)(* 24 3600)))) + ;; if we are not a server create a db handle. this is not finalized ;; so watch for problems. I'm still not clear if it is needed to manually ;; finalize sqlite3 dbs with the sqlite3 egg. ;; @@ -2167,12 +2215,13 @@ (sqlite3:execute (db:no-sync-db db) "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val)) (define (db:no-sync-del! db var) (sqlite3:execute (db:no-sync-db db) "DELETE FROM no_sync_metadat WHERE var=?;" var)) -(define (db:no-sync-get/default db var default) - (let ((res default)) +(define (db:no-sync-get/default db-in var default) + (let ((db (db:no-sync-db db-in)) + (res default)) (sqlite3:for-each-row (lambda (val) (set! res val)) (db:no-sync-db db) "SELECT val FROM no_sync_metadat WHERE var=?;" Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -1367,25 +1367,15 @@ ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex - (let* ( ;; (lock-key (conc "test-" test-id)) - ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) - ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds - ;; (if (car lock) - ;; #t - ;; (if (> (current-seconds) expire-time) - ;; (begin - ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) - ;; (rmt:no-sync-del! lock-key) ;; destroy the lock - ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; - ;; (begin - ;; (thread-sleep! 1) - ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) + (let* (;; locking code removed from here commented out and pasted at end of file (item-path (item-list->path itemdat)) - (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) + (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour"))) + ;; launch-mode will be 'adjutant or 'normal + (launch-mode (string->symbol (or (configf:lookup *configdat* "jobtools" "mode") "normal")))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0))) (if (> launch-delay delta) (begin (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. @@ -1422,26 +1412,17 @@ ;; (memory (configf:lookup tconfig "requirements" "memory")) ;; (hosts (configf:lookup *configdat* "jobtools" "workhosts")) ;; I'm pretty sure this was never completed (remote-megatest (configf:lookup *configdat* "setup" "executable")) (run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim") (configf:lookup *configdat* "setup" "runtimelim"))) - ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to - ;; allow running from dashboard. Extract the path - ;; from the called megatest and convert dashboard - ;; or dboard to megatest (local-megatest (common:find-local-megatest)) - #;(local-megatest (let* ((lm (car (argv))) - (dir (pathname-directory lm)) - (exe (pathname-strip-directory lm))) - (conc (if dir (conc dir "/") "") - (case (string->symbol exe) - ((dboard) "../megatest") - ((mtest) "../megatest") - ((dashboard) "megatest") - (else exe))))) - (launcher (common:get-launcher *configdat* test-name item-path)) ;; (configf:lookup *configdat* "jobtools" "launcher")) - (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path + (launcher (let ((l (common:get-launcher *configdat* test-name item-path launcher-mode))) + (if (string? l) + (string-split l) + l))) ;; some nonhomogenuity here. '(cmd param1 param2 ...) OR '(host-type launcher) + ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path + (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) (work-area #f) (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all (diskpath #f) (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) @@ -1455,11 +1436,11 @@ '())))) ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) - (if launcher (set! launcher (string-split launcher))) + ;; (if launcher (set! launcher (string-split launcher))) ;; yuk! ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) @@ -1518,11 +1499,11 @@ ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (common:file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir - (cond + (cond ;; ((and launcher hosts) ;; must be using ssh hostname ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms)))) (launcher (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) @@ -1620,5 +1601,21 @@ ;; now wait on that process if all is correct ;; periodically update the db with runtime ;; when the process exits look at the db, if still RUNNING after 10 seconds set ;; state/status appropriately (process-wait pid))) + + + ;; (lock-key (conc "test-" test-id)) + ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) + ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds + ;; (if (car lock) + ;; #t + ;; (if (> (current-seconds) expire-time) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) + ;; (rmt:no-sync-del! lock-key) ;; destroy the lock + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; + ;; (begin + ;; (thread-sleep! 1) + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) + Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -927,10 +927,19 @@ (define (rmt:no-sync-del! var) (rmt:send-receive 'no-sync-del! #f `(,var))) (define (rmt:no-sync-get-lock keyname) (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) + +(define (rmt:no-sync-add-job host-type vars-list exekey) + (rmt:send-receive 'no-sync-add-job #f `(,host-type ,vars-list ,exekey))) + +(define (rmt:no-sync-take-job host-type) + (rmt:send-receive 'no-sync-take-job #f `(,host-type))) + +(define (rmt:no-sync-job-records-clean) + (rmt:set-receive 'no-sync-job-records-clean #f '())) ;;====================================================================== ;; A R C H I V E S ;;======================================================================