Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -415,10 +415,12 @@ (mutex-unlock! *heartbeat-mutex*) ;; (debug:print 11 "last-access=" last-access ", server-timeout=" server-timeout) ;; ;; no_traffic, no running tests, if server 0, no running servers + ;; + ;; (let ((wait-on-running (configf:lookup *configdat* "server" "wait-on-running"))) ;; wait on running tasks (if not true then exit on time out) ;; (if (and *server-run* (or (> (+ last-access server-timeout) (current-seconds)) (and (eq? run-id 0) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -91,18 +91,18 @@ fulln runscript))))) ;; assume it is on the path (rollup-status 0)) (change-directory top-path) - - - - ;; ADD here - is test already RUNNING? If so ---- ABORT RUN ATTEMPT - - - - + ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, + ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* + ;; + (let ((test-info (rmt:get-testinfo-state-status run-id test-id))) + (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) + (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") + (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed"))) + (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys)) ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process ;; one of these is defunct/redundant ... (if (not (launch:setup-for-run force: #t)) @@ -173,12 +173,10 @@ ;; open-run-close not needed for test-set-meta-info ;; (tests:set-full-meta-info #f test-id run-id 0 work-area) ;; (tests:set-full-meta-info test-id run-id 0 work-area) (tests:set-full-meta-info #f test-id run-id 0 work-area 10) - ;; (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f) - (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (thread-sleep! 0.3) ;; NFS slowness has caused grief here (if (args:get-arg "-xterm") (set! fullrunscript "xterm") (if (and fullrunscript (not (file-execute-access? fullrunscript))) @@ -524,11 +522,11 @@ ;; ;; All log file links should be stored relative to the top of link path ;; ;; - [ - ] ;; -(define (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat) +(define (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat #!key (remtries 2)) (let* ((item-path (item-list->path itemdat)) (runname (db:get-value-by-header (db:get-rows run-info) (db:get-header run-info) "runname")) ;; convert back to db: from rdb: - this is always run at server end @@ -546,13 +544,14 @@ ;; ensure this exists first as links to subtests must be created there (linktree (let ((rd (config-lookup *configdat* "setup" "linktree"))) (if rd rd (conc *toppath* "/runs")))) - (lnkbase (conc linktree "/" target "/" runname)) - (lnkpath (conc lnkbase "/" testname)) - (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path))) + (lnkbase (conc linktree "/" target "/" runname)) + (lnkpath (conc lnkbase "/" testname)) + (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path)) + (lnktarget (conc lnkpath "/" item-path))) ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical ;; rundir shortdir (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path) @@ -631,11 +630,11 @@ (hash-table-set! *toptest-paths* testname toptest-path))))) ;; The toptest path has been created, the link to the test in the linktree has ;; been created. Now, if this is an iterated test the real test dir must be created (if (not not-iterated) ;; this is an iterated test - (let ((lnktarget (conc lnkpath "/" item-path))) + (begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) (debug:print 2 "Setting up sub test run area") (debug:print 2 " - creating run area in " test-path) (handle-exceptions exn (begin @@ -653,10 +652,13 @@ (debug:print 0 "ERROR: Failed to re-create link " linktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit)) (if (symbolic-link? lnktarget) (delete-file lnktarget)) (if (not (file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) + (if (not (directory? test-path)) + (create-directory test-path #t)) ;; this is a hack, I don't know why out of the blue this path does not exist sometimes + (if (directory? test-path) (begin (let* ((ovrcmd (let ((cmd (config-lookup *configdat* "setup" "testcopycmd"))) (if cmd ;; substitute the TEST_SRC_PATH and TEST_TARG_PATH @@ -669,11 +671,16 @@ " >> " test-path "/mt_launch.log 2>> " test-path "/mt_launch.log"))) (status (system cmd))) (if (not (eq? status 0)) (debug:print 2 "ERROR: problem with running \"" cmd "\""))) (list lnkpathf lnkpath )) - (list #f #f)))) + (if (> remtries 0) + (begin + (debug:print 0 "ERROR: Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) + ;; + (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) + (list #f #f))))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host @@ -741,10 +748,15 @@ (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory #f testinfo #t))) ;; remove data only, do not perturb the record + + ;; prevent overlapping actions - set to LAUNCHED as early as possible + ;; + (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) + (rmt:roll-up-pass-fail-counts run-id test-name item-path "LAUNCHED") (set! diskpath (get-best-disk *configdat*)) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) @@ -774,15 +786,14 @@ (list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) + ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) (change-directory work-area) ;; so that log files from the launch process don't clutter the test dir - (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) - (rmt:roll-up-pass-fail-counts run-id test-name item-path "LAUNCHED") (cond ((and launcher hosts) ;; must be using ssh hostname (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms) debug-param))) ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms)))) (launcher Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1070,11 +1070,11 @@ ;; (debug:print 0 "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (begin (debug:print-info 0 "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) - (rmt:find-and-mark-incomplete run-id))) + (rmt:find-and-mark-incomplete run-id #f))) (if (not (eq? num-running prev-num-running)) (debug:print-info 0 "run-wait specified, waiting on " num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds))))) (thread-sleep! 15) ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) @@ -1505,11 +1505,12 @@ (define (runs:remove-test-directory db test remove-data-only) (let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree (real-dir (if (file-exists? run-dir) (resolve-pathname run-dir) #f))) - (if (not remove-data-only) + (if remove-data-only + (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) (if (and real-dir (> (string-length real-dir) 5) (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. @@ -1539,11 +1540,12 @@ (not (member run-dir (list "n/a" "/tmp/badname")))) (debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink") (debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted.")) )) ;; Only delete the records *after* removing the directory. If things fail we have a record - (if (not remove-data-only) + (if remove-data-only + (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "NOT_STARTED" "n/a" #f) (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test))))) ;;====================================================================== ;; Routines for manipulating runs ;;======================================================================