249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
|
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
|
-
+
-
-
-
-
+
-
-
|
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
(rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
(set! kill-job? #f)))
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(launch:handle-zombie-tests run-id)
(when do-sync
(if do-sync ;; save meta data about the running of this test
;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
(if kill-job?
(begin
(debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
(mutex-lock! m)
;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
;; section and the runit section? Or add a loop that tries three times with a 1/4 second
;; between tries?
|