Megatest

Diff
Login

Differences From Artifact [98c156694e]:

To Artifact [b95b62c5c1]:


42
43
44
45
46
47
48








49












50
51
52
53
54
55
56
(declare (uses subrunmod))
(declare (uses archivemod))
(declare (uses fsmod))

(use srfi-69)

(module runsmod








	*













(import scheme)
(cond-expand
 (chicken-4
  
  (import chicken
	  ports







>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>







42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
(declare (uses subrunmod))
(declare (uses archivemod))
(declare (uses fsmod))

(use srfi-69)

(module runsmod
	(
	 runs:get-mt-env-alist
	 setup-env-defaults
	 runs:clean-cache
	 rmt:find-and-mark-incomplete
	 launch:setup
	 launch:end-of-run-check
	 launch:test-copy

	 set-item-env-vars
	 runs:set-megatest-env-vars
	 full-runconfigs-read
	 runs:operate-on

	 runs:update-all-test_meta
	 runs:handle-locking
	 ;; runs:rollup-run ;; not ported
	 runs:run-tests
	 runs:remove-all-but-last-n-runs-per-target
	 general-run-call
	 )

(import scheme)
(cond-expand
 (chicken-4
  
  (import chicken
	  ports
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
	tasksmod
	testsmod
	subrunmod
	archivemod
	fsmod
	)

(include "common_records.scm")
;; (include "key_records.scm")
;; (include "db_records.scm")
(include "run_records.scm")
;; (include "test_records.scm")

;; use this struct to facilitate refactoring
;;







|







145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
	tasksmod
	testsmod
	subrunmod
	archivemod
	fsmod
	)

;; (include "common_records.scm")
;; (include "key_records.scm")
;; (include "db_records.scm")
(include "run_records.scm")
;; (include "test_records.scm")

;; use this struct to facilitate refactoring
;;
1186
1187
1188
1189
1190
1191
1192
1193
1194

1195

1196
1197
1198
1199
1200
1201
1202
	 (lambda ()
	   ;; jobtools maxload is useful for where the full Megatest run is done on one machine
	   (if (and (not (rmt:on-homehost?))
		    maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
	       (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
	   
	   ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
	   (if maxhomehostload
	       (common:wait-for-homehost-load maxhomehostload

					      (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))))))

    
 
    
    (if (and (not (null? prereqs-not-met))
	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
	(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))








|
|
>
|
>







1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
	 (lambda ()
	   ;; jobtools maxload is useful for where the full Megatest run is done on one machine
	   (if (and (not (rmt:on-homehost?))
		    maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
	       (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
	   
	   ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
	   ;; (if maxhomehostload
	   ;;     (common:wait-for-homehost-load
	   ;; 	maxhomehostload
	   ;; 	(conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
	   )))
    
 
    
    (if (and (not (null? prereqs-not-met))
	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
	(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))

4536
4537
4538
4539
4540
4541
4542
4543



4544





















































































		(if (member (symbol->string archive-command) '("save-remove"))
                    (begin 
                     (debug:print-info 0 *default-log-port* "remove testdat")
		    (runs:remove-test-directory test-dat 'archive-remove)))))
	    (hash-table-ref test-groups test-base)))))
       (hash-table-keys disk-groups))
    #t))




)





























































































>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
		(if (member (symbol->string archive-command) '("save-remove"))
                    (begin 
                     (debug:print-info 0 *default-log-port* "remove testdat")
		    (runs:remove-test-directory test-dat 'archive-remove)))))
	    (hash-table-ref test-groups test-base)))))
       (hash-table-keys disk-groups))
    #t))

;;======================================================================
;; Maintenance
;;======================================================================

(define (rmt:find-and-mark-incomplete run-id ovr-deadtime)
  (let* ((cfg-deadtime             (configf:lookup-number *configdat* "setup" "deadtime"))
	 (test-stats-update-period (configf:lookup-number *configdat* "setup" "test-stats-update-period")))
   (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period)
   ;;call end of eud of run detection for posthook
   (launch:end-of-run-check run-id)))

;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
;;
;; NOT EASY TO MIGRATE TO db{file,mod}
;;
(define (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
          ;; The default running-deadtime is 720 seconds = 12 minutes.
          ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
         (deadtime-trim (or ovr-deadtime cfg-deadtime))
         (server-start-allowance 200)
         (server-overloaded-budget 200)
         (launch-monitor-off-time (or test-stats-update-period 30))
         (launch-monitor-on-time-budget 30)
         (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
         (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
         (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
         (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
         (running-deadtime (or deadtime-trim running-deadtime-default))) ;; two minutes (30 seconds between updates, this leaves 3x grace period)

    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (let* ((dat (rmt:get-toplevels-and-incompletes run-id running-deadtime remotehoststart-deadtime)))
      (set! oldlaunched (list-ref dat 1))
      (set! toplevels   (list-ref dat 2))
      (set! incompleted (list-ref dat 0)))

    (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
		      (length toplevels) " old LAUNCHED toplevel tests and "
		      (length incompleted) " tests marked RUNNING but apparently dead.")
  
    ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
    ;;
    ;; (db:delay-if-busy dbdat)
    (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
	   (all-ids             (append min-incompleted-ids (map car oldlaunched))))
      (if (> (length all-ids) 0)
	  (begin
	    ;; (launch:is-test-alive "localhost" 435)
	    (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
			 " as DEAD")
	    (for-each
             (lambda (test-id)
               (let* ((tinfo   (rmt:get-test-info-by-id run-id test-id))
		      (run-dir (db:test-get-rundir     tinfo))
		      (host    (db:test-get-host       tinfo))
		      (pid     (db:test-get-process_id tinfo))
		      (result (rmt:get-status-from-final-status-file run-dir)))
		 (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
		     (begin
		       (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
		       (rmt:set-state-status-and-roll-up-items
			run-id test-id 'foo "COMPLETED" "PASS"
			"Test stopped responding but it has PASSED; marking it PASS in the DB."))
		     (let ((is-alive (and (not (eq? pid 0))  ;; 0 is default in re-used field "attemptnum" where pid stored.
					  (commonmod:is-test-alive host pid))))
		       (if is-alive
			   (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
					" has a process on pid " pid ", NOT setting to DEAD.")
			   (begin
			     (debug:print 0 *default-log-port* "INFO: test " test-id
					  " final state/status is not COMPLETED/PASS. It is " result)
			     (rmt:set-state-status-and-roll-up-items
			      run-id test-id 'foo "COMPLETED" "DEAD"
			      "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
	     ;; call end of eud of run detection for posthook - from merge, is it needed?
	     ;; (launch:end-of-run-check run-id)
	     all-ids)
	    )))))



)