Megatest

Check-in [89fedf98b8]
Login
Overview
Comment:got further. noticed race condition when not stepping one at a time by setting launcher. noticed xor does not handle preq-fail.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64-keep-running-fix
Files: files | file ages | folders
SHA1: 89fedf98b8bfc7bc81a065e180a50a47631218ca
User & Date: bjbarcla on 2017-09-29 17:56:31
Other Links: branch diff | manifest | tags
Context
2017-10-02
17:57
upgraded mt: to rmt: in runs.scm for status setting, fleshed out statuses that qualify for killed & in progress in db:prereqs-not-met

.. dead end. learnings applied to v1.64-itemflow2 branch. Closed-Leaf check-in: 6baf47004a user: bjbarcla tags: v1.64-keep-running-fix

2017-09-29
17:56
got further. noticed race condition when not stepping one at a time by setting launcher. noticed xor does not handle preq-fail. check-in: 89fedf98b8 user: bjbarcla tags: v1.64-keep-running-fix
2017-09-28
17:58
updated - itemwait continues forward now, but runs do not stop. it is progress.

problems 1- toplevel goes to completed when not all items have started but so-far started items are completed. 2- not-started/preq-fail propagates not-started/na (which propagates not-started/preq-fail) check-in: f7fdbdc305 user: bjbarcla tags: v1.64-keep-running-fix

Changes

Modified db.scm from [f4290acd99] to [66475d1bf4].

1579
1580
1581
1582
1583
1584
1585
1586

1587
1588
1589
1590
1591
1592
1593
1579
1580
1581
1582
1583
1584
1585

1586
1587
1588
1589
1590
1591
1592
1593







-
+







;;
;; (define (db:launch-delay-left dbstruct run-id launch-delay)
  

;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCED'));
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));

(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) ;; FIXME suspect test run time & deadtime are not well matched; resulting in COMPLETED/DEAD status of an a-ok running test
	 (deadtime     (if (and deadtime-str

Modified launch.scm from [0c1e4ef13e] to [d90ac4eb29].

1295
1296
1297
1298
1299
1300
1301

1302
1303
1304
1305
1306
1307
1308
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309







+







;; 3. create link from run dir to megatest runs area 
;; 4. remotely run the test on allocated host
;;    - could be ssh to host from hosts table (update regularly with load)
;;    - could be netbatch
;;      (launch-test db (cadr status) test-conf))
(define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params)
  (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex
  (BB> "entered launch-test")
  (let* ( ;; (lock-key        (conc "test-" test-id))
	;; (got-lock        (let loop ((lock        (rmt:no-sync-get-lock lock-key))
	;; 			     (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds
	;; 		    (if (car lock)
	;; 			#t
	;; 			(if (> (current-seconds) expire-time)
	;; 			    (begin
1329
1330
1331
1332
1333
1334
1335

1336
1337
1338
1339
1340
1341
1342
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344







+







       (list "MT_RUN_AREA_HOME" *toppath*)
       (list "MT_TEST_NAME" test-name)
       (list "MT_RUNNAME"   runname)
       (list "MT_ITEMPATH"  item-path)
       (list "MT_CONTOUR"   contour)
       )
      itemdat))
    (BB> "set env vars")
    (let* ((tregistry       (tests:get-all)) ;; third param (below) is system-allowed
           ;; for tconfig, why do we allow fallback to test-conf?
	   (tconfig         (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t)
				(begin
                                  (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.")
                                  test-conf))) ;; force re-read now that all vars are set
	   (useshell        (let ((ush (config-lookup *configdat* "jobtools"     "useshell")))
1374
1375
1376
1377
1378
1379
1380

1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392


1393
1394
1395
1396
1397
1398
1399

1400
1401
1402
1403
1404
1405
1406
1407
1408

1409
1410
1411
1412
1413
1414
1415
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394

1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421







+











-
+
+







+









+







	   (cmdparms   #f)
	   (fullcmd    #f) ;; (define a (with-output-to-string (lambda ()(write x))))
	   (mt-bindir-path #f)
	   (testinfo   (rmt:get-test-info-by-id run-id test-id))
	   (mt_target  (string-intersperse (map cadr keyvals) "/"))
	   (debug-param (append (if (args:get-arg "-debug")  (list "-debug" (args:get-arg "-debug")) '())
				(if (args:get-arg "-logging")(list "-logging") '()))))
      (BB> "entered let 1")
      ;; (if hosts (set! hosts (string-split hosts)))
      ;; set the megatest to be called on the remote host
      (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest"))
      (set! mt-bindir-path (pathname-directory remote-megatest))
      (if launcher (set! launcher (string-split launcher)))
      ;; set up the run work area for this test
      (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run
	       (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir
	  (begin
	    (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path)
	    (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record
      

      (BB> "after launcher set")
      ;; prevent overlapping actions - set to LAUNCHED as early as possible
      ;;
      ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail
      (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED"))
      (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f)
      ;; (pp (hash-table->alist tconfig))
      (set! diskpath (get-best-disk *configdat* tconfig))
      (BB> "after launch state set")
      (if diskpath
	  (let ((dat  (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
	    (set! work-area (car dat))
	    (set! toptest-work-area (cadr dat))
	    (debug:print-info 2 *default-log-port* "Using work area " work-area))
	  (begin
	    (set! work-area (conc test-path "/tmp_run"))
	    (create-directory work-area #t)
	    (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run")))
      (BB> "after disk path set")
      (set! cmdparms (base64:base64-encode 
		      (z3:encode-buffer 
		       (with-output-to-string
			 (lambda () ;; (list 'hosts     hosts)
			   (write (list (list 'testpath  test-path)
					;; (list 'transport (conc *transport-type*))
					;; (list 'serverinf *server-info*)
1434
1435
1436
1437
1438
1439
1440
1441

1442
1443
1444
1445
1446
1447
1448
1440
1441
1442
1443
1444
1445
1446

1447
1448
1449
1450
1451
1452
1453
1454







-
+







					(list 'target    mt_target)
					(list 'contour   contour)
					(list 'runtlim   (if run-time-limit (common:hms-string->seconds run-time-limit) #f))
					(list 'env-ovrd  (hash-table-ref/default *configdat* "env-override" '())) 
					(list 'set-vars  (if params (hash-table-ref/default params "-setvars" #f)))
					(list 'runname   runname)
					(list 'mt-bindir-path mt-bindir-path))))))))
      
      (BB> "after cmdparams set")
      ;; clean out step records from previous run if they exist
      ;; (rmt:delete-test-step-records run-id test-id)
      ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway
      (if (common:file-exists? work-area)
	  (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir
      (cond
       ;; ((and launcher hosts) ;; must be using ssh hostname
1456
1457
1458
1459
1460
1461
1462

1463

1464
1465
1466
1467
1468
1469
1470
1462
1463
1464
1465
1466
1467
1468
1469

1470
1471
1472
1473
1474
1475
1476
1477







+
-
+







	(set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" ""))))))
      ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" "")))))
      (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm"))))
      (debug:print 1 *default-log-port* "Launching " work-area)
      ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done
      (debug:print 4 *default-log-port* "fullcmd: " fullcmd)
      (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible.
      (BB> "after set *last-launch*")
      (let* ((commonprevvals (alist->env-vars
      (let* ((commonprevvals (alist->env-vars ;; observed this let can be very slow. (>5 sec)
			      (hash-table-ref/default *configdat* "env-override" '())))
	     (miscprevvals   (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute"
			      (append (list (list "MT_TEST_RUN_DIR" work-area)
					    (list "MT_TEST_NAME" test-name)
					    (list "MT_ITEM_INFO" (conc itemdat)) 
					    (list "MT_RUNNAME"   runname)
					    (list "MT_TARGET"    mt_target)
1483
1484
1485
1486
1487
1488
1489

1490
1491
1492
1493
1494
1495
1496
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504







+







					  (if launchwait
					      cmdstr
					      (conc cmdstr " >> mt_launch.log 2>&1 &")))
					(car fullcmd))
				    (if useshell
					'()
					(cdr fullcmd)))))
        (BB> "let depth 2 entered")
        (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork.
	;; (rmt:no-sync-del! lock-key)         ;; release the lock for starting this test
	(if (not launchwait) ;; give the OS a little time to allow the process to start
	    (thread-sleep! 0.01))
	(with-output-to-file "mt_launch.log"
	  (lambda ()
	    (print "LAUNCHCMD: " (string-intersperse fullcmd " "))

Modified runs.scm from [6846f73920] to [3e262d9c09].

202
203
204
205
206
207
208
209

210
211
212
213
214
215
216
202
203
204
205
206
207
208

209
210
211
212
213
214
215
216







-
+







				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
    (if (> (+ num-running num-running-in-jobgroup) 0)
	(runs:inc-can-run-more-tests-count runsdat)) ;; (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1)))
    (if (not (eq? *last-num-running-tests* num-running))
	(begin
	  (debug:print 2 *default-log-port* "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	  (debug:print 1 *default-log-port* "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	  (set! *last-num-running-tests* num-running)))
    (if (not (eq? 0 *globalexitstatus*))
	(list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)
	(let ((can-not-run-more (cond
				 ;; if max-concurrent-jobs is set and the number running is greater 
				 ;; than it then cannot run more jobs
				 ((and max-concurrent-jobs (>= num-running max-concurrent-jobs))
711
712
713
714
715
716
717

718
719
720
721
722
723
724
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725







+







                      (or (procedure? items)(eq? items 'have-procedure))))
                  waitons))
         (completed-prereq-items 
          (let ((foo (begin (BB> "hello prereqs: "prereqs) #t))
                (res (filter (lambda (test)
                               (BB> "foo - "test)
                               (and (vector? test)
                                    (equal? "COMPLETED" (db:test-get-state test))
                                    (equal? "COMPLETED" (db:test-get-state test))
                                    (not (equal? "" (db:test-get-item-path test)))))
                             prereqs)))
            res)) 

         )
    (debug:print-info 4 *default-log-port* "START OF INNER COND #2 "
765
766
767
768
769
770
771
772

773
774
775

776

777
778
779


780
781
782
783
784
785
786
766
767
768
769
770
771
772

773
774
775

776
777
778
779


780
781
782
783
784
785
786
787
788







-
+


-
+

+

-
-
+
+







                         (debug:print 0 *default-log-port* "WARNING: this condition is triggered when there were no items to expand and nothing to run. Please check your run for completeness")
                         (exit 0))
                       (set! runs:nothing-left-in-queue-count (+ runs:nothing-left-in-queue-count 1)))
                   #f)))

            ;;; desired result of below cond branch:
            ;;   we want to expand items in our test of interest (hed) in the following cases:
            ;;    case 1 - mode is itemmatch or itemwait: (TODO)
            ;;    case 1 - mode is itemmatch or itemwait: (DONE)
            ;;       - all prereq tests have been expanded
            ;;       - at least one prereq's items have completed
            ;;    case 2 - mode is toplevel   (DONE)
            ;;    case 2 - mode is toplevel   (PARTIAL)
            ;;       - prereqs are completed.
            ;;       - or no prereqs can complete (TODO)
            ;;    case 3 - mode not specified (DONE)
            ;;       - prereqs are completed and passed (we could consider removing "and passed" -- it would change behavior from current)
            
            ;;       - prereqs are completed and passed (we could consider removing "and passed" -- it would change behavior from current)          
                   
            ;; runs:expand-items case: toplevel or else no dangling prerequeistes -- expand items now.
            ((or
              (and have-itemized (null? unexpanded-prereqs) (not (null? completed-prereq-items)))
              (null? prereqs)            ;; nothing is in our way to proceed (need to expand this to an item level check.)
              (and (member 'toplevel testmode)   ;; for toplevel test - proceed (nothing in our way)
                   (null? non-completed)))
             (BB> "cb2")
1423
1424
1425
1426
1427
1428
1429
1430

1431
1432
1433
1434
1435
1436
1437
1425
1426
1427
1428
1429
1430
1431

1432
1433
1434
1435
1436
1437
1438
1439







-
+







		  (loop (runs:queue-next-hed tal reg reglen regfull)
			(runs:queue-next-tal tal reg reglen regfull)
			(runs:queue-next-reg tal reg reglen regfull)
			reruns))))
		  ;; (loop (car tal)(cdr tal) reg reruns))))

	(runs:incremental-print-results run-id)
	(debug:print 4 *default-log-port* "TOP OF LOOP => "
	(debug:print 1 *default-log-port* "TOP OF LOOP => "
		     "test-name: " test-name
		     "\n  test-record  " test-record
		     "\n  hed:         " hed
		     "\n  itemdat:     " itemdat
		     "\n  items:       " items
		     "\n  item-path:   " item-path
		     "\n  waitons:     " waitons
1464
1465
1466
1467
1468
1469
1470







1471
1472
1473
1474
1475
1476
1477
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486







+
+
+
+
+
+
+







					    (not (member waiton reruns)))
				       1
				       #f))
				 waitons))))) ;; could do this more elegantly with a marker....
	  (debug:print 0 *default-log-port* "WARNING: Marking test " tfullname " as not runnable. It is waiting on tests that cannot be run. Giving up now.")
	  (hash-table-set! test-registry tfullname 'removed))


         ; BB - a possibility for preqfail propagation
         ;; ((and (not items) (string? item-path) (not (equal? item-path ""))
         ;;       (lset-intersection testmode '(itemmatch itemwait)))

         ;;  )

	 ;; items is #f then the test is ok to be handed off to launch (but not before)
	 ;; 
	 ((not items)
	  (debug:print-info 4 *default-log-port* "OUTER COND: (not items)")
	  (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
		   (not (null? tal)))
	      (loop (car tal)(cdr tal) reg reruns))
1587
1588
1589
1590
1591
1592
1593
1594

1595

1596
1597
1598
1599






1600
1601
1602
1603
1604
1605
1606

1607
1608
1609
1610
1611
1612
1613
1614

1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632

1633
1634
1635
1636
1637
1638
1639
1596
1597
1598
1599
1600
1601
1602

1603
1604
1605




1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617

1618
1619
1620
1621
1622
1623
1624
1625

1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643

1644
1645
1646
1647
1648
1649
1650
1651







-
+

+
-
-
-
-
+
+
+
+
+
+






-
+







-
+

















-
+







	    ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running))))
	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running))))
    ;; LET* ((test-record
    ;; we get here on "drop through". All done!
    (runs:run-post-hook run-id)
    (debug:print-info 1 *default-log-port* "All tests launched")))

(define (runs:calc-fails prereqs-not-met)
(define (runs:calc-fails prereqs-not-met) ;; BB is this redundant with runs:runable-tests ?
  (filter (lambda (test)
            (or
	    (and (vector? test) ;; not (string? test))
		 (member (db:test-get-state test) '("INCOMPLETE" "COMPLETED"))
		 (not (member (db:test-get-status test)
			      '("PASS" "WARN" "CHECK" "WAIVED" "SKIP")))))
             (and (vector? test) ;; not (string? test))
                  (member (db:test-get-status test) '("TEN_STRIKES" "BLOCKED" "PREQ_FAIL" "ZERO_ITEMS" "PREQ_DISCARDED" "TIMED_OUT" "KILLED")))
             (and (vector? test) ;; not (string? test))
                  (member (db:test-get-state test) '("INCOMPLETE" "COMPLETED"))
                  (not (member (db:test-get-status test)
                               '("PASS" "WARN" "CHECK" "WAIVED" "SKIP"))))))
	  prereqs-not-met))

(define (runs:calc-prereq-fail prereqs-not-met)
  (filter (lambda (test)
	    (and (vector? test) ;; not (string? test))
		 (equal? (db:test-get-state test) "NOT_STARTED")
		 (not (member (db:test-get-status test)
                 (not (member (db:test-get-status test)
			      '("n/a" "KEEP_TRYING")))))
	  prereqs-not-met))

(define (runs:calc-not-completed prereqs-not-met) ;; filter out tests which have reached a ground state -- they are done one way or another.
  (filter 
   (lambda (t)
     (or (not (vector? t))
         (not (and (equal? (db:test-get-state t) "NOT_STARTED") (equal? (db:test-get-status t) "PREQ_FAIL")))
         (not (member (db:test-get-status t) '("TEN_STRIKES" "BLOCKED" "PREQ_FAIL" "ZERO_ITEMS" "PREQ_DISCARDED" "TIMED_OUT" "KILLED")))
	 (not (member (db:test-get-state t) '("INCOMPLETE" "COMPLETED")))))
   prereqs-not-met))

;; (define (runs:calc-not-completed prereqs-not-met)
;;   (filter
;;    (lambda (t)
;;      (or (not (vector? t))
;; 	 (not (equal? "COMPLETED" (db:test-get-state t)))))
;;    prereqs-not-met))

(define (runs:calc-runnable prereqs-not-met)
  (filter 
   (lambda (t)
     (or (not (vector? t))
	 (and (equal? "NOT_STARTED" (db:test-get-state t))
	      (member (db:test-get-status t)
		      '("n/a" "KEEP_TRYING")))
	 (and (equal? "RUNNING" (db:test-get-state t))))) ;; account for a test that is running
	 (and (member (db:test-get-state t) '("RUNNING" "LAUNCHED" "REMOTEHOSTSTART" ))))) ;; account for a test that is running
   prereqs-not-met))

(define (runs:pretty-string lst)
  (map (lambda (t)
	 (if (not (vector? t))
	     (conc t)
	     (conc (db:test-get-testname t)"/"(db:test-get-item-path t) ":" (db:test-get-state t) "/" (db:test-get-status t))))