Megatest

Check-in [d8f671770d]
Login
Overview
Comment:Added back the improved db:find-and-mark-incomplete routine.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.55 | v1.5519
Files: files | file ages | folders
SHA1: d8f671770d1aa303cac1f400c3a10c1a8786e0fd
User & Date: mrwellan on 2014-05-20 17:20:19
Other Links: branch diff | manifest | tags
Context
2014-05-20
23:54
Properly deal with roll up vs. setting to INCOMPLETE in old tests stuck in LAUNCHED check-in: b54fa13558 user: matt tags: v1.55
17:20
Added back the improved db:find-and-mark-incomplete routine. check-in: d8f671770d user: mrwellan tags: v1.55, v1.5519
01:16
Bumped version to v1.5519 check-in: d14bc79960 user: matt tags: v1.55, v1.5519
Changes

Modified db.scm from [ae5afe1970] to [e4c046d5ff].

480
481
482
483
484
485
486
487
488
489

490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508

509
510






511
512
513
514
515
516
517

518
519
520
521
522
523
524
525
526
527







528


529
530
531
532
533
534
535









536
537
538
539
540
541
542
;;======================================================================

;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCED'));


(define (db:find-and-mark-incomplete db #!key (ovr-deadtime #f))
  (let* ((incompleted '())

	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
	 (deadtime     (if (and deadtime-str
				(string->number deadtime-str))
			   (string->number deadtime-str)
			   7200)) ;; two hours
	 (run-ids      (db:get-run-ids db))) ;; iterate over runs to divy up the calls
    (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
    (for-each
     (lambda (run-id)

       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
       ;;
       ;; THIS CANNOT WORK. The run_duration is not updated in the central db due to performance concerns.
       ;;                   The testdat.db file must be consulted.
       ;;
       ;; HOWEVER: this code in run:test seems to work fine
       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
       ;;                     (db:test-get-run_duration testdat)))
       ;;                    600) 

       (sqlite3:for-each-row 
	(lambda (test-id)






	  (set! incompleted (cons test-id incompleted)))
	db
	"SELECT id FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time - run_duration) > ? AND state IN ('RUNNING','REMOTEHOSTSTART');"
	run-id deadtime)

       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
       ;;

       (sqlite3:for-each-row
	(lambda (test-id)
	  (set! incompleted (cons test-id incompleted)))
	db
	"SELECT id FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time - run_duration) > ? AND state IN ('LAUNCHED');"
	run-id (* 60 60 24)))
     run-ids)
       
    ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
    ;;







    (if (> (length incompleted) 0)


	(begin
	  (debug:print 0 "WARNING: Marking test(s); " (string-intersperse (map conc incompleted) ", ") " as INCOMPLETE")
	  (sqlite3:execute 
	   db
	   (conc "UPDATE tests SET state='INCOMPLETE' WHERE id IN (" 
		 (string-intersperse (map conc incompleted) ",")
		 ");"))))))









		     
;; Clean out old junk and vacuum the database
;;
;; Ultimately do something like this:
;;
;; 1. Look at test records either deleted or part of deleted run:
;;    a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown'







<


>



















>

|
>
>
>
>
>
>
|

|
|



>

|
|

|
|

|


>
>
>
>
>
>
>
|
>
>
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>







480
481
482
483
484
485
486

487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
;;======================================================================

;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCED'));


(define (db:find-and-mark-incomplete db #!key (ovr-deadtime #f))
  (let* ((incompleted '())
	 (toplevels   '())
	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
	 (deadtime     (if (and deadtime-str
				(string->number deadtime-str))
			   (string->number deadtime-str)
			   7200)) ;; two hours
	 (run-ids      (db:get-run-ids db))) ;; iterate over runs to divy up the calls
    (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
    (for-each
     (lambda (run-id)

       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
       ;;
       ;; THIS CANNOT WORK. The run_duration is not updated in the central db due to performance concerns.
       ;;                   The testdat.db file must be consulted.
       ;;
       ;; HOWEVER: this code in run:test seems to work fine
       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
       ;;                     (db:test-get-run_duration testdat)))
       ;;                    600) 
       (db:delay-if-busy)
       (sqlite3:for-each-row 
	(lambda (test-id run-dir uname testname item-path)
	  (if (and (equal? uname "n/a")
		   (equal? item-path "")) ;; this is a toplevel test
	      ;; what to do with toplevel? call rollup?
	      (begin
		(set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		(debug:print-info 0 "Found old toplevel test in RUNNING state, test-id=" test-id))
	      (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
	db
	"SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 600 AND state IN ('RUNNING','REMOTEHOSTSTART');"
	run-id)

       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
       ;;
       (db:delay-if-busy)
       (sqlite3:for-each-row
	(lambda (test-id run-dir uname testname item-path)
	  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))
	db
	"SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
	run-id))
     run-ids)
    
    ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
    ;;
    (db:delay-if-busy)
    (let* ((min-incompleted (filter (lambda (x)
				     (let* ((testpath (cadr x))
					    (tdatpath (conc testpath "/testdat.db"))
					    (dbexists (file-exists? tdatpath)))
				       (or (not dbexists) ;; if no file then something wrong - mark as incomplete
					   (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
				   incompleted))
	  (min-incompleted-ids (map car min-incompleted)))
      (if (> (length min-incompleted-ids) 0)
	  (begin
	    (debug:print 0 "WARNING: Marking test(s); " (string-intersperse (map conc min-incompleted-ids) ", ") " as INCOMPLETE")
	    (sqlite3:execute 
	     db
	     (conc "UPDATE tests SET state='INCOMPLETE' WHERE id IN (" 
		   (string-intersperse (map conc min-incompleted-ids) ",")
		   ");")))))

    ;; Now do rollups for the toplevel tests
    ;;
    (for-each
     (lambda (toptest)
       (let ((test-name (list-ref toptest 3))
	     (run-id    (list-ref toptest 5)))
	 (cdb:top-test-set-per-pf-counts *runremote* run-id test-name)))
     toplevels)))
		     
;; Clean out old junk and vacuum the database
;;
;; Ultimately do something like this:
;;
;; 1. Look at test records either deleted or part of deleted run:
;;    a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown'

Modified runs.scm from [9ea69b3434] to [596fc89645].

857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882

883
884
885
886
887
888
889
890
891
892
893
894
895
896
	(num-retries           0)
	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
	(max-concurrent-jobs   (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1))) ;; length of the register queue ahead
	(reglen                (if (number? reglen-in) reglen-in 1))
	(last-time-incomplete  (current-seconds))
	(last-time-some-running (current-seconds)))

    ;; Initialize the test-registery hash with tests that already have a record
    ;; convert state to symbol and use that as the hash value
    (for-each (lambda (trec)
		(let ((id (db:test-get-id        trec))
		      (tn (db:test-get-testname  trec))
		      (ip (db:test-get-item-path trec))
		      (st (db:test-get-state     trec)))
		  (if (not (equal? st "DELETED"))
		      (hash-table-set! test-registry (runs:make-full-test-name tn ip) (string->symbol st)))))
	      tests-info)
    (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100))

    (let loop ((hed         (car sorted-test-names))
	       (tal         (cdr sorted-test-names))
	       (reg         '()) ;; registered, put these at the head of tal 
	       (reruns      '()))

      (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns))

      ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
      ;; (if (> (current-seconds)(+ last-time-incomplete 900))
      ;;     (begin
      ;;       (set! last-time-incomplete (current-seconds))
      ;;       (cdb:remote-run db:find-and-mark-incomplete #f)))

      ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns)
      (let* ((test-record (hash-table-ref test-records hed))
	     (test-name   (tests:testqueue-get-testname test-record))
	     (tconfig     (tests:testqueue-get-testconfig test-record))
	     (jobgroup    (config-lookup tconfig "test_meta" "jobgroup"))
	     (testmode    (let ((m (config-lookup tconfig "requirements" "mode")))







|


















>



|
|
|
|







857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
	(num-retries           0)
	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
	(max-concurrent-jobs   (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1))) ;; length of the register queue ahead
	(reglen                (if (number? reglen-in) reglen-in 1))
	(last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
	(last-time-some-running (current-seconds)))

    ;; Initialize the test-registery hash with tests that already have a record
    ;; convert state to symbol and use that as the hash value
    (for-each (lambda (trec)
		(let ((id (db:test-get-id        trec))
		      (tn (db:test-get-testname  trec))
		      (ip (db:test-get-item-path trec))
		      (st (db:test-get-state     trec)))
		  (if (not (equal? st "DELETED"))
		      (hash-table-set! test-registry (runs:make-full-test-name tn ip) (string->symbol st)))))
	      tests-info)
    (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100))

    (let loop ((hed         (car sorted-test-names))
	       (tal         (cdr sorted-test-names))
	       (reg         '()) ;; registered, put these at the head of tal 
	       (reruns      '()))

      (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns))

      ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
      (if (> (current-seconds)(+ last-time-incomplete 900))
          (begin
            (set! last-time-incomplete (current-seconds))
            (cdb:remote-run db:find-and-mark-incomplete #f)))

      ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns)
      (let* ((test-record (hash-table-ref test-records hed))
	     (test-name   (tests:testqueue-get-testname test-record))
	     (tconfig     (tests:testqueue-get-testconfig test-record))
	     (jobgroup    (config-lookup tconfig "test_meta" "jobgroup"))
	     (testmode    (let ((m (config-lookup tconfig "requirements" "mode")))