Megatest

Check-in [5cd9ae113c]
Login
Overview
Comment:Merged in updates from v1.65
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-use-pkts
Files: files | file ages | folders
SHA1: 5cd9ae113ca4a1971dc2fe123b251bd7f64877d4
User & Date: matt on 2017-05-25 23:16:18
Other Links: branch diff | manifest | tags
Context
2017-05-25
23:17
Applied stash with better meta data in pkt for configs check-in: f2e853123a user: matt tags: v1.65-use-pkts
23:16
Merged in updates from v1.65 check-in: 5cd9ae113c user: matt tags: v1.65-use-pkts
23:12
Merging in recent fixed from v1.64 check-in: b44a2afd2a user: matt tags: v1.65
17:11
Minor updates for pkt dumping check-in: 465e3734f8 user: mrwellan tags: v1.65-use-pkts
Changes

Modified common.scm from [7f4a9cee8c] to [e816a82f7a].

756
757
758
759
760
761
762

763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
		  (if (not *time-to-exit*) (loop))))
	    (if (common:low-noise-print 30)
		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num)))))))

;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (common:watchdog)
  (debug:print-info 13 *default-log-port* "common:watchdog entered.")

  (if (common:on-homehost?)
      (let ((dbstruct (db:setup #t)))
	(debug:print-info 13 *default-log-port* "after db:setup with dbstruct="dbstruct)
	(cond
	 ((dbr:dbstruct-read-only dbstruct)
	  (debug:print-info 13 *default-log-port* "loading read-only watchdog")
	  (common:readonly-watchdog dbstruct))
	 (else
	  (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
	  (common:writable-watchdog dbstruct)))
	(debug:print-info 13 *default-log-port* "watchdog done."))
      (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))


(define (std-exit-procedure)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f







>
|
|
|
|
|
|
|
|
|
|
|
|







756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
		  (if (not *time-to-exit*) (loop))))
	    (if (common:low-noise-print 30)
		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num)))))))

;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (common:watchdog)
  (debug:print-info 13 *default-log-port* "common:watchdog entered.")
  (if (launch:setup)
      (if (common:on-homehost?)
	  (let ((dbstruct (db:setup #t)))
	    (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct)
	    (cond
	     ((dbr:dbstruct-read-only dbstruct)
	      (debug:print-info 13 *default-log-port* "loading read-only watchdog")
	      (common:readonly-watchdog dbstruct))
	     (else
	      (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
	      (common:writable-watchdog dbstruct)))
	    (debug:print-info 13 *default-log-port* "watchdog done."))
	  (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))


(define (std-exit-procedure)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f
1754
1755
1756
1757
1758
1759
1760
1761

1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774

1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
       (lambda ()
          (for-each (lambda (keyval)
		      (let* ((key   (car keyval))
			     (val   (cdr keyval))
			     (delim (if (string-search whitesp val) 
					"\""
					"")))
			(print (if (member key ignorevars)

				   "# setenv "
				   "setenv ")
			       key " " delim (mungeval val) delim)))
		    envvars)))
     (with-output-to-file (conc fname ".sh")
       (lambda ()
          (for-each (lambda (keyval)
		      (let* ((key (car keyval))
			     (val (cdr keyval))
			     (delim (if (string-search whitesp val) 
					"\""
					"")))
			(print (if (or (member key ignorevars)

				       (string-search ":" key)) ;; internal only values to be skipped.
				   "# export "
				   "export ")
			       key "=" delim (mungeval val) delim)))
                    envvars)))))

;; set some env vars from an alist, return an alist with original values
;; (("VAR" "value") ...)
(define (alist->env-vars lst)
  (if (list? lst)
      (let ((res '()))
	(for-each (lambda (p)
		    (let* ((var (car  p))
			   (val (cadr p))
			   (prv (get-environment-variable var)))
		      (set! res (cons (list var prv) res))
		      (if val 
			  (setenv var (->string val))
			  (unsetenv var))))
		  lst)
	res)
      '()))

;; clear vars matching pattern, run proc, set vars back
;; if proc is a string run that string as a command with







|
>













>

















|







1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
       (lambda ()
          (for-each (lambda (keyval)
		      (let* ((key   (car keyval))
			     (val   (cdr keyval))
			     (delim (if (string-search whitesp val) 
					"\""
					"")))
			(print (if (or (member key ignorevars)
				       (string-search whitesp key))
				   "# setenv "
				   "setenv ")
			       key " " delim (mungeval val) delim)))
		    envvars)))
     (with-output-to-file (conc fname ".sh")
       (lambda ()
          (for-each (lambda (keyval)
		      (let* ((key (car keyval))
			     (val (cdr keyval))
			     (delim (if (string-search whitesp val) 
					"\""
					"")))
			(print (if (or (member key ignorevars)
				       (string-search whitesp key)
				       (string-search ":" key)) ;; internal only values to be skipped.
				   "# export "
				   "export ")
			       key "=" delim (mungeval val) delim)))
                    envvars)))))

;; set some env vars from an alist, return an alist with original values
;; (("VAR" "value") ...)
(define (alist->env-vars lst)
  (if (list? lst)
      (let ((res '()))
	(for-each (lambda (p)
		    (let* ((var (car  p))
			   (val (cadr p))
			   (prv (get-environment-variable var)))
		      (set! res (cons (list var prv) res))
		      (if val 
			  (safe-setenv var (->string val))
			  (unsetenv var))))
		  lst)
	res)
      '()))

;; clear vars matching pattern, run proc, set vars back
;; if proc is a string run that string as a command with

Modified db.scm from [1b0c158e19] to [25f1970af9].

935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
;;  'old2new      - sync megatest.db to /tmp/.../megatest.db and /tmp/.../megatest_ref.db
;;  'new2old      - sync /tmp/.../megatest.db to megatest.db and /tmp/.../megatest_ref.db (and update data_synced)
;;  'closeall     - close all opened dbs
;;  'schema       - attempt to apply schema changes
;;  run-ids: '(1 2 3 ...) or #f (for all)
;;
(define (db:multi-db-sync dbstruct . options)
  (if (not (launch:setup))
      (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.")
      (let* ((mtdb     (dbr:dbstruct-mtdb dbstruct))
	     (tmpdb    (db:get-db dbstruct))
             (refndb   (dbr:dbstruct-refndb dbstruct))
	     (allow-cleanup #t) ;; (if run-ids #f #t))
	     (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))
	     (data-synced 0)) ;; count of changed records (I hope)

	(for-each
	 (lambda (option)

	   (case option
	     ;; kill servers
	     ((killservers)
	      (for-each
	       (lambda (server)
		 (match-let (((mod-time host port start-time pid) server))
			    (if (and host pid)
				(tasks:kill-server host pid))))
	       servers))

	     ;; clear out junk records
	     ;;
	     ((dejunk)
	      (db:delay-if-busy mtdb) ;; ok to delay on mtdb
	      (db:clean-up mtdb)
	      (db:clean-up tmpdb)
              (db:clean-up refndb))

	     ;; sync runs, test_meta etc.
	     ;;
	     ((old2new)
	      (set! data-synced
		    (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb)
		       data-synced)))
	     
	     ;; now ensure all newdb data are synced to megatest.db
	     ;; do not use the run-ids list passed in to the function
	     ;;
	     ((new2old)
	      (set! data-synced
		    (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb)
		       data-synced)))

	     ((adj-target)
	      (db:adj-target (db:dbdat-get-db mtdb))
	      (db:adj-target (db:dbdat-get-db tmpdb))
	      (db:adj-target (db:dbdat-get-db refndb)))
	   
	     ((schema)
              (db:patch-schema-maindb (db:dbdat-get-db mtdb))
              (db:patch-schema-maindb (db:dbdat-get-db tmpdb))
              (db:patch-schema-maindb (db:dbdat-get-db refndb))
              (db:patch-schema-rundb  (db:dbdat-get-db mtdb))
              (db:patch-schema-rundb  (db:dbdat-get-db tmpdb))
              (db:patch-schema-rundb  (db:dbdat-get-db refndb))))
	
	   (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))
	 options)
	data-synced)))

;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling  proc idb . params)
  (debug:print-info 11 *default-log-port* "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
  (print "I don't work anymore. open-run-close-no-exception-handling needs fixing or removing...")
  (exit)
  (if (or *db-write-access*







|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|







935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
;;  'old2new      - sync megatest.db to /tmp/.../megatest.db and /tmp/.../megatest_ref.db
;;  'new2old      - sync /tmp/.../megatest.db to megatest.db and /tmp/.../megatest_ref.db (and update data_synced)
;;  'closeall     - close all opened dbs
;;  'schema       - attempt to apply schema changes
;;  run-ids: '(1 2 3 ...) or #f (for all)
;;
(define (db:multi-db-sync dbstruct . options)
  ;; (if (not (launch:setup))
  ;;    (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.")
  (let* ((mtdb     (dbr:dbstruct-mtdb dbstruct))
	 (tmpdb    (db:get-db dbstruct))
	 (refndb   (dbr:dbstruct-refndb dbstruct))
	 (allow-cleanup #t) ;; (if run-ids #f #t))
	 (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))
	 (data-synced 0)) ;; count of changed records (I hope)
    
    (for-each
     (lambda (option)
       
       (case option
	 ;; kill servers
	 ((killservers)
	  (for-each
	   (lambda (server)
	     (match-let (((mod-time host port start-time pid) server))
	       (if (and host pid)
		   (tasks:kill-server host pid))))
	   servers))
	 
	 ;; clear out junk records
	 ;;
	 ((dejunk)
	  (db:delay-if-busy mtdb) ;; ok to delay on mtdb
	  (db:clean-up mtdb)
	  (db:clean-up tmpdb)
	  (db:clean-up refndb))

	 ;; sync runs, test_meta etc.
	 ;;
	 ((old2new)
	  (set! data-synced
	    (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb)
	       data-synced)))
	 
	 ;; now ensure all newdb data are synced to megatest.db
	 ;; do not use the run-ids list passed in to the function
	 ;;
	 ((new2old)
	  (set! data-synced
	    (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb)
	       data-synced)))

	 ((adj-target)
	  (db:adj-target (db:dbdat-get-db mtdb))
	  (db:adj-target (db:dbdat-get-db tmpdb))
	  (db:adj-target (db:dbdat-get-db refndb)))
	 
	 ((schema)
	  (db:patch-schema-maindb (db:dbdat-get-db mtdb))
	  (db:patch-schema-maindb (db:dbdat-get-db tmpdb))
	  (db:patch-schema-maindb (db:dbdat-get-db refndb))
	  (db:patch-schema-rundb  (db:dbdat-get-db mtdb))
	  (db:patch-schema-rundb  (db:dbdat-get-db tmpdb))
	  (db:patch-schema-rundb  (db:dbdat-get-db refndb))))
       
       (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))
     options)
    data-synced))

;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling  proc idb . params)
  (debug:print-info 11 *default-log-port* "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
  (print "I don't work anymore. open-run-close-no-exception-handling needs fixing or removing...")
  (exit)
  (if (or *db-write-access*
3257
3258
3259
3260
3261
3262
3263

3264

3265
3266
3267
3268
3269
3270
3271
			   (db:get-test-info       dbstruct run-id test-name item-path)))
	 (test-id      (db:test-get-id testdat))
	 (test-name    (if (number? test-name)
			   (db:test-get-testname testdat)
			   test-name))
	 (item-path    (db:test-get-item-path testdat))
         (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))

         (tl-test-id   (db:test-get-id tl-testdat)))

    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
	(db:general-call dbstruct 'set-test-start-time (list test-id)))
    (mutex-lock! *db-transaction-mutex*)
    (db:with-db
     dbstruct #f #f
     (lambda (db)
       (let ((tr-res







>
|
>







3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
			   (db:get-test-info       dbstruct run-id test-name item-path)))
	 (test-id      (db:test-get-id testdat))
	 (test-name    (if (number? test-name)
			   (db:test-get-testname testdat)
			   test-name))
	 (item-path    (db:test-get-item-path testdat))
         (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
         (tl-test-id   (if tl-testdat
			   (db:test-get-id tl-testdat)
			   #f)))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
	(db:general-call dbstruct 'set-test-start-time (list test-id)))
    (mutex-lock! *db-transaction-mutex*)
    (db:with-db
     dbstruct #f #f
     (lambda (db)
       (let ((tr-res
3323
3324
3325
3326
3327
3328
3329

3330
3331
3332
3333
3334
3335
3336
3337
							  (and (equal? newstate "NOT_STARTED")
							       (> num-non-completes 0)))
						      "STARTED"
                                                      (car all-curr-statuses))))
                       ;; (print "bad-not-supported: " bad-not-support " all-curr-states: " all-curr-states " all-curr-statuses: " all-curr-states)
                       ;;      " newstate: " newstate " newstatus: " newstatus)
                       ;; NB// Pass the db so it is part of the transaction

                       (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)))))))
         (mutex-unlock! *db-transaction-mutex*)
         (if (and test-id state status (equal? status "AUTO")) 
             (db:test-data-rollup dbstruct run-id test-id status))
         tr-res)))))
;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status*
(define (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path)
  (db:with-db







>
|







3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
							  (and (equal? newstate "NOT_STARTED")
							       (> num-non-completes 0)))
						      "STARTED"
                                                      (car all-curr-statuses))))
                       ;; (print "bad-not-supported: " bad-not-support " all-curr-states: " all-curr-states " all-curr-statuses: " all-curr-states)
                       ;;      " newstate: " newstate " newstatus: " newstatus)
                       ;; NB// Pass the db so it is part of the transaction
                       (if tl-test-id
			   (db:test-set-state-status db run-id tl-test-id newstate newstatus #f))))))))
         (mutex-unlock! *db-transaction-mutex*)
         (if (and test-id state status (equal? status "AUTO")) 
             (db:test-data-rollup dbstruct run-id test-id status))
         tr-res)))))
;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status*
(define (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path)
  (db:with-db

Modified launch.scm from [454af8a4ba] to [68a0486978].

312
313
314
315
316
317
318

319
320
321
322
323
324
325
326
327
328
329
330
331
332


333
334
335
336
337
338


339

340
341
342


343
344
345
346
347
348
349
350

351
352
353
354
355
356
357
358
		    (if (steprun-good? logpro-used (launch:einf-exit-code exit-info))
			(if (not (null? tal))
			    (loop (car tal) (cdr tal) stepname))
			(debug:print 4 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping")))
		  (debug:print 4 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))))))

(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)

  (let* ((start-seconds (current-seconds))
	 (calc-minutes  (lambda ()
			  (inexact->exact 
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory))))


      (let ((new-cpu-load (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
				 (delta (abs (- load cpu-load))))
			    (if (> delta 0.1) ;; don't bother updating with small changes
				load
				#f)))
	    (new-disk-free (let* ((df    (get-df (current-directory)))


				  (delta (abs (- df disk-free))))

			     (if (> delta 200) ;; ignore changes under 200 Meg
				 df
				 #f))))


	(set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))
			    (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
						(time-exceeded (> run-seconds runtlim)))
					   (if time-exceeded
					       (begin
						 (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
						 #t)
					       #f)))))

	(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
	(if kill-job? 
	    (begin
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
	      ;;       between tries?
	      (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0))







>
|












|
>
>
|
|
|
|
|
|
>
>
|
>
|
|
|
>
>








>
|







312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
		    (if (steprun-good? logpro-used (launch:einf-exit-code exit-info))
			(if (not (null? tal))
			    (loop (car tal) (cdr tal) stepname))
			(debug:print 4 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping")))
		  (debug:print 4 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))))))

(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30")))
         (start-seconds (current-seconds))
	 (calc-minutes  (lambda ()
			  (inexact->exact 
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                   (delta (abs (- load cpu-load))))
                              (if (> delta 0.1) ;; don't bother updating with small changes
                                  load
                                  #f)))
             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds
                                              (get-df (current-directory))
                                              disk-free))
                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time)))
        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
	(set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))
			    (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
						(time-exceeded (> run-seconds runtlim)))
					   (if time-exceeded
					       (begin
						 (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
						 #t)
					       #f)))))
        (if do-sync
            (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
	(if kill-job? 
	    (begin
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
	      ;;       between tries?
	      (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0))
392
393
394
395
396
397
398
399



400
401
402
403
404
405
406
	      (mutex-unlock! m)
	      ;; no point in sticking around. Exit now.
	      (exit)))
	(if (hash-table-ref/default misc-flags 'keep-going #f)
	    (begin
	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free)))))))



    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional


(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)







|
>
>
>







401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
	      (mutex-unlock! m)
	      ;; no point in sticking around. Exit now.
	      (exit)))
	(if (hash-table-ref/default misc-flags 'keep-going #f)
	    (begin
	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes)
                        (or new-cpu-load cpu-load)
                        (or new-disk-free disk-free)
                        (if do-sync (current-seconds) last-sync)))))))
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional


(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)
840
841
842
843
844
845
846



847


848
849
850
851
852
853
854
855
856
      *toppath*   ;; return toppath
      (let* ((use-cache (common:use-cache?)) ;; BB- use-cache checks *configdat* for use-cache setting.  We do not have *configdat*.  Bootstrapping problem here.
	     (toppath  (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath
	     (target   (common:args-get-target))
	     (sections (if target (list "default" target) #f)) ;; for runconfigs
	     (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config 
             (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig))



	     (mtcachef   (car cachefiles)) ;; (and cachedir (conc cachedir "/" ".megatest.cfg-"  megatest-version "-" megatest-fossil-hash)))


	     (rccachef   (cdr cachefiles)) ;; (and cachedir (conc cachedir "/" ".runconfigs.cfg-"  megatest-version "-" megatest-fossil-hash)))
	     ) ;; (cancreate (and cachedir (common:file-exists? cachedir)(file-write-access? cachedir) (not (common:in-running-test?)))))
	(set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource
        ;;(BB> "launch:setup-body -- cachefiles="cachefiles)
	(cond
	 ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME
	 ((and (not force-reread) mtcachef (common:file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME") use-cache)
          ;;(BB> "launch:setup-body -- cond branch 1 - use-cache")
          (set! *configdat*    (configf:read-alist mtcachef))







>
>
>
|
>
>
|
|







852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
      *toppath*   ;; return toppath
      (let* ((use-cache (common:use-cache?)) ;; BB- use-cache checks *configdat* for use-cache setting.  We do not have *configdat*.  Bootstrapping problem here.
	     (toppath  (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath
	     (target   (common:args-get-target))
	     (sections (if target (list "default" target) #f)) ;; for runconfigs
	     (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config 
             (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig))
	     ;; checking for null cachefiles should not be necessary, I was seeing error car of '(), might be a chicken bug or a red herring ...
	     (mtcachef   (if (null? cachefiles)
			     #f
			     (car cachefiles))) ;; (and cachedir (conc cachedir "/" ".megatest.cfg-"  megatest-version "-" megatest-fossil-hash)))
	     (rccachef   (if (null? cachefiles)
			     #f
			     (cdr cachefiles)))) ;; (and cachedir (conc cachedir "/" ".runconfigs.cfg-"  megatest-version "-" megatest-fossil-hash)))
	      ;; (cancreate (and cachedir (common:file-exists? cachedir)(file-write-access? cachedir) (not (common:in-running-test?)))))
	(set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource
        ;;(BB> "launch:setup-body -- cachefiles="cachefiles)
	(cond
	 ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME
	 ((and (not force-reread) mtcachef (common:file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME") use-cache)
          ;;(BB> "launch:setup-body -- cond branch 1 - use-cache")
          (set! *configdat*    (configf:read-alist mtcachef))

Modified megatest.scm from [425b1fc1e3] to [5793c013b1].

385
386
387
388
389
390
391
392








393
394
395
396
397
398
399
(let ((targ (or (args:get-arg "-reqtarg")(args:get-arg "-target"))))
  (if targ (setenv "MT_TARGET" targ)))

;; The watchdog is to keep an eye on things like db sync etc.
;;

;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define *watchdog* (make-thread common:watchdog "Watchdog thread"))









;;(if (not (args:get-arg "-server"))
;;    (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
(let* ((no-watchdog-args
       '("-list-runs"
         "-list-servers"
         "-server"







|
>
>
>
>
>
>
>
>







385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
(let ((targ (or (args:get-arg "-reqtarg")(args:get-arg "-target"))))
  (if targ (setenv "MT_TARGET" targ)))

;; The watchdog is to keep an eye on things like db sync etc.
;;

;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define *watchdog* (make-thread
		    (lambda ()
		      (handle-exceptions
			  exn
			  (begin
			    (print-call-chain)
			    (print " message: " ((condition-property-accessor 'exn 'message) exn)))
			(common:watchdog)))
		    "Watchdog thread"))

;;(if (not (args:get-arg "-server"))
;;    (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
(let* ((no-watchdog-args
       '("-list-runs"
         "-list-servers"
         "-server"

Modified runs.scm from [4535e4953f] to [cca5799598].

511
512
513
514
515
516
517





518

519
520
521
522
523
524
525
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ((keep-going        #t)
		 (run-queue-retries 5)
		 (th1        (make-thread (lambda ()





					    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))

					    ;; (handle-exceptions
					    ;;  exn
					    ;;  (begin
					    ;;    (print-call-chain (current-error-port))
					    ;;    (debug:print-error 0 *default-log-port* "failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn))
					    ;;    (if (> run-queue-retries 0)
					    ;; 	   (begin







>
>
>
>
>
|
>







511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ((keep-going        #t)
		 (run-queue-retries 5)
		 (th1        (make-thread (lambda ()
					    (handle-exceptions
						exn
						(begin
						  (print-call-chain)
						  (print " message: " ((condition-property-accessor 'exn 'message) exn)))
					      (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
								    (any->number reglen) all-tests-registry)))
					    ;; (handle-exceptions
					    ;;  exn
					    ;;  (begin
					    ;;    (print-call-chain (current-error-port))
					    ;;    (debug:print-error 0 *default-log-port* "failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn))
					    ;;    (if (> run-queue-retries 0)
					    ;; 	   (begin

Modified tests/tests.scm from [63be015a8f] to [9ef0e0195b].

15
16
17
18
19
20
21
















22
23
24
25
26
27
28
(require-extension posix)
(import posix)
(import srfi-18)
;; (require-extension zmq)
;; (import zmq)

(define test-work-dir (current-directory))

















;; read in all the _record files
(let ((files (glob "*_records.scm")))
  (for-each
   (lambda (file)
     (print "Loading " file)
     (load file))







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
(require-extension posix)
(import posix)
(import srfi-18)
;; (require-extension zmq)
;; (import zmq)

(define test-work-dir (current-directory))

;; given list of lists
;;  ( ( msg expected param1 param2 ...)
;;    ( ... ) )
;; apply test to all
;;
(define (test-batch proc pname inlst #!key (post-proc #f))
  (for-each
   (lambda (spec)
     (let ((msg    (conc pname " " (car spec)))
           (result (cadr spec))
           (params (cddr spec)))
       (if post-proc
           (test msg result (post-proc (apply proc params)))
           (test msg result (apply proc params)))))
   inlst))

;; read in all the _record files
(let ((files (glob "*_records.scm")))
  (for-each
   (lambda (file)
     (print "Loading " file)
     (load file))

Modified tests/unittests/all-rmt.scm from [091111a6e5] to [47417391a0].

26
27
28
29
30
31
32









33

34

35

36
37
38
39
40
41
42
(test #f #t (string? (server:check-if-running ".")))
;; DEF (test #f #f (rmt:send-receive-no-auto-client-setup *runremote* 'get-keys #f '()))
;; DEF (rmt:kill-server run-id)
;; DEF (rmt:start-server run-id)
(test #f '(#t "successful login")(rmt:login #f))
;; DEF (rmt:login-no-auto-client-setup connection-info)
(test #f #t (pair? (rmt:get-latest-host-load (get-host-name))))









(test #f #t (list? (rmt:get-changed-record-ids 0)))

(test #f #f (begin (runs:update-all-test_meta #f) #f))

(test #f '("test1" "test2")(sort (alist-ref "tagtwo" (hash-table->alist (rmt:get-tests-tags)) equal?) string<=))

(test #f '() (rmt:get-key-val-pairs 0))
(test #f '("SYSTEM" "RELEASE") (rmt:get-keys))
(test #f '("SYSTEM" "RELEASE") (rmt:get-keys-write)) ;; dummy query to force server start
(test #f '() (rmt:get-key-vals 1))
(test #f (vector '("SYSTEM" "RELEASE") '()) (rmt:get-targets))
(test #f "" (rmt:get-target 1))
(test #f #t (rmt:register-test 1 "foo" ""))







>
>
>
>
>
>
>
>
>

>

>

>







26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
(test #f #t (string? (server:check-if-running ".")))
;; DEF (test #f #f (rmt:send-receive-no-auto-client-setup *runremote* 'get-keys #f '()))
;; DEF (rmt:kill-server run-id)
;; DEF (rmt:start-server run-id)
(test #f '(#t "successful login")(rmt:login #f))
;; DEF (rmt:login-no-auto-client-setup connection-info)
(test #f #t (pair? (rmt:get-latest-host-load (get-host-name))))

;; get-latest-host-load does a lookup in the db, it won't return a useful value unless
;; a test ran recently on host
(test-batch rmt:get-latest-host-load
            "rmt:get-latest-host-load"
            (list (list "localhost"  #t (get-host-name))
                  (list "not-a-host" #t "not-a-host"  ))
            post-proc: pair?)
                                           
(test #f #t (list? (rmt:get-changed-record-ids 0)))

(test #f #f (begin (runs:update-all-test_meta #f) #f))

(test #f '("test1" "test2")(sort (alist-ref "tagtwo" (hash-table->alist (rmt:get-tests-tags)) equal?) string<=))

(test #f '() (rmt:get-key-val-pairs 0))
(test #f '("SYSTEM" "RELEASE") (rmt:get-keys))
(test #f '("SYSTEM" "RELEASE") (rmt:get-keys-write)) ;; dummy query to force server start
(test #f '() (rmt:get-key-vals 1))
(test #f (vector '("SYSTEM" "RELEASE") '()) (rmt:get-targets))
(test #f "" (rmt:get-target 1))
(test #f #t (rmt:register-test 1 "foo" ""))
80
81
82
83
84
85
86

87















88
89
90
91
92
93
94
(test #f #t (vector? (rmt:get-runs "%" 10 0 keypatts)))
(test #f '(1)(rmt:get-all-run-ids))
(test #f '()(rmt:get-prev-run-ids 1))
(test #f #t (begin (rmt:lock/unlock-run 1 #t #f "mikey") #t))
(test #f "JUSTFINE" (rmt:get-run-status 1))
(test #f #t (begin (rmt:set-run-status 1 "NOTFINE" msg: "A message") #t))
(test #f #t (begin (rmt:update-run-event_time 1) #t))

;; (rmt:get-runs-by-patt  keys runnamepatt targpatt offset limit fields last-runs-update) ;; fields of #f uses default















;; (rmt:find-and-mark-incomplete run-id ovr-deadtime)
;; (rmt:get-main-run-stats run-id)
;; (rmt:get-var varname)
;; (rmt:set-var varname value)
;; (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f))
;; (rmt:get-previous-test-run-record run-id test-name item-path)
;; (rmt:get-run-stats)







>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
(test #f #t (vector? (rmt:get-runs "%" 10 0 keypatts)))
(test #f '(1)(rmt:get-all-run-ids))
(test #f '()(rmt:get-prev-run-ids 1))
(test #f #t (begin (rmt:lock/unlock-run 1 #t #f "mikey") #t))
(test #f "JUSTFINE" (rmt:get-run-status 1))
(test #f #t (begin (rmt:set-run-status 1 "NOTFINE" msg: "A message") #t))
(test #f #t (begin (rmt:update-run-event_time 1) #t))

;; (rmt:get-runs-by-patt  keys runnamepatt targpatt offset limit fields last-runs-update) ;; fields of #f uses default
;;
(let ((keys (rmt:get-keys))
      (rnp  "%")    ;; run name patt
      (tpt  "%/%")) ;; target patt
  (test-batch rmt:get-runs-by-patt
              "rmt:get-runs-by-patt"
              (list (list "t=0" #t keys rnp tpt #f #f #f 0)
                    (list "t=current" #f keys rnp tpt #f #f #f (+ 100 (current-seconds))) ;; should be no records from the future
                    )
              post-proc: (lambda (res)
                           ;; (print "rmt:get-runs-by-patt returned: " res)
                           (and (vector? res)
                                (let ((rows (vector-ref res 1)))
                                  (> (length rows) 0))))))

;; (rmt:find-and-mark-incomplete run-id ovr-deadtime)
;; (rmt:get-main-run-stats run-id)
;; (rmt:get-var varname)
;; (rmt:set-var varname value)
;; (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f))
;; (rmt:get-previous-test-run-record run-id test-name item-path)
;; (rmt:get-run-stats)