Megatest

Check-in [5ef2e6042b]
Login
Overview
Comment:Adjusted server startup timing values. Changed CHECK to be a failing status for prereq. Moved some debug messages to higher debug levels. Added unsetting of HTTP_PROXY env vars.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65
Files: files | file ages | folders
SHA1: 5ef2e6042b683d03f2fcbd53df5a061c4b665d87
User & Date: mmgraham on 2021-04-23 14:55:50
Other Links: branch diff | manifest | tags
Context
2021-04-29
15:11
reverted the change to condition the call to runs:incremental-print-results on debug 3. This was breaking the subrun test. check-in: 43dabdcf28 user: mmgraham tags: v1.65
2021-04-23
14:55
Adjusted server startup timing values. Changed CHECK to be a failing status for prereq. Moved some debug messages to higher debug levels. Added unsetting of HTTP_PROXY env vars. check-in: 5ef2e6042b user: mmgraham tags: v1.65
2021-04-06
13:30
made the sourcing of cfg.sh not dependent on LD_LIBRARY_PATH being set check-in: 38a8d8c4ea user: mmgraham tags: v1.65
Changes

Modified common.scm from [82673dacdb] to [526a2263d9].

815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
    (11 "PREQ_DISCARDED")
    (12 "ABORT")))

(define *common:ended-states*       ;; states which indicate the test is stopped and will not proceed
  '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" ))

(define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked
  '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD"))

(define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed
  '("PASS" "WARN" "CHECK" "WAIVED" "SKIP"))

;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items
(define *common:running-states*     ;; test is either running or can be run
  '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED"))

(define *common:cant-run-states*    ;; These are stopping conditions that prevent a test from being run
  '("COMPLETED" "KILLED" "UNKNOWN" "INCOMPLETE" "ARCHIVED"))







|


|







815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
    (11 "PREQ_DISCARDED")
    (12 "ABORT")))

(define *common:ended-states*       ;; states which indicate the test is stopped and will not proceed
  '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" ))

(define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked
  '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD" "CHECK"))

(define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed
  '("PASS" "WARN" "WAIVED" "SKIP"))

;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items
(define *common:running-states*     ;; test is either running or can be run
  '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED"))

(define *common:cant-run-states*    ;; These are stopping conditions that prevent a test from being run
  '("COMPLETED" "KILLED" "UNKNOWN" "INCOMPLETE" "ARCHIVED"))
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
;;   count     - count down to zero, at some point we'd give up if the load never drops
;;   num-tries - count down to zero number tries to get numcpus
;;
(define (common:wait-for-cpuload maxnormload numcpus-in
				 #!key (count 1000)
				 (msg #f)(remote-host #f)(num-tries 5))
  (let* ((loadavg     (common:get-cpu-load remote-host))
	  ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
	 (numcpus     (if (<= 1 numcpus-in)
			  (common:get-num-cpus remote-host)
			  numcpus-in))
	 (first       (car loadavg))
	 (next        (cadr loadavg))
	 (adjmaxload  (* maxnormload (max 1 numcpus))) ;; possible bug
						   ;; where numcpus
						   ;; (or could be
						   ;; maxload) is
						   ;; zero, crude
						   ;; fallback is to
						   ;; at least use 1
	 ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
	 ;; etc.
	 (effective-load    (common:get-intercept first next))
	 (recommended-delay (common:get-delay effective-load numcpus))
	 (effective-host    (or remote-host "localhost"))
	 (normalized-effective-load (/ effective-load numcpus))
	 (will-wait                 (> normalized-effective-load maxnormload)))
    (if (> recommended-delay 1) 
	(let* ((actual-delay (min recommended-delay 30)))
	  (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
	      (debug:print-info 0 *default-log-port* "Load control, delaying "
				actual-delay " seconds to maintain safe load. current normalized effective load is "
				normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load))
	  (thread-sleep! actual-delay)))
    
    (cond
     ;; bad data, try again to get the data
     ((not will-wait)
      (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))
      	  (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))








|
|
|
<
|
|
|
<
<
<
<
|
<
|
|
|
|
|
|
|
|
|
|
|


|







2206
2207
2208
2209
2210
2211
2212
2213
2214
2215

2216
2217
2218




2219

2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
;;   count     - count down to zero, at some point we'd give up if the load never drops
;;   num-tries - count down to zero number tries to get numcpus
;;
(define (common:wait-for-cpuload maxnormload numcpus-in
				 #!key (count 1000)
				 (msg #f)(remote-host #f)(num-tries 5))
  (let* ((loadavg     (common:get-cpu-load remote-host))
    ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
    (numcpus     (if (<= 1 numcpus-in)
    (common:get-num-cpus remote-host) numcpus-in))

    (first       (car loadavg))
    (next        (cadr loadavg))
    (adjmaxload  (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude




						  ;; fallback is to at least use 1

    ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
    ;; etc.
    (effective-load    (common:get-intercept first next))
    (recommended-delay (common:get-delay effective-load numcpus))
    (effective-host    (or remote-host "localhost"))
    (normalized-effective-load (/ effective-load numcpus))
    (will-wait                 (> normalized-effective-load maxnormload)))
    (if (and will-wait (> recommended-delay 1)) 
      (let* ((actual-delay (min recommended-delay 30)))
        (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
          (debug:print-info 0 *default-log-port* "Load control, delaying "
				actual-delay " seconds to maintain safe load. current normalized effective load is "
				normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load))
	(thread-sleep! actual-delay)))
    
    (cond
     ;; bad data, try again to get the data
     ((not will-wait)
      (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))
      	  (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))

Modified db.scm from [ed256dd44f] to [65246b91b8].

4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678

4679
4680
4681
4682
4683
4684
4685
		(loop (car tal)(cdr tal) newr)))))))




;; the new prereqs calculation, looks also at itempath if specified
;; all prereqs must be met
;;    if prereq test with itempath='' is COMPLETED and PASS, WARN, CHECK, or WAIVED then prereq is met
;;    if prereq test with itempath=ref-item-path and COMPLETED with PASS, WARN, CHECK, or WAIVED then prereq is met
;;
;; Note: mode 'normal means that tests must be COMPLETED and ok (i.e. PASS, WARN, CHECK, SKIP or WAIVED)
;;       mode 'toplevel means that tests must be COMPLETED only
;;       mode 'itemmatch or 'itemwait means that tests items must be COMPLETED and (PASS|WARN|WAIVED|CHECK) [[ NB// NOT IMPLEMENTED YET ]]
;;       mode 'exclusive means this test/item cannot run if the same test/item is LAUNCHED,REMOTEHOSTSTART or RUNNING
;;
;; IDEA for consideration:
;;    1. collect all tests "upstream"
;;    2. any NOT completed and good? if yes => return those as prereqs not met, if no => return null list
;; 
;; (define (db:get-prereqs-not-met dbstruct run-id waitons ref-item-path mode)
(define (db:get-prereqs-not-met dbstruct run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmap #f))
  ;; BBnote - rollup of an itemized test's overall state/status done in db:set-state-status-and-roll-up-items

  (append
   (if (member 'exclusive mode)
       (let ((running-tests (db:get-tests-for-run dbstruct
						  #f  ;; run-id of #f means for all runs. 
						  (if (string=? ref-item-path "")   ;; testpatt
						      ref-test-name
						      (conc ref-test-name "/" ref-item-path))







|














>







4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
		(loop (car tal)(cdr tal) newr)))))))




;; the new prereqs calculation, looks also at itempath if specified
;; all prereqs must be met
;;    if prereq test with itempath='' is in common:well-ended-states, then prereq is met
;;    if prereq test with itempath=ref-item-path and COMPLETED with PASS, WARN, CHECK, or WAIVED then prereq is met
;;
;; Note: mode 'normal means that tests must be COMPLETED and ok (i.e. PASS, WARN, CHECK, SKIP or WAIVED)
;;       mode 'toplevel means that tests must be COMPLETED only
;;       mode 'itemmatch or 'itemwait means that tests items must be COMPLETED and (PASS|WARN|WAIVED|CHECK) [[ NB// NOT IMPLEMENTED YET ]]
;;       mode 'exclusive means this test/item cannot run if the same test/item is LAUNCHED,REMOTEHOSTSTART or RUNNING
;;
;; IDEA for consideration:
;;    1. collect all tests "upstream"
;;    2. any NOT completed and good? if yes => return those as prereqs not met, if no => return null list
;; 
;; (define (db:get-prereqs-not-met dbstruct run-id waitons ref-item-path mode)
(define (db:get-prereqs-not-met dbstruct run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmap #f))
  ;; BBnote - rollup of an itemized test's overall state/status done in db:set-state-status-and-roll-up-items
  (debug:print 4 *default-log-port* "db:get-prereqs-not-met: " waitons)
  (append
   (if (member 'exclusive mode)
       (let ((running-tests (db:get-tests-for-run dbstruct
						  #f  ;; run-id of #f means for all runs. 
						  (if (string=? ref-item-path "")   ;; testpatt
						      ref-test-name
						      (conc ref-test-name "/" ref-item-path))
4698
4699
4700
4701
4702
4703
4704


4705
4706
4707
4708
4709
4710
4711
	;;	(if (equal? (db:test-get-item-path testdat) "")
	;;	    (db:test-get-testname testdat)
	;;	    (conc (db:test-get-testname testdat)
	;;		  "/"
	;;		  (db:test-get-item-path testdat))))
	 running-tests) ;; calling functions want the entire data
       '())



   ;; collection of: for each waiton -
   ;;   if this ref-test-name is an item in an itemized test and mode is itemwait/itemmatch:
   ;;     if waiton is not itemized - if waiton is not both completed and in ok status, add as unmet prerequisite
   ;;     if waiton is itemized:
   ;;           and waiton's items are not expanded, add as unmet prerequisite
   ;;           else if matching waiton item is not both completed and in an ok status, add as unmet prerequisite







>
>







4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
	;;	(if (equal? (db:test-get-item-path testdat) "")
	;;	    (db:test-get-testname testdat)
	;;	    (conc (db:test-get-testname testdat)
	;;		  "/"
	;;		  (db:test-get-item-path testdat))))
	 running-tests) ;; calling functions want the entire data
       '())



   ;; collection of: for each waiton -
   ;;   if this ref-test-name is an item in an itemized test and mode is itemwait/itemmatch:
   ;;     if waiton is not itemized - if waiton is not both completed and in ok status, add as unmet prerequisite
   ;;     if waiton is itemized:
   ;;           and waiton's items are not expanded, add as unmet prerequisite
   ;;           else if matching waiton item is not both completed and in an ok status, add as unmet prerequisite

Modified runs.scm from [07f8912d41] to [2f42b47c06].

737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
                          (waiton-record   (hash-table-ref/default test-records waiton #f))
			  (waiton-tconfig  (if waiton-record (vector-ref waiton-record 1) #f))
			  (waiton-itemized (and waiton-tconfig
						(or (hash-table-ref/default waiton-tconfig "items" #f)
						    (hash-table-ref/default waiton-tconfig "itemstable" #f))))
			  (itemmaps        (tests:get-itemmaps config))  ;; (configf:lookup config "requirements" "itemmap"))
			  (new-test-patts  (tests:extend-test-patts test-patts hed waiton itemmaps hed-itemized-waiton))) 
		     (debug:print-info 0 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items")
		     ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%"
		     ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt
		     ;; is this satisfied by merely appending "/" to the waiton name added to the list?
		     ;;
		     ;; This approach causes all of the items in an upstream test to be run 
		     ;; if we have this waiton already processed once we can analzye it for extending
		     ;; tests to be run, since we can't properly process waitons unless they have been







|







737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
                          (waiton-record   (hash-table-ref/default test-records waiton #f))
			  (waiton-tconfig  (if waiton-record (vector-ref waiton-record 1) #f))
			  (waiton-itemized (and waiton-tconfig
						(or (hash-table-ref/default waiton-tconfig "items" #f)
						    (hash-table-ref/default waiton-tconfig "itemstable" #f))))
			  (itemmaps        (tests:get-itemmaps config))  ;; (configf:lookup config "requirements" "itemmap"))
			  (new-test-patts  (tests:extend-test-patts test-patts hed waiton itemmaps hed-itemized-waiton))) 
		     (debug:print-info 2 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items")
		     ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%"
		     ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt
		     ;; is this satisfied by merely appending "/" to the waiton name added to the list?
		     ;;
		     ;; This approach causes all of the items in an upstream test to be run 
		     ;; if we have this waiton already processed once we can analzye it for extending
		     ;; tests to be run, since we can't properly process waitons unless they have been
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
                                   (set! test-names (cons waiton test-names)) ;; need to process this one, only add once the waiton tconfig read
                                   (set! required-tests (cons (conc waiton "/") required-tests))
                                   (set! test-patts new-test-patts))
                                 (begin
                                   (debug:print-info 0 *default-log-port* "Waitor(s) not yet on testpatt for " waiton ", setting up to re-process it")
                                   (set! tal (append (cons waiton tal)(list hed)))))
                             (begin
                               (debug:print-info 0 *default-log-port* "Adding non-itemized test " waiton " to required-tests")
                               (set! required-tests (cons waiton required-tests))
                               (set! test-patts new-test-patts)))
			 (begin
			   (debug:print-info 0 *default-log-port* "No testconfig info yet for " waiton ", setting up to re-process it")
			   (set! tal (append (cons waiton tal)(list hed))))) ;; (cons (conc waiton "/") required-tests))
		     ;; NOPE: didn't work. required needs to be plain test names. Try tacking on to test-patts
		     ;;  - doesn't work







|







759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
                                   (set! test-names (cons waiton test-names)) ;; need to process this one, only add once the waiton tconfig read
                                   (set! required-tests (cons (conc waiton "/") required-tests))
                                   (set! test-patts new-test-patts))
                                 (begin
                                   (debug:print-info 0 *default-log-port* "Waitor(s) not yet on testpatt for " waiton ", setting up to re-process it")
                                   (set! tal (append (cons waiton tal)(list hed)))))
                             (begin
                               (debug:print-info 2 *default-log-port* "Adding non-itemized test " waiton " to required-tests")
                               (set! required-tests (cons waiton required-tests))
                               (set! test-patts new-test-patts)))
			 (begin
			   (debug:print-info 0 *default-log-port* "No testconfig info yet for " waiton ", setting up to re-process it")
			   (set! tal (append (cons waiton tal)(list hed))))) ;; (cons (conc waiton "/") required-tests))
		     ;; NOPE: didn't work. required needs to be plain test names. Try tacking on to test-patts
		     ;;  - doesn't work
896
897
898
899
900
901
902

903
904
905
906
907
908
909
;; return value of runs:expand-items is passed back to runs-tests-queue and is fed to named loop with this signature:
;;    (let loop ((hed         (car sorted-test-names))
;;	         (tal         (cdr sorted-test-names))
;;	         (reg         '()) ;; registered, put these at the head of tal 
;;	         (reruns      '()))
(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps)
  (let* ((loop-list       (list hed tal reg reruns))

	 (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)))
			    (if (list? res)
				res
				(begin
				  (debug:print 0 *default-log-port*
					       "ERROR: rmt:get-prereqs-not-met returned non-list!\n"
					       "  res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps)







>







896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
;; return value of runs:expand-items is passed back to runs-tests-queue and is fed to named loop with this signature:
;;    (let loop ((hed         (car sorted-test-names))
;;	         (tal         (cdr sorted-test-names))
;;	         (reg         '()) ;; registered, put these at the head of tal 
;;	         (reruns      '()))
(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps)
  (let* ((loop-list       (list hed tal reg reruns))
         (junk (debug:print-info 4 *default-log-port* "expand-items calling rmt:get-prereqs-not-met"))
	 (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)))
			    (if (list? res)
				res
				(begin
				  (debug:print 0 *default-log-port*
					       "ERROR: rmt:get-prereqs-not-met returned non-list!\n"
					       "  res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps)
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
	   (list (car newtal)(append (cdr newtal) reg) '() reruns)
	  #f)) 
     ((null? runnables)
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-7")
      #f) ;; if we get here and non-completed is null then it is all over.
     (else
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-8")
      (debug:print 0 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now")
      (list (car newtal)(cdr newtal) reg reruns)))))

(define (runs:mixed-list-testname-and-testrec->list-of-strings inlst)
  (if (null? inlst)
      '()
      (map (lambda (t)
	     (cond







|







1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
	   (list (car newtal)(append (cdr newtal) reg) '() reruns)
	  #f)) 
     ((null? runnables)
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-7")
      #f) ;; if we get here and non-completed is null then it is all over.
     (else
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-8")
      (debug:print 2 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now")
      (list (car newtal)(cdr newtal) reg reruns)))))

(define (runs:mixed-list-testname-and-testrec->list-of-strings inlst)
  (if (null? inlst)
      '()
      (map (lambda (t)
	     (cond
1284
1285
1286
1287
1288
1289
1290

1291

1292
1293
1294
1295
1296
1297
1298
		    (not (member 'exclusive testmode)))))
      ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path))
      ;; we are going to reset all the counters for test retries by setting a new hash table
      ;; this means they will increment only when nothing can be run
      (set! *max-tries-hash* (make-hash-table))
      
      (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat)

      (runs:incremental-print-results run-id)

      (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
      (runs:shrink-can-run-more-tests-count runsdat)  ;; DELAY TWEAKER (still needed?)
      ;; (thread-sleep! *global-delta*)
      (if (or (not (null? tal))(not (null? reg)))
	  (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time
	  #f))
     







>
|
>







1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
		    (not (member 'exclusive testmode)))))
      ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path))
      ;; we are going to reset all the counters for test retries by setting a new hash table
      ;; this means they will increment only when nothing can be run
      (set! *max-tries-hash* (make-hash-table))
      
      (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat)
      (if (debug:debug-mode 3)
        (runs:incremental-print-results run-id)
      )
      (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
      (runs:shrink-can-run-more-tests-count runsdat)  ;; DELAY TWEAKER (still needed?)
      ;; (thread-sleep! *global-delta*)
      (if (or (not (null? tal))(not (null? reg)))
	  (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time
	  #f))
     
1666
1667
1668
1669
1670
1671
1672
1673
1674

1675
1676
1677
1678
1679
1680
1681
		  (debug:print-info 0 *default-log-port* "Skipping test " tfullname " as it has been marked do not run due to being completed or not runnable"))
	      (if (or (not (null? tal))(not (null? reg)))
		  (loop (runs:queue-next-hed tal reg reglen regfull)
			(runs:queue-next-tal tal reg reglen regfull)
			(runs:queue-next-reg tal reg reglen regfull)
			reruns))))
        ;; (loop (car tal)(cdr tal) reg reruns))))

	(runs:incremental-print-results run-id)

	(debug:print 4 *default-log-port* "TOP OF LOOP => "
		     "test-name: " test-name
		     "\n  hed:         " hed
		     "\n  tal:         " (runs:pretty-long-list tal)
		     "\n  reg:         " reg
                     "\n  test-record  " test-record
                     "\n  itemdat:     " itemdat







|
|
>







1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
		  (debug:print-info 0 *default-log-port* "Skipping test " tfullname " as it has been marked do not run due to being completed or not runnable"))
	      (if (or (not (null? tal))(not (null? reg)))
		  (loop (runs:queue-next-hed tal reg reglen regfull)
			(runs:queue-next-tal tal reg reglen regfull)
			(runs:queue-next-reg tal reg reglen regfull)
			reruns))))
        ;; (loop (car tal)(cdr tal) reg reruns))))
        (if (debug:debug-mode 3)
	  (runs:incremental-print-results run-id)
        )
	(debug:print 4 *default-log-port* "TOP OF LOOP => "
		     "test-name: " test-name
		     "\n  hed:         " hed
		     "\n  tal:         " (runs:pretty-long-list tal)
		     "\n  reg:         " reg
                     "\n  test-record  " test-record
                     "\n  itemdat:     " itemdat
1850
1851
1852
1853
1854
1855
1856
1857

1858
1859
1860
1861
1862

1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874

1875
1876
1877
1878
1879
1880
1881
		  (if loop-list
		      (apply loop loop-list)
                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
                      )
                  )
		;; if can't run more just loop with next possible test
		(loop (car newtal)(cdr newtal) reg reruns))))
         

	 ;; this case should not happen, added to help catch any bugs
	 ((and (list? items) itemdat)
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-5")
	  (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this")
	  (exit 1))

	 ((not (null? reruns))
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-6")
	  (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED,
		 (junked (lset-difference equal? tal newlst)))
	    (debug:print-info 4 *default-log-port* "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal)
	    (if (< num-retries max-retries)
		(set! newlst (append reruns newlst)))
	    (set! num-retries (+ num-retries 1))
	    ;; (thread-sleep! (+ 1 *global-delta*))
	    (if (not (null? newlst))
		;; since reruns have been tacked on to newlst create new reruns from junked
		(loop (car newlst)(cdr newlst) reg (delete-duplicates junked)))))

	 ((not (null? tal))
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-7")
	  (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here."))
	 ((not (null? reg)) ;; could we get here with leftovers?
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-8")
	  (debug:print-info 0 *default-log-port* "Have leftovers!")
	  (loop (car reg)(cdr reg) '() reruns))







|
>





>












>







1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
		  (if loop-list
		      (apply loop loop-list)
                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
                      )
                  )
		;; if can't run more just loop with next possible test
		(loop (car newtal)(cdr newtal) reg reruns))))
        

	 ;; this case should not happen, added to help catch any bugs
	 ((and (list? items) itemdat)
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-5")
	  (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this")
	  (exit 1))

	 ((not (null? reruns))
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-6")
	  (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED,
		 (junked (lset-difference equal? tal newlst)))
	    (debug:print-info 4 *default-log-port* "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal)
	    (if (< num-retries max-retries)
		(set! newlst (append reruns newlst)))
	    (set! num-retries (+ num-retries 1))
	    ;; (thread-sleep! (+ 1 *global-delta*))
	    (if (not (null? newlst))
		;; since reruns have been tacked on to newlst create new reruns from junked
		(loop (car newlst)(cdr newlst) reg (delete-duplicates junked)))))

	 ((not (null? tal))
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-7")
	  (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here."))
	 ((not (null? reg)) ;; could we get here with leftovers?
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-8")
	  (debug:print-info 0 *default-log-port* "Have leftovers!")
	  (loop (car reg)(cdr reg) '() reruns))
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
	     (set! runflag #t))

	    (else (set! runflag #f)))
	   (debug:print 4 *default-log-port* "RUNNING => runflag: " runflag " STATE: " (test:get-state testdat) " STATUS: " (test:get-status testdat))
	   (if (not runflag)
	       (if (not parent-test)
		   (if (runs:lownoise (conc "not starting test" full-test-name) 60)
		       (debug:print 1 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) 
				    "\" and status \"" (test:get-status testdat) "\", use -rerun \"" (test:get-status testdat)
				    "\" or -force to override")))
	       ;; NOTE: No longer be checking prerequisites here! Will never get here unless prereqs are
	       ;;       already met.
	       ;; This would be a great place to do the process-fork
	       ;; 
	       (let ((skip-test   #f)







|







2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
	     (set! runflag #t))

	    (else (set! runflag #f)))
	   (debug:print 4 *default-log-port* "RUNNING => runflag: " runflag " STATE: " (test:get-state testdat) " STATUS: " (test:get-status testdat))
	   (if (not runflag)
	       (if (not parent-test)
		   (if (runs:lownoise (conc "not starting test" full-test-name) 60)
		       (debug:print 3 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) 
				    "\" and status \"" (test:get-status testdat) "\", use -rerun \"" (test:get-status testdat)
				    "\" or -force to override")))
	       ;; NOTE: No longer be checking prerequisites here! Will never get here unless prereqs are
	       ;;       already met.
	       ;; This would be a great place to do the process-fork
	       ;; 
	       (let ((skip-test   #f)
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
		    ;; seek and kill in flight -runtests with % as testpatt here
		    ;; (if (equal? testpatt "%")
		    (tasks:kill-runner target run-name testpatt)
		    ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    (debug:print 1 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((print-run)
		    (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
		    action)
		   ((run-wait)
		    (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete"))
		   ((archive)
		    (debug:print 1 *default-log-port* "Archiving/restoring (" (args:get-arg "-archive") ") data for run: " runkey " " (db:get-value-by-header run header "runname"))







|







2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
		    ;; seek and kill in flight -runtests with % as testpatt here
		    ;; (if (equal? testpatt "%")
		    (tasks:kill-runner target run-name testpatt)
		    ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    (debug:print 2 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((print-run)
		    (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
		    action)
		   ((run-wait)
		    (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete"))
		   ((archive)
		    (debug:print 1 *default-log-port* "Archiving/restoring (" (args:get-arg "-archive") ") data for run: " runkey " " (db:get-value-by-header run header "runname"))
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
				       )
				      ) ; end case rem-status
                                    ) ; end let
                                  ); end cond has-subrun

                                 (else
                                  ;; BB - TODO - consider backgrounding to threads to delete tests (work below) 
                                  (debug:print-info 0 *default-log-port* "test: " test-name " itest-state: " test-state)
                                  (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                      (begin
                                        (if (not (hash-table-ref/default test-retry-time test-fulln #f))
                                            (begin
                                              ;; want to set to REMOVING BUT CANNOT do it here?
                                              (hash-table-set! test-retry-time test-fulln (current-seconds))))
                                        (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)







|







2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
				       )
				      ) ; end case rem-status
                                    ) ; end let
                                  ); end cond has-subrun

                                 (else
                                  ;; BB - TODO - consider backgrounding to threads to delete tests (work below) 
                                  (debug:print-info 2 *default-log-port* "test: " test-name " itest-state: " test-state)
                                  (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                      (begin
                                        (if (not (hash-table-ref/default test-retry-time test-fulln #f))
                                            (begin
                                              ;; want to set to REMOVING BUT CANNOT do it here?
                                              (hash-table-set! test-retry-time test-fulln (current-seconds))))
                                        (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
	;; 			      (thread-sleep! 1)
	 ;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)))))))
	 )
    (case clean-mode
      ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f))
      ((remove-all)      (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f))
      ((archive-remove)  (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f)))
    (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
    (if (and real-dir 
	     (> (string-length real-dir) 5)
	     (common:file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
	(let* ((realpath (resolve-pathname run-dir)))
	  (debug:print-info 1 *default-log-port* "Recursively removing " realpath)
	  (if (common:file-exists? realpath)
	      (runs:safe-delete-test-dir realpath)







|







2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
	;; 			      (thread-sleep! 1)
	 ;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)))))))
	 )
    (case clean-mode
      ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f))
      ((remove-all)      (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f))
      ((archive-remove)  (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f)))
    (debug:print-info 2 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
    (if (and real-dir 
	     (> (string-length real-dir) 5)
	     (common:file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
	(let* ((realpath (resolve-pathname run-dir)))
	  (debug:print-info 1 *default-log-port* "Recursively removing " realpath)
	  (if (common:file-exists? realpath)
	      (runs:safe-delete-test-dir realpath)
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
	  (let* ((linktree (common:get-linktree)) ;; (if toppath (configf:lookup *configdat* "setup" "linktree")))
		 (runtop   (conc linktree "/" target "/" runname))
		 (files    (if (common:file-exists? runtop)
			       (append (glob (conc runtop "/.megatest*"))
				       (glob (conc runtop "/.runconfig*")))
			       '())))
	    (if (null? files)
		(debug:print-info 0 *default-log-port* "No cached megatest or runconfigs files found. None removed.")
		(begin
		  (debug:print-info 0 *default-log-port* "Removing cached files:\n    " (string-intersperse files "\n    "))
		  (for-each 
		   (lambda (f)
		     (handle-exceptions
			 exn
			 (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn)
		       (delete-file f)))
		   files))))
	  (debug:print-error 0 *default-log-port* "-clean-cache requires -runname."))
      (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))







|

|









3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
	  (let* ((linktree (common:get-linktree)) ;; (if toppath (configf:lookup *configdat* "setup" "linktree")))
		 (runtop   (conc linktree "/" target "/" runname))
		 (files    (if (common:file-exists? runtop)
			       (append (glob (conc runtop "/.megatest*"))
				       (glob (conc runtop "/.runconfig*")))
			       '())))
	    (if (null? files)
		(debug:print-info 2 *default-log-port* "No cached megatest or runconfigs files found. None removed.")
		(begin
		  (debug:print-info 2 *default-log-port* "Removing cached files:\n    " (string-intersperse files "\n    "))
		  (for-each 
		   (lambda (f)
		     (handle-exceptions
			 exn
			 (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn)
		       (delete-file f)))
		   files))))
	  (debug:print-error 0 *default-log-port* "-clean-cache requires -runname."))
      (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))

Modified server.scm from [bb2f7cfc15] to [15bbdd7b10].

152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))








|







152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384

385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))


;; if server-start-last exists, and wasn't old enough, wait <idle time>, then call this function recursively until it is old enough.
;; if it is old enough, overwrite it and wait 0.25 seconds.
;; if it then has the wrong server key, wait <idle time> and call this function recursively.
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (old-enough   (> delta idletime))

              )

          ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
          ;; 
	  (if (and old-enough
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag (lambda () (print server-key)))
		     (thread-sleep! 0.25)
		     (let ((res (with-input-from-file start-flag (lambda () (read-line)))))
		       (equal? server-key res)))
                )
	      #t

           ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time>, then call this function recursively. 
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "server key does not match" "too soon to start another server"))

		(thread-sleep! idletime)
		(server:wait-for-server-start-last-flag areapath)))))))


        
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* (
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
	  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 18) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))

      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")
   )
)

;; this one seems to be the general entry point







|

|










>



|
|




|
|



|


|

|



















|







365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))


;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
;; if it is old enough, overwrite it and wait 0.25 seconds.
;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (old-enough   (> delta idletime))
               (new-server-key "")
              )

          ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
	  ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process.
           (if (and old-enough
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag (lambda () (print server-key)))
		     (thread-sleep! 0.25)
		     (set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))
		     (equal? server-key new-server-key))
                )
	      #t

           ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time> + 1, then call this function recursively. 
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))

		(thread-sleep! ( + 1 idletime))
		(server:wait-for-server-start-last-flag areapath)))))))


        
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* (
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
	  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))

      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")
   )
)

;; this one seems to be the general entry point

Modified utils/mk_wrapper from [aa7905f86b] to [4fc79e58e7].

42
43
44
45
46
47
48









49
50
51
52
53
54
55
if [ "\$LD_LIBRARY_PATH" != "" ];then
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:\$LD_LIBRARY_PATH:$libdir
else
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$libdir
fi

export MT_SQLITE3_EXE=$sqlite3_exe









__EOF
) > $cfgfile
  echo 
else
  echo "INFO: LD_LIBRARY_PATH not set" >&2
fi








>
>
>
>
>
>
>
>
>







42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
if [ "\$LD_LIBRARY_PATH" != "" ];then
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:\$LD_LIBRARY_PATH:$libdir
else
  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$libdir
fi

export MT_SQLITE3_EXE=$sqlite3_exe

http_vars="http_proxy https_proxy HTTP_PROXY HTTPS_PROXY"
for i in \$http_vars
do
j=\${!i}
if [ "\$j" != "" ]; then
   unset \$i
fi
done
__EOF
) > $cfgfile
  echo 
else
  echo "INFO: LD_LIBRARY_PATH not set" >&2
fi