Megatest

Check-in [9657acc77f]
Login
Overview
Comment:lock with host/pid
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-cleanup
Files: files | file ages | folders
SHA1: 9657acc77f04bf33ce092d41928555a41f4e06b9
User & Date: matt on 2020-08-28 14:24:24
Other Links: branch diff | manifest | tags
Context
2020-08-29
06:30
force server exit only if [setup] runtime is set ==3.7/1.0/PASS/mars== check-in: 7951d54072 user: matt tags: v1.65-cleanup
2020-08-28
14:24
lock with host/pid check-in: 9657acc77f user: matt tags: v1.65-cleanup
10:24
Converted a few load wait back to simple sleeps to save cycles. check-in: 44f224d1e8 user: mrwellan tags: v1.65-cleanup, v1.6565
Changes

Modified runs.scm from [dce077e9b3] to [f86e80cfc1].

325
326
327
328
329
330
331
332
333
334



335
336
337
338
339
340
341
325
326
327
328
329
330
331



332
333
334
335
336
337
338
339
340
341







-
-
-
+
+
+







;;		  ;; seems like always)?
;;		  ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
;;		   0)
;;        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
;;		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
;;		   10)  ;; obviously haven't had any work to do for a while
;;		  (else 0)))
;;		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
;;		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
;;		   )))
;; 		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
;; 		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
;; 		   )))
  
  (let* ((num-running             (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
992
993
994
995
996
997
998
999

1000
1001

1002
1003
1004
1005
1006
1007
1008
992
993
994
995
996
997
998

999
1000

1001
1002
1003
1004
1005
1006
1007
1008







-
+

-
+







    ((and (null? fails) ;; have not-started tests, but unable to run them.  everything looks completed with no prospect of unsticking something that is stuck.  we should mark hed as moribund and exit or continue if there are more tests to consider
	   (null? prereq-fails)
	   (null? non-completed))
     (debug:print-info 4 *default-log-port* "cond branch - "  "ei-4")
      (if  (runs:can-keep-running? hed 20)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; 
	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; 
	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
	    ;;
	    ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!!
	    ;; No runsdat, can't do this yet
	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
	    ;;
	    (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?

Modified server.scm from [0bfbc24f73] to [0335617d1a].

321
322
323
324
325
326
327
328


329
330
331
332
333










334
335
336
337
338
339
340
321
322
323
324
325
326
327

328
329
330
331
332
333

334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350







-
+
+




-
+
+
+
+
+
+
+
+
+
+







        *my-client-signature*)))

;; wait for server=start-last to be three seconds old
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4)))
	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta reftime)))
	  (if all-go
	  (if (and all-go
		   (begin
		     (with-output-to-file start-flag
		       (lambda ()
			 (print server-key)))
		     (thread-sleep! 0.25)
		     (let ((res (with-input-from-file start-flag
				  (lambda ()
				    (read-line)))))
		       (equal? server-key res))))
	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! reftime)
		(server:wait-for-server-start-last-flag areapath)))))))