Megatest

Check-in [d1548b7a57]
Login
Overview
Comment:added support for custom load-jump-limit and added setup, keep-deleted-records (in seconds, keep deleted records this long) ==/7.9/0.9/WARN/1201/mars/==
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-cleanup
Files: files | file ages | folders
SHA1: d1548b7a5735483af9c07ed9129625e4b8518016
User & Date: mrwellan on 2020-08-25 10:12:29
Original Comment: added support for custom load-jump-limit and added setup, keep-deleted-records (in seconds, keep deleted records this long)
Other Links: branch diff | manifest | tags
Context
2020-08-25
15:54
Merged v1.65 with get-intercept and get-delay ==/8.7/1.0/WARN/1201/mars/== check-in: b0705815be user: mrwellan tags: v1.65-cleanup
10:12
added support for custom load-jump-limit and added setup, keep-deleted-records (in seconds, keep deleted records this long) ==/7.9/0.9/WARN/1201/mars/== check-in: d1548b7a57 user: mrwellan tags: v1.65-cleanup
2020-08-24
18:06
Saftey fixes and minor cleanup ==/3.73/1.3/PASS/1203/orion/== check-in: 79674abc64 user: mrwellan tags: v1.65-cleanup
Changes

Modified common.scm from [bc77148d79] to [e3a13b4dea].

2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127

2128
2129
2130
2131
2132
2133
2134
					       ;; at least use 1
	 (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
			       0
			       next))) ;; we will force a conservative calculation any time next is large.
	 (first-next-avg    (/ (+ first next) 2))
	 ;; add some randomness to the time to break any alignment
	 ;; where netbatch dumps many jobs to machines simultaneously
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
						      (/ (- 1000 count) 10)
						      waitdelay)
						   (- first adjmaxload) ))  )))

    ;; let's let the user know once in a long while that load checking
    ;; is happening but not constantly report it
    (if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
    (cond
     ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable







|
|
|
|
>







2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
					       ;; at least use 1
	 (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
			       0
			       next))) ;; we will force a conservative calculation any time next is large.
	 (first-next-avg    (/ (+ first next) 2))
	 ;; add some randomness to the time to break any alignment
	 ;; where netbatch dumps many jobs to machines simultaneously
         (adjwait           (min (+ 300 (random 10)) (abs (* (+ (random 10)
								(/ (- 1000 count) 10)
								waitdelay)
							     (- first adjmaxload) ))))
	 (load-jump-limit   (configf:lookup-number *configdat* "setup" "load-jump-limit")))
    ;; let's let the user know once in a long while that load checking
    ;; is happening but not constantly report it
    (if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
    (cond
     ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
2144
2145
2146
2147
2148
2149
2150

2151
2152
2153
2154
2155
2156
2157
2158
2159
			" seconds due to load " first
			" exceeding max of " adjmaxload
			" on server " (or remote-host (get-host-name))
			" (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp (cond

		       ((> numcpus 8)(/ numcpus 4))
		       ((> numcpus 4)(/ numcpus 2))
		       (else 0.5)))
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
			(if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     (else







>
|
|







2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
			" seconds due to load " first
			" exceeding max of " adjmaxload
			" on server " (or remote-host (get-host-name))
			" (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp (cond
		       (load-jump-limit load-jump-limit)
		       ((> numcpus 8)(/ numcpus 2))
		       ((> numcpus 4)(/ numcpus 1.2))
		       (else 0.5)))
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
			(if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     (else

Modified db.scm from [e3d7e1adb0] to [f437d94aa0].

3130
3131
3132
3133
3134
3135
3136
3137

3138
3139
3140
3141
3142
3143
3144
3145
  (db:with-db
   dbstruct #f #f
   (lambda (db)
     (sqlite3:execute db "UPDATE tests SET state='DELETED',status='n/a',comment='' WHERE id=?;" test-id))))

;; 
(define (db:delete-old-deleted-test-records dbstruct)
  (let (;; (run-ids  (db:get-all-run-ids dbstruct))

	(targtime (- (current-seconds)(* 30 24 60 60)))) ;; one month in the past
    (db:with-db
     dbstruct
     0
     #t
     (lambda (db)
       (sqlite3:with-transaction
	db







|
>
|







3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
  (db:with-db
   dbstruct #f #f
   (lambda (db)
     (sqlite3:execute db "UPDATE tests SET state='DELETED',status='n/a',comment='' WHERE id=?;" test-id))))

;; 
(define (db:delete-old-deleted-test-records dbstruct)
  (let ((targtime (- (current-seconds)
		     (or (configf:lookup-number *configdat* "setup" "keep-deleted-records")
			 (* 30 24 60 60))))) ;; one month in the past
    (db:with-db
     dbstruct
     0
     #t
     (lambda (db)
       (sqlite3:with-transaction
	db