Megatest

Check-in [50281239f6]
Login
Overview
Comment:wip - moving globals into *bdat*
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.6584-ck5
Files: files | file ages | folders
SHA1: 50281239f6c19aafc92c777e9ac9f98c78930a01
User & Date: matt on 2021-04-15 21:24:22
Other Links: branch diff | manifest | tags
Context
2021-04-15
21:32
wip - moving globals into *bdat* check-in: f3116a6a76 user: matt tags: v1.6584-ck5
21:24
wip - moving globals into *bdat* check-in: 50281239f6 user: matt tags: v1.6584-ck5
20:44
wip - moving globals into *bdat* check-in: 921126fb6b user: matt tags: v1.6584-ck5
Changes

Modified common.scm from [c1fd4093ed] to [e6fb95db8e].

197
198
199
200
201
202
203
204

205
206
207

208
209
210
211
212
213
214
197
198
199
200
201
202
203

204
205
206

207
208
209
210
211
212
213
214







-
+


-
+







        #t)
      #f))

(define (std-exit-procedure)
  ;;(common:telemetry-log-close)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
  (let ((no-hurry  (if (bdat-time-to-exit *bdat*) ;; hurry up
		       #f
		       (begin
			 (set! *time-to-exit* #t)
			 (bdat-time-to-exit-set! *bdat* #t)
			 #t))))
    (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
    (if (and no-hurry (debug:debug-mode 18))
	(rmt:print-db-stats))
    (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
                              (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
			      (if *task-db*    
338
339
340
341
342
343
344
345

346
347
348

349
350
351
352
353
354
355
356
357

358
359
360
361
362
363
364
338
339
340
341
342
343
344

345
346
347

348
349
350
351
352
353
354
355
356

357
358
359
360
361
362
363
364







-
+


-
+








-
+







        (tmp-mtdb        (dbr:dbstruct-tmpdb dbstruct))
        (tmp-mtpath      (db:dbdat-get-path tmp-mtdb)))
    (debug:print-info 0 *default-log-port* "Read-only periodic sync thread started.")
    (let loop ((last-sync-time 0))
      (debug:print-info 13 *default-log-port* "loop top tmp-mtpath="tmp-mtpath" golden-mtpath="golden-mtpath)
      (let* ((duration-since-last-sync (- (current-seconds) last-sync-time)))
        (debug:print-info 13 *default-log-port* "duration-since-last-sync="duration-since-last-sync)
        (if (and (not *time-to-exit*)
        (if (and (not (bdat-time-to-exit *bdat*))
                 (< duration-since-last-sync sync-cool-off-duration))
            (thread-sleep! (- sync-cool-off-duration duration-since-last-sync)))
        (if (not *time-to-exit*)
        (if (not (bdat-time-to-exit *bdat*))
            (let ((golden-mtdb-mtime (file-modification-time golden-mtpath))
                  (tmp-mtdb-mtime    (file-modification-time tmp-mtpath)))
	      (if (> golden-mtdb-mtime tmp-mtdb-mtime)
		  (if (< golden-mtdb-mtime (- (current-seconds) 3)) ;; file has NOT been touched in past three seconds, this way multiple servers won't fight to sync back
		      (let ((res (db:multi-db-sync dbstruct 'old2new)))
			(debug:print-info 13 *default-log-port* "rosync called, " res " records transferred."))))
              (loop (current-seconds)))
            #t)))
    (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" mtpath="golden-mtpath)))
    (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, (bdat-time-to-exit *bdat*) = " (bdat-time-to-exit *bdat*)" pid="(current-process-id)" mtpath="golden-mtpath)))

;;======================================================================
;; from metadat lookup MEGATEST_VERSION
;;
(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB
  (rmt:get-var "MEGATEST_VERSION"))

Modified commonmod.scm from [7c95467769] to [217c2d522c].

104
105
106
107
108
109
110
111

112
113
114
115
116
117
118
104
105
106
107
108
109
110

111
112
113
114
115
116
117
118







-
+








(define *bdat* #f) ;; the one and only (someday) global?

(defstruct bdat
  (home                   (getenv "HOME"))
  (user                   (getenv "USER"))
  (watchdog               #f)
  
  (time-to-exit           #f)
  (server-loop-heart-beat (current-seconds))
  )

(define (make-and-init-bigdata)
  (set! *bdat*
	(make-bdat)))
  
175
176
177
178
179
180
181
182

183
184
185
186
187
188
189
175
176
177
178
179
180
181

182
183
184
185
186
187
188
189







-
+







(define *my-client-signature* #f)
(define *transport-type*    'http)             ;; override with [server] transport http|rpc|nmsg
(define *runremote*         #f)                ;; if set up for server communication this will hold <host port>
;; (define *max-cache-size*    0)
(define *logged-in-clients* (make-hash-table))
(define *server-id*         #f)
(define *server-info*       #f)  ;; good candidate for easily convert to non-global
(define *time-to-exit*      #f)
;; (define *time-to-exit*      #f)
(define *server-run*        #t)
(define *run-id*            #f)
(define *server-kind-run*   (make-hash-table))
(define *home-host*         #f)
;; (define *total-non-write-delay* 0)
(define *heartbeat-mutex*   (make-mutex))
(define *api-process-request-count* 0)
1025
1026
1027
1028
1029
1030
1031
1032

1033
1034
1035
1036
1037
1038
1039
1040

1041
1042
1043
1044
1045
1046
1047
1025
1026
1027
1028
1029
1030
1031

1032
1033
1034
1035
1036
1037
1038
1039

1040
1041
1042
1043
1044
1045
1046
1047







-
+







-
+








;; common:readonly-watchdo
;; common:watchdog
;; std-exit-procedure was here

(define (std-signal-handler signum)
  ;; (signal-mask! signum)
  (set! *time-to-exit* #t) 
  (bdat-time-to-exit-set! *bdat* #t) 
  ;;(debug:print-info 13 *default-log-port* "got signal "signum)
  (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly")
  ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
  (exit))

(define (special-signal-handler signum)
  ;; (signal-mask! signum)
  (set! *time-to-exit* #t)
  (bdat-time-to-exit-set! *bdat*  #t)
  ;;(debug:print-info 13 *default-log-port* "got signal "signum)
  (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting!!")
  ;;TODO send email to notify admin contact listed in the config that the lisner got killed
  ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
  (exit))

(set-signal-handler! signal/int  std-signal-handler)  ;; ^C

Modified http-transportmod.scm from [9429590d28] to [03a411c806].

606
607
608
609
610
611
612
613

614
615
616
617
618
619
620
606
607
608
609
610
611
612

613
614
615
616
617
618
619
620







-
+







(define (http-transport:server-shutdown port)
  (begin
    ;;(BB> "http-transport:server-shutdown called")
    (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
    ;;
    ;; start_shutdown
    ;;
    (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up
    (set! (bdat-time-to-exit *bdat*) #t) ;; tell on-exit to be fast as we've already cleaned up
    (portlogger:open-run-close portlogger:set-port port "released")
    (thread-sleep! 1)

    ;; (debug:print-info 0 *default-log-port* "Max cached queries was    " *max-cache-size*)
    ;; (debug:print-info 0 *default-log-port* "Number of cached writes   " *number-of-writes*)
    ;; (debug:print-info 0 *default-log-port* "Average cached write time "
    ;; 		      (if (eq? *number-of-writes* 0)

Modified launchmod.scm from [90e4e1ba80] to [d57ee3c088].

458
459
460
461
462
463
464
465

466
467
468
469
470
471
472
458
459
460
461
462
463
464

465
466
467
468
469
470
471
472







-
+







	  (launch:setup) ;; should be properly in the run area home now
          
	  (set! tconfigreg (tests:get-all)) ;; mapping of testname => test source path
	  (let ((sighand (lambda (signum)
			   ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting
			   (if (eq? signum signal/stop)
			       (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting."))
			   (set! *time-to-exit* #t)
			   (set! (bdat-time-to-exit *bdat*) #t)
			   (print "Received signal " signum ", cleaning up before exit (set this test to COMPLETED/ABORT) . Please wait...")
			   (let ((th1 (make-thread (lambda ()
                                                     (print "set test to COMPLETED/ABORT begin.")
						     (rmt:test-set-state-status run-id test-id "COMPLETED" "ABORT" "received kill signal")
                                                     (print "set test to COMPLETED/ABORT complete.")
						     (print "Killed by signal " signum ". Exiting")
						     (exit 1))))

Modified megatest.scm from [a165cc0b60] to [44edba80d4].

1127
1128
1129
1130
1131
1132
1133
1134

1135
1136
1137
1138
1139
1140
1141
1127
1128
1129
1130
1131
1132
1133

1134
1135
1136
1137
1138
1139
1140
1141







-
+







            ((or (not (args:get-arg "-dumpmode"))
     	    (string=? (args:get-arg "-dumpmode") "ini"))
     	(configf:config->ini data))
            (else
     	(debug:print-error 0 *default-log-port* "-dumpmode of " (args:get-arg "-dumpmode") " not recognised")))
           (set! *didsomething* #t)
           (pop-directory)
           (set! *time-to-exit* #t)))
           (bdat-time-to-exit-set! *bdat* #t)))
     
     (if (args:get-arg "-show-cmdinfo")
         (if (or (args:get-arg ":value")(getenv "MT_CMDINFO"))
     	(let ((data (common:read-encoded-string (or (args:get-arg ":value")(getenv "MT_CMDINFO")))))
     	  (if (equal? (args:get-arg "-dumpmode") "json")
     	      (json-write data)
     	      (pp data))
1404
1405
1406
1407
1408
1409
1410
1411

1412
1413
1414
1415
1416
1417
1418
1404
1405
1406
1407
1408
1409
1410

1411
1412
1413
1414
1415
1416
1417
1418







-
+







               (print (string-join table-header ","))
               (for-each (lambda(table-row)
                           (print (string-join (map ->string table-row) ",")))
     
                         
                                 table-rows))))
       (set! *didsomething* #t)
       (set! *time-to-exit* #t))
       (bdat-time-to-exit-set! *bdat* #t))
     
     
     
     ;; NOTE: list-runs and list-db-targets operate on local db!!!
     ;;
     ;; IDEA: megatest list -runname blah% ...
     ;;
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756



1757
1758

1759
1760
1761
1762
1763
1764
1765
1747
1748
1749
1750
1751
1752
1753



1754
1755
1756
1757

1758
1759
1760
1761
1762
1763
1764
1765







-
-
-
+
+
+

-
+







     				       (begin
     					 (debug:print 0 *default-log-port* "WARNING: path given, " outputfile " is relative, prefixing with current directory")
     					 (conc (current-directory) "/" outputfile)))))
     		  (create-directory tempdir #t)
     		  (ods:list->ods tempdir ouf sheets))))
     	  ;; (system (conc "rm -rf " tempdir))
     	  (set! *didsomething* #t)
               (set! *time-to-exit* #t)
               ) ;; end if true branch (end of a let)
             ) ;; end if
	  (bdat-time-to-exit-set! *bdat* #t)
	  ) ;; end if true branch (end of a let)
	) ;; end if
         ) ;; end if -list-runs
     

     ;; list-waivers
     (if (and (args:get-arg "-list-waivers")
     	 (launch:setup))
         (let* ((runpatt     (or (args:get-arg "-runname") "%"))
     	   (testpatt    (common:args-get-testpatt #f))
     	   (keys        (rmt:get-keys)) 
     	   (runsdat     (rmt:get-runs-by-patt
2580
2581
2582
2583
2584
2585
2586
2587

2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600

2601
2602
2603
2604
2605
2606
2607
2580
2581
2582
2583
2584
2585
2586

2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599

2600
2601
2602
2603
2604
2605
2606
2607







-
+












-
+







     
     ;;======================================================================
     ;; Exit and clean up
     ;;======================================================================
     
     (if (not *didsomething*)
         (debug:print 0 *default-log-port* help)
         (set! *time-to-exit* #t)
         (bdat-time-to-exit-set! *bdat* #t)
         )
     ;;(debug:print-info 13 *default-log-port* "thread-join! watchdog")
     
     ;; join the watchdog thread if it has been thread-start!ed  (it may not have been started in the case of a server that never enters running state)
     ;;   (symbols returned by thread-state: created ready running blocked suspended sleeping terminated dead)
     ;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
     (let* ((watchdog (bdat-watchdog *bdat*)))
       (if (thread? watchdog)
	   (case (thread-state watchdog)
	     ((ready running blocked sleeping terminated dead)
	      (thread-join! watchdog)))))
     
     (set! *time-to-exit* #t)
     (bdat-time-to-exit-set! *bdat* #t)
     
     (if (not (eq? *globalexitstatus* 0))
         (if (or (args:get-arg "-run")(args:get-arg "-runtests")(args:get-arg "-runall"))
             (begin
	       (debug:print 0 *default-log-port* "NOTE: Subprocesses with non-zero exit code detected: " *globalexitstatus*)
	       (exit 0))
             (case *globalexitstatus*

Modified runsmod.scm from [af3c8fc3d0] to [4aa37d6c17].

381
382
383
384
385
386
387
388

389
390
391
392
393
394
395
381
382
383
384
385
386
387

388
389
390
391
392
393
394
395







-
+







	  ;   (> run-count config-reruns))
 	  ;(set! run-count config-reruns))

    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))

    (let ((sighand (lambda (signum)
		     ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting
		     (set! *time-to-exit* #t)
		     (bdat-time-to-exit-set! *bdat* #t)
		     (print "Received signal " signum ", cleaning up before exit. Please wait...")
		     (let ((th1 (make-thread (lambda ()
					       ;; (let ((tdbdat (tasks:open-db)))
						 (rmt:tasks-set-state-given-param-key task-key "killed") ;; )
					       (print "Killed by signal " signum ". Exiting")
					       (thread-sleep! 3)
					       (exit))))

Modified server.scm from [643d5fcda0] to [cc36ed0b6d].

262
263
264
265
266
267
268
269

270
271
272
273
274
275

276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291

292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322

323
324
325
326
327
328
329
262
263
264
265
266
267
268

269
270
271
272
273
274

275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290

291
292
293
294
295
296
297
298
299
300
301
302
303
304
305

306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321

322
323
324
325
326
327
328
329







-
+





-
+















-
+














-
+















-
+







  (let* ((do-a-sync  (server:get-bruteforce-syncer dbstruct))
         (final-sync (server:get-bruteforce-syncer dbstruct fork-to-background: #t persist-until-sync: #t)))
    (when (and (not (args:get-arg "-sync-to-megatest.db")) ;; conditions under which we do not run the sync
	       (args:get-arg "-server"))
      
      (let loop ()
	(do-a-sync)
        (if (not *time-to-exit*) (loop))) ;; keep going unless time to exit
        (if (not (bdat-time-to-exit *bdat*)) (loop))) ;; keep going unless time to exit

      ;; time to exit, close the no-sync db here
      (final-sync)

      (if (common:low-noise-print 30)
	  (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)
	  (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = "(bdat-time-to-exit *bdat*)" pid="(current-process-id)
			    )))))

(define (server:writable-watchdog-deltasync dbstruct)
  (thread-sleep! 0.054) ;; delay for startup
  (let ((legacy-sync  (common:run-sync?))
        (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
	(debug-mode   (debug:debug-mode 1))
	(last-time    (current-seconds))
	(no-sync-db   (db:open-no-sync-db))
	(stmt-cache   (dbr:dbstruct-stmt-cache dbstruct))
        (sync-duration 0) ;; run time of the sync in milliseconds
        )
    (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls
    (debug:print-info 2 *default-log-port* "Periodic sync thread started.")
    (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)  );;  " this-wd-num="this-wd-num)
    (if (and legacy-sync (not *time-to-exit*))
    (if (and legacy-sync (not (bdat-time-to-exit *bdat*)))
	(let* (;;(dbstruct (db:setup))
	       (mtdb       (dbr:dbstruct-mtdb dbstruct))
	       (mtpath     (db:dbdat-get-path mtdb))
	       (tmp-area   (common:get-db-tmp-area))
	       (start-file (conc tmp-area "/.start-sync"))
	       (end-file   (conc tmp-area "/.end-sync")))
	  (debug:print-info 0 *default-log-port* "Server running, periodic sync started.")
	  (let loop ()
	    ;; sync for filesystem local db writes
	    ;;
	    (mutex-lock! *db-multi-sync-mutex*)
	    (let* ((need-sync        (>= *db-last-access* *db-last-sync*)) ;; no sync since last write
		   (sync-in-progress *db-sync-in-progress*)
                   (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5))
		   (should-sync      (and (not *time-to-exit*)
		   (should-sync      (and (not (bdat-time-to-exit *bdat*))
                                          (> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed
		   (start-time       (current-seconds))
                   (cpu-load-adj     (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)))
		   (mt-mod-time      (file-modification-time mtpath))
		   (last-sync-start  (if (common:file-exists? start-file)
					 (file-modification-time start-file)
					 0))
		   (last-sync-end    (if (common:file-exists? end-file)
					 (file-modification-time end-file)
					 10))
                   (sync-period      (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period
		   (recently-synced  (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db!
					  (< mt-mod-time last-sync-start)))
		   (sync-done        (<= last-sync-start last-sync-end))
                   (sync-stale       (> start-time (+ last-sync-start sync-stale-seconds)))
		   (will-sync        (and (not *time-to-exit*)       ;; do not start a sync if we are in the process of exiting
		   (will-sync        (and (not (bdat-time-to-exit *bdat*))       ;; do not start a sync if we are in the process of exiting
                                          (or need-sync should-sync)
					  (or sync-done sync-stale)
					  (not sync-in-progress)
					  (not recently-synced))))
              (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop.  need-sync="need-sync" sync-in-progress=" sync-in-progress
				" should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync
				" sync-done=" sync-done " sync-period=" sync-period)
388
389
390
391
392
393
394
395

396
397

398
399

400
401
402
403
404

405
406
407
408

409
388
389
390
391
392
393
394

395
396

397
398

399
400
401
402
403

404
405
406
407

408
409







-
+

-
+

-
+




-
+



-
+

		       (> (- start-time last-time) 60))
		  (begin
		    (set! last-time start-time)
		    (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))
	    
	    ;; keep going unless time to exit
	    ;;
	    (if (not *time-to-exit*)
	    (if (not (bdat-time-to-exit *bdat*))
		(let delay-loop ((count 0))
                  ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*)
                  ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="(bdat-time-to-exit *bdat*))
                                                            
		  (if (and (not *time-to-exit*)
		  (if (and (not (bdat-time-to-exit *bdat*))
			   (< count 6)) ;; was 11, changing to 4. 
		      (begin
			(thread-sleep! 1)
			(delay-loop (+ count 1))))
		  (if (not *time-to-exit*) (loop))))
		  (if (not (bdat-time-to-exit *bdat*)) (loop))))
	    ;; time to exit, close the no-sync db here
	    (db:no-sync-close-db no-sync-db stmt-cache)
	    (if (common:low-noise-print 30)
		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num)))))))
		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " (bdat-time-to-exit *bdat*)" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num)))))))