Megatest

Check-in [507ea188e3]
Login
Overview
Comment:multiple tweaks getting it to all work again
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.6584-ck5
Files: files | file ages | folders
SHA1: 507ea188e31e686deba12ec31ab8e6364b19f78f
User & Date: matt on 2021-04-18 22:58:32
Other Links: branch diff | manifest | tags
Context
2021-04-18
23:26
wip check-in: 641ecb4b57 user: matt tags: v1.6584-ck5
22:58
multiple tweaks getting it to all work again check-in: 507ea188e3 user: matt tags: v1.6584-ck5
21:07
wiphtop check-in: cecf838aa0 user: matt tags: v1.6584-ck5
Changes

Modified commonmod.scm from [0b83b5ae2b] to [f2f99ff8c5].

2065
2066
2067
2068
2069
2070
2071
2072
2073


2074
2075
2076
2077
2078
2079
2080
      (if (file-exists? gzfile) (delete-file gzfile))
      (system (conc "gzip " logfile))
      
      (unsetenv "TARGETHOST_LOGF")
      (unsetenv "TARGETHOST"))))

(define (server:get-logs-list area-path)
  (let* ((server-logs-cmd  (conc "grep -iL exiting " area-path "/logs/server-*-*.log"))
	 (server-logs   (string-split (string-chomp (with-input-from-pipe server-logs-cmd read-string)))))


    server-logs))
  
;; get a list of servers with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))







|
|
>
>







2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
      (if (file-exists? gzfile) (delete-file gzfile))
      (system (conc "gzip " logfile))
      
      (unsetenv "TARGETHOST_LOGF")
      (unsetenv "TARGETHOST"))))

(define (server:get-logs-list area-path)
  (let* (;; (server-logs-cmd  (conc "grep -iL exiting " area-path "/logs/server-*-*.log"))
	 ;; (server-logs   (string-split (string-chomp (with-input-from-pipe server-logs-cmd read-string))))
	 (server-logs      (glob (conc area-path"/logs/server-*-*.log")))
	 )
    server-logs))
  
;; get a list of servers with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
2217
2218
2219
2220
2221
2222
2223
2224

2225

2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254

2255
2256
2257
2258



2259

2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
    (or ns numservers)))

;; given a path to a server log return: host port startseconds
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 

(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx (regexp "^SERVER: dbprep"))

        (dbprep-found 0)) 

    (handle-exceptions
	exn
      (begin
	(debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn)
	(list #f #f #f #f)) ;; no idea what went wrong, call it a bad server
      (with-input-from-file
	  logf
	(lambda ()
	  (let loop ((inl  (read-line))
		     (lnum 0))
	    (if (not (eof-object? inl))
		(let ((mlst (string-match server-rx inl))
                      (dbprep (string-match dbprep-rx inl))
                      )
                  (if dbprep
                    (set! dbprep-found 1)
                  )
		  (if (not mlst)
		      (if (< lnum 500) ;; give up if more than 500 lines of server log read
			  (loop (read-line)(+ lnum 1))
			  (begin 
                           (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf )
                           (list #f #f #f #f)))
		      (let ((dat  (cdr mlst)))
			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
                (begin 

                   (if dbprep-found
                      (begin
                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds))
                         (thread-sleep! 25)



                      )

                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds))
                   )
		   (list #f #f #f #f)))))))))


;;======================================================================
;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
;;   (let* ((loadavg (common:get-cpu-load remote-host))







|
>
|
>











|
|
|
|
|
<




|
|





|
>
|
<
|
|
>
>
>
|
>
|
<
|







2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245

2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259

2260
2261
2262
2263
2264
2265
2266
2267

2268
2269
2270
2271
2272
2273
2274
2275
    (or ns numservers)))

;; given a path to a server log return: host port startseconds
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 

(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx     (regexp "^SERVER: dbprep"))
	(exiting-rx    (regexp ".*exiting promptly.*"))
        (dbprep-found  #f)
	(exiting-found #f))
    (handle-exceptions
	exn
      (begin
	(debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn)
	(list #f #f #f #f)) ;; no idea what went wrong, call it a bad server
      (with-input-from-file
	  logf
	(lambda ()
	  (let loop ((inl  (read-line))
		     (lnum 0))
	    (if (not (eof-object? inl))
		(let ((mlst    (string-match server-rx inl))
                      (dbprep  (string-match dbprep-rx inl))
		      (exiting (string-match exiting-rx inl)))
                  (if dbprep  (set! dbprep-found #t))
		  (if exiting (set! exiting-found #t))

		  (if (not mlst)
		      (if (< lnum 500) ;; give up if more than 500 lines of server log read
			  (loop (read-line)(+ lnum 1))
			  (begin 
			    (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf )
			    (list #f #f #f #f)))
		      (let ((dat  (cdr mlst)))
			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
		(begin
		  (cond
		   (dbprep-found

		    (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds))
		    (thread-sleep! 25))
		   (exiting-found
		    (debug:print-info 0 *default-log-port* "Removing server log "logf" as the server exited due to signal")
		    (delete-file* logf)
		    (thread-sleep! 1))
		   (else
		    (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds))))

		  (list #f #f #f #f)))))))))


;;======================================================================
;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
;;   (let* ((loadavg (common:get-cpu-load remote-host))

Modified dbmod.scm from [6d0e6f1813] to [8e5937bb80].

404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
          (if (and  (or (not dbfexists)
			(and modtimedelta
			     (> modtimedelta max-stale-tmp))) ;; if db in tmp is over ten seconds older than the file in MTRA then do a sync back
		    do-sync)
	      (begin
		(debug:print 1 *default-log-port* "filling db " (db:dbdat-get-path tmpdb) " with data \n    from " (db:dbdat-get-path mtdb) " mod time delta: " modtimedelta)
		(db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb refndb tmpdb)
    ;touch tmp db to avoid wal mode wierdness  
     (set! (file-modification-time tmpdbfname) (current-seconds))  
                (debug:print-info 13 *default-log-port* "db:sync-all-tables-list done.")
                )
	      (debug:print 4 *default-log-port* " db, " (db:dbdat-get-path tmpdb) " already exists or fresh enough, not propogating data from\n     " (db:dbdat-get-path mtdb) " mod time delta: " modtimedelta) )
	  ;; (db:multi-db-sync dbstruct 'old2new))  ;; migrate data from megatest.db automatically
          tmpdb))))









|
|







404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
          (if (and  (or (not dbfexists)
			(and modtimedelta
			     (> modtimedelta max-stale-tmp))) ;; if db in tmp is over ten seconds older than the file in MTRA then do a sync back
		    do-sync)
	      (begin
		(debug:print 1 *default-log-port* "filling db " (db:dbdat-get-path tmpdb) " with data \n    from " (db:dbdat-get-path mtdb) " mod time delta: " modtimedelta)
		(db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb refndb tmpdb)
		;; touch tmp db to avoid wal mode wierdness  
		(set-file-times! tmpdbfname (current-seconds))  
                (debug:print-info 13 *default-log-port* "db:sync-all-tables-list done.")
                )
	      (debug:print 4 *default-log-port* " db, " (db:dbdat-get-path tmpdb) " already exists or fresh enough, not propogating data from\n     " (db:dbdat-get-path mtdb) " mod time delta: " modtimedelta) )
	  ;; (db:multi-db-sync dbstruct 'old2new))  ;; migrate data from megatest.db automatically
          tmpdb))))


Modified debugprint.scm from [668a77fa42] to [fdf96a030a].

58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

77
78
79
80
81
82
83
	(print "ERROR: Invalid debug value \"" vstr "\"")
	#f)
      #t))

(define (debug:debug-mode n)
  (let* ((vb (verbosity)))
    (cond
   ((and (number? vb)   ;; number number
	 (number? n))
    (<= n vb))
   ((and (list? vb)     ;; list   number
	 (number? n))
    (member n vb))
   ((and (list? vb)     ;; list   list
	 (list? n))
    (not (null? (lset-intersection! eq? vb n))))
   ((and (number? vb)
	 (list? n))
    (member vb n)))))


(define (debug:print n e . params)
  (if (debug:debug-mode n)
      (with-output-to-port (or e (current-error-port))
	(lambda ()
	  ;; (if *logging*
	      ;; (db:log-event (apply conc params))







|
|
|
|
|
|
|
|
|
|
|
|
>







58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
	(print "ERROR: Invalid debug value \"" vstr "\"")
	#f)
      #t))

(define (debug:debug-mode n)
  (let* ((vb (verbosity)))
    (cond
     ((and (number? vb)   ;; number number
	   (number? n))
      (<= n vb))
     ((and (list? vb)     ;; list   number
	   (number? n))
      (member n vb))
     ((and (list? vb)     ;; list   list
	   (list? n))
      (not (null? (lset-intersection! eq? vb n))))
     ((and (number? vb)
	   (list? n))
      (member vb n))
     (else #f))))

(define (debug:print n e . params)
  (if (debug:debug-mode n)
      (with-output-to-port (or e (current-error-port))
	(lambda ()
	  ;; (if *logging*
	      ;; (db:log-event (apply conc params))

Modified http-transportmod.scm from [698ee19400] to [7319316419].

596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
(define (http-transport:server-shutdown port)
  (begin
    ;;(BB> "http-transport:server-shutdown called")
    (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
    ;;
    ;; start_shutdown
    ;;
    (set! (bdat-time-to-exit *bdat*) #t) ;; tell on-exit to be fast as we've already cleaned up
    (portlogger:open-run-close portlogger:set-port port "released")
    (thread-sleep! 1)

    ;; (debug:print-info 0 *default-log-port* "Max cached queries was    " *max-cache-size*)
    ;; (debug:print-info 0 *default-log-port* "Number of cached writes   " *number-of-writes*)
    ;; (debug:print-info 0 *default-log-port* "Average cached write time "
    ;; 		      (if (eq? *number-of-writes* 0)







|







596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
(define (http-transport:server-shutdown port)
  (begin
    ;;(BB> "http-transport:server-shutdown called")
    (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
    ;;
    ;; start_shutdown
    ;;
    (bdat-time-to-exit-set! *bdat* #t) ;; tell on-exit to be fast as we've already cleaned up
    (portlogger:open-run-close portlogger:set-port port "released")
    (thread-sleep! 1)

    ;; (debug:print-info 0 *default-log-port* "Max cached queries was    " *max-cache-size*)
    ;; (debug:print-info 0 *default-log-port* "Number of cached writes   " *number-of-writes*)
    ;; (debug:print-info 0 *default-log-port* "Average cached write time "
    ;; 		      (if (eq? *number-of-writes* 0)

Modified launchmod.scm from [046e0b01a9] to [2eee1b3817].

2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
              (loop (current-seconds)))
            #t)))
    (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, (bdat-time-to-exit *bdat*) = " (bdat-time-to-exit *bdat*)" pid="(current-process-id)" mtpath="golden-mtpath)))


;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (init-watchdog)
  (set! (bdat-watchdog-set! *bdat*)
	(make-thread
	 (lambda ()
	   (handle-exceptions
	    exn
	    (begin
	      (print-call-chain)
	      (print " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
	    (common:watchdog)))
	 "Watchdog thread"))
  (start-watchdog))

(define (start-watchdog)
  ;;(if (not (args:get-arg "-server"))
  ;;    (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
  (let* ((no-watchdog-args
	  '("-list-runs"
	    "-testdata-csv"
	    "-list-servers"
	    "-server"
	    "-adjutant"
	    "-list-disks"
	    "-list-targets"
	    "-show-runconfig"
	    ;;"-list-db-targets"
	    "-show-runconfig"
	    "-show-config"
	    "-show-cmdinfo"
	    "-cleanup-db"
	    ))
	 (no-watchdog-argvals (list '("-archive" . "replicate-db")))
	 (start-watchdog-specail-arg-val (let loop ((hed  (car no-watchdog-argvals))
						    (tail (cdr   no-watchdog-argvals)))
					   ;; (print "hed" hed " arg " (args:get-arg (car hed)) " val:" (cdr hed)  " eql" (equal? (args:get-arg (car hed)) (cdr hed)))  
					   (if (equal? (args:get-arg (car hed)) (cdr hed))
					       #f
					       (if (null? tail)
						   #t
						   (loop (car tail) (cdr tail))))))      
	 (no-watchdog-args-vals (filter (lambda (x) x)
					(map args:get-arg no-watchdog-args)))
	 (start-watchdog (and (null? no-watchdog-args-vals) start-watchdog-specail-arg-val)))
					;(print  "no-watchdog-args="no-watchdog-args "no-watchdog-args-vals="no-watchdog-args-vals " start-watchdog-specail-arg-val:" start-watchdog-specail-arg-val " start-watchdog:" start-watchdog) 
    (if start-watchdog
	(thread-start! (bdat-watchdog *bdat*)))))
     
(define (server:writable-watchdog-deltasync dbstruct)
  (thread-sleep! 0.054) ;; delay for startup
  (let ((legacy-sync  (common:run-sync?))
        (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
	(debug-mode   (debug:debug-mode 1))
	(last-time    (current-seconds))
	(no-sync-db   (db:open-no-sync-db))







|
|
|
|
|
|
|
|
|
|




















|
|
|








|
|


|







2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
              (loop (current-seconds)))
            #t)))
    (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, (bdat-time-to-exit *bdat*) = " (bdat-time-to-exit *bdat*)" pid="(current-process-id)" mtpath="golden-mtpath)))


;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (init-watchdog)
  (bdat-watchdog-set! *bdat*
		      (make-thread
		       (lambda ()
			 (handle-exceptions
			  exn
			  (begin
			    (print-call-chain)
			    (print " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
			  (common:watchdog)))
		       "Watchdog thread"))
  (start-watchdog))

(define (start-watchdog)
  ;;(if (not (args:get-arg "-server"))
  ;;    (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
  (let* ((no-watchdog-args
	  '("-list-runs"
	    "-testdata-csv"
	    "-list-servers"
	    "-server"
	    "-adjutant"
	    "-list-disks"
	    "-list-targets"
	    "-show-runconfig"
	    ;;"-list-db-targets"
	    "-show-runconfig"
	    "-show-config"
	    "-show-cmdinfo"
	    "-cleanup-db"
	    ))
	 (no-watchdog-argvals            (list '("-archive" . "replicate-db")))
	 (start-watchdog-special-arg-val (let loop ((hed  (car no-watchdog-argvals))
						    (tail (cdr  no-watchdog-argvals)))
					   ;; (print "hed" hed " arg " (args:get-arg (car hed)) " val:" (cdr hed)  " eql" (equal? (args:get-arg (car hed)) (cdr hed)))  
					   (if (equal? (args:get-arg (car hed)) (cdr hed))
					       #f
					       (if (null? tail)
						   #t
						   (loop (car tail) (cdr tail))))))      
	 (no-watchdog-args-vals (filter (lambda (x) x)
					(map args:get-arg no-watchdog-args)))
	 (start-watchdog (and (null? no-watchdog-args-vals) start-watchdog-special-arg-val)))
    ;; (print  "no-watchdog-args="no-watchdog-args "no-watchdog-args-vals="no-watchdog-args-vals " start-watchdog-special-arg-val:" start-watchdog-special-arg-val " start-watchdog:" start-watchdog) 
    (if start-watchdog
	(thread-start! (bdat-watchdog *bdat*)))))

(define (server:writable-watchdog-deltasync dbstruct)
  (thread-sleep! 0.054) ;; delay for startup
  (let ((legacy-sync  (common:run-sync?))
        (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
	(debug-mode   (debug:debug-mode 1))
	(last-time    (current-seconds))
	(no-sync-db   (db:open-no-sync-db))

Modified megatest.scm from [f5acd0dd12] to [62df76911c].

775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
     ;; immediately set MT_TARGET if -reqtarg or -target are available
     ;;
     (let ((targ (or (args:get-arg "-reqtarg")(args:get-arg "-target"))))
       (if targ (setenv "MT_TARGET" targ)))
     
     ;; The watchdog is to keep an eye on things like db sync etc.
     ;;
     
  
;;      (define (debug:debug-mode n)
;;        (cond
;;         ((and (number? *verbosity*)   ;; number number
;;      	 (number? n))
;;          (<= n *verbosity*))
;;         ((and (list? *verbosity*)     ;; list   number







|







775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
     ;; immediately set MT_TARGET if -reqtarg or -target are available
     ;;
     (let ((targ (or (args:get-arg "-reqtarg")(args:get-arg "-target"))))
       (if targ (setenv "MT_TARGET" targ)))
     
     ;; The watchdog is to keep an eye on things like db sync etc.
     ;;
(init-watchdog)
  
;;      (define (debug:debug-mode n)
;;        (cond
;;         ((and (number? *verbosity*)   ;; number number
;;      	 (number? n))
;;          (<= n *verbosity*))
;;         ((and (list? *verbosity*)     ;; list   number