Megatest

Check-in [3cdcb8c138]
Login
Overview
Comment:Some awful hacks to keep the system running. There is something causing servers to crash, I suspect sync is the problem. This work-around just constantly replaces the servers with new ones.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.70
Files: files | file ages | folders
SHA1: 3cdcb8c1388aca6d665dcd1437084c563bbdd4ec
User & Date: matt on 2022-05-22 20:20:00
Other Links: branch diff | manifest | tags
Context
2022-05-27
19:21
Commented out some not-used fuctions, removed the server start every 120 seconds and added dbfile handle count check-in: b1db729de1 user: matt tags: v1.70
2022-05-22
20:20
Some awful hacks to keep the system running. There is something causing servers to crash, I suspect sync is the problem. This work-around just constantly replaces the servers with new ones. check-in: 3cdcb8c138 user: matt tags: v1.70
18:02
Cleaned up some gratuitous database opens, quietened some debug messages check-in: a6be57bfc9 user: matt tags: v1.70
Changes

Modified api.scm from [b65cdceb6b] to [64bd840562].

155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
     (print-call-chain (current-error-port))
     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    ((> *api-process-request-count* 20) ;; 20)
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
     (set! *server-overloaded* #t)
     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in







|







155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
     (print-call-chain (current-error-port))
     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    ((> *api-process-request-count* 200) ;; 20)
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
     (set! *server-overloaded* #t)
     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in

Modified db.scm from [18a5213140] to [66cbe6fa4e].

1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
(define (db:lock-and-delta-sync no-sync-db dbstruct from-db-file runid)
  (assert (not *db-sync-in-progress*) "FATAL: db:lock-and-sync called while a sync is in progress.")
  (debug:print-info 3 *default-log-port* "db:lock-and-delta-sync")
  (let* ((lockdat  (db:no-sync-get-lock no-sync-db from-db-file))
	 (gotlock  (car lockdat))
	 (locktime (cdr lockdat)))

    (debug:print-info 3 *default-log-port* "db:lock-and-delta-sync: go lock?")
    (if gotlock
	(begin
          (debug:print 0 *default-log-port* "db:lock-and-delta-sync copying db")
          (db:sync-touched dbstruct runid)
	  (db:no-sync-del! no-sync-db from-db-file)
	  #t)
        (begin
          (debug:print 0 *default-log-port* "could not get lock for " from-db-file " from no-sync-db")
	  #f
        ))))







|


|







1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
(define (db:lock-and-delta-sync no-sync-db dbstruct from-db-file runid)
  (assert (not *db-sync-in-progress*) "FATAL: db:lock-and-sync called while a sync is in progress.")
  (debug:print-info 3 *default-log-port* "db:lock-and-delta-sync")
  (let* ((lockdat  (db:no-sync-get-lock no-sync-db from-db-file))
	 (gotlock  (car lockdat))
	 (locktime (cdr lockdat)))

    (debug:print-info 3 *default-log-port* "db:lock-and-delta-sync: got lock?")
    (if gotlock
	(begin
          (debug:print 0 *default-log-port* "db:lock-and-delta-sync copying db "runid" at "(current-seconds))
          (db:sync-touched dbstruct runid)
	  (db:no-sync-del! no-sync-db from-db-file)
	  #t)
        (begin
          (debug:print 0 *default-log-port* "could not get lock for " from-db-file " from no-sync-db")
	  #f
        ))))

Modified http-transport.scm from [c12a4eb4f0] to [3269081060].

473
474
475
476
477
478
479
480

481
482
483
484
485
486
487
	    (debug:print 0 *default-log-port* "SERVER: dbprep")
	    (set! *dbstruct-dbs*  (db:setup #t)) ;;  run-id)) FIXME!!!
	    (set! server-going #t)
	    (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine.

	    ;; (thread-start! *watchdog*)
          ) 
	  (if no-sync-db

              (begin
                (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S"))
                (db:all-db-sync *dbstruct-dbs*)
                ;; (db:do-sync no-sync-db)
	        ;; (db:run-lock-and-sync *no-sync-db*)
              )
          )







|
>







473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
	    (debug:print 0 *default-log-port* "SERVER: dbprep")
	    (set! *dbstruct-dbs*  (db:setup #t)) ;;  run-id)) FIXME!!!
	    (set! server-going #t)
	    (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine.

	    ;; (thread-start! *watchdog*)
          ) 
	  (if (and no-sync-db
		   (common:low-noise-print 5 "sync-all")) ;; cheesy way to reduce frequency of running sync :)
              (begin
                (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S"))
                (db:all-db-sync *dbstruct-dbs*)
                ;; (db:do-sync no-sync-db)
	        ;; (db:run-lock-and-sync *no-sync-db*)
              )
          )
528
529
530
531
532
533
534












535
536
537
538
539
540
541
	     (flush-output *default-log-port*)))
      (if (common:low-noise-print 60 "dbstats")
	  (begin
	    (debug:print 0 *default-log-port* "Server stats:")
	    (db:print-current-query-stats)))
      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600)))
	(cond












         ((and *server-run*
	       (> (+ last-access server-timeout)
		  (current-seconds)))
          (if (common:low-noise-print 120 "server continuing")
              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (let ((curr-time (current-seconds)))
		(handle-exceptions







>
>
>
>
>
>
>
>
>
>
>
>







529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
	     (flush-output *default-log-port*)))
      (if (common:low-noise-print 60 "dbstats")
	  (begin
	    (debug:print 0 *default-log-port* "Server stats:")
	    (db:print-current-query-stats)))
      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600)))
	(cond
	 ((and *server-run*
	       (> (- (current-seconds) server-start-time) 120)) ;; let's try server replacement
	  ;; ((adj-proc-load . 0.056875) (adj-core-load . 0.11375) (1m-load . 0.91) (5m-load . 0.77) (15m-load . 1.0) (proc . 16) (core . 8) (phys . 1))
	  (let* ((loaddat       (common:get-normalized-cpu-load #f))
		 (adj-proc-load (alist-ref 'adj-proc-load loaddat))
		 (adj-core-load (alist-ref 'adj-core-load loaddat))
		 (adj-load      (max adj-proc-load adj-core-load)))
	    (if (< adj-load 2) ;; reduce chance of runaway
		(server:run *toppath*))
	    (db:all-db-sync *dbstruct-dbs*)
	    (thread-sleep! 30)
	    (http-transport:server-shutdown port)))
         ((and *server-run*
	       (> (+ last-access server-timeout)
		  (current-seconds)))
          (if (common:low-noise-print 120 "server continuing")
              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (let ((curr-time (current-seconds)))
		(handle-exceptions