Megatest

Check-in [3cdcb8c138]
Login
Overview
Comment:Some awful hacks to keep the system running. There is something causing servers to crash, I suspect sync is the problem. This work-around just constantly replaces the servers with new ones.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.70
Files: files | file ages | folders
SHA1: 3cdcb8c1388aca6d665dcd1437084c563bbdd4ec
User & Date: matt on 2022-05-22 20:20:00
Other Links: branch diff | manifest | tags
Context
2022-05-27
19:21
Commented out some not-used fuctions, removed the server start every 120 seconds and added dbfile handle count check-in: b1db729de1 user: matt tags: v1.70
2022-05-22
20:20
Some awful hacks to keep the system running. There is something causing servers to crash, I suspect sync is the problem. This work-around just constantly replaces the servers with new ones. check-in: 3cdcb8c138 user: matt tags: v1.70
18:02
Cleaned up some gratuitous database opens, quietened some debug messages check-in: a6be57bfc9 user: matt tags: v1.70
Changes

Modified api.scm from [b65cdceb6b] to [64bd840562].

155
156
157
158
159
160
161
162

163
164
165
166
167
168
169
155
156
157
158
159
160
161

162
163
164
165
166
167
168
169







-
+







     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
     (print-call-chain (current-error-port))
     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    ((> *api-process-request-count* 20) ;; 20)
    ((> *api-process-request-count* 200) ;; 20)
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
     (set! *server-overloaded* #t)
     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in

Modified db.scm from [18a5213140] to [66cbe6fa4e].

1059
1060
1061
1062
1063
1064
1065
1066

1067
1068
1069

1070
1071
1072
1073
1074
1075
1076
1059
1060
1061
1062
1063
1064
1065

1066
1067
1068

1069
1070
1071
1072
1073
1074
1075
1076







-
+


-
+







(define (db:lock-and-delta-sync no-sync-db dbstruct from-db-file runid)
  (assert (not *db-sync-in-progress*) "FATAL: db:lock-and-sync called while a sync is in progress.")
  (debug:print-info 3 *default-log-port* "db:lock-and-delta-sync")
  (let* ((lockdat  (db:no-sync-get-lock no-sync-db from-db-file))
	 (gotlock  (car lockdat))
	 (locktime (cdr lockdat)))

    (debug:print-info 3 *default-log-port* "db:lock-and-delta-sync: go lock?")
    (debug:print-info 3 *default-log-port* "db:lock-and-delta-sync: got lock?")
    (if gotlock
	(begin
          (debug:print 0 *default-log-port* "db:lock-and-delta-sync copying db")
          (debug:print 0 *default-log-port* "db:lock-and-delta-sync copying db "runid" at "(current-seconds))
          (db:sync-touched dbstruct runid)
	  (db:no-sync-del! no-sync-db from-db-file)
	  #t)
        (begin
          (debug:print 0 *default-log-port* "could not get lock for " from-db-file " from no-sync-db")
	  #f
        ))))

Modified http-transport.scm from [c12a4eb4f0] to [3269081060].

473
474
475
476
477
478
479
480


481
482
483
484
485
486
487
473
474
475
476
477
478
479

480
481
482
483
484
485
486
487
488







-
+
+







	    (debug:print 0 *default-log-port* "SERVER: dbprep")
	    (set! *dbstruct-dbs*  (db:setup #t)) ;;  run-id)) FIXME!!!
	    (set! server-going #t)
	    (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine.

	    ;; (thread-start! *watchdog*)
          ) 
	  (if no-sync-db
	  (if (and no-sync-db
		   (common:low-noise-print 5 "sync-all")) ;; cheesy way to reduce frequency of running sync :)
              (begin
                (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S"))
                (db:all-db-sync *dbstruct-dbs*)
                ;; (db:do-sync no-sync-db)
	        ;; (db:run-lock-and-sync *no-sync-db*)
              )
          )
528
529
530
531
532
533
534












535
536
537
538
539
540
541
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554







+
+
+
+
+
+
+
+
+
+
+
+







	     (flush-output *default-log-port*)))
      (if (common:low-noise-print 60 "dbstats")
	  (begin
	    (debug:print 0 *default-log-port* "Server stats:")
	    (db:print-current-query-stats)))
      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600)))
	(cond
	 ((and *server-run*
	       (> (- (current-seconds) server-start-time) 120)) ;; let's try server replacement
	  ;; ((adj-proc-load . 0.056875) (adj-core-load . 0.11375) (1m-load . 0.91) (5m-load . 0.77) (15m-load . 1.0) (proc . 16) (core . 8) (phys . 1))
	  (let* ((loaddat       (common:get-normalized-cpu-load #f))
		 (adj-proc-load (alist-ref 'adj-proc-load loaddat))
		 (adj-core-load (alist-ref 'adj-core-load loaddat))
		 (adj-load      (max adj-proc-load adj-core-load)))
	    (if (< adj-load 2) ;; reduce chance of runaway
		(server:run *toppath*))
	    (db:all-db-sync *dbstruct-dbs*)
	    (thread-sleep! 30)
	    (http-transport:server-shutdown port)))
         ((and *server-run*
	       (> (+ last-access server-timeout)
		  (current-seconds)))
          (if (common:low-noise-print 120 "server continuing")
              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (let ((curr-time (current-seconds)))
		(handle-exceptions