Megatest

Check-in [bff9d56983]
Login
Overview
Comment:Protected calls to expensive ping with calls to cheap server:read-dotserver. This appears to 100% the run-away pings problem
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.62-no-rpc
Files: files | file ages | folders
SHA1: bff9d56983e3732e5fd0c2915b594d55988195b6
User & Date: mrwellan on 2016-12-05 12:58:47
Other Links: branch diff | manifest | tags
Context
2018-04-18
00:21
No idea what this was. Commiting just in case it is interesting ... Leaf check-in: 33160354bc user: matt tags: dunno
2016-12-05
13:11
Bumped version to v1.6301 check-in: fbf0a07a1d user: mrwellan tags: v1.63, v1.6301
12:58
Protected calls to expensive ping with calls to cheap server:read-dotserver. This appears to 100% the run-away pings problem Closed-Leaf check-in: bff9d56983 user: mrwellan tags: v1.62-no-rpc
11:05
Correct expiration of server connections check-in: b168adb943 user: mrwellan tags: v1.62-no-rpc
Changes

Modified http-transport.scm from [344a092fb3] to [4d8eecbf3a].

531
532
533
534
535
536
537

538
539
540
541
542
543
544
545
    (if (args:get-arg "-daemonize")
	(begin
	  (daemon:ize)
	  (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))

    (if (server:check-if-running run-id)
	(begin
	  (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
	  (exit 0))
	(begin ;; ok, no server detected, clean out any lingering records
	   (tasks:server-force-clean-running-records-for-run-id  (db:delay-if-busy tdbdat) run-id "notresponding")))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))







>
|







531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
    (if (args:get-arg "-daemonize")
	(begin
	  (daemon:ize)
	  (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))
    (if (and (server:read-dotserver *toppath*)
             (server:check-if-running run-id))
	(begin
	  (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
	  (exit 0))
	(begin ;; ok, no server detected, clean out any lingering records
	   (tasks:server-force-clean-running-records-for-run-id  (db:delay-if-busy tdbdat) run-id "notresponding")))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))

Modified rmt.scm from [3ee7bc9a52] to [5e992d9837].

24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

;; generate entries for ~/.megatestrc with the following
;;
;;  grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100))) ;; default to 100 seconds

;;======================================================================
;;  S U P P O R T   F U N C T I O N S







|







24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

;; generate entries for ~/.megatestrc with the following
;;
;;  grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100))) ;; default to 100 seconds

;;======================================================================
;;  S U P P O R T   F U N C T I O N S
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
      (rmt:open-qry-close-locally cmd 0 params))
     ;; no server contact made and this is a write, passively start a server 
     ((and (not (remote-server-url *runremote*))
	   (not (member cmd api:read-only-queries)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
      (let ((serverconn (server:check-if-running *toppath*)))
	(if serverconn
	    (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed
	    (if (not (server:start-attempted? *toppath*))
		(server:kind-run *toppath*))))
      (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call
          (begin
            (mutex-unlock! *rmt-mutex*)
	    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.1")
            (rmt:open-qry-close-locally cmd 0 params))
          (begin
            (mutex-unlock! *rmt-mutex*)
	    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.2")
	    (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
            (rmt:send-receive cmd rid params attemptnum: attemptnum))))
     ;; if not on homehost ensure we have a connection to a live server
     ;; NOTE: we *have* a homehost record by now
     ((and (not (cdr (remote-hh-dat *runremote*)))        ;; are we on a homehost?







|









|







109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
      (rmt:open-qry-close-locally cmd 0 params))
     ;; no server contact made and this is a write, passively start a server 
     ((and (not (remote-server-url *runremote*))
	   (not (member cmd api:read-only-queries)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
      (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
	(if serverconn
	    (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed
	    (if (not (server:start-attempted? *toppath*))
		(server:kind-run *toppath*))))
      (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call
          (begin
            (mutex-unlock! *rmt-mutex*)
	    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.1")
            (rmt:open-qry-close-locally cmd 0 params))
          (begin                            ;; not on homehost, start server and wait
            (mutex-unlock! *rmt-mutex*)
	    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.2")
	    (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
            (rmt:send-receive cmd rid params attemptnum: attemptnum))))
     ;; if not on homehost ensure we have a connection to a live server
     ;; NOTE: we *have* a homehost record by now
     ((and (not (cdr (remote-hh-dat *runremote*)))        ;; are we on a homehost?

Modified tasks.scm from [285ca22d6e] to [b8a3c2af2e].

402
403
404
405
406
407
408


409


410
411
412
413
414
415
416
	       (< delay-time delay-max-tries))
	  (begin
	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
		(debug:print 0 *default-log-port* "Try starting server for run-id " run-id))
	    (thread-sleep! (/ (random 2000) 1000))
	    (server:kind-run run-id)
	    (thread-sleep! (min delay-time 1))


	    (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))))))



(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0  1     2         3      4     5          6        7     8          9          10        11     12
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res)))







>
>
|
>
>







402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
	       (< delay-time delay-max-tries))
	  (begin
	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
		(debug:print 0 *default-log-port* "Try starting server for run-id " run-id))
	    (thread-sleep! (/ (random 2000) 1000))
	    (server:kind-run run-id)
	    (thread-sleep! (min delay-time 1))
            (if (not (or (server:start-attempted? *toppath*)
                         (server:read-dotserver *toppath*))) ;; no point in trying
                (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))
                #f))
          #f)))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0  1     2         3      4     5          6        7     8          9          10        11     12
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res)))