Megatest

Check-in [7654d9b17b]
Login
Overview
Comment:re-implemented old v1.60 style server timeout handling. Something hahad gotten lost in the translation
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: 7654d9b17b55fc464cd6bc8ee3b2f17bf19f33a6
User & Date: matt on 2017-07-12 23:12:34
Other Links: branch diff | manifest | tags
Context
2017-07-13
04:19
Oops. Change units for server expiration time back to hours check-in: fafe58dfc3 user: matt tags: v1.64
2017-07-12
23:12
re-implemented old v1.60 style server timeout handling. Something hahad gotten lost in the translation check-in: 7654d9b17b user: matt tags: v1.64
17:37
fixed bug check-in: b27bd09cf2 user: bjbarcla tags: v1.64
Changes

Modified common.scm from [b3a2e136d0] to [d2bb16b225].

158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (server:get-timeout)) ;; default from server:get-timeout
  (force-server      #f)
  (ro-mode           #f)  
  (ro-mode-checked   #f)) ;; flag that indicates we have checked for ro-mode

;; launching and hosts
(defstruct host
  (reachable    #f)







|







158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (server:expiration-timeout))
  (force-server      #f)
  (ro-mode           #f)  
  (ro-mode-checked   #f)) ;; flag that indicates we have checked for ro-mode

;; launching and hosts
(defstruct host
  (reachable    #f)

Modified http-transport.scm from [9dc12bb28f] to [e390d2210b].

381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
				      (exit))
				    (loop start-time
					  (equal? sdat last-sdat)
					  sdat)))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (server-timeout (server:get-timeout))
	 (server-going  #f)
	 (server-log-file (args:get-arg "-log"))) ;; always set when we are a server

    (with-output-to-file started-file (lambda ()(print (current-process-id))))

    (let loop ((count         0)
	       (server-state 'available)







|







381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
				      (exit))
				    (loop start-time
					  (equal? sdat last-sdat)
					  sdat)))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (server-timeout (server:expiration-timeout))
	 (server-going  #f)
	 (server-log-file (args:get-arg "-log"))) ;; always set when we are a server

    (with-output-to-file started-file (lambda ()(print (current-process-id))))

    (let loop ((count         0)
	       (server-state 'available)
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
	  (begin
	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds))
	    (flush-output *default-log-port*)))
      (if (common:low-noise-print 60 "dbstats")
	  (begin
	    (debug:print 0 *default-log-port* "Server stats:")
	    (db:print-current-query-stats)))
      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600))
	     (adjusted-timeout (if (> hrs-since-start 1)
				   (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
				   server-timeout)))
	(if (common:low-noise-print 120 "server timeout")
	    (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
	(cond
         ((and *server-run*
	       (> (+ last-access server-timeout)
		  (current-seconds))
	       (< (- (current-seconds) server-start-time) (configf:lookup-number *configdat* "server" "time-to-die-seconds" default: (* 3600 700 2) ) )) ;; do not update log or touch log if we've been running for more than one hour.
          (if (common:low-noise-print 120 "server continuing")
              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (let ((curr-time (current-seconds)))
		(handle-exceptions
		    exn
		    (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk?")
		  (if (not *server-overloaded*)







|
<
<
<
<
<



|
<







438
439
440
441
442
443
444
445





446
447
448
449

450
451
452
453
454
455
456
	  (begin
	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds))
	    (flush-output *default-log-port*)))
      (if (common:low-noise-print 60 "dbstats")
	  (begin
	    (debug:print 0 *default-log-port* "Server stats:")
	    (db:print-current-query-stats)))
      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600)))





	(cond
         ((and *server-run*
	       (> (+ last-access server-timeout)
		  (current-seconds)))

          (if (common:low-noise-print 120 "server continuing")
              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (let ((curr-time (current-seconds)))
		(handle-exceptions
		    exn
		    (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk?")
		  (if (not *server-overloaded*)

Modified launch.scm from [bc68bfb44c] to [efe1a9c8e9].

465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
	  ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ...
	  ;;
	  (setenv "MT_TESTSUITENAME" areaname)
	  (setenv "MT_RUN_AREA_HOME" top-path)
	  (set! *toppath* top-path)
	  (setenv "MT_TEST_RUN_DIR"  work-area)

	  ;; On NFS it can be slow and unreliable to get needed startup information.
	  ;;  i. Check if we are on the homehost, if so, proceed
	  ;; ii. Check if host and port passed in via CMDINFO are valid and if
	  ;;     possible use them.
	  (let ((bestadrs (server:get-best-guess-address (get-host-name)))
		(needcare #f))
	    (if (equal? homehost bestadrs) ;; we are likely on the homehost
		(debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost)
		(let ((host-port (if serverurl (string-split serverurl ":") #f)))
		  (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote*
		  (if (string? homehost)
		      (if (and host-port
			       (> (length host-port) 1))
			  (let* ((host      (car host-port))
                                 (port      (cadr host-port))
                                 (start-res (http-transport:client-connect host port))
                                 (ping-res  (rmt:login-no-auto-client-setup start-res)))
			    (if (and start-res
				     ping-res)
				;; (begin ;; let ((url  (http-transport:server-dat-make-url start-res)))
				(begin
				  (remote-conndat-set! *runremote* start-res)
				  ;; (remote-server-url-set! *runremote* url)
				  ;; (if (server:ping url)
				  (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data."))
				(begin
				  (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " host ":" port)
				  (set! *runremote* #f))
				  ;; (remote-conndat-set! *runremote* #f))
				))
			  (begin
			    (set! *runremote* #f)
			    (debug:print-info 0 *default-log-port* (if host-port
								       (conc "received invalid host-port information " host-port)
								       "no host-port information received"))
			    ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare.
			    (set! needcare #t)))
		      (begin
			(set! *runremote* #f)
			(debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.")
			(set! needcare #t)))))
	    (if needcare  ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host
		(let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory
		  (handle-exceptions
		      exn
		      (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn))
		    (create-directory logdir #t)))))
		  
	  ;; NFS might not have propagated the directory meta data to the run host - give it time if needed
	  (let loop ((count 0))
	    (if (or (common:file-exists? top-path)
		    (> count 10))
		(change-directory top-path)
		(begin
		  (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found")







|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|







465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
	  ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ...
	  ;;
	  (setenv "MT_TESTSUITENAME" areaname)
	  (setenv "MT_RUN_AREA_HOME" top-path)
	  (set! *toppath* top-path)
	  (setenv "MT_TEST_RUN_DIR"  work-area)

	;;    ;; On NFS it can be slow and unreliable to get needed startup information.
	;;    ;;  i. Check if we are on the homehost, if so, proceed
	;;    ;; ii. Check if host and port passed in via CMDINFO are valid and if
	;;    ;;     possible use them.
	;;    (let ((bestadrs (server:get-best-guess-address (get-host-name)))
	;;    	(needcare #f))
	;;      (if (equal? homehost bestadrs) ;; we are likely on the homehost
	;;    	(debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost)
	;;    	(let ((host-port (if serverurl (string-split serverurl ":") #f)))
	;;    	  (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote*
	;;    	  (if (string? homehost)
	;;    	      (if (and host-port
	;;    		       (> (length host-port) 1))
	;;    		  (let* ((host      (car host-port))
        ;;                           (port      (cadr host-port))
        ;;                           (start-res (http-transport:client-connect host port))
        ;;                           (ping-res  (rmt:login-no-auto-client-setup start-res)))
	;;    		    (if (and start-res
	;;    			     ping-res)
	;;    			;; (begin ;; let ((url  (http-transport:server-dat-make-url start-res)))
	;;    			(begin
	;;    			  (remote-conndat-set! *runremote* start-res)
	;;    			  ;; (remote-server-url-set! *runremote* url)
	;;    			  ;; (if (server:ping url)
	;;    			  (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data."))
	;;    			(begin
	;;    			  (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " host ":" port)
	;;    			  (set! *runremote* #f))
	;;    			  ;; (remote-conndat-set! *runremote* #f))
	;;    			))
	;;    		  (begin
	;;    		    (set! *runremote* #f)
	;;    		    (debug:print-info 0 *default-log-port* (if host-port
	;;    							       (conc "received invalid host-port information " host-port)
	;;    							       "no host-port information received"))
	;;    		    ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare.
	;;    		    (set! needcare #t)))
	;;    	      (begin
	;;    		(set! *runremote* #f)
	;;    		(debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.")
	;;    		(set! needcare #t)))))
	;;      (if needcare  ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host
	;;    	(let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory
	;;    	  (handle-exceptions
	;;    	      exn
	;;    	      (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn))
	;;    	    (create-directory logdir #t)))))
	;;    	  
	  ;; NFS might not have propagated the directory meta data to the run host - give it time if needed
	  (let loop ((count 0))
	    (if (or (common:file-exists? top-path)
		    (> count 10))
		(change-directory top-path)
		(begin
		  (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found")

Modified rmt.scm from [bc431c6c0b] to [efb226ceca].

126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

141
142
143
144
145
146
147
148
149
150
151
152
153
      (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
      #f)

     ;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
     ;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
     ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
     ;;
     ;; ;;DOT CASE4 [label="reset\nconnection"];
     ;; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
     ;; ;;DOT CASE4 -> "rmt:send-receive";
     ;; ;; reset the connection if it has been unused too long
     ;; ((and runremote
     ;;       (remote-conndat runremote)
     ;; 	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #<request>.\n message: Server closed connection before sending response"
     ;; 	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))

     ;;  (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
     ;;  (http-transport:close-connections area-dat: runremote)
     ;;  (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
     ;;  (mutex-unlock! *rmt-mutex*)
     ;;  (rmt:send-receive cmd rid params attemptnum: attemptnum))

     ;;DOT CASE5 [label="local\nread"];
     ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
     ;;DOT CASE5 -> "rmt:open-qry-close-locally";
     ;; on homehost and this is a read
     ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
	   (cdr (remote-hh-dat runremote))       ;; on homehost
           (member cmd api:read-only-queries))   ;; this is a read







|
|
|
|
|
|
|
|
>
|
|
|
|
|
|







126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
      (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
      #f)

     ;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
     ;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
     ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
     ;;
     ;;DOT CASE4 [label="reset\nconnection"];
     ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
     ;;DOT CASE4 -> "rmt:send-receive";
     ;; reset the connection if it has been unused too long
     ((and runremote
           (remote-conndat runremote)
	   (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on
	      (+ (http-transport:server-dat-get-last-access (remote-conndat runremote))
		 (remote-server-timeout runremote))))
      (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
      (http-transport:close-connections area-dat: runremote)
      (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     
     ;;DOT CASE5 [label="local\nread"];
     ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
     ;;DOT CASE5 -> "rmt:open-qry-close-locally";
     ;; on homehost and this is a read
     ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
	   (cdr (remote-hh-dat runremote))       ;; on homehost
           (member cmd api:read-only-queries))   ;; this is a read
242
243
244
245
246
247
248


249


250
251
252
253
254
255
256
257
                                  ((exn)(vector #f "other fail" (print-call-chain)))))
			 (else
			  (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported")
			  (exit))))
	     (success  (if (vector? dat) (vector-ref dat 0) #f))
	     (res      (if (vector? dat) (vector-ref dat 1) #f)))
	(if (and (vector? conninfo) (> 5 (vector-length conninfo)))


            (http-transport:server-dat-update-last-access conninfo) ;; refresh access time


            (begin
              (set! conninfo #f)
              (remote-conndat-set! runremote #f))) 
	;; (mutex-unlock! *rmt-mutex*)
        (debug:print-info 13 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = " runremote)
	(mutex-unlock! *rmt-mutex*)
	(if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end
	    (if (and (vector? res)







>
>

>
>
|







243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
                                  ((exn)(vector #f "other fail" (print-call-chain)))))
			 (else
			  (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported")
			  (exit))))
	     (success  (if (vector? dat) (vector-ref dat 0) #f))
	     (res      (if (vector? dat) (vector-ref dat 1) #f)))
	(if (and (vector? conninfo) (> 5 (vector-length conninfo)))

	    
            (http-transport:server-dat-update-last-access conninfo) ;; refresh access time


	    (begin
              (set! conninfo #f)
              (remote-conndat-set! runremote #f))) 
	;; (mutex-unlock! *rmt-mutex*)
        (debug:print-info 13 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = " runremote)
	(mutex-unlock! *rmt-mutex*)
	(if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end
	    (if (and (vector? res)

Modified server.scm from [52a482f03f] to [e95564f4fe].

437
438
439
440
441
442
443
444

445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
(define (server:login toppath)
  (lambda (toppath)
    (set! *db-last-access* (current-seconds)) ;; might not be needed.
    (if (equal? *toppath* toppath)
	#t
	#f)))

;; timeout is in hours

(define (server:get-timeout)
  (let ((tmo (configf:lookup  *configdat* "server" "timeout")))
    (if (and (string? tmo)
	     (string->number tmo))
	(* 60 60 (string->number tmo))
	;; (* 3 24 60 60) ;; default to three days
	;;(* 60 60 1)     ;; default to one hour
	(* 60 5)          ;; default to five minutes
	)))

;; moving this here as it needs access to db and cannot be in common.
;;
(define (server:writable-watchdog dbstruct)
  (thread-sleep! 0.05) ;; delay for startup
  (let ((legacy-sync  (common:run-sync?))
	(debug-mode   (debug:debug-mode 1))







|
>
|
|

|
|
<
<
<
|







437
438
439
440
441
442
443
444
445
446
447
448
449
450



451
452
453
454
455
456
457
458
(define (server:login toppath)
  (lambda (toppath)
    (set! *db-last-access* (current-seconds)) ;; might not be needed.
    (if (equal? *toppath* toppath)
	#t
	#f)))

;; timeout is hms string: 1h 5m 3s, default is 1 minute
;;
(define (server:expiration-timeout)
  (let ((tmo (configf:lookup *configdat* "server" "timeout")))
    (if (and (string? tmo)
	     (common:hms-string->seconds tmo))
        (string->number tmo)



	60)))

;; moving this here as it needs access to db and cannot be in common.
;;
(define (server:writable-watchdog dbstruct)
  (thread-sleep! 0.05) ;; delay for startup
  (let ((legacy-sync  (common:run-sync?))
	(debug-mode   (debug:debug-mode 1))