Megatest

Check-in [7b318f91bd]
Login
Overview
Comment:protected config file call to delete-file with exception handler. Fixed logic on connecting using CMDINFO. Fixed -list-servers and -kill-servers. Turned exception handler back on in portlogger. Removed the addition of a little noise from the server timeout handling in rmt.scm
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: 7b318f91bdd04d485610aaf5b8f201379de28b90
User & Date: matt on 2017-03-27 23:59:53
Other Links: branch diff | manifest | tags
Context
2017-03-28
00:35
Improved reliability but now have issue with connection. check-in: 45da129709 user: matt tags: v1.64
2017-03-27
23:59
protected config file call to delete-file with exception handler. Fixed logic on connecting using CMDINFO. Fixed -list-servers and -kill-servers. Turned exception handler back on in portlogger. Removed the addition of a little noise from the server timeout handling in rmt.scm check-in: 7b318f91bd user: matt tags: v1.64
12:41
prereq proc is supposed to return test records, not test names. check-in: 2b9e485b7f user: matt tags: v1.64
Changes

Modified configf.scm from [346c0caf52] to [0cf569e087].

669
670
671
672
673
674
675
676


677

678
679
680
681
682
683
684
    (with-output-to-file fname ;; first write out the file
      (lambda ()
	(pp dat)))
    (if (file-exists? fname)   ;; now verify it is readable
	(if (configf:read-alist fname)
	    #t ;; data is good.
	    (begin
	      (delete-file fname)


	      (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.")

	      #f))
	#f)))

;; convert hierarchial list to ini format
;;
(define (configf:config->ini data)
  (map 







|
>
>
|
>







669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
    (with-output-to-file fname ;; first write out the file
      (lambda ()
	(pp dat)))
    (if (file-exists? fname)   ;; now verify it is readable
	(if (configf:read-alist fname)
	    #t ;; data is good.
	    (begin
	      (handle-exceptions
		  exn
		  #f
		(debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.")
		(delete-file fname))
	      #f))
	#f)))

;; convert hierarchial list to ini format
;;
(define (configf:config->ini data)
  (map 

Modified launch.scm from [cc10125ef0] to [f8bf4a3053].

470
471
472
473
474
475
476
477

478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
			       (> (length host-port) 1))
			  (let* ((host      (car host-port))
                                 (port      (cadr host-port))
                                 (start-res (http-transport:client-connect host port))
                                 (ping-res  (rmt:login-no-auto-client-setup start-res)))
			    (if (and start-res
				     ping-res)
				(let ((url  (http-transport:server-dat-make-url start-res)))

				  (remote-conndat-set! *runremote* start-res)
				  (remote-server-url-set! *runremote* url)
				  (if (server:ping url)
				      (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.")
				      (begin
					(debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url)
					(remote-conndat-set! *runremote* #f)
					(remote-server-url-set! *runremote* #f))))
				(debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.")
				))
			  (begin
			    (debug:print-info 0 *default-log-port* (if host-port
								       (conc "received invalid host-port information " host-port)
								       "no host-port information received"))
			    ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare.







|
>

|
|
|
|
|
|
|







470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
			       (> (length host-port) 1))
			  (let* ((host      (car host-port))
                                 (port      (cadr host-port))
                                 (start-res (http-transport:client-connect host port))
                                 (ping-res  (rmt:login-no-auto-client-setup start-res)))
			    (if (and start-res
				     ping-res)
				;; (begin ;; let ((url  (http-transport:server-dat-make-url start-res)))
				(begin
				  (remote-conndat-set! *runremote* start-res)
				  ;; (remote-server-url-set! *runremote* url)
				  ;; (if (server:ping url)
				  (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data."))
				;; (begin
				;; 	(debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url)
				;; 	(remote-conndat-set! *runremote* #f)
				;; 	(remote-server-url-set! *runremote* #f))))
				(debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.")
				))
			  (begin
			    (debug:print-info 0 *default-log-port* (if host-port
								       (conc "received invalid host-port information " host-port)
								       "no host-port information received"))
			    ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare.

Modified megatest.scm from [164cc6d2b1] to [84dec1a162].

155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
  -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are
                                 overwritten by values set in config files.
  -server -|hostname      : start the server (reduces contention on megatest.db), use
                            - to automatically figure out hostname
  -transport http|rpc     : use http or rpc for transport (default is http) 
  -log logfile            : send stdout and stderr to logfile
  -list-servers           : list the servers 
  -stop-server id         : stop server specified by id (see output of -list-servers), use
                            0 to kill all
  -repl                   : start a repl (useful for extending megatest)
  -load file.scm          : load and run file.scm
  -mark-incompletes       : find and mark incomplete tests
  -ping run-id|host:port  : ping server, exit with 0 if found
  -debug N|N,M,O...       : enable debug 0-N or N and M and O ...
  -config fname           : override the megatest.config file with fname
  -append-config fname    : append fname to the megatest.config file







<
|







155
156
157
158
159
160
161

162
163
164
165
166
167
168
169
  -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are
                                 overwritten by values set in config files.
  -server -|hostname      : start the server (reduces contention on megatest.db), use
                            - to automatically figure out hostname
  -transport http|rpc     : use http or rpc for transport (default is http) 
  -log logfile            : send stdout and stderr to logfile
  -list-servers           : list the servers 

  -kill-servers           : kill all servers
  -repl                   : start a repl (useful for extending megatest)
  -load file.scm          : load and run file.scm
  -mark-incompletes       : find and mark incomplete tests
  -ping run-id|host:port  : ping server, exit with 0 if found
  -debug N|N,M,O...       : enable debug 0-N or N and M and O ...
  -config fname           : override the megatest.config file with fname
  -append-config fname    : append fname to the megatest.config file
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
			":expected"
			":tol"
			":units"
			;; misc
			"-start-dir"
			"-contour"
			"-server"
			"-stop-server"
			"-transport"
			"-kill-server"
			"-port"
			"-extract-ods"
			"-pathmod"
			"-env2file"
			"-envcap"
			"-envdelta"
			"-setvars"







<

<







250
251
252
253
254
255
256

257

258
259
260
261
262
263
264
			":expected"
			":tol"
			":units"
			;; misc
			"-start-dir"
			"-contour"
			"-server"

			"-transport"

			"-port"
			"-extract-ods"
			"-pathmod"
			"-env2file"
			"-envcap"
			"-envdelta"
			"-setvars"
317
318
319
320
321
322
323

324
325
326
327
328
329
330
			"-cache-db"
                        "-use-db-cache"
			;; misc
			"-repl"
			"-lock"
			"-unlock"
			"-list-servers"

                        "-run-wait"      ;; wait on a run to complete (i.e. no RUNNING)
			"-local"         ;; run some commands using local db access
                        "-generate-html"

			;; misc queries
			"-list-disks"
			"-list-targets"







>







314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
			"-cache-db"
                        "-use-db-cache"
			;; misc
			"-repl"
			"-lock"
			"-unlock"
			"-list-servers"
			"-kill-servers"
                        "-run-wait"      ;; wait on a run to complete (i.e. no RUNNING)
			"-local"         ;; run some commands using local db access
                        "-generate-html"

			;; misc queries
			"-list-disks"
			"-list-targets"
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
					      (eq? pid-val 0))
					  (begin
					    (printf "Sending signal/term to ~A\n" pid)
					    (process-signal pid signal/term))))))
		       (process:children #f))
		      (original-exit exit-code)))))

;; for some switches alway print the command to stderr
;;
(if (args:any? "-run" "-runall" "-list-runs" "-remove-runs" "-set-state-status")
    (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))

;;======================================================================
;; Misc setup stuff
;;======================================================================

(debug:setup)







|

|







487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
					      (eq? pid-val 0))
					  (begin
					    (printf "Sending signal/term to ~A\n" pid)
					    (process-signal pid signal/term))))))
		       (process:children #f))
		      (original-exit exit-code)))))

;; for some switches always print the command to stderr
;;
(if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status")
    (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))

;;======================================================================
;; Misc setup stuff
;;======================================================================

(debug:setup)
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787

788
789
790
791
792
793
794
795
796
797
798

799
800
801
802

803
804
805
806
807
808
809

810
811
812
813
814
815
816



817
818
819
820

821
822
823
824
825
826
827
(if (args:get-arg "-server")
    (let ((tl        (launch:setup))
          (transport-type (string->symbol (or (args:get-arg "-transport") "http"))))
      (server:launch 0 transport-type)
      (set! *didsomething* #t)))

(if (or (args:get-arg "-list-servers")
	(args:get-arg "-stop-server")
        (args:get-arg "-kill-server"))
    (let ((tl (launch:setup)))
      (if tl 
	  (let* ((servers (server:get-list *toppath*))
		 (fmtstr  "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n")
		 (servers-to-kill '())
                 (kill-switch  (if (args:get-arg "-kill-server") "-9" ""))
                 (killinfo   (or (args:get-arg "-stop-server") (args:get-arg "-kill-server") ))
		 (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f))
		 (sid        (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)))
	    (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport")
	    (format #t fmtstr "==" "=====" "===" "====" "=================" "======" "========" "=====" "=========")
	    (for-each 
	     (lambda (server)
	       (let* ((id         (vector-ref server 0))

		      (pid        (vector-ref server 1))
		      (hostname   (vector-ref server 2))
		      (interface  (vector-ref server 3)) 
		      (pullport   (vector-ref server 4))
		      (pubport    (vector-ref server 5))
		      (start-time (vector-ref server 6))
		      (priority   (vector-ref server 7))
		      (state      (vector-ref server 8))
		      (mt-ver     (vector-ref server 9))
		      (last-update (vector-ref server 10)) 
		      (transport  (vector-ref server 11))

		      (killed     #f)
		      (status     (< last-update 20)))
		 ;;   (zmq-sockets (if status (server:client-connect hostname port) #f)))
		 ;; no need to login as status of #t indicates we are connecting to correct 

		 ;; server
		;; (if (equal? state "dead")
		;;     (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day.
		;; 	 (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete))
		;;     (if (> last-update 20)        ;; Mark as dead if not updated in last 20 seconds
		;; 	 (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid)))
		 (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update

			 (if status "alive" "dead") transport)
		 (if (or (equal? id sid)
			 (equal? sid 0)) ;; kill all/any
		     (begin
		       (debug:print-info 0 *default-log-port* "Attempting to kill "kill-switch" server with pid " pid)
		       (tasks:kill-server hostname pid kill-switch: kill-switch)))))
	     servers)



	    (debug:print-info 1 *default-log-port* "Done with listservers")
	    (set! *didsomething* #t)
	    (exit)) ;; must do, would have to add checks to many/all calls below
	  (exit))))


;;======================================================================
;; Weird special calls that need to run *after* the server has started?
;;======================================================================

(if (args:get-arg "-list-targets")
    (if (launch:setup)







<
|

|

|
<
<
<
<
<
|
|
|

|
>
|
|
<
|
<
<
<
<
<
<
<
>
|
|
<
<
>
|
<
<
<
|
<
<
>
|
|
|

|
|
|
>
>
>


|

>







763
764
765
766
767
768
769

770
771
772
773
774





775
776
777
778
779
780
781
782

783







784
785
786


787
788



789


790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
(if (args:get-arg "-server")
    (let ((tl        (launch:setup))
          (transport-type (string->symbol (or (args:get-arg "-transport") "http"))))
      (server:launch 0 transport-type)
      (set! *didsomething* #t)))

(if (or (args:get-arg "-list-servers")

        (args:get-arg "-kill-servers"))
    (let ((tl (launch:setup)))
      (if tl ;; all roads from here exit
	  (let* ((servers (server:get-list *toppath*))
		 (fmtstr  "~8a~22a~20a~20a~8a\n"))





	    (format #t fmtstr "pid" "Interface:port" "age (hms)" "Last mod" "State")
	    (format #t fmtstr "===" "==============" "=========" "========" "=====")
	    (for-each ;;  ( mod-time host port start-time pid )
	     (lambda (server)
	       (let* ((mtm (any->number (car server)))
		      (mod (if mtm (- (current-seconds) mtm) "unk"))
		      (age (- (current-seconds)(or (any->number (list-ref server 3)) (current-seconds))))
		      (url (conc (cadr server) ":" (caddr server)))

		      (pid (list-ref server 4))







		      (alv (if (number? mod)(< mod 10) #f)))
		 (format #t
			 fmtstr


			 pid
			 url



			 (seconds->hr-min-sec age)


			 (seconds->hr-min-sec mod)
			 (if alv "alive" "dead"))
		 (if (and alv
			  (args:get-arg "-kill-servers"))
		     (begin
		       (debug:print-info 0 *default-log-port* "Attempting to kill server with pid " pid)
		       (server:kill server)))))
	     (sort servers (lambda (a b)
			     (let ((ma (or (any->number (car a)) 9e9))
				   (mb (or (any->number (car b)) 9e9)))
			       (> ma mb)))))
	    (debug:print-info 1 *default-log-port* "Done with listservers")
	    (set! *didsomething* #t)
	    (exit))
	  (exit))))
      ;; must do, would have to add checks to many/all calls below

;;======================================================================
;; Weird special calls that need to run *after* the server has started?
;;======================================================================

(if (args:get-arg "-list-targets")
    (if (launch:setup)

Modified portlogger.scm from [e604a481b0] to [b8f7cf5181].

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
            fail_count INTEGER DEFAULT 0,
            update_time TIMESTAMP DEFAULT (strftime('%s','now')) );")
    db))

(define (portlogger:open-run-close proc . params)
  (let* ((fname  (conc "/tmp/." (current-user-name) "-portlogger.db"))
	 (avail  (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away
    ;;(handle-exceptions
    ;; exn
    ;; (begin
    ;;   ;; (release-dot-lock fname)
    ;;   (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params)
    ;;   (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
    ;;   (debug:print 0 *default-log-port* "exn=" (condition->list exn))
    ;;   (if (file-exists? fname)
    ;;	   (begin
    ;;	     (debug:print 0 *default-log-port* "Removing portlogger database file " fname)
    ;;	     (delete-file fname))) ;; just get rid of the portlogger file
    ;;   (print-call-chain (current-error-port)))
     (let* (;; (lock   (obtain-dot-lock fname 2 9 10))
	    (db     (portlogger:open-db fname))
	    (res    (apply proc db params)))
       (sqlite3:finalize! db)
       ;; (release-dot-lock fname)
       res)))
;; )

;; (fold-row PROC INIT DATABASE SQL . PARAMETERS) 
(define (portlogger:take-port db portnum)
  (let* ((qry1 (sqlite3:prepare db "INSERT INTO ports (port,state) VALUES (?,?);"))
	 (qry2 (sqlite3:prepare db "UPDATE ports SET state=?,update_time=strftime('%s','now') WHERE port=?;"))
	 (qry3 (sqlite3:prepare db "SELECT state FROM ports WHERE port=?;"))
	 (res  (sqlite3:with-transaction







|
|
|
|
|
|
|
|
<
<
<
|





|
<







48
49
50
51
52
53
54
55
56
57
58
59
60
61
62



63
64
65
66
67
68
69

70
71
72
73
74
75
76
            fail_count INTEGER DEFAULT 0,
            update_time TIMESTAMP DEFAULT (strftime('%s','now')) );")
    db))

(define (portlogger:open-run-close proc . params)
  (let* ((fname  (conc "/tmp/." (current-user-name) "-portlogger.db"))
	 (avail  (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away
    (handle-exceptions
     exn
     (begin
       ;; (release-dot-lock fname)
       (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params)
       (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 *default-log-port* "exn=" (condition->list exn))
       (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it



       (print-call-chain (current-error-port)))
     (let* (;; (lock   (obtain-dot-lock fname 2 9 10))
	    (db     (portlogger:open-db fname))
	    (res    (apply proc db params)))
       (sqlite3:finalize! db)
       ;; (release-dot-lock fname)
       res))))


;; (fold-row PROC INIT DATABASE SQL . PARAMETERS) 
(define (portlogger:take-port db portnum)
  (let* ((qry1 (sqlite3:prepare db "INSERT INTO ports (port,state) VALUES (?,?);"))
	 (qry2 (sqlite3:prepare db "UPDATE ports SET state=?,update_time=strftime('%s','now') WHERE port=?;"))
	 (qry3 (sqlite3:prepare db "SELECT state FROM ports WHERE port=?;"))
	 (res  (sqlite3:with-transaction

Modified rmt.scm from [f051a84a44] to [01e080d921].

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
      (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
      #f
      )

     ;; reset the connection if it has been unused too long
     ((and runremote
           (remote-conndat runremote)
	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! runremote #f)
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                  ;; can remove this one. should never get here.         
      (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")







|


|







95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
      (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
      #f
      )

     ;; reset the connection if it has been unused too long
     ((and runremote
           (remote-conndat runremote)
	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))))) ;; NOTE: REMOVED the 30 second noise. If adding it back be sure to offset!! add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                  ;; can remove this one. should never get here.         
      (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")

Modified server.scm from [34ba33b083] to [a878389459].

417
418
419
420
421
422
423
424
425
426
(define (server:get-timeout)
  (let ((tmo (configf:lookup  *configdat* "server" "timeout")))
    (if (and (string? tmo)
	     (string->number tmo))
	(* 60 60 (string->number tmo))
	;; (* 3 24 60 60) ;; default to three days
	;;(* 60 60 1)     ;; default to one hour
	(* 60 60 0.25)    ;; default to 0.25 hours
	)))








|


417
418
419
420
421
422
423
424
425
426
(define (server:get-timeout)
  (let ((tmo (configf:lookup  *configdat* "server" "timeout")))
    (if (and (string? tmo)
	     (string->number tmo))
	(* 60 60 (string->number tmo))
	;; (* 3 24 60 60) ;; default to three days
	;;(* 60 60 1)     ;; default to one hour
	(* 60 5)          ;; default to five minutes
	)))