Megatest

Check-in [7b318f91bd]
Login
Overview
Comment:protected config file call to delete-file with exception handler. Fixed logic on connecting using CMDINFO. Fixed -list-servers and -kill-servers. Turned exception handler back on in portlogger. Removed the addition of a little noise from the server timeout handling in rmt.scm
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: 7b318f91bdd04d485610aaf5b8f201379de28b90
User & Date: matt on 2017-03-27 23:59:53
Other Links: branch diff | manifest | tags
Context
2017-03-28
00:35
Improved reliability but now have issue with connection. check-in: 45da129709 user: matt tags: v1.64
2017-03-27
23:59
protected config file call to delete-file with exception handler. Fixed logic on connecting using CMDINFO. Fixed -list-servers and -kill-servers. Turned exception handler back on in portlogger. Removed the addition of a little noise from the server timeout handling in rmt.scm check-in: 7b318f91bd user: matt tags: v1.64
12:41
prereq proc is supposed to return test records, not test names. check-in: 2b9e485b7f user: matt tags: v1.64
Changes

Modified configf.scm from [346c0caf52] to [0cf569e087].

669
670
671
672
673
674
675
676
677





678
679
680
681
682
683
684
669
670
671
672
673
674
675


676
677
678
679
680
681
682
683
684
685
686
687







-
-
+
+
+
+
+







    (with-output-to-file fname ;; first write out the file
      (lambda ()
	(pp dat)))
    (if (file-exists? fname)   ;; now verify it is readable
	(if (configf:read-alist fname)
	    #t ;; data is good.
	    (begin
	      (delete-file fname)
	      (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.")
	      (handle-exceptions
		  exn
		  #f
		(debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.")
		(delete-file fname))
	      #f))
	#f)))

;; convert hierarchial list to ini format
;;
(define (configf:config->ini data)
  (map 

Modified launch.scm from [cc10125ef0] to [f8bf4a3053].

470
471
472
473
474
475
476
477


478
479
480
481
482
483
484
485







486
487
488
489
490
491
492
470
471
472
473
474
475
476

477
478
479







480
481
482
483
484
485
486
487
488
489
490
491
492
493







-
+
+

-
-
-
-
-
-
-
+
+
+
+
+
+
+







			       (> (length host-port) 1))
			  (let* ((host      (car host-port))
                                 (port      (cadr host-port))
                                 (start-res (http-transport:client-connect host port))
                                 (ping-res  (rmt:login-no-auto-client-setup start-res)))
			    (if (and start-res
				     ping-res)
				(let ((url  (http-transport:server-dat-make-url start-res)))
				;; (begin ;; let ((url  (http-transport:server-dat-make-url start-res)))
				(begin
				  (remote-conndat-set! *runremote* start-res)
				  (remote-server-url-set! *runremote* url)
				  (if (server:ping url)
				      (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.")
				      (begin
					(debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url)
					(remote-conndat-set! *runremote* #f)
					(remote-server-url-set! *runremote* #f))))
				  ;; (remote-server-url-set! *runremote* url)
				  ;; (if (server:ping url)
				  (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data."))
				;; (begin
				;; 	(debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url)
				;; 	(remote-conndat-set! *runremote* #f)
				;; 	(remote-server-url-set! *runremote* #f))))
				(debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.")
				))
			  (begin
			    (debug:print-info 0 *default-log-port* (if host-port
								       (conc "received invalid host-port information " host-port)
								       "no host-port information received"))
			    ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare.

Modified megatest.scm from [164cc6d2b1] to [84dec1a162].

155
156
157
158
159
160
161
162
163

164
165
166
167
168
169
170
155
156
157
158
159
160
161


162
163
164
165
166
167
168
169







-
-
+







  -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are
                                 overwritten by values set in config files.
  -server -|hostname      : start the server (reduces contention on megatest.db), use
                            - to automatically figure out hostname
  -transport http|rpc     : use http or rpc for transport (default is http) 
  -log logfile            : send stdout and stderr to logfile
  -list-servers           : list the servers 
  -stop-server id         : stop server specified by id (see output of -list-servers), use
                            0 to kill all
  -kill-servers           : kill all servers
  -repl                   : start a repl (useful for extending megatest)
  -load file.scm          : load and run file.scm
  -mark-incompletes       : find and mark incomplete tests
  -ping run-id|host:port  : ping server, exit with 0 if found
  -debug N|N,M,O...       : enable debug 0-N or N and M and O ...
  -config fname           : override the megatest.config file with fname
  -append-config fname    : append fname to the megatest.config file
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
250
251
252
253
254
255
256

257

258
259
260
261
262
263
264







-

-







			":expected"
			":tol"
			":units"
			;; misc
			"-start-dir"
			"-contour"
			"-server"
			"-stop-server"
			"-transport"
			"-kill-server"
			"-port"
			"-extract-ods"
			"-pathmod"
			"-env2file"
			"-envcap"
			"-envdelta"
			"-setvars"
317
318
319
320
321
322
323

324
325
326
327
328
329
330
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328







+







			"-cache-db"
                        "-use-db-cache"
			;; misc
			"-repl"
			"-lock"
			"-unlock"
			"-list-servers"
			"-kill-servers"
                        "-run-wait"      ;; wait on a run to complete (i.e. no RUNNING)
			"-local"         ;; run some commands using local db access
                        "-generate-html"

			;; misc queries
			"-list-disks"
			"-list-targets"
489
490
491
492
493
494
495
496

497
498

499
500
501
502
503
504
505
487
488
489
490
491
492
493

494
495

496
497
498
499
500
501
502
503







-
+

-
+







					      (eq? pid-val 0))
					  (begin
					    (printf "Sending signal/term to ~A\n" pid)
					    (process-signal pid signal/term))))))
		       (process:children #f))
		      (original-exit exit-code)))))

;; for some switches alway print the command to stderr
;; for some switches always print the command to stderr
;;
(if (args:any? "-run" "-runall" "-list-runs" "-remove-runs" "-set-state-status")
(if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status")
    (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))

;;======================================================================
;; Misc setup stuff
;;======================================================================

(debug:setup)
765
766
767
768
769
770
771
772
773

774
775

776
777

778
779
780
781
782
783
784
785



786
787
788
789




790
791

792
793
794
795
796
797
798
799
800



801
802
803


804
805
806
807

808
809
810
811
812




813
814
815
816






817
818
819

820

821
822
823
824
825
826
827
763
764
765
766
767
768
769


770
771

772
773

774








775
776
777
778



779
780
781
782


783









784
785
786



787
788




789





790
791
792
793
794



795
796
797
798
799
800
801
802

803
804
805
806
807
808
809
810
811
812







-
-
+

-
+

-
+
-
-
-
-
-
-
-
-
+
+
+

-
-
-
+
+
+
+
-
-
+
-
-
-
-
-
-
-
-
-
+
+
+
-
-
-
+
+
-
-
-
-
+
-
-
-
-
-
+
+
+
+

-
-
-
+
+
+
+
+
+


-
+

+







(if (args:get-arg "-server")
    (let ((tl        (launch:setup))
          (transport-type (string->symbol (or (args:get-arg "-transport") "http"))))
      (server:launch 0 transport-type)
      (set! *didsomething* #t)))

(if (or (args:get-arg "-list-servers")
	(args:get-arg "-stop-server")
        (args:get-arg "-kill-server"))
        (args:get-arg "-kill-servers"))
    (let ((tl (launch:setup)))
      (if tl 
      (if tl ;; all roads from here exit
	  (let* ((servers (server:get-list *toppath*))
		 (fmtstr  "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n")
		 (fmtstr  "~8a~22a~20a~20a~8a\n"))
		 (servers-to-kill '())
                 (kill-switch  (if (args:get-arg "-kill-server") "-9" ""))
                 (killinfo   (or (args:get-arg "-stop-server") (args:get-arg "-kill-server") ))
		 (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f))
		 (sid        (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)))
	    (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport")
	    (format #t fmtstr "==" "=====" "===" "====" "=================" "======" "========" "=====" "=========")
	    (for-each 
	    (format #t fmtstr "pid" "Interface:port" "age (hms)" "Last mod" "State")
	    (format #t fmtstr "===" "==============" "=========" "========" "=====")
	    (for-each ;;  ( mod-time host port start-time pid )
	     (lambda (server)
	       (let* ((id         (vector-ref server 0))
		      (pid        (vector-ref server 1))
		      (hostname   (vector-ref server 2))
	       (let* ((mtm (any->number (car server)))
		      (mod (if mtm (- (current-seconds) mtm) "unk"))
		      (age (- (current-seconds)(or (any->number (list-ref server 3)) (current-seconds))))
		      (url (conc (cadr server) ":" (caddr server)))
		      (interface  (vector-ref server 3)) 
		      (pullport   (vector-ref server 4))
		      (pid (list-ref server 4))
		      (pubport    (vector-ref server 5))
		      (start-time (vector-ref server 6))
		      (priority   (vector-ref server 7))
		      (state      (vector-ref server 8))
		      (mt-ver     (vector-ref server 9))
		      (last-update (vector-ref server 10)) 
		      (transport  (vector-ref server 11))
		      (killed     #f)
		      (status     (< last-update 20)))
		      (alv (if (number? mod)(< mod 10) #f)))
		 (format #t
			 fmtstr
		 ;;   (zmq-sockets (if status (server:client-connect hostname port) #f)))
		 ;; no need to login as status of #t indicates we are connecting to correct 
		 ;; server
			 pid
			 url
		;; (if (equal? state "dead")
		;;     (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day.
		;; 	 (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete))
		;;     (if (> last-update 20)        ;; Mark as dead if not updated in last 20 seconds
			 (seconds->hr-min-sec age)
		;; 	 (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid)))
		 (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update
			 (if status "alive" "dead") transport)
		 (if (or (equal? id sid)
			 (equal? sid 0)) ;; kill all/any
			 (seconds->hr-min-sec mod)
			 (if alv "alive" "dead"))
		 (if (and alv
			  (args:get-arg "-kill-servers"))
		     (begin
		       (debug:print-info 0 *default-log-port* "Attempting to kill "kill-switch" server with pid " pid)
		       (tasks:kill-server hostname pid kill-switch: kill-switch)))))
	     servers)
		       (debug:print-info 0 *default-log-port* "Attempting to kill server with pid " pid)
		       (server:kill server)))))
	     (sort servers (lambda (a b)
			     (let ((ma (or (any->number (car a)) 9e9))
				   (mb (or (any->number (car b)) 9e9)))
			       (> ma mb)))))
	    (debug:print-info 1 *default-log-port* "Done with listservers")
	    (set! *didsomething* #t)
	    (exit)) ;; must do, would have to add checks to many/all calls below
	    (exit))
	  (exit))))
      ;; must do, would have to add checks to many/all calls below

;;======================================================================
;; Weird special calls that need to run *after* the server has started?
;;======================================================================

(if (args:get-arg "-list-targets")
    (if (launch:setup)

Modified portlogger.scm from [e604a481b0] to [b8f7cf5181].

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62








63
64
65
66

67
68
69
70
71
72

73
74
75
76
77
78
79
80
48
49
50
51
52
53
54








55
56
57
58
59
60
61
62




63
64
65
66
67
68

69

70
71
72
73
74
75
76







-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-
-
-
+





-
+
-







            fail_count INTEGER DEFAULT 0,
            update_time TIMESTAMP DEFAULT (strftime('%s','now')) );")
    db))

(define (portlogger:open-run-close proc . params)
  (let* ((fname  (conc "/tmp/." (current-user-name) "-portlogger.db"))
	 (avail  (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away
    ;;(handle-exceptions
    ;; exn
    ;; (begin
    ;;   ;; (release-dot-lock fname)
    ;;   (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params)
    ;;   (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
    ;;   (debug:print 0 *default-log-port* "exn=" (condition->list exn))
    ;;   (if (file-exists? fname)
    (handle-exceptions
     exn
     (begin
       ;; (release-dot-lock fname)
       (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params)
       (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 *default-log-port* "exn=" (condition->list exn))
       (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it
    ;;	   (begin
    ;;	     (debug:print 0 *default-log-port* "Removing portlogger database file " fname)
    ;;	     (delete-file fname))) ;; just get rid of the portlogger file
    ;;   (print-call-chain (current-error-port)))
       (print-call-chain (current-error-port)))
     (let* (;; (lock   (obtain-dot-lock fname 2 9 10))
	    (db     (portlogger:open-db fname))
	    (res    (apply proc db params)))
       (sqlite3:finalize! db)
       ;; (release-dot-lock fname)
       res)))
       res))))
;; )

;; (fold-row PROC INIT DATABASE SQL . PARAMETERS) 
(define (portlogger:take-port db portnum)
  (let* ((qry1 (sqlite3:prepare db "INSERT INTO ports (port,state) VALUES (?,?);"))
	 (qry2 (sqlite3:prepare db "UPDATE ports SET state=?,update_time=strftime('%s','now') WHERE port=?;"))
	 (qry3 (sqlite3:prepare db "SELECT state FROM ports WHERE port=?;"))
	 (res  (sqlite3:with-transaction

Modified rmt.scm from [f051a84a44] to [01e080d921].

95
96
97
98
99
100
101
102

103
104
105

106
107
108
109
110
111
112
95
96
97
98
99
100
101

102
103
104

105
106
107
108
109
110
111
112







-
+


-
+







      (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
      #f
      )

     ;; reset the connection if it has been unused too long
     ((and runremote
           (remote-conndat runremote)
	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts
	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))))) ;; NOTE: REMOVED the 30 second noise. If adding it back be sure to offset!! add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! runremote #f)
      (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                  ;; can remove this one. should never get here.         
      (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")

Modified server.scm from [34ba33b083] to [a878389459].

417
418
419
420
421
422
423
424

425
426
417
418
419
420
421
422
423

424
425
426







-
+


(define (server:get-timeout)
  (let ((tmo (configf:lookup  *configdat* "server" "timeout")))
    (if (and (string? tmo)
	     (string->number tmo))
	(* 60 60 (string->number tmo))
	;; (* 3 24 60 60) ;; default to three days
	;;(* 60 60 1)     ;; default to one hour
	(* 60 60 0.25)    ;; default to 0.25 hours
	(* 60 5)          ;; default to five minutes
	)))