Megatest

Check-in [73502a1626]
Login
Overview
Comment:fixed -list-servers and -stop-server switches on megatest
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.62
Files: files | file ages | folders
SHA1: 73502a16261973eea343136d5f8fbe48b234f401
User & Date: bjbarcla on 2016-10-25 21:02:22
Other Links: branch diff | manifest | tags
Context
2016-10-26
14:21
added support for -kill-servers and -transport switches on megatest check-in: cfb9ac119d user: bjbarcla tags: v1.62
2016-10-25
21:02
fixed -list-servers and -stop-server switches on megatest check-in: 73502a1626 user: bjbarcla tags: v1.62
10:37
Updated plans check-in: c449bba35a user: mrwellan tags: v1.62
Changes

Modified Makefile from [81378c20f8] to [83b5fe2a28].

1


2
3
4
5
6
7
8
# make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)'


PREFIX=$(PWD)
CSCOPTS= 
INSTALL=install
SRCFILES = common.scm items.scm launch.scm \
   ods.scm runconfig.scm server.scm configf.scm \
   db.scm keys.scm margs.scm megatest-version.scm \
   process.scm runs.scm tasks.scm tests.scm genexample.scm \

>
>







1
2
3
4
5
6
7
8
9
10
# make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)'
# rm <files>.o ; make install CSCOPTS='-profile' ; ... ;  chicken-profile | less

PREFIX=$(PWD)
CSCOPTS= 
INSTALL=install
SRCFILES = common.scm items.scm launch.scm \
   ods.scm runconfig.scm server.scm configf.scm \
   db.scm keys.scm margs.scm megatest-version.scm \
   process.scm runs.scm tasks.scm tests.scm genexample.scm \

Modified api.scm from [d744d47aad] to [bcdab13d33].

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
;;    - returns #( flag result )
;;
(define (api:execute-requests dbstruct dat)
  (handle-exceptions
   exn
   (let ((call-chain (get-call-chain)))
     (print-call-chain (current-error-port))
     (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (if (not (vector? dat))                    ;; it is an error to not receive a vector
       (vector #f #f "remote must be called with a vector")       
       (vector                                   ;; return a vector + the returned data structure
	#t 
	(let ((cmd    (vector-ref dat 0))
	      (params (vector-ref dat 1)))







|







105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
;;    - returns #( flag result )
;;
(define (api:execute-requests dbstruct dat)
  (handle-exceptions
   exn
   (let ((call-chain (get-call-chain)))
     (print-call-chain (current-error-port))
     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (if (not (vector? dat))                    ;; it is an error to not receive a vector
       (vector #f #f "remote must be called with a vector")       
       (vector                                   ;; return a vector + the returned data structure
	#t 
	(let ((cmd    (vector-ref dat 0))
	      (params (vector-ref dat 1)))

Modified db.scm from [67b7a55241] to [29d75e1de6].

3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
    ;; ((fs) obj)
    ((http fs)
     (string-substitute
      (regexp "=") "_"
      (base64:base64-encode 
       (z3:encode-buffer
	(with-output-to-string
	  (lambda ()(serialize obj)))))
      #t))
    ((zmq nmsg)(with-output-to-string (lambda ()(serialize obj))))
    (else obj)))

(define (db:string->obj msg #!key (transport 'http))
  (case transport
    ;; ((fs) msg)
    ((http fs)
     (if (string? msg)
	 (with-input-from-string 
	     (z3:decode-buffer
	      (base64:base64-decode
	       (string-substitute 
		(regexp "_") "=" msg #t)))
	   (lambda ()(deserialize)))
	 (begin
	   (debug:print-error 0 *default-log-port* "reception failed. Received " msg " but cannot translate it.")
	   msg))) ;; crude reply for when things go awry
    ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize))))
    (else msg)))

(define (db:test-set-status-state dbstruct run-id test-id status state msg)
  (let ((dbdat  (db:get-db dbstruct run-id)))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
	(db:general-call dbdat 'set-test-start-time (list test-id)))
    (if msg
	(db:general-call dbdat 'state-status-msg (list state status msg test-id))







|


|
















|







3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
    ;; ((fs) obj)
    ((http fs)
     (string-substitute
      (regexp "=") "_"
      (base64:base64-encode 
       (z3:encode-buffer
	(with-output-to-string
	  (lambda ()(serialize obj))))) ;; BB: serialize - this is what causes problems between different builds of megatest communicating.  serialize is sensitive to binary image of mtest.
      #t))
    ((zmq nmsg)(with-output-to-string (lambda ()(serialize obj))))
    (else obj))) ;; rpc

(define (db:string->obj msg #!key (transport 'http))
  (case transport
    ;; ((fs) msg)
    ((http fs)
     (if (string? msg)
	 (with-input-from-string 
	     (z3:decode-buffer
	      (base64:base64-decode
	       (string-substitute 
		(regexp "_") "=" msg #t)))
	   (lambda ()(deserialize)))
	 (begin
	   (debug:print-error 0 *default-log-port* "reception failed. Received " msg " but cannot translate it.")
	   msg))) ;; crude reply for when things go awry
    ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize))))
    (else msg))) ;; rpc

(define (db:test-set-status-state dbstruct run-id test-id status state msg)
  (let ((dbdat  (db:get-db dbstruct run-id)))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
	(db:general-call dbdat 'set-test-start-time (list test-id)))
    (if msg
	(db:general-call dbdat 'state-status-msg (list state status msg test-id))

Modified megatest.scm from [f073a21a21] to [8187d3ec4f].

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
  -import-megatest.db     : migrate a database from v1.55 series to v1.60 series
  -sync-to-megatest.db    : migrate data back to megatest.db
  -update-meta            : update the tests metadata for all tests
  -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are
                                 overwritten by values set in config files.
  -server -|hostname      : start the server (reduces contention on megatest.db), use
                            - to automatically figure out hostname
  -transport http|zmq     : use http or zmq for transport (default is http) 
  -daemonize              : fork into background and disconnect from stdin/out
  -log logfile            : send stdout and stderr to logfile
  -list-servers           : list the servers 
  -stop-server id         : stop server specified by id (see output of -list-servers), use
                            0 to kill all
  -repl                   : start a repl (useful for extending megatest)
  -load file.scm          : load and run file.scm







|







144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
  -import-megatest.db     : migrate a database from v1.55 series to v1.60 series
  -sync-to-megatest.db    : migrate data back to megatest.db
  -update-meta            : update the tests metadata for all tests
  -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are
                                 overwritten by values set in config files.
  -server -|hostname      : start the server (reduces contention on megatest.db), use
                            - to automatically figure out hostname
  -transport http|rpc     : use http or rpc for transport (default is http) 
  -daemonize              : fork into background and disconnect from stdin/out
  -log logfile            : send stdout and stderr to logfile
  -list-servers           : list the servers 
  -stop-server id         : stop server specified by id (see output of -list-servers), use
                            0 to kill all
  -repl                   : start a repl (useful for extending megatest)
  -load file.scm          : load and run file.scm
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
			 (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid)))
		 (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update
			 (if status "alive" "dead") transport)
		 (if (or (equal? id sid)
			 (equal? sid 0)) ;; kill all/any
		     (begin
		       (debug:print-info 0 *default-log-port* "Attempting to stop server with pid " pid)
		       (tasks:kill-server status hostname pullport pid transport)))))
	     servers)
	    (debug:print-info 1 *default-log-port* "Done with listservers")
	    (set! *didsomething* #t)
	    (exit)) ;; must do, would have to add checks to many/all calls below
	  (exit))))

;;======================================================================







|







815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
			 (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid)))
		 (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update
			 (if status "alive" "dead") transport)
		 (if (or (equal? id sid)
			 (equal? sid 0)) ;; kill all/any
		     (begin
		       (debug:print-info 0 *default-log-port* "Attempting to stop server with pid " pid)
		       (tasks:kill-server hostname pid)))))
	     servers)
	    (debug:print-info 1 *default-log-port* "Done with listservers")
	    (set! *didsomething* #t)
	    (exit)) ;; must do, would have to add checks to many/all calls below
	  (exit))))

;;======================================================================

Modified runs.scm from [9f66bcb951] to [c631ccf0a3].

1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
	  (debug:print-info 4 *default-log-port* "Exiting loop with...\n  hed=" hed "\n  tal=" tal "\n  reruns=" reruns))
	 )))
    ;; now *if* -run-wait we wait for all tests to be done
    ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
    (thread-sleep! 5) ;; I think there is a race condition here. Let states/statuses settle
    (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id))
		    (prev-num-running 0))
      ;; (debug:print 0 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
      (if (and (or (args:get-arg "-run-wait")
		   (equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
	       (> num-running 0))
	  (begin
	    ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
	    ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))
	    (if (> (current-seconds)(+ last-time-incomplete 900))







|







1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
	  (debug:print-info 4 *default-log-port* "Exiting loop with...\n  hed=" hed "\n  tal=" tal "\n  reruns=" reruns))
	 )))
    ;; now *if* -run-wait we wait for all tests to be done
    ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
    (thread-sleep! 5) ;; I think there is a race condition here. Let states/statuses settle
    (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id))
		    (prev-num-running 0))
      (BB> "num-running=" num-running ", prev-num-running=" prev-num-running)
      (if (and (or (args:get-arg "-run-wait")
		   (equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
	       (> num-running 0))
	  (begin
	    ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
	    ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))
	    (if (> (current-seconds)(+ last-time-incomplete 900))

Modified server.scm from [1ba0421ee5] to [8a9483a036].

99
100
101
102
103
104
105

106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

121
122


123
124
125
126
127
128
129
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host

;;
(define  (server:run run-id)
  (let* ((curr-host   (get-host-name))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (target-host (configf:lookup *configdat* "server" "homehost" ))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc *toppath* "/logs/" run-id ".log"))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server " (or target-host "-") " -run-id " run-id (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
									      (conc " -daemonize -log " logfile)
									      "")
		      " -m testsuite:" testsuite))) ;; (conc " >> " logfile " 2>&1 &")))))
    (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (if (not (directory-exists? "logs"))(create-directory "logs"))

    ;; Rotate logs, logic: 
    ;;                 if > 500k and older than 1 week, remove previous compressed log and compress this log


    (directory-fold 
     (lambda (file rem)
       (if (and (string-match "^.*.log" file)
		(> (file-size (conc "logs/" file)) 200000))
	   (let ((gzfile (conc "logs/" file ".gz")))
	     (if (file-exists? gzfile)
		 (begin







>















>

|
>
>







99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run run-id)
  (let* ((curr-host   (get-host-name))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (target-host (configf:lookup *configdat* "server" "homehost" ))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc *toppath* "/logs/" run-id ".log"))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server " (or target-host "-") " -run-id " run-id (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
									      (conc " -daemonize -log " logfile)
									      "")
		      " -m testsuite:" testsuite))) ;; (conc " >> " logfile " 2>&1 &")))))
    (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    
    ;; Rotate logs, logic: 
    ;;                 if > 500k and older than 1 week:
    ;;                     remove previous compressed log and compress this log
    ;;
    (directory-fold 
     (lambda (file rem)
       (if (and (string-match "^.*.log" file)
		(> (file-size (conc "logs/" file)) 200000))
	   (let ((gzfile (conc "logs/" file ".gz")))
	     (if (file-exists? gzfile)
		 (begin
139
140
141
142
143
144
145

146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
	     ;; look at target host, is it host.domain.tld or ip address and does it 
	     ;; match current ip or hostname
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))

    (setenv "TARGETHOST_LOGF" logfile)
    (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    ;; (system cmdln)
    (pop-directory)))

(define (server:get-client-signature)
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
	(set! *my-client-signature* sig)
	*my-client-signature*)))

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched







>








|







143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
	     ;; look at target host, is it host.domain.tld or ip address and does it 
	     ;; match current ip or hostname
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
    
    (setenv "TARGETHOST_LOGF" logfile)
    (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    ;; (system cmdln)
    (pop-directory)))

(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
	(set! *my-client-signature* sig)
	*my-client-signature*)))

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched

Modified tasks.scm from [7aab5e9e48] to [7c3f9c7793].

227
228
229
230
231
232
233








234
235
236
237
238
239
240
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=?;"
		   (conc "defunct" tag) run-id))

(define (tasks:server-force-clean-run-record mdb run-id iface port tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
		   (conc "defunct" tag) run-id iface port))









(define (tasks:server-delete-records-for-this-pid mdb tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;"
		   (conc "defunct" tag) (get-host-name) (current-process-id)))

(define (tasks:server-delete-record mdb server-id tag) 
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;"
		   (conc "defunct" tag) server-id)







>
>
>
>
>
>
>
>







227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=?;"
		   (conc "defunct" tag) run-id))

(define (tasks:server-force-clean-run-record mdb run-id iface port tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
		   (conc "defunct" tag) run-id iface port))


;; BB> adding missing func for --list-servers
(define (tasks:server-deregister mdb hostname #!key (pullport #f) (pid #f) (action #f)) ;;pullport pid: pid action: 'delete))
  (if (eq? action 'delete)
      (sqlite3:execute mdb "DELETE FROM servers WHERE pid=? AND port=? AND hostname=?;" pid pullport hostname)
      (sqlite3:execute mdb "UPDATE servers SET state='defunct', heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;"
                       hostname pid)))

(define (tasks:server-delete-records-for-this-pid mdb tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;"
		   (conc "defunct" tag) (get-host-name) (current-process-id)))

(define (tasks:server-delete-record mdb server-id tag) 
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;"
		   (conc "defunct" tag) server-id)