Megatest

Check-in [cd8a4f1a41]
Login
Overview
Comment:Completed server re-write
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | re-re-factor-server
Files: files | file ages | folders
SHA1: cd8a4f1a4169081fd4f0fd827a933828e7b6dac1
User & Date: matt on 2014-02-17 21:11:03
Other Links: branch diff | manifest | tags
Context
2014-02-17
23:04
Partial fix for borked server-dat Closed-Leaf check-in: 8ec315acf0 user: matt tags: re-re-factor-server
21:11
Completed server re-write check-in: cd8a4f1a41 user: matt tags: re-re-factor-server
19:36
Trimmed out some junk code and fixed some logic in the server start up sequencing check-in: 4a2103f62b user: matt tags: re-re-factor-server
Changes

Modified client.scm from [065a0a550e] to [5cb1c0c7dc].

51
52
53
54
55
56
57
58

59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

96
97
98
99

100
101
102
103
104
105
106
107
108
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 3))

  (if (<= remaining-tries 0)
      (begin
	(debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	(exit 1))
      (let ((server-dat (and run-id (hash-table-ref/default *runremote* run-id #f))))
	(if server-dat
	    (let ((start-res (http-transport:client-connect run-id
							    (tasks:hostinfo-get-interface server-dat)
							    (tasks:hostinfo-get-port      server-dat))))
	      (if start-res ;; sucessful login?
		  (begin
		    (hash-table-set! *runremote* run-id server-dat)
		    server-dat)
		  (begin    ;; login failed
		    (hash-table-delete! *runremote* run-id)
		    (open-run-close tasks:server-force-clean-run-record
				    tasks:open-db
				    run-id 
				    (tasks:hostinfo-get-interface server-dat)
				    (tasks:hostinfo-get-port      server-dat))

		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))
	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
	      (if server-dat
		  (let ((start-res (http-transport:client-connect run-id
								  (tasks:hostinfo-get-interface server-dat)
								  (tasks:hostinfo-get-port      server-dat))))
		    (if start-res
			(begin
			  (hash-table-set! *runremote* run-id server-dat)
			  server-dat)
			(begin    ;; login failed
			  (hash-table-delete! *runremote* run-id)
			  (open-run-close tasks:server-force-clean-run-record
					  tasks:open-db
					  run-id 
					  (tasks:hostinfo-get-interface server-dat)
					  (tasks:hostinfo-get-port      server-dat))

			  (server:try-running run-id)
			  (thread-sleep! 3) ;; give server a little time to start up
			  (client:setup run-id remaining-tries: (- remaining-tries 1)))))
		  (begin    ;; no server registered

		    (server:try-running run-id)
		    (thread-sleep! 3) ;; give server a little time to start up
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect run-id 
				 (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))







|
>






|




|
|







>








|
|







>

|


>

|







51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 10))
  (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries)
  (if (<= remaining-tries 0)
      (begin
	(debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	(exit 1))
      (let ((server-dat (and run-id (hash-table-ref/default *runremote* run-id #f))))
	(if server-dat
	    (let ((start-res (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result!
							    (tasks:hostinfo-get-interface server-dat)
							    (tasks:hostinfo-get-port      server-dat))))
	      (if start-res ;; sucessful login?
		  (begin
		    (hash-table-set! *runremote* run-id start-res)
		    start-res)
		  (begin    ;; login failed
		    (hash-table-delete! *runremote* run-id)
		    (open-run-close tasks:server-force-clean-run-record
				    tasks:open-db
				    run-id 
				    (tasks:hostinfo-get-interface server-dat)
				    (tasks:hostinfo-get-port      server-dat))
		    (thread-sleep! 5)
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))
	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
	      (if server-dat
		  (let ((start-res (http-transport:client-connect run-id
								  (tasks:hostinfo-get-interface server-dat)
								  (tasks:hostinfo-get-port      server-dat))))
		    (if start-res
			(begin
			  (hash-table-set! *runremote* run-id start-res)
			  start-res)
			(begin    ;; login failed
			  (hash-table-delete! *runremote* run-id)
			  (open-run-close tasks:server-force-clean-run-record
					  tasks:open-db
					  run-id 
					  (tasks:hostinfo-get-interface server-dat)
					  (tasks:hostinfo-get-port      server-dat))
			  (thread-sleep! 2)
			  (server:try-running run-id)
			  (thread-sleep! 5) ;; give server a little time to start up
			  (client:setup run-id remaining-tries: (- remaining-tries 1)))))
		  (begin    ;; no server registered
		    (thread-sleep! 2)
		    (server:try-running run-id)
		    (thread-sleep! 5) ;; give server a little time to start up
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect run-id 
				 (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

Modified http-transport.scm from [7e052a2b1c] to [d9e94ba5d7].

398
399
400
401
402
403
404




405
406
407
408
409
410
411
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (daemon:ize))




  (let ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id)))
    (if (not server-id)
	(begin
	  ;; since we didn't get the server lock we are going to clean up and bail out
	  (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	  (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db))
	(let* ((th2 (make-thread (lambda ()







>
>
>
>







398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (daemon:ize))
  (if (server:check-if-running run-id)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
  (let ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id)))
    (if (not server-id)
	(begin
	  ;; since we didn't get the server lock we are going to clean up and bail out
	  (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	  (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db))
	(let* ((th2 (make-thread (lambda ()

Modified rmt.scm from [deb40682ce] to [206f8532c1].

33
34
35
36
37
38
39
40

41












42
43
44
45
46
47
48
49
50

51
52
53
54
55
56
57
58
;;======================================================================
;;  S U P P O R T   F U N C T I O N S
;;======================================================================

;; cmd is a symbol
;; vars is a json string encoding the parameters for the call
;;
(define (rmt:send-receive cmd run-id params)

  (let* ((connection-info (hash-table-ref/default *runremote* run-id #f))












	 (jparams         (db:obj->string params))
	 (res (http-transport:client-api-send-receive run-id connection-info cmd jparams)))
    (if res
	(db:string->obj res) ;; (rmt:json-str->dat res)
	(let ((new-connection-info (client:setup run-id)))
	  (debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
	  (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)))))

(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)

  (let* ((jparams         (db:obj->string params)) ;; (rmt:dat->json-str params))
	 (res (http-transport:client-api-send-receive run-id connection-info cmd jparams numretries: 3)))
    (if res
	(db:string->obj res) ;; (rmt:json-str->dat res)
	;; this one does NOT keep trying
	res)))

;; Wrap json library for strings (why the ports crap in the first place?)







|
>
|
>
>
>
>
>
>
>
>
>
>
>
>






|


>
|







33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
;;======================================================================
;;  S U P P O R T   F U N C T I O N S
;;======================================================================

;; cmd is a symbol
;; vars is a json string encoding the parameters for the call
;;
(define (rmt:send-receive cmd rid params)
  (let* ((run-id  (if rid rid 0))
	 (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f)))
			    (if cinfo
				cinfo
				(let loop ((numtries 100))
				  (thread-sleep! 1)
				  (let ((res (client:setup run-id)))
				    (if res 
					res
					(if (> numtries 0)
					    (loop (- numtries 1))
					    (begin
					      (debug:print 0 "ERROR: 100 tries and no server, giving up")
					      (exit 1)))))))))
	 (jparams         (db:obj->string params))
	 (res (http-transport:client-api-send-receive run-id connection-info cmd jparams)))
    (if res
	(db:string->obj res) ;; (rmt:json-str->dat res)
	(let ((new-connection-info (client:setup run-id)))
	  (debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
	  (rmt:send-receive cmd run-id params)))))

(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
  (let* ((run-id   (if run-id run-id 0))
	 (jparams         (db:obj->string params)) ;; (rmt:dat->json-str params))
	 (res (http-transport:client-api-send-receive run-id connection-info cmd jparams numretries: 3)))
    (if res
	(db:string->obj res) ;; (rmt:json-str->dat res)
	;; this one does NOT keep trying
	res)))

;; Wrap json library for strings (why the ports crap in the first place?)

Modified server.scm from [cffc1a2257] to [8eb4730569].

101
102
103
104
105
106
107

108

109
110
111
112
113
114
115
116
117

;; When using zmq this would send the message back (two step process)
;; with spiffy or rpc this simply returns the return data to be returned
;; 
(define (server:reply return-addr query-sig success/fail result)
  (db:obj->string (vector success/fail query-sig result)))


(define (server:try-running run-id)

  (let ((cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest")
		     " -server - -run-id " run-id " &> " *toppath* "/db/" run-id ".log &")))
    (debug:print 0 "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (system cmdln)
    (pop-directory)))

(define (server:check-if-running run-id)
  (let loop ((server (open-run-close tasks:get-server tasks:open-db run-id))







>

>
|
|







101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

;; When using zmq this would send the message back (two step process)
;; with spiffy or rpc this simply returns the return data to be returned
;; 
(define (server:reply return-addr query-sig success/fail result)
  (db:obj->string (vector success/fail query-sig result)))

;; > file 2>&1 
(define (server:try-running run-id)
  (let* ((rand-name (random 100))
	 (cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest")
		     " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &")))
    (debug:print 0 "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (system cmdln)
    (pop-directory)))

(define (server:check-if-running run-id)
  (let loop ((server (open-run-close tasks:get-server tasks:open-db run-id))

Modified tasks.scm from [c13323f697] to [19f1225d86].

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
(define (tasks:hostinfo-get-pubport     vec)    (vector-ref  vec 3))
(define (tasks:hostinfo-get-transport   vec)    (vector-ref  vec 4))
(define (tasks:hostinfo-get-pid         vec)    (vector-ref  vec 5))
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id)
  (server:check-if-running run-id)
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))      
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 
   "INSERT INTO servers (pid,hostname,port,pubport,start_time,      priority,state,mt_version,heartbeat,   interface,transport,run_id)
                   VALUES(?, ?,       ?,   ?, strftime('%s','now'), ?,       ?,    ?,-1,?,        ?,        ?);"







<





|







90
91
92
93
94
95
96

97
98
99
100
101
102
103
104
105
106
107
108
109
(define (tasks:hostinfo-get-pubport     vec)    (vector-ref  vec 3))
(define (tasks:hostinfo-get-transport   vec)    (vector-ref  vec 4))
(define (tasks:hostinfo-get-pid         vec)    (vector-ref  vec 5))
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id)

  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 
   "INSERT INTO servers (pid,hostname,port,pubport,start_time,      priority,state,mt_version,heartbeat,   interface,transport,run_id)
                   VALUES(?, ?,       ?,   ?, strftime('%s','now'), ?,       ?,    ?,-1,?,        ?,        ?);"

Modified tests/Makefile from [f03136dea1] to [488ee3eee2].

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
NEWTARGET  = "$(OS)/$(FS)/$(VER)"
TARGET     = "ubuntu/nfs/none"

all : test1 test2 test3 test4 test5 test6 test7 test8 test9

server :
	cd ..;make;make install
	cd fullrun;../../bin/megatest -server - -debug 22 -run-id $(RUNID)

stopserver :
	cd ..;make && make install
	cd fullrun;$(MEGATEST) -stop-server 0

repl :
	cd ..;make && make install







|







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
NEWTARGET  = "$(OS)/$(FS)/$(VER)"
TARGET     = "ubuntu/nfs/none"

all : test1 test2 test3 test4 test5 test6 test7 test8 test9

server :
	cd ..;make;make install
	cd fullrun;../../bin/megatest -server - -debug $(DEBUG) -run-id $(RUNID)

stopserver :
	cd ..;make && make install
	cd fullrun;$(MEGATEST) -stop-server 0

repl :
	cd ..;make && make install