Megatest

Check-in [cd8a4f1a41]
Login
Overview
Comment:Completed server re-write
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | re-re-factor-server
Files: files | file ages | folders
SHA1: cd8a4f1a4169081fd4f0fd827a933828e7b6dac1
User & Date: matt on 2014-02-17 21:11:03
Other Links: branch diff | manifest | tags
Context
2014-02-17
23:04
Partial fix for borked server-dat Closed-Leaf check-in: 8ec315acf0 user: matt tags: re-re-factor-server
21:11
Completed server re-write check-in: cd8a4f1a41 user: matt tags: re-re-factor-server
19:36
Trimmed out some junk code and fixed some logic in the server start up sequencing check-in: 4a2103f62b user: matt tags: re-re-factor-server
Changes

Modified client.scm from [065a0a550e] to [5cb1c0c7dc].

51
52
53
54
55
56
57
58


59
60
61
62
63
64
65

66
67
68
69
70
71


72
73
74
75
76
77
78

79
80
81
82
83
84
85
86
87
88


89
90
91
92
93
94
95

96
97

98
99

100
101

102
103
104
105
106
107
108
51
52
53
54
55
56
57

58
59
60
61
62
63
64
65

66
67
68
69
70


71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88


89
90
91
92
93
94
95
96
97
98
99

100
101
102
103
104

105
106
107
108
109
110
111
112







-
+
+






-
+




-
-
+
+







+








-
-
+
+







+

-
+


+

-
+







;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 3))
(define (client:setup run-id #!key (remaining-tries 10))
  (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries)
  (if (<= remaining-tries 0)
      (begin
	(debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	(exit 1))
      (let ((server-dat (and run-id (hash-table-ref/default *runremote* run-id #f))))
	(if server-dat
	    (let ((start-res (http-transport:client-connect run-id
	    (let ((start-res (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result!
							    (tasks:hostinfo-get-interface server-dat)
							    (tasks:hostinfo-get-port      server-dat))))
	      (if start-res ;; sucessful login?
		  (begin
		    (hash-table-set! *runremote* run-id server-dat)
		    server-dat)
		    (hash-table-set! *runremote* run-id start-res)
		    start-res)
		  (begin    ;; login failed
		    (hash-table-delete! *runremote* run-id)
		    (open-run-close tasks:server-force-clean-run-record
				    tasks:open-db
				    run-id 
				    (tasks:hostinfo-get-interface server-dat)
				    (tasks:hostinfo-get-port      server-dat))
		    (thread-sleep! 5)
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))
	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
	      (if server-dat
		  (let ((start-res (http-transport:client-connect run-id
								  (tasks:hostinfo-get-interface server-dat)
								  (tasks:hostinfo-get-port      server-dat))))
		    (if start-res
			(begin
			  (hash-table-set! *runremote* run-id server-dat)
			  server-dat)
			  (hash-table-set! *runremote* run-id start-res)
			  start-res)
			(begin    ;; login failed
			  (hash-table-delete! *runremote* run-id)
			  (open-run-close tasks:server-force-clean-run-record
					  tasks:open-db
					  run-id 
					  (tasks:hostinfo-get-interface server-dat)
					  (tasks:hostinfo-get-port      server-dat))
			  (thread-sleep! 2)
			  (server:try-running run-id)
			  (thread-sleep! 3) ;; give server a little time to start up
			  (thread-sleep! 5) ;; give server a little time to start up
			  (client:setup run-id remaining-tries: (- remaining-tries 1)))))
		  (begin    ;; no server registered
		    (thread-sleep! 2)
		    (server:try-running run-id)
		    (thread-sleep! 3) ;; give server a little time to start up
		    (thread-sleep! 5) ;; give server a little time to start up
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect run-id 
				 (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

Modified http-transport.scm from [7e052a2b1c] to [d9e94ba5d7].

398
399
400
401
402
403
404




405
406
407
408
409
410
411
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415







+
+
+
+







;;
;; start_server? 
;;
(define (http-transport:launch run-id)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (daemon:ize))
  (if (server:check-if-running run-id)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
  (let ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id)))
    (if (not server-id)
	(begin
	  ;; since we didn't get the server lock we are going to clean up and bail out
	  (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	  (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db))
	(let* ((th2 (make-thread (lambda ()

Modified rmt.scm from [deb40682ce] to [206f8532c1].

33
34
35
36
37
38
39
40
41















42
43
44
45
46
47
48

49
50

51

52
53
54
55
56
57
58
33
34
35
36
37
38
39


40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

61
62
63
64

65
66
67
68
69
70
71
72







-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+






-
+


+
-
+







;;======================================================================
;;  S U P P O R T   F U N C T I O N S
;;======================================================================

;; cmd is a symbol
;; vars is a json string encoding the parameters for the call
;;
(define (rmt:send-receive cmd run-id params)
  (let* ((connection-info (hash-table-ref/default *runremote* run-id #f))
(define (rmt:send-receive cmd rid params)
  (let* ((run-id  (if rid rid 0))
	 (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f)))
			    (if cinfo
				cinfo
				(let loop ((numtries 100))
				  (thread-sleep! 1)
				  (let ((res (client:setup run-id)))
				    (if res 
					res
					(if (> numtries 0)
					    (loop (- numtries 1))
					    (begin
					      (debug:print 0 "ERROR: 100 tries and no server, giving up")
					      (exit 1)))))))))
	 (jparams         (db:obj->string params))
	 (res (http-transport:client-api-send-receive run-id connection-info cmd jparams)))
    (if res
	(db:string->obj res) ;; (rmt:json-str->dat res)
	(let ((new-connection-info (client:setup run-id)))
	  (debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
	  (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)))))
	  (rmt:send-receive cmd run-id params)))))

(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
  (let* ((run-id   (if run-id run-id 0))
  (let* ((jparams         (db:obj->string params)) ;; (rmt:dat->json-str params))
	 (jparams         (db:obj->string params)) ;; (rmt:dat->json-str params))
	 (res (http-transport:client-api-send-receive run-id connection-info cmd jparams numretries: 3)))
    (if res
	(db:string->obj res) ;; (rmt:json-str->dat res)
	;; this one does NOT keep trying
	res)))

;; Wrap json library for strings (why the ports crap in the first place?)

Modified server.scm from [cffc1a2257] to [8eb4730569].

101
102
103
104
105
106
107

108

109
110


111
112
113
114
115
116
117
101
102
103
104
105
106
107
108
109
110


111
112
113
114
115
116
117
118
119







+

+
-
-
+
+








;; When using zmq this would send the message back (two step process)
;; with spiffy or rpc this simply returns the return data to be returned
;; 
(define (server:reply return-addr query-sig success/fail result)
  (db:obj->string (vector success/fail query-sig result)))

;; > file 2>&1 
(define (server:try-running run-id)
  (let* ((rand-name (random 100))
  (let ((cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest")
		     " -server - -run-id " run-id " &> " *toppath* "/db/" run-id ".log &")))
	 (cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest")
		     " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &")))
    (debug:print 0 "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (system cmdln)
    (pop-directory)))

(define (server:check-if-running run-id)
  (let loop ((server (open-run-close tasks:get-server tasks:open-db run-id))

Modified tasks.scm from [c13323f697] to [19f1225d86].

90
91
92
93
94
95
96
97
98
99
100
101
102
103

104
105
106
107
108
109
110
90
91
92
93
94
95
96

97
98
99
100
101

102
103
104
105
106
107
108
109







-





-
+







(define (tasks:hostinfo-get-pubport     vec)    (vector-ref  vec 3))
(define (tasks:hostinfo-get-transport   vec)    (vector-ref  vec 4))
(define (tasks:hostinfo-get-pid         vec)    (vector-ref  vec 5))
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id)
  (server:check-if-running run-id)
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))      
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 
   "INSERT INTO servers (pid,hostname,port,pubport,start_time,      priority,state,mt_version,heartbeat,   interface,transport,run_id)
                   VALUES(?, ?,       ?,   ?, strftime('%s','now'), ?,       ?,    ?,-1,?,        ?,        ?);"

Modified tests/Makefile from [f03136dea1] to [488ee3eee2].

20
21
22
23
24
25
26
27

28
29
30
31
32
33
34
20
21
22
23
24
25
26

27
28
29
30
31
32
33
34







-
+







NEWTARGET  = "$(OS)/$(FS)/$(VER)"
TARGET     = "ubuntu/nfs/none"

all : test1 test2 test3 test4 test5 test6 test7 test8 test9

server :
	cd ..;make;make install
	cd fullrun;../../bin/megatest -server - -debug 22 -run-id $(RUNID)
	cd fullrun;../../bin/megatest -server - -debug $(DEBUG) -run-id $(RUNID)

stopserver :
	cd ..;make && make install
	cd fullrun;$(MEGATEST) -stop-server 0

repl :
	cd ..;make && make install