Megatest

Check-in [308a5d65b6]
Login
Overview
Comment:several little fixes
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | try-nanomsg
Files: files | file ages | folders
SHA1: 308a5d65b691985f4b8f1a3b34e00d727afbf6aa
User & Date: matt on 2014-11-24 23:36:10
Other Links: branch diff | manifest | tags
Context
2014-11-26
08:59
Merged v1.60 changes into try-nanomsg check-in: 0b3a6d8aa9 user: matt tags: try-nanomsg
2014-11-24
23:36
several little fixes check-in: 308a5d65b6 user: matt tags: try-nanomsg
20:29
working again - with local access check-in: 5377945384 user: matt tags: try-nanomsg
Changes

Modified client.scm from [aa964f7d14] to [ec1ead835f].

68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
		     (port      (http-transport:server-dat-get-port  host-info))
		     (start-res (case *transport-type* 
				  ((http)(http-transport:client-connect iface port))
				  ((nmsg) host-info) ;; (http-transport:server-dat-get-socket host-info))
				  (else #f)))
		     (ping-res  (case *transport-type*
				  ((http)(rmt:login-no-auto-client-setup start-res run-id))
				  ((nmsg)(nmsg-transport:ping iface port timeout: 2 socket: ))
				  (else #f))))
		(if ping-res   ;; sucessful login?
		    (begin
		      (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries)
		      start-res)  ;; return the server info
		    ;; have host info but no ping. shutdown the current connection and try again
		    (begin    ;; login failed







|







68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
		     (port      (http-transport:server-dat-get-port  host-info))
		     (start-res (case *transport-type* 
				  ((http)(http-transport:client-connect iface port))
				  ((nmsg) host-info) ;; (http-transport:server-dat-get-socket host-info))
				  (else #f)))
		     (ping-res  (case *transport-type*
				  ((http)(rmt:login-no-auto-client-setup start-res run-id))
				  ((nmsg)(nmsg-transport:ping iface port timeout: 2 socket: #t))
				  (else #f))))
		(if ping-res   ;; sucessful login?
		    (begin
		      (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries)
		      start-res)  ;; return the server info
		    ;; have host info but no ping. shutdown the current connection and try again
		    (begin    ;; login failed

Modified http-transport.scm from [37e9804757] to [8c622aa861].

444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
      ;; (debug:print 11 "last-access=" last-access ", server-timeout=" server-timeout)
      ;;
      ;; no_traffic, no running tests, if server 0, no running servers
      ;;
      ;; (let ((wait-on-running (configf:lookup *configdat* "server" "wait-on-running"))) ;; wait on running tasks (if not true then exit on time out)
      ;;
      (if (and *server-run*
	       ;; (or
	       (> (+ last-access server-timeout)
		  (current-seconds)))
;;		   (and (eq? run-id 0)
;;			(> (tasks:num-servers-non-zero-running tdb) 0))
;;		   (and (not (eq? run-id 0)) ;; only makes sense in non-zero run-id servers
;;			(> (db:get-count-tests-actually-running *inmemdb* run-id) 0))
;;		   ))
	  (begin
	    (debug:print-info 0 "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	    ;;
	    ;; Consider implementing some smarts here to re-insert the record or kill self is
	    ;; the db indicates so
	    ;;
	    ;; (if (tasks:server-am-i-the-server? tdb run-id)







<


<
<
<
<
<







444
445
446
447
448
449
450

451
452





453
454
455
456
457
458
459
      ;; (debug:print 11 "last-access=" last-access ", server-timeout=" server-timeout)
      ;;
      ;; no_traffic, no running tests, if server 0, no running servers
      ;;
      ;; (let ((wait-on-running (configf:lookup *configdat* "server" "wait-on-running"))) ;; wait on running tasks (if not true then exit on time out)
      ;;
      (if (and *server-run*

	       (> (+ last-access server-timeout)
		  (current-seconds)))





	  (begin
	    (debug:print-info 0 "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	    ;;
	    ;; Consider implementing some smarts here to re-insert the record or kill self is
	    ;; the db indicates so
	    ;;
	    ;; (if (tasks:server-am-i-the-server? tdb run-id)

Modified nmsg-transport.scm from [987e090f8d] to [b2990ea4bd].

243
244
245
246
247
248
249
250








251
252
253
254
255
256
257
258
259
260
261
262
263

264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290

291
292
293
294


295
296
297
298
299
300
301
				sdat)
                              (begin
                                (thread-sleep! 0.5)
                                (loop))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (tdbdat      (tasks:open-db)))








    (print "Keep-running got server pid " server-id ", using iface " iface " and port " port)
    (let loop ((count 0))
      (thread-sleep! 4) ;; no need to do this very often
      ;; NB// sync currently does NOT return queue-length
      (let () ;; (queue-len (cdb:client-call server-info 'sync #t 1)))
      ;; (print "Server running, count is " count)
        (if (< count 1) ;; 3x3 = 9 secs aprox
            (loop (+ count 1)))
        
        ;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access
        (mutex-lock! *heartbeat-mutex*)
        (set! last-access *last-db-access*)
        (mutex-unlock! *heartbeat-mutex*)

        (if (> (+ last-access
                  ;; (* 50 60 60)    ;; 48 hrs
                  ;; 60              ;; one minute
                  ;; (* 60 60)       ;; one hour
                  (* 45 60)          ;; 45 minutes, until the db deletion bug is fixed.
                  )
               (current-seconds))
            (begin
              (debug:print-info 0 "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
              (loop 0))
            (begin
              (debug:print-info 0 "Starting to shutdown the server.")
              (set! *time-to-exit* #t)
              (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running")
              (debug:print-info 0 "Server shutdown complete. Exiting")
              ;; (exit)
	      ))))))

;;======================================================================
;; C L I E N T S
;;======================================================================

(define (nmsg-transport:client-connect iface portnum)
  (let* ((reqsoc      (nmsg-transport:ping iface portnum return-socket: #t)))
    (vector iface portnum #f #f #f (current-seconds) reqsoc)))

(define (nmsg-transport:client-api-send-receive run-id connection-info cmd param)

  (let ((packet  (vector cmd param))
	(reqsoc  (http-transport:server-dat-get-socket connection-info)))
    (nn-send reqsoc (db:obj->string packet transport: 'nmsg))
    (db:string->obj (nn-recv reqsoc) transport: 'nmsg)))



;;======================================================================
;; J U N K 
;;======================================================================

;; DO NOT USE
;;







|
>
>
>
>
>
>
>
>













>
|
<
<
<
<
<
|








|











>



|
>
>







243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273





274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
				sdat)
                              (begin
                                (thread-sleep! 0.5)
                                (loop))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (tdbdat      (tasks:open-db))
	 (server-timeout (let ((tmo (configf:lookup  *configdat* "server" "timeout")))
			   (if (and (string? tmo)
				    (string->number tmo))
			       (* 60 60 (string->number tmo))
			       ;; (* 3 24 60 60) ;; default to three days
			       (* 60 1)         ;; default to one minute
			       ;; (* 60 60 25)      ;; default to 25 hours
			       ))))
    (print "Keep-running got server pid " server-id ", using iface " iface " and port " port)
    (let loop ((count 0))
      (thread-sleep! 4) ;; no need to do this very often
      ;; NB// sync currently does NOT return queue-length
      (let () ;; (queue-len (cdb:client-call server-info 'sync #t 1)))
      ;; (print "Server running, count is " count)
        (if (< count 1) ;; 3x3 = 9 secs aprox
            (loop (+ count 1)))
        
        ;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access
        (mutex-lock! *heartbeat-mutex*)
        (set! last-access *last-db-access*)
        (mutex-unlock! *heartbeat-mutex*)
        (if (and *server-run*
	       (> (+ last-access server-timeout)





		  (current-seconds)))
            (begin
              (debug:print-info 0 "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
              (loop 0))
            (begin
              (debug:print-info 0 "Starting to shutdown the server.")
              (set! *time-to-exit* #t)
              (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running")
              (debug:print-info 0 "Server shutdown complete. Exiting")
              (exit)
	      ))))))

;;======================================================================
;; C L I E N T S
;;======================================================================

(define (nmsg-transport:client-connect iface portnum)
  (let* ((reqsoc      (nmsg-transport:ping iface portnum return-socket: #t)))
    (vector iface portnum #f #f #f (current-seconds) reqsoc)))

(define (nmsg-transport:client-api-send-receive run-id connection-info cmd param)
  (mutex-lock! *http-mutex*)
  (let ((packet  (vector cmd param))
	(reqsoc  (http-transport:server-dat-get-socket connection-info)))
    (nn-send reqsoc (db:obj->string packet transport: 'nmsg))
    (let ((res (db:string->obj (nn-recv reqsoc) transport: 'nmsg)))
      (mutex-unlock! *http-mutex*)
      res)))

;;======================================================================
;; J U N K 
;;======================================================================

;; DO NOT USE
;;

Modified server.scm from [57814a5cf8] to [3a939720aa].

138
139
140
141
142
143
144

145
146
147



148
149
150
151
152
153
154
	       (trycount 0))
    (if server
	;; note: client:start will set *runremote*. this needs to be changed
	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;

	(let ((res (server:ping-server run-id 
				       (tasks:hostinfo-get-interface server)
				       (tasks:hostinfo-get-port      server))))



	  ;; if the server didn't respond we must remove the record
	  (if res
	      #t
	      (begin
		(debug:print-info 0 "server at " server " not responding, removing record")
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
				" server:check-if-running")







>
|
|
|
>
>
>







138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
	       (trycount 0))
    (if server
	;; note: client:start will set *runremote*. this needs to be changed
	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;
	(let ((res (case *transport-type*
		     ((http)(server:ping-server run-id 
						(tasks:hostinfo-get-interface server)
						(tasks:hostinfo-get-port      server)))
		     ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
						 (tasks:hostinfo-get-port      server)
						 timeout: 2)))))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      #t
	      (begin
		(debug:print-info 0 "server at " server " not responding, removing record")
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
				" server:check-if-running")

Modified tests/Makefile from [a8b041cd34] to [608cf0bdb8].

114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Some simple checks for bootstrapping and run loop logic 

test9 : minsetup test9a test9b test9c test9d test9e

test9a :
	@echo Run super-simple mintest e, no waitons.
	cd mintest;$(DASHBOARD)&
	cd mintest;$(MEGATEST) -preclean -runtests e -target $(VER) :runname `date +%H.%M.%S` -debug $(DEBUG)

test9b :
	@echo Run simple mintest d with one waiton c
	cd mintest;$(MEGATEST) -preclean -runtests d -target $(VER) :runname `date +%H.%M.%S` -debug $(DEBUG)

test9c :
	@echo Run mintest a with full waiton chain a -> b -> c -> d -> e
	cd mintest;$(MEGATEST) -preclean -runtests a -target $(VER) :runname `date +%H.%M.%S` -debug $(DEBUG)

test9d :
	@echo Run an itemized test with no items
	cd mintest;$(MEGATEST) -preclean -runtests g -target $(VER) :runname `date +%H.%M.%S` -debug $(DEBUG)

test9e :
	@echo Run mintest a1 with full waiton chain with d1fail: a1 -> b1 -> c1 -> d1fail -> e1
	cd mintest;$(MEGATEST) -preclean -runtests a1 -target $(VER) :runname `date +%H.%M.%S` -debug $(DEBUG)

test10 :
	@echo Run a bunch of different targets simultaneously
	(cd fullrun;$(MEGATEST) -server - ;sleep 2)&
	for targ in mint/btrfs/mintdir sunos/sshfs/loc; do \
	   (cd fullrun;$(MEGATEST) -preclean -runtests priority_10_waiton_1 -target $$targ :runname $(RUNNAME) &); done
	for sys in ubuntu suse redhat debian;do \







|



|



|



|



|







114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Some simple checks for bootstrapping and run loop logic 

test9 : minsetup test9a test9b test9c test9d test9e

test9a :
	@echo Run super-simple mintest e, no waitons.
	cd mintest;$(DASHBOARD)&
	cd mintest;$(MEGATEST) -preclean -runtests e -target $(VER) -runname $(shell date +%H.%M.%S) -debug $(DEBUG)

test9b :
	@echo Run simple mintest d with one waiton c
	cd mintest;$(MEGATEST) -preclean -runtests d -target $(VER) -runname `date +%H.%M.%S` -debug $(DEBUG)

test9c :
	@echo Run mintest a with full waiton chain a -> b -> c -> d -> e
	cd mintest;$(MEGATEST) -preclean -runtests a -target $(VER) -runname `date +%H.%M.%S` -debug $(DEBUG)

test9d :
	@echo Run an itemized test with no items
	cd mintest;$(MEGATEST) -preclean -runtests g -target $(VER) -runname `date +%H.%M.%S` -debug $(DEBUG)

test9e :
	@echo Run mintest a1 with full waiton chain with d1fail: a1 -> b1 -> c1 -> d1fail -> e1
	cd mintest;$(MEGATEST) -preclean -runtests a1 -target $(VER) -runname `date +%H.%M.%S` -debug $(DEBUG)

test10 :
	@echo Run a bunch of different targets simultaneously
	(cd fullrun;$(MEGATEST) -server - ;sleep 2)&
	for targ in mint/btrfs/mintdir sunos/sshfs/loc; do \
	   (cd fullrun;$(MEGATEST) -preclean -runtests priority_10_waiton_1 -target $$targ :runname $(RUNNAME) &); done
	for sys in ubuntu suse redhat debian;do \

Modified tests/fullrun/megatest.config from [79e8e68f6b] to [558689506a].

134
135
136
137
138
139
140
141
142
143
144

145
146
147
148
149
150
151

# This server will keep running this number of hours after last access. 
# Three minutes is 0.05 hours
# timeout 0.025
timeout 0.01

# Server is required - slower but more resistant to Sqlite issues.
# required yes

# Start server when average query takes longer than this
server-query-threshold 55500


# daemonize yes
# hostname #{scheme (get-host-name)}

## disks are:
## name host:/path/to/area
## -or-







|


|
>







134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

# This server will keep running this number of hours after last access. 
# Three minutes is 0.05 hours
# timeout 0.025
timeout 0.01

# Server is required - slower but more resistant to Sqlite issues.
required yes

# Start server when average query takes longer than this
# server-query-threshold 55500
server-query-threshold -1

# daemonize yes
# hostname #{scheme (get-host-name)}

## disks are:
## name host:/path/to/area
## -or-

Modified tests/mintest/megatest.config from [158955d103] to [74b434d2c6].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
[fields]
X TEXT

[setup]
max_concurrent_jobs 50
linktree #{getenv PWD}/linktree
transport http

[server]
port 8090

[jobtools]
useshell yes
launcher nbfind

[disks]
disk0 #{getenv PWD}/runs





|











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
[fields]
X TEXT

[setup]
max_concurrent_jobs 50
linktree #{getenv MT_RUN_AREA_HOME}/linktree
transport http

[server]
port 8090

[jobtools]
useshell yes
launcher nbfind

[disks]
disk0 #{getenv PWD}/runs