Megatest

Check-in [904e5f7d6c]
Login
Overview
Comment:Added debugging tags to server state changes. Cleaned up dashboard to display new server data.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: 904e5f7d6cd7ae02500bfad9bf4651ebf43d2a65
User & Date: matt on 2014-02-24 22:15:53
Other Links: branch diff | manifest | tags
Context
2014-02-24
22:41
Experimentatal change to more aggressively try to connect to servers check-in: 23a0587e45 user: matt tags: v1.60
22:15
Added debugging tags to server state changes. Cleaned up dashboard to display new server data. check-in: 904e5f7d6c user: matt tags: v1.60
19:44
Random port assignment, minimize re-use of ports check-in: bbd7cb0f5a user: matt tags: v1.60
Changes

Modified client.scm from [3302993138] to [c34489a609].

71
72
73
74
75
76
77
78


79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98


99
100
101
102
103
104
105
71
72
73
74
75
76
77

78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

99
100
101
102
103
104
105
106
107







-
+
+



















-
+
+







		  (if (eq? remaining-tries 4)
		      (begin    ;; login failed
			(hash-table-delete! *runremote* run-id)
			(open-run-close tasks:server-force-clean-run-record
			 		tasks:open-db
			 		run-id 
			 		(car  server-dat)
			 		(cadr server-dat))
			 		(cadr server-dat)
					" client:setup (server-dat=#t)")
			(thread-sleep! 5)
			(client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1)))
		      (begin
			(thread-sleep! 5)
			(client:setup run-id remaining-tries: (- remaining-tries 1))))))
	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
	      (if server-dat
		  (let ((start-res (http-transport:client-connect run-id
								  (tasks:hostinfo-get-interface server-dat)
								  (tasks:hostinfo-get-port      server-dat))))
		    (if start-res
			start-res
			(if (eq? remaining-tries 2)
			    (begin    ;; login failed
			      (hash-table-delete! *runremote* run-id)
			      (open-run-close tasks:server-force-clean-run-record
					      tasks:open-db
					      run-id 
					      (tasks:hostinfo-get-interface server-dat)
					      (tasks:hostinfo-get-port      server-dat))
					      (tasks:hostinfo-get-port      server-dat)
					      " client:setup (server-dat = #f)")
			      (thread-sleep! 2)
			      (server:try-running run-id)
			      (thread-sleep! 10) ;; give server a little time to start up
			      (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1)))
			    (begin
			      (thread-sleep! 5)
			      (client:setup run-id remaining-tries: (- remaining-tries 1))))))

Modified dashboard.scm from [daaa85c93d] to [16f8f4859b].

978
979
980
981
982
983
984
985

986
987

988
989
990
991







992
993
994
995
996
997
998
978
979
980
981
982
983
984

985
986
987
988




989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002







-
+


+
-
-
-
-
+
+
+
+
+
+
+







;;======================================================================
;;
;; General info about the run(s) and megatest area
(define (dashboard:summary db)
  (let ((rawconfig        (read-config (conc *toppath* "/megatest.config") #f 'return-string)))
    (iup:vbox
     (iup:split
      ;; #:value 500
      #:value 500
      (iup:frame 
       #:title "General Info"
       (iup:vbox
       (iup:hbox 
	(dcommon:keys-matrix rawconfig)
	(dcommon:general-info)
	))
	(iup:hbox
	 (iup:label "Area Path")
	 (iup:textbox #:value *toppath* #:expand "HORIZONTAL"))
	(iup:hbox 
	 (dcommon:keys-matrix rawconfig)
	 (dcommon:general-info)
	 )))
      (iup:frame
       #:title "Server"
       (dcommon:servers-table)))
     (iup:frame 
      #:title "Megatest config settings"
      (iup:hbox
       (dcommon:section-matrix rawconfig "setup" "Varname" "Value")

Modified dcommon.scm from [aa49c4cb44] to [8f2d34a514].

356
357
358
359
360
361
362
363

364
365
366


367
368
369
370
371
372
373


374
375
376


377
378
379
380
381
382
383
356
357
358
359
360
361
362

363
364


365
366
367
368
369
370
371


372
373
374


375
376
377
378
379
380
381
382
383







-
+

-
-
+
+





-
-
+
+

-
-
+
+







;; General data
;;
(define (dcommon:general-info)
  (let ((general-matrix (iup:matrix
			 #:alignment1 "ALEFT"
			 #:expand "YES" ;; "HORIZONTAL"
			 #:numcol 1
			 #:numlin 3
			 #:numlin 2
			 #:numcol-visible 1
			 #:numlin-visible 3)))
    (iup:attribute-set! general-matrix "WIDTH1" "200")
			 #:numlin-visible 2)))
    (iup:attribute-set! general-matrix "WIDTH1" "150")
    (iup:attribute-set! general-matrix "0:1" "About this Megatest area") 
    ;; User (this is not always obvious - it is common to run as a different user
    (iup:attribute-set! general-matrix "1:0" "User")
    (iup:attribute-set! general-matrix "1:1" (current-user-name))
    ;; Megatest area
    (iup:attribute-set! general-matrix "2:0" "Area")
    (iup:attribute-set! general-matrix "2:1" *toppath*)
    ;; (iup:attribute-set! general-matrix "2:0" "Area")
    ;; (iup:attribute-set! general-matrix "2:1" *toppath*)
    ;; Megatest version
    (iup:attribute-set! general-matrix "3:0" "Version")
    (iup:attribute-set! general-matrix "3:1" megatest-version)
    (iup:attribute-set! general-matrix "2:0" "Version")
    (iup:attribute-set! general-matrix "2:1" (conc megatest-version "-" (substring megatest-fossil-hash 0 4)))

    general-matrix))

(define (dcommon:run-stats dbstruct)
  (let* ((stats-matrix (iup:matrix expand: "YES"))
	 (changed      #f)
	 (updater      (lambda ()
443
444
445
446
447
448
449
450

451
452

453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470

471

472
473
474
475
476

477
478
479

480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526































527
528
529
530
531
532
533
443
444
445
446
447
448
449

450
451

452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471

472
473
474
475
476

477



478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496





























497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534







-
+

-
+


















+
-
+




-
+
-
-
-
+


















-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+








(define (dcommon:servers-table)
  (let* ((colnum         0)
	 (rownum         0)
	 (servers-matrix (iup:matrix #:expand "YES"
				     #:numcol 7
				     #:numcol-visible 7
				     #:numlin-visible 3
				     #:numlin-visible 5
				     ))
	 (colnames       (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "State" "Transport"))
	 (colnames       (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId"))
	 (updater        (lambda ()
			   (let ((servers (open-run-close tasks:get-all-servers tasks:open-db)))
			     (iup:attribute-set! servers-matrix "NUMLIN" (length servers))
			     ;; (set! colnum 0)
			     ;; (for-each (lambda (colname)
			     ;;    	 ;; (print "colnum: " colnum " colname: " colname)
			     ;;    	 (iup:attribute-set! servers-matrix (conc "0:" colnum) colname)
			     ;;    	 (set! colnum (+ 1 colnum)))
			     ;;           colnames)
			     (set! rownum 1)
			     (for-each 
			      (lambda (server)
				(set! colnum 0)
				(let* ((vals (list (vector-ref server 0) ;; Id
						   (vector-ref server 9) ;; MT-Ver
						   (vector-ref server 1) ;; Pid
						   (vector-ref server 2) ;; Hostname
						   (conc (vector-ref server 3) ":" (vector-ref server 4)) ;; IP:Port
						   (seconds->hr-min-sec (- (current-seconds)(vector-ref server 6)))
						   (vector-ref server 5) ;; Pubport
						   ;; (vector-ref server 5) ;; Pubport
						   ;; (vector-ref server 10) ;; Last beat
						   ;; (vector-ref server 6) ;; Start time
						   ;; (vector-ref server 7) ;; Priority
						   ;; (vector-ref server 8) ;; State
						   (if (< (vector-ref server 10) 20) ;; Status (Please redo this properly!)
						   (vector-ref server 8) ;; State
						       "alive"
						       "dead")
						   (vector-ref server 11)  ;; Transport
						   (vector-ref server 12)  ;; RunId
						   )))
				  (for-each (lambda (val)
					      ;; (print "rownum: " rownum " colnum: " colnum " val: " val)
					      (iup:attribute-set! servers-matrix (conc rownum ":" colnum) val)
					      (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum))
					      (set! colnum (+ 1 colnum)))
					    vals)
				  (set! rownum (+ rownum 1)))
				 (iup:attribute-set! servers-matrix "REDRAW" "ALL"))
			      servers)))))
    (set! colnum 0)
    (for-each (lambda (colname)
		(iup:attribute-set! servers-matrix (conc "0:" colnum) colname)
		(iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum))
		(set! colnum (+ colnum 1)))
	      colnames)
    (set! dashboard:update-servers-table updater) 
    ;; (iup:attribute-set! servers-matrix "WIDTHDEF" "40")
    (iup:hbox
     (iup:vbox
      (iup:button "Start"
		  ;; #:size "50x"
		  #:expand "YES"
		  #:action (lambda (obj)
			     (let ((cmd (conc ;; "xterm -geometry 180x20 -e \""
					      "megatest -server - &")))
					      ;; ";echo Press any key to continue;bash -c 'read -n 1 -s'\" &")))
			       (system cmd))))
      (iup:button "Stop"
		  #:expand "YES"
		  ;; #:size "50x"
		  #:action (lambda (obj)
			     (let ((cmd (conc ;; "xterm -geometry 180x20 -e \""
					      "megatest -stop-server 0 &")))
					      ;; ";echo Press any key to continue;bash -c 'read -n 1 -s'\" &")))
			       (system cmd))))
      (iup:button "Restart"
		  #:expand "YES"
		  ;; #:size "50x"
		  #:action (lambda (obj)
			     (let ((cmd (conc ;; "xterm -geometry 180x20 -e \""
					      "megatest -stop-server 0;megatest -server - &")))
					      ;; ";echo Press any key to continue;bash -c 'read -n 1 -s'\" &")))
			       (system cmd)))))
      servers-matrix
     )))
  
   ;;  (iup:hbox
   ;;   (iup:vbox
   ;;    (iup:button "Start"
   ;;      	  ;; #:size "50x"
   ;;      	  #:expand "YES"
   ;;      	  #:action (lambda (obj)
   ;;      		     (let ((cmd (conc ;; "xterm -geometry 180x20 -e \""
   ;;      				      "megatest -server - &")))
   ;;      				      ;; ";echo Press any key to continue;bash -c 'read -n 1 -s'\" &")))
   ;;      		       (system cmd))))
   ;;    (iup:button "Stop"
   ;;      	  #:expand "YES"
   ;;      	  ;; #:size "50x"
   ;;      	  #:action (lambda (obj)
   ;;      		     (let ((cmd (conc ;; "xterm -geometry 180x20 -e \""
   ;;      				      "megatest -stop-server 0 &")))
   ;;      				      ;; ";echo Press any key to continue;bash -c 'read -n 1 -s'\" &")))
   ;;      		       (system cmd))))
   ;;    (iup:button "Restart"
   ;;      	  #:expand "YES"
   ;;      	  ;; #:size "50x"
   ;;      	  #:action (lambda (obj)
   ;;      		     (let ((cmd (conc ;; "xterm -geometry 180x20 -e \""
   ;;      				      "megatest -stop-server 0;megatest -server - &")))
   ;;      				      ;; ";echo Press any key to continue;bash -c 'read -n 1 -s'\" &")))
   ;;      		       (system cmd)))))
   ;;    servers-matrix
   ;;   )))
    servers-matrix
    ))

;; The main menu
(define (dcommon:main-menu)
  (iup:menu ;; a menu is a special attribute to a dialog (think Gnome putting the menu at screen top)
   (iup:menu-item "Files" (iup:menu   ;; Note that you can use either #:action or action: for options
		       (iup:menu-item "Open"  action: (lambda (obj)
							(iup:show (iup:file-dialog))
							(print "File->open " obj)))

Modified http-transport.scm from [f8416b573d] to [462dc5100a].

153
154
155
156
157
158
159
160

161
162
163
164
165
166
167
153
154
155
156
157
158
159

160
161
162
163
164
165
166
167







-
+







		   tasks:open-db 
		   server-id 
		   ipaddrstr portnum)
   (debug:print 1 "INFO: Trying to start server on " ipaddrstr ":" portnum)
   ;; This starts the spiffy server
   ;; NEED WAY TO SET IP TO #f TO BIND ALL
   (start-server bind-address: ipaddrstr port: portnum)
   (open-run-close tasks:server-delete tasks:open-db ipaddrstr portnum)
   (open-run-close tasks:server-delete tasks:open-db ipaddrstr portnum " http-transport:try-start-server")
   (debug:print 1 "INFO: server has been stopped")))

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================

;;======================================================================
389
390
391
392
393
394
395
396

397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416

417
418
419
420
421
422
423
389
390
391
392
393
394
395

396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415

416
417
418
419
420
421
422
423







-
+



















-
+







	    (debug:print-info 0 "Average non-cached time   "
			      (if (eq? *number-non-write-queries* 0)
				  "n/a (no queries)"
				  (/ *total-non-write-delay* 
				     *number-non-write-queries*))
			      " ms")
	    (debug:print-info 0 "Server shutdown complete. Exiting")
	    (tasks:server-delete-record tdb server-id)
	    (tasks:server-delete-record tdb server-id " http-transport:keep-running")
	    (exit))))))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (daemon:ize))
  (if (server:check-if-running run-id)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
  (let ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id)))
    (if (not server-id)
	(begin
	  ;; since we didn't get the server lock we are going to clean up and bail out
	  (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	  (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db)
	  (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db " http-transport:launch")
	  )
	(let* ((th2 (make-thread (lambda ()
				   (http-transport:run 
				    (if (args:get-arg "-server")
					(args:get-arg "-server")
					"-")
				    run-id

Modified server.scm from [faceca54c0] to [9a921aa4ae].

127
128
129
130
131
132
133
134


135
136
127
128
129
130
131
132
133

134
135
136
137







-
+
+


	;; client:start returns #t if login was successful.
	;;
	(let ((res (client:start run-id server)))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      res
	      (begin
		(open-run-close tasks:server-force-clean-running-records-for-run-id tasks:open-db run-id)
		(open-run-close tasks:server-force-clean-running-records-for-run-id tasks:open-db run-id 
				" server:check-if-running")
		res)))
	#f)))

Modified tasks.scm from [8d2eda9003] to [d835e3e194].

89
90
91
92
93
94
95
96

97
98
99
100
101
102
103
89
90
91
92
93
94
95

96
97
98
99
100
101
102
103







-
+







(define (tasks:hostinfo-get-port        vec)    (vector-ref  vec 2))
(define (tasks:hostinfo-get-pubport     vec)    (vector-ref  vec 3))
(define (tasks:hostinfo-get-transport   vec)    (vector-ref  vec 4))
(define (tasks:hostinfo-get-pid         vec)    (vector-ref  vec 5))
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
125
126
127
128
129
130
131
132
133



134
135
136



137
138
139
140



141
142
143



144
145
146



147
148
149
150
151

152
153
154

155
156
157
158
159
160
161
125
126
127
128
129
130
131


132
133
134
135


136
137
138
139



140
141
142
143


144
145
146
147


148
149
150
151
152
153
154

155
156
157

158
159
160
161
162
163
164
165







-
-
+
+
+

-
-
+
+
+

-
-
-
+
+
+

-
-
+
+
+

-
-
+
+
+




-
+


-
+







     (lambda (num-in-queue)
       (set! res num-in-queue))
     mdb
     "SELECT count(id) FROM servers WHERE run_id=? AND state = 'available';"
     run-id)
    res))

(define (tasks:server-clean-out-old-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 300 AND run_id=?;" run-id))
(define (tasks:server-clean-out-old-records-for-run-id mdb run-id tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 300 AND run_id=?;"
		   (conc "defunct" tag) run-id))

(define (tasks:server-force-clean-running-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE state = 'running' AND run_id=?;" run-id))
(define (tasks:server-force-clean-running-records-for-run-id mdb run-id tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=?;"
		   (conc "defunct" tag) run-id))

(define (tasks:server-force-clean-run-record mdb run-id iface port)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
		   run-id iface port))
(define (tasks:server-force-clean-run-record mdb run-id iface port tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
		   (conc "defunct" tag) run-id iface port))

(define (tasks:server-delete-records-for-this-pid mdb)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE hostname=? AND pid=?;" (get-host-name) (current-process-id)))
(define (tasks:server-delete-records-for-this-pid mdb tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;"
		   (conc "defunct" tag) (get-host-name) (current-process-id)))

(define (tasks:server-delete-record mdb server-id)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE id=?;" server-id)
(define (tasks:server-delete-record mdb server-id tag) 
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;"
		   (conc "defunct" tag) server-id)
  ;; use this opportuntity to clean out records over one month old
  (sqlite3:execute mdb "DELETE FROM servers WHERE state not in ('running','shutting-down') AND (strftime('%s','now') - start_time) > 2628000;"))

(define (tasks:server-set-state! mdb server-id state)
  (sqlite3:execute mdb "UPDATE servers SET state=? WHERE id=?;" state server-id))
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;" state server-id))

(define (tasks:server-set-interface-port mdb server-id interface port)
  (sqlite3:execute mdb "UPDATE servers SET interface=?,port=? WHERE id=?;" interface port server-id))
  (sqlite3:execute mdb "UPDATE servers SET interface=?,port=?,heartbeat=strftime('%s','now') WHERE id=?;" interface port server-id))

;; Get random port not used in long time
;;
(define (tasks:server-get-next-port mdb)
  (let* ((lownum        30000)
	(highnum        64000)
	(used-ports     '())
236
237
238
239
240
241
242
243
244



245
246

247
248
249
250
251
252
253
240
241
242
243
244
245
246


247
248
249
250

251
252
253
254
255
256
257
258







-
-
+
+
+

-
+







          WHERE run_id=? AND state='running'
          ORDER BY start_time DESC LIMIT 1;" run-id) ;; (common:version-signature) run-id)
    res))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport)
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport) res)))
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0   1        2         3    4       5          6        7     8          9          10        11     12
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res)))
     mdb
     "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport FROM servers ORDER BY start_time DESC;")
     "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id FROM servers WHERE state NOT LIKE 'defunct%' ORDER BY start_time DESC;")
    res))

(define (tasks:kill-server status hostname port pid)
  (debug:print-info 1 "Removing defunct server record for " hostname ":" port)
  (if port
      (open-run-close tasks:server-deregister tasks:open-db hostname port: port)
      (open-run-close tasks:server-deregister tasks:open-db hostname pid:  pid))