Megatest

Check-in [6757cdb9b3]
Login
Overview
Comment:99.5% done with protecting db access with journal check
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: 6757cdb9b3b579cafc1cd8f2904aacf09af1baaa
User & Date: matt on 2014-11-12 21:51:39
Other Links: branch diff | manifest | tags
Context
2014-11-12
22:40
99.8% check-in: edc56c4136 user: matt tags: v1.60
21:51
99.5% done with protecting db access with journal check check-in: 6757cdb9b3 user: matt tags: v1.60
17:14
98% done check-in: 24e4d63419 user: mrwellan tags: v1.60
Changes

Modified client.scm from [dc8b2be6ad] to [406d30b1f6].

53
54
55
56
57
58
59


60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122































































123
124
125
126
127
128
129
53
54
55
56
57
58
59
60
61































































62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131







+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 100) (failed-connects 0))
  (debug:print-info 2 "client:setup remaining-tries=" remaining-tries)
  (let* ((tdbdat (tasks:open-db))
	 (tdb    (db:dbdat-get-db tdbdat)))
  (if (<= remaining-tries 0)
      (begin
	(debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	(exit 1))
      (let ((host-info (hash-table-ref/default *runremote* run-id #f)))
	(if host-info
	    (let* ((iface     (http-transport:server-dat-get-iface host-info))
		   (port      (http-transport:server-dat-get-port  host-info))
		   (start-res (http-transport:client-connect iface port))
		   (ping-res  (rmt:login-no-auto-client-setup start-res run-id)))
	      (if ping-res   ;; sucessful login?
		  (begin
		    (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries)
		    ;; Why add the close-connections here?
		    ;; (http-transport:close-connections run-id)
		    (hash-table-set! *runremote* run-id start-res)
		    start-res)  ;; return the server info
		  ;; have host info but no ping. shutdown the current connection and try again
		  (begin    ;; login failed
		    (debug:print-info 1 "client:setup, ping is bad for start-res=" start-res " and *runremote*=" host-info)
		    (http-transport:close-connections run-id)
		    (hash-table-delete! *runremote* run-id)
		    (if (< remaining-tries 8)
			(thread-sleep! 5)
			(thread-sleep! 1))
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))
	    ;; YUK: rename server-dat here
	    (let* ((server-dat (tasks:get-server (tasks:get-db) run-id)))
	      (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	      (if server-dat
		  (let* ((iface     (tasks:hostinfo-get-interface server-dat))
			 (port      (tasks:hostinfo-get-port      server-dat))
			 (start-res (http-transport:client-connect iface port))
			 (ping-res  (rmt:login-no-auto-client-setup start-res run-id)))
		    (if (and start-res
			     ping-res)
			(begin
			  (hash-table-set! *runremote* run-id start-res)
			  (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res))
			  start-res)
			(begin    ;; login failed but have a server record, clean out the record and try again
			  (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
			  (http-transport:close-connections run-id)
			  (hash-table-delete! *runremote* run-id)
			  (tasks:server-force-clean-run-record (tasks:get-db)
							       run-id 
							       (tasks:hostinfo-get-interface server-dat)
							       (tasks:hostinfo-get-port      server-dat)
							       " client:setup (server-dat = #t)")
			  (thread-sleep! 2)
			  (server:try-running run-id)
			  (thread-sleep! 10) ;; give server a little time to start up
			  (client:setup run-id remaining-tries: (- remaining-tries 1)))))
		  (begin    ;; no server registered
		    (let ((num-available (tasks:num-in-available-state (tasks:get-db) run-id)))
		      (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
		      (thread-sleep! 2) 
		      (if (< num-available 2)
			  (begin
			    ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)")
			    (server:try-running run-id)))
		      (thread-sleep! 10) ;; give server a little time to start up
		      (client:setup run-id remaining-tries: (- remaining-tries 1))))))))))
    (if (<= remaining-tries 0)
	(begin
	  (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	  (exit 1))
	(let ((host-info (hash-table-ref/default *runremote* run-id #f)))
	  (if host-info
	      (let* ((iface     (http-transport:server-dat-get-iface host-info))
		     (port      (http-transport:server-dat-get-port  host-info))
		     (start-res (http-transport:client-connect iface port))
		     (ping-res  (rmt:login-no-auto-client-setup start-res run-id)))
		(if ping-res   ;; sucessful login?
		    (begin
		      (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries)
		      ;; Why add the close-connections here?
		      ;; (http-transport:close-connections run-id)
		      (hash-table-set! *runremote* run-id start-res)
		      start-res)  ;; return the server info
		    ;; have host info but no ping. shutdown the current connection and try again
		    (begin    ;; login failed
		      (debug:print-info 1 "client:setup, ping is bad for start-res=" start-res " and *runremote*=" host-info)
		      (http-transport:close-connections run-id)
		      (hash-table-delete! *runremote* run-id)
		      (if (< remaining-tries 8)
			  (thread-sleep! 5)
			  (thread-sleep! 1))
		      (client:setup run-id remaining-tries: (- remaining-tries 1)))))
	      ;; YUK: rename server-dat here
	      (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
		(debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
		(if server-dat
		    (let* ((iface     (tasks:hostinfo-get-interface server-dat))
			   (port      (tasks:hostinfo-get-port      server-dat))
			   (start-res (http-transport:client-connect iface port))
			   (ping-res  (rmt:login-no-auto-client-setup start-res run-id)))
		      (if (and start-res
			       ping-res)
			  (begin
			    (hash-table-set! *runremote* run-id start-res)
			    (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res))
			    start-res)
			  (begin    ;; login failed but have a server record, clean out the record and try again
			    (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
			    (http-transport:close-connections run-id)
			    (hash-table-delete! *runremote* run-id)
			    (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
								 run-id 
								 (tasks:hostinfo-get-interface server-dat)
								 (tasks:hostinfo-get-port      server-dat)
								 " client:setup (server-dat = #t)")
			    (thread-sleep! 2)
			    (server:try-running run-id)
			    (thread-sleep! 10) ;; give server a little time to start up
			    (client:setup run-id remaining-tries: (- remaining-tries 1)))))
		    (begin    ;; no server registered
		      (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
			(debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
			(thread-sleep! 2) 
			(if (< num-available 2)
			    (begin
			      ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)")
			      (server:try-running run-id)))
			(thread-sleep! 10) ;; give server a little time to start up
			(client:setup run-id remaining-tries: (- remaining-tries 1)))))))))))

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

;; client:signal-handler

Modified dashboard.scm from [c1ff2abbf5] to [ea8b1971cf].

1451
1452
1453
1454
1455
1456
1457
1458

1459
1460
1461
1462
1463
1464
1465
1466
1451
1452
1453
1454
1455
1456
1457

1458

1459
1460
1461
1462
1463
1464
1465







-
+
-







	   (> modtime last-db-update-time)
	   (> (current-seconds)(+ last-db-update-time 1)))))

(define *monitor-db-path* (conc *dbdir* "/monitor.db"))
(define *last-monitor-update-time* 0)

;; Force creation of the db in case it isn't already there.
(let ((db (tasks:open-db)))
(tasks:open-db)
  (sqlite3:finalize! db))

(define (dashboard:get-youngest-run-db-mod-time)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn))
     (current-seconds)) ;; something went wrong - just print an error and return current-seconds

Modified db.scm from [4ac80e9f1a] to [71a8762428].

473
474
475
476
477
478
479
480
481


482
483

484
485
486
487
488
489
490
473
474
475
476
477
478
479


480
481
482

483
484
485
486
487
488
489
490







-
-
+
+

-
+







	     (db:dbdat-get-db todb)
	     full-sel)

	    ;; first pass implementation, just insert all changed rows
	    (for-each 
	     (lambda (targdb)
	       (let* ((db     (db:dbdat-get-db targdb))
		      (stmth  (sqlite3:prepare targdb full-ins)))
		 (db:delay-if-busy targdb)
		      (stmth  (sqlite3:prepare db full-ins)))
		 ;; (db:delay-if-busy targdb) ;; NO WAITING
		 (sqlite3:with-transaction
		  targdb
		  db
		  (lambda ()
		    (for-each ;; 
		     (lambda (fromrow)
		       (let* ((a    (vector-ref fromrow 0))
			      (curr (hash-table-ref/default todat a #f))
			      (same #t))
			 (let loop ((i 0))
532
533
534
535
536
537
538
539
540


541
542
543
544
545
546

547
548
549
550
551
552
553
532
533
534
535
536
537
538


539
540
541
542
543
544
545

546
547
548
549
550
551
552
553







-
-
+
+





-
+







	 (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f))
	 (mtdb     (if toppath (db:open-megatest-db)))
	 (run-ids  (if run-ids 
		       run-ids
		       (if toppath (begin
				     (db:delay-if-busy mtdb)
				     (db:get-all-run-ids mtdb)))))
	 (mdb     (tasks:open-db))
	 (servers (tasks:get-all-servers mdb)))
	 (tdbdat  (tasks:open-db))
	 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))))
    
    ;; kill servers
    (if (member 'killservers options)
	(for-each
	 (lambda (server)
	   (tasks:server-delete-record mdb (vector-ref server 0) "dbmigration")
	   (tasks:server-delete-record (db:delay-if-busy tdbdat) (vector-ref server 0) "dbmigration")
	   (tasks:kill-server (vector-ref server 2)(vector-ref server 1)))
	 servers))

    ;; clear out junk records
    ;;
    (if (member 'dejunk options)
	(begin
584
585
586
587
588
589
590
591
592



593
594
595
596
597
598
599
584
585
586
587
588
589
590


591
592
593
594
595
596
597
598
599
600







-
-
+
+
+







		  (frundb (db:dbdat-get-db (db:get-db fromdb run-id))))
	     (db:delay-if-busy frundb)
	     (db:delay-if-busy mtdb)
	     (if (eq? run-id 0)
		 (db:sync-tables (db:sync-main-list dbstruct) fromdb mtdb)
		 (db:sync-tables db:sync-tests-only fromdb mtdb))))
	 run-ids))
    (db:close-all dbstruct)
    (sqlite3:finalize! mdb)))
    ;; (db:close-all dbstruct)
    ;; (sqlite3:finalize! mdb)
    ))

;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling  proc idb . params)
  (debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
  (if (or *db-write-access*
	  (not (member proc *db:all-write-procs*)))
      (let* ((db (cond
1828
1829
1830
1831
1832
1833
1834

1835
1836


1837
1838
1839
1840
1841
1842
1843
1829
1830
1831
1832
1833
1834
1835
1836


1837
1838
1839
1840
1841
1842
1843
1844
1845







+
-
-
+
+







		      pid test-id))))

(define (db:test-get-top-process-pid dbstruct run-id test-id)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (db)
  (sqlite3:first-result db "SELECT attemptnum FROM tests WHERE id=?;"
			test-id)))
     (sqlite3:first-result db "SELECT attemptnum FROM tests WHERE id=?;"
			   test-id))))

(define db:test-record-fields '("id"           "run_id"        "testname"  "state"      "status"      "event_time"
				"host"         "cpuload"       "diskfree"  "uname"      "rundir"      "item_path"
                                "run_duration" "final_logf"    "comment"   "shortdir"   "attemptnum"))

;; fields *must* be a non-empty list
;;
2432
2433
2434
2435
2436
2437
2438

2439
2440
2441
2442
2443
2444
2445
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448







+







		  (if (null? tal)
		      (map cdr (hash-table->alist tests-hash)) ;; return a list of the most recent tests
		      (loop (car tal)(cdr tal))))))))))

(define (db:delay-if-busy dbdat #!key (count 6))
  (if dbdat
      (let* ((dbpath (db:dbdat-get-path dbdat))
	     (db     (db:dbdat-get-db   dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline
	     (dbfj   (conc dbpath "-journal")))
	(if (file-exists? dbfj)
	    (case count
	      ((6)
	       (thread-sleep! 0.2)
	       (db:delay-if-busy count: 5))
	      ((5)
2455
2456
2457
2458
2459
2460
2461
2462



2463
2464
2465
2466
2467
2468
2469
2458
2459
2460
2461
2462
2463
2464

2465
2466
2467
2468
2469
2470
2471
2472
2473
2474







-
+
+
+







	       (thread-sleep! 3.2)
	       (db:delay-if-busy count: 1))
	      ((1)
	       (thread-sleep! 6.4)
	       (db:delay-if-busy count: 0))
	      (else
	       (debug:print-info 0 "delaying db access due to high database load.")
	       (thread-sleep! 12.8)))))))
	       (thread-sleep! 12.8))))
	db)
      "bogus result from db:delay-if-busy"))

(define (db:test-get-records-for-index-file dbstruct run-id test-name)
  (let ((res '()))
    (db:with-db
     dbstruct
     run-id
     #f

Modified dcommon.scm from [ed88f64fcd] to [e887ed7ced].

444
445
446
447
448
449
450


451

452
453
454
455
456
457
458
459
460

461
462
463
464
465
466
467
444
445
446
447
448
449
450
451
452

453
454
455
456
457
458
459
460
461

462
463
464
465
466
467
468
469







+
+
-
+








-
+







    (set! dashboard:update-summary-tab updater)
    (iup:attribute-set! stats-matrix "WIDTHDEF" "40")
    (iup:vbox
     ;; (iup:label "Run statistics"  #:expand "HORIZONTAL")
     stats-matrix)))

(define (dcommon:servers-table)
  (let* ((tdbdat         (tasks:open-db))
	 (tdb            (db:dbdat-get-db tdbdat))
  (let* ((colnum         0)
	 (colnum         0)
	 (rownum         0)
	 (servers-matrix (iup:matrix #:expand "YES"
				     #:numcol 7
				     #:numcol-visible 7
				     #:numlin-visible 5
				     ))
	 (colnames       (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId"))
	 (updater        (lambda ()
			   (let ((servers (tasks:get-all-servers (tasks:get-db))))
			   (let ((servers (tasks:get-all-servers (db:delay-if-busy tdbdat))))
			     (iup:attribute-set! servers-matrix "NUMLIN" (length servers))
			     ;; (set! colnum 0)
			     ;; (for-each (lambda (colname)
			     ;;    	 ;; (print "colnum: " colnum " colname: " colname)
			     ;;    	 (iup:attribute-set! servers-matrix (conc "0:" colnum) colname)
			     ;;    	 (set! colnum (+ 1 colnum)))
			     ;;           colnames)

Modified http-transport.scm from [c83e2578f6] to [8d5a62d976].

132
133
134
135
136
137
138
139


140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

162
163
164
165
166
167

168
169
170
171
172
173
174
175
176
177
178
179
180
181

182
183
184
185
186
187
188
132
133
134
135
136
137
138

139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160


161
162
163
164

165

166
167
168
169
170
171
172
173
174
175
176
177
178


179
180
181
182
183
184
185
186







-
+
+




















-
-
+



-

-
+












-
-
+







						  headers: '((content-type text/plain))))
				  (else (continue))))))))
    (http-transport:try-start-server run-id ipaddrstr start-port server-id)))

;; This is recursively run by http-transport:run until sucessful
;;
(define (http-transport:try-start-server run-id ipaddrstr portnum server-id)
  (let ((config-hostname (configf:lookup *configdat* "server" "hostname")))
  (let ((config-hostname (configf:lookup *configdat* "server" "hostname"))
	(tdbdat          (tasks:open-db)))
    (debug:print-info 0 "http-transport:try-start-server run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname)
    (handle-exceptions
     exn
     (begin
       (print-error-message exn)
       (if (< portnum 64000)
	   (begin 
	     (debug:print 0 "WARNING: attempt to start server failed. Trying again ...")
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 0 "exn=" (condition->list exn))
	     (portlogger:open-run-close portlogger:set-failed portnum)
	     (debug:print 0 "WARNING: failed to start on portnum: " portnum ", trying next port")
	     (thread-sleep! 0.1)

	     ;; get_next_port goes here
	     (http-transport:try-start-server run-id
					      ipaddrstr
					      (portlogger:open-run-close portlogger:find-port)
					      server-id))
	   (begin
	     (tasks:wait-on-busy-monitor.db)
	     (tasks:server-force-clean-run-record (tasks:get-db) run-id ipaddrstr portnum " http-transport:try-start-server")
	     (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server")
	     (print "ERROR: Tried and tried but could not start the server"))))
     ;; any error in following steps will result in a retry
     (set! *server-info* (list ipaddrstr portnum))
     (tasks:wait-on-busy-monitor.db)
     (tasks:server-set-interface-port 
		     (tasks:get-db)
		     (db:delay-if-busy tdbdat)
		     server-id 
		     ipaddrstr portnum)
     (debug:print 0 "INFO: Trying to start server on " ipaddrstr ":" portnum)
     ;; This starts the spiffy server
     ;; NEED WAY TO SET IP TO #f TO BIND ALL
     ;; (start-server bind-address: ipaddrstr port: portnum)
     (if config-hostname ;; this is a hint to bind directly
	 (start-server port: portnum bind-address: (if (equal? config-hostname "-")
						       ipaddrstr
						       config-hostname))
	 (start-server port: portnum))
     ;;  (portlogger:open-run-close portlogger:set-port portnum "released")
     (tasks:wait-on-busy-monitor.db)
     (tasks:server-force-clean-run-record (tasks:get-db) run-id ipaddrstr portnum " http-transport:try-start-server")
     (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server")
     (debug:print 1 "INFO: server has been stopped"))))

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================

;;======================================================================
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284

285
286
287
288
289
290
291
255
256
257
258
259
260
261

262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280

281
282
283
284
285
286
287
288







-



















-
+







	    exn
	    (debug:print 0 "WARNING: closing connections failed. Server at " fullurl " almost certainly dead")
	    (close-all-connections!))
	   (debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries)
	   (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1)))
	 (begin
	   (mutex-unlock! *http-mutex*)
	   (tasks:wait-on-busy-monitor.db)
	   (tasks:kill-server-run-id run-id)
	   #f))
     (begin
       (debug:print-info 11 "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n")
       ;; set up the http-client here
       (max-retry-attempts 1)
       ;; consider all requests indempotent
       (retry-request? (lambda (request)
			 #f))
       ;; send the data and get the response
       ;; extract the needed info from the http data and 
       ;; process and return it.
       (let* ((send-recieve (lambda ()
			      (mutex-lock! *http-mutex*)
			      ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines)
			      ;;					       ((exn http client-error) e (print e)))
			      (set! res (handle-exceptions
					 exn
					 (begin
					   (debug:print 0 "WARNING: failure in with-input-from-request. Killing associated server to allow clean retry.")
					   (debug:print 0 "WARNING: failure in with-input-from-request to " fullrul ". Killing associated server to allow clean retry.")
					   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
					   (hash-table-delete! *runremote* run-id)
					   (tasks:kill-server-run-id run-id)
					   #f)
					 (with-input-from-request ;; was dat
					  fullurl 
					  (list (cons 'key "thekey")
352
353
354
355
356
357
358

359

360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376

377
378
379

380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

404
405
406
407
408
409
410
411
412
413
414
415

416
417
418

419
420
421
422
423
424
425
349
350
351
352
353
354
355
356

357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373

374
375


376

377
378
379
380
381
382
383

384
385
386
387
388
389
390
391
392
393
394
395
396
397

398
399
400
401
402
403
404
405
406
407
408


409
410
411

412
413
414
415
416
417
418
419







+
-
+
















-
+

-
-
+
-







-














-
+










-
-
+


-
+







;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running server-id run-id)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (let* ((tdbdat      (tasks:open-db))
  (let* ((server-info (let loop ((start-time (current-seconds))
	 (server-info (let loop ((start-time (current-seconds))
				 (changed    #t)
				 (last-sdat  "not this"))
                        (let ((sdat #f))
			  (thread-sleep! 0.01)
			  (debug:print-info 0 "Waiting for server alive signature")
                          (mutex-lock! *heartbeat-mutex*)
                          (set! sdat *server-info*)
                          (mutex-unlock! *heartbeat-mutex*)
                          (if (and sdat
				   (not changed)
				   (> (- (current-seconds) start-time) 2))
			      sdat
                              (begin
				(debug:print-info 0 "Still waiting, last-sdat=" last-sdat)
                                (sleep 4)
				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
				    (let ((tdb  (tasks:open-db)))
				    (begin
				      (debug:print 0 "ERROR: transport appears to have died, exiting server " server-id " for run " run-id)
				      (tasks:wait-on-busy-monitor.db)
				      (tasks:server-delete-record tdb server-id "failed to start, never received server alive signature")
				      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
				      (sqlite3:finalize! tdb)
				      (exit))
				    (loop start-time
					  (equal? sdat last-sdat)
					  sdat)))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (tdb         (tasks:open-db))
	 (server-timeout (let ((tmo (configf:lookup  *configdat* "server" "timeout")))
			   (if (and (string? tmo)
				    (string->number tmo))
			       (* 60 60 (string->number tmo))
			       ;; (* 3 24 60 60) ;; default to three days
			       (* 60 1)         ;; default to one minute
			       ;; (* 60 60 25)      ;; default to 25 hours
			       ))))
    (let loop ((count         0)
	       (server-state 'available))
      ;; Use this opportunity to sync the inmemdb to db
      (let ((start-time (current-milliseconds))
	    (sync-time  #f)
	    (rem-time   #f))

	;; inmemdb is a dbstruct
	(if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
	(set! sync-time  (- (current-milliseconds) start-time))
	(set! rem-time (quotient (- 4000 sync-time) 1000))
	(debug:print 2 "SYNC: time= " sync-time ", rem-time=" rem-time)

      ;;
      ;; set_running after our first pass through and start the db
      ;;
      (if (eq? server-state 'available)
	  (begin
	    (tasks:wait-on-busy-monitor.db)
	    (tasks:server-set-state! tdb server-id "dbprep")
	    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
	    (thread-sleep! 5) ;; give some margin for queries to complete before switching from file based access to server based access
	    (set! *inmemdb*  (db:setup run-id))
	    (tasks:server-set-state! tdb server-id "running")))
	    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")))

      (if (and (<= rem-time 4)
	       (> rem-time 0))
	  (thread-sleep! rem-time)
	  (thread-sleep! 4))) ;; fallback for if the math is changed ...
      
      (if (< count 1) ;; 3x3 = 9 secs aprox
467
468
469
470
471
472
473
474
475
476
477
478
479

480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499

500
501
502
503
504
505
506

507
508
509
510
511
512
513
514
515
516
517
518












519
520
521
522
523
524
525






526
527
528
529
530
531





532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547















548
549
550

551
552

553
554
555
556
557
558
559





560
561
562
563
564
565
566
461
462
463
464
465
466
467

468
469
470
471

472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490


491
492
493
494
495
496
497
498
499












500
501
502
503
504
505
506
507
508
509
510
511







512
513
514
515
516
517






518
519
520
521
522
















523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539

540
541

542
543
544





545
546
547
548
549
550
551
552
553
554
555
556







-




-
+


















-
-
+







+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+


-
+

-
+


-
-
-
-
-
+
+
+
+
+







	    ;;     (tasks:server-set-state! tdb server-id "running"))
	    ;;
	    (loop 0 server-state))
	  (begin
	    (debug:print-info 0 "Starting to shutdown the server.")
	    ;; need to delete only *my* server entry (future use)
	    (set! *time-to-exit* #t)
	    (tasks:wait-on-busy-monitor.db) ;; wait here in addition to just before the shutting-down
	    (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
	    ;;
	    ;; start_shutdown
	    ;;
	    (tasks:server-set-state! tdb server-id "shutting-down")
	    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
	    (portlogger:open-run-close portlogger:set-port port "released")
	    (thread-sleep! 5)
	    (debug:print-info 0 "Max cached queries was    " *max-cache-size*)
	    (debug:print-info 0 "Number of cached writes   " *number-of-writes*)
	    (debug:print-info 0 "Average cached write time "
			      (if (eq? *number-of-writes* 0)
				  "n/a (no writes)"
				  (/ *writes-total-delay*
				     *number-of-writes*))
			      " ms")
	    (debug:print-info 0 "Number non-cached queries "  *number-non-write-queries*)
	    (debug:print-info 0 "Average non-cached time   "
			      (if (eq? *number-non-write-queries* 0)
				  "n/a (no queries)"
				  (/ *total-non-write-delay* 
				     *number-non-write-queries*))
			      " ms")
	    (debug:print-info 0 "Server shutdown complete. Exiting")
	    (tasks:wait-on-busy-monitor.db)
	    (tasks:server-delete-record tdb server-id " http-transport:keep-running")
	    (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running")
	    (exit))))))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
  (let* ((tdbdat (tasks:open-db)))
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (begin
	(daemon:ize)
	(if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	    (begin
	      (current-error-port *alt-log-file*)
	      (current-output-port *alt-log-file*)))))
  (if (server:check-if-running run-id)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
    (set! *run-id*   run-id)
    (if (args:get-arg "-daemonize")
	(begin
	  (daemon:ize)
	  (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id)
	(begin
	  (debug:print 0 "INFO: Server for run-id " run-id " already running")
	  (exit 0)))
  (tasks:wait-on-busy-monitor.db)
  (let loop ((server-id (tasks:server-lock-slot (tasks:get-db) run-id))
	     (remtries  4))
    (if (not server-id)
	(if (> remtries 0)
	    (begin
	      (thread-sleep! 2)
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
	      (tasks:wait-on-busy-monitor.db)
	      (loop (tasks:server-lock-slot (tasks:get-db) run-id)
		    (- remtries 1)))
	    (begin
	      ;; since we didn't get the server lock we are going to clean up and bail out
	      (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
		(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)
		      (- remtries 1)))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
		(debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	      (tasks:wait-on-busy-monitor.db)
	      (tasks:server-delete-records-for-this-pid (tasks:get-db) " http-transport:launch")
	      ))
	(let* ((th2 (make-thread (lambda ()
				   (debug:print-info 0 "Server run thread started")
				   (http-transport:run 
				    (if (args:get-arg "-server")
					(args:get-arg "-server")
					"-")
				    run-id
				    server-id)) "Server run"))
	       (th3 (make-thread (lambda ()
				   (debug:print-info 0 "Server monitor thread started")
				   (http-transport:keep-running server-id run-id))
				 "Keep running")))
	  ;; Database connection
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
		))
	  (let* ((th2 (make-thread (lambda ()
				     (debug:print-info 0 "Server run thread started")
				     (http-transport:run 
				      (if (args:get-arg "-server")
					  (args:get-arg "-server")
					  "-")
				      run-id
				      server-id)) "Server run"))
		 (th3 (make-thread (lambda ()
				     (debug:print-info 0 "Server monitor thread started")
				     (http-transport:keep-running server-id run-id))
				   "Keep running")))
	    ;; Database connection


	  ;; don't start the db here
	    ;; don't start the db here

	  ;; (set! *inmemdb*  (db:setup run-id))
	    ;; (set! *inmemdb*  (db:setup run-id))


	  (thread-start! th2)
	  (thread-start! th3)
	  (set! *didsomething* #t)
	  (thread-join! th2)
	  (exit)))))
	    (thread-start! th2)
	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))

(define (http-transport:server-signal-handler signum)
  (signal-mask! signum)
  (handle-exceptions
   exn
   (debug:print " ... exiting ...")
   (let ((th1 (make-thread (lambda ()

Modified megatest.scm from [9b64b448cf] to [42ff41d3a5].

545
546
547
548
549
550
551

552

553
554
555
556
557
558
559
545
546
547
548
549
550
551
552

553
554
555
556
557
558
559
560







+
-
+







;; MAY STILL NEED THIS
;;		       (set! *megatest-db* (make-dbr:dbstruct path: *toppath* local: #t))))))))))

(if (or (args:get-arg "-list-servers")
	(args:get-arg "-stop-server"))
    (let ((tl (launch:setup-for-run)))
      (if tl 
	  (let* ((tdbdat  (tasks:open-db))
	  (let* ((servers (open-run-close tasks:get-all-servers tasks:open-db))
		 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))
		 (fmtstr  "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n")
		 (servers-to-kill '())
		 (killinfo   (args:get-arg "-stop-server"))
		 (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f))
		 (sid        (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)))
	    (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport")
	    (format #t fmtstr "==" "=====" "===" "====" "=================" "======" "========" "=====" "=========")
574
575
576
577
578
579
580
581

582
583

584
585
586
587
588
589
590
575
576
577
578
579
580
581

582
583

584
585
586
587
588
589
590
591







-
+

-
+







		      (killed     #f)
		      (status     (< last-update 20)))
		 ;;   (zmq-sockets (if status (server:client-connect hostname port) #f)))
		 ;; no need to login as status of #t indicates we are connecting to correct 
		 ;; server
		 (if (equal? state "dead")
		     (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day.
			 (open-run-close tasks:server-deregister tasks:open-db hostname pullport: pullport pid: pid action: 'delete))
			 (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete))
		     (if (> last-update 20)        ;; Mark as dead if not updated in last 20 seconds
			 (open-run-close tasks:server-deregister tasks:open-db hostname pullport: pullport pid: pid)))
			 (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid)))
		 (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update
			 (if status "alive" "dead") transport)
		 (if (or (equal? id sid)
			 (equal? sid 0)) ;; kill all/any
		     (begin
		       (debug:print-info 0 "Attempting to stop server with pid " pid)
		       (tasks:kill-server status hostname pullport pid transport)))))

Modified rmt.scm from [ec918e30be] to [3dfb2ffd80].

74
75
76
77
78
79
80
81



82
83
84
85
86
87
88
74
75
76
77
78
79
80

81
82
83
84
85
86
87
88
89
90







-
+
+
+







  (mutex-unlock! *db-multi-sync-mutex*)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f)))
			    (if cinfo
				cinfo
				;; NB// can cache the answer for server running for 10 seconds ...
				;;  ;; (and (not (rmt:write-frequency-over-limit? cmd run-id))
				(if (tasks:server-running-or-starting? (tasks:get-db) run-id)
				(if (tasks:server-running-or-starting? (db:delay-if-busy
									(tasks:open-db))
								       run-id)
				    (let ((res (client:setup run-id)))
				      (if res 
					  (hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully)
					  #f))
				    #f))))
	 (jparams         (db:obj->string params)))
    (if connection-info

Modified runs.scm from [5861640706] to [692dff51df].

212
213
214
215
216
217
218
219

220
221
222
223

224
225


226
227
228
229
230
231
232
233


234
235
236
237
238
239
240
212
213
214
215
216
217
218

219
220
221
222
223
224


225
226


227
228
229
230


231
232
233
234
235
236
237
238
239







-
+




+
-
-
+
+
-
-




-
-
+
+







	 (test-records       (make-hash-table))
	 ;; need to process runconfigs before generating these lists
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 (tasks-db           (tasks:open-db)))
	 (tdbdat             (tasks:open-db)))

    (set-signal-handler! signal/int
			 (lambda (signum)
			   (signal-mask! signum)
			   (print "Received signal " signum ", cleaning up before exit. Please wait...")
			   (let ((tdb (tasks:open-db)))
			     (tasks:set-state-given-param-key tdb task-key "killed")
			   (let ((tdbdat (tasks:open-db)))
			     (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "killed"))
			     ;; (sqlite3:interrupt! tdb) ;; seems silly?
			     (sqlite3:finalize! tdb))
			   (print "Killed by signal " signum ". Exiting")
			   (exit)))

    ;; register this run in monitor.db
    (tasks:add tasks-db "run-tests" user target runname test-patts task-key) ;; params)
    (tasks:set-state-given-param-key tasks-db task-key "running")
    (tasks:add (db:delay-if-busy tdbdat) "run-tests" user target runname test-patts task-key) ;; params)
    (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "running")
    (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process
    (if (file-exists? runconfigf)
	(setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target)
	(debug:print 0 "WARNING: You do not have a run config file: " runconfigf))

    ;; Now generate all the tests lists
    (set! all-tests-registry (tests:get-all))
391
392
393
394
395
396
397
398
399



400
401
402
403
404
405
406
390
391
392
393
394
395
396


397
398
399
400
401
402
403
404
405
406







-
-
+
+
+







		  (if (not (hash-table-ref/default flags "-preclean" #f))
		      (hash-table-set! flags "-preclean" #t))
		  (if (not (hash-table-ref/default flags "-rerun" #f))
		      (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS"))
		  (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))))
	  (debug:print-info 0 "No tests to run")))
    (debug:print-info 4 "All done by here")
    (tasks:set-state-given-param-key tasks-db task-key "done")
    (sqlite3:finalize! tasks-db)))
    (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "done")
    ;; (sqlite3:finalize! tasks-db)
    ))


;; loop logic. These are used in runs:run-tests-queue to make it a bit more readable.
;;
;; If reg not full and have items in tal then loop with (car tal)(cdr tal) reg reruns
;; If reg is full (i.e. length >= n
;;   loop with (car reg) tal (cdr reg) reruns
1395
1396
1397
1398
1399
1400
1401
1402

1403
1404
1405
1406
1407
1408
1409
1395
1396
1397
1398
1399
1400
1401

1402
1403
1404
1405
1406
1407
1408
1409







-
+







;;    'set-state-status
;;
;; NB// should pass in keys?
;;
(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(remove-data-only #f))
  (common:clear-caches) ;; clear all caches
  (let* ((db           #f)
	 (tasks-db     (tasks:open-db))
	 (tdbdat       (tasks:open-db))
	 (keys         (rmt:get-keys))
	 (rundat       (mt:get-runs-by-patt keys runnamepatt target))
	 (header       (vector-ref rundat 0))
	 (runs         (vector-ref rundat 1))
	 (states       (if state  (string-split state  ",") '()))
	 (statuses     (if status (string-split status ",") '()))
	 (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f))))
1434
1435
1436
1437
1438
1439
1440
1441

1442
1443
1444
1445
1446
1447
1448
1434
1435
1436
1437
1438
1439
1440

1441
1442
1443
1444
1445
1446
1447
1448







-
+







	   (debug:print-info 4 "runs:operate-on run=" run ", header=" header)
	   (if (not (null? tests))
	       (begin
		 (case action
		   ((remove-runs)
		    ;; seek and kill in flight -runtests with % as testpatt here
		    (if (equal? testpatt "%")
			(tasks:kill-runner tasks-db target run-name)
			(tasks:kill-runner (db:delay-if-busy tdbdat) target run-name)
			(debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    (debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((print-run)
		    (debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
		    action)
1547
1548
1549
1550
1551
1552
1553
1554


1555
1556
1557
1558
1559
1560
1561
1547
1548
1549
1550
1551
1552
1553

1554
1555
1556
1557
1558
1559
1560
1561
1562







-
+
+







		       ;;    (if (null? (glob (conc runpath "/*")))
		       ;;        (begin
		       ;; 	 (debug:print 1 "Removing run dir " runpath)
		       ;; 	 (system (conc "rmdir -p " runpath))))
		       )))))
	 ))
     runs)
    (sqlite3:finalize! tasks-db))
    ;; (sqlite3:finalize! (db:delay-if-busy tdbdat))
    )
  #t)

(define (runs:remove-test-directory db test remove-data-only)
  (let* ((run-dir       (db:test-get-rundir test))    ;; run dir is from the link tree
	 (real-dir      (if (file-exists? run-dir)
			    (resolve-pathname run-dir)
			    #f)))

Modified server.scm from [faceda817c] to [13f9300039].

125
126
127
128
129
130
131

132
133


134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

149
150
151

152
153
154
155

156
157
158
159
160
161
162







163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183

184
185
186
187
188
189
190
125
126
127
128
129
130
131
132


133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

149
150
151

152
153
154
155
156
157







158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184

185
186
187
188
189
190
191
192







+
-
-
+
+














-
+


-
+




+
-
-
-
-
-
-
-
+
+
+
+
+
+
+




















-
+







;; 
(define (server:try-running run-id)
  (if (eq? run-id 0)
      (server:run run-id)
      (rmt:start-server run-id)))

(define (server:check-if-running run-id)
  (let ((tdbdat (tasks:open-db)))
  (let loop ((server (tasks:get-server (tasks:get-db) run-id))
	     (trycount 0))
    (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id))
	       (trycount 0))
    (if server
	;; note: client:start will set *runremote*. this needs to be changed
	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;
	(let ((res (server:ping-server run-id 
				       (tasks:hostinfo-get-interface server)
				       (tasks:hostinfo-get-port      server))))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      #t
	      (begin
		(debug:print-info 0 "server at " server " not responding, removing record")
		(tasks:server-force-clean-running-records-for-run-id (tasks:get-db) run-id 
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
				" server:check-if-running")
		res)))
	#f)))
	#f))))

;; called in megatest.scm, host-port is string hostname:port
;;
(define (server:ping run-id host:port)
  (let ((tdbdat (tasks:open-db)))
  (let* ((host-port (let ((slst (string-split   host:port ":")))
		      (if (eq? (length slst) 2)
			  (list (car slst)(string->number (cadr slst)))
			  #f)))
	 (toppath       (launch:setup-for-run))
	 (server-db-dat (if (not host-port)(tasks:get-server (tasks:get-db) run-id) #f)))
    (if (not run-id)
    (let* ((host-port (let ((slst (string-split   host:port ":")))
			(if (eq? (length slst) 2)
			    (list (car slst)(string->number (cadr slst)))
			    #f)))
	   (toppath       (launch:setup-for-run))
	   (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f)))
      (if (not run-id)
	  (begin
	    (debug:print 0 "ERROR: must specify run-id when doing ping, -run-id n")
	    (print "ERROR: No run-id")
	    (exit 1))
	  (if (and (not host-port)
		   (not server-db-dat))
	      (begin
		(print "ERROR: bad host:port")
		(exit 1))
	      (let* ((iface      (if host-port (car host-port) (tasks:hostinfo-get-interface server-db-dat)))
		     (port       (if host-port (cadr host-port)(tasks:hostinfo-get-port      server-db-dat)))
		     (server-dat (http-transport:client-connect iface port))
		     (login-res  (rmt:login-no-auto-client-setup server-dat run-id)))
		(if (and (list? login-res)
			 (car login-res))
		    (begin
		      (print "LOGIN_OK")
		      (exit 0))
		    (begin
		      (print "LOGIN_FAILED")
		      (exit 1))))))))
		      (exit 1)))))))))

;; run ping in separate process, safest way in some cases
;;
(define (server:ping-server run-id iface port)
  (with-input-from-pipe 
   (conc (common:get-megatest-exe) " -run-id " run-id " -ping " (conc iface ":" port))
   (lambda ()

Modified tasks.scm from [059408bffa] to [c8e0f86792].

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68


69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87



















88
89
90
91
92
93
94
95
96
97
98

99
100
101
102
103
104
105

106
107
108
109
110
111
112
113
114
115
116
117
118
119


120
121
122
123
124
125
126
127
128


129
130

131
132

133
134
135
136
137
138

139
140
141
142
143
144
145
49
50
51
52
53
54
55



56
57
58
59
60
61
62
63
64
65
66
67



















68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

97
98
99
100
101
102
103

104
105
106
107
108
109
110
111
112
113
114
115
116


117
118
119
120
121
122
123
124
125


126
127


128


129






130
131
132
133
134
135
136
137







-
-
-










+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+










-
+






-
+












-
-
+
+







-
-
+
+
-
-
+
-
-
+
-
-
-
-
-
-
+







(define (tasks:get-task-db-path)
  (if *task-db*
      (vector-ref *task-db* 1)
      (let* ((linktree     (configf:lookup *configdat* "setup" "linktree"))
	     (dbpath       (conc linktree "/.db/monitor.db")))
	dbpath)))

(define (tasks:wait-on-busy-monitor.db)
  (tasks:wait-on-journal (tasks:get-task-db-path) 30))

;; If file exists AND
;;    file readable
;;         ==> open it
;; If file exists AND
;;    file NOT readable
;;         ==> open in-mem version
;; If file NOT exists
;;    ==> open in-mem version
;;
(define (tasks:open-db)
  (if *task-db*
      *task-db*
  (let* ((dbpath       (tasks:get-task-db-path))
	 (avail        (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away
	 (exists       (file-exists? dbpath))
	 (write-access (file-write-access? dbpath))
	 (mdb          (cond
			((file-write-access? *toppath*)(sqlite3:open-database dbpath))
			((file-read-access? dbpath)    (sqlite3:open-database dbpath))
			(else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath))
	 (handler      (make-busy-timeout 36000)))
    (if (and exists
	     (not write-access))
	(set! *db-write-access* write-access)) ;; only unset so other db's also can use this control
    (sqlite3:set-busy-handler! mdb handler)
    (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;"))
    (if (or (and (not exists)
		 (file-write-access? *toppath*))
	    (not (file-read-access? dbpath)))
	(begin
	  (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY,
      (let* ((dbpath       (tasks:get-task-db-path))
	     (avail        (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away
	     (exists       (file-exists? dbpath))
	     (write-access (file-write-access? dbpath))
	     (mdb          (cond
			    ((file-write-access? *toppath*)(sqlite3:open-database dbpath))
			    ((file-read-access? dbpath)    (sqlite3:open-database dbpath))
			    (else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath))
	     (handler      (make-busy-timeout 36000)))
	(if (and exists
		 (not write-access))
	    (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control
	(sqlite3:set-busy-handler! mdb handler)
	(sqlite3:execute mdb (conc "PRAGMA synchronous = 0;"))
	(if (or (and (not exists)
		     (file-write-access? *toppath*))
		(not (file-read-access? dbpath)))
	    (begin
	      (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY,
                                action TEXT DEFAULT '',
                                owner TEXT,
                                state TEXT DEFAULT 'new',
                                target TEXT DEFAULT '',
                                name TEXT DEFAULT '',
                                testpatt TEXT DEFAULT '',
                                keylock TEXT,
                                params TEXT,
                                creation_time TIMESTAMP,
                                execution_time TIMESTAMP);")
	  (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS monitors (id INTEGER PRIMARY KEY,
	      (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS monitors (id INTEGER PRIMARY KEY,
                                pid INTEGER,
                                start_time TIMESTAMP,
                                last_update TIMESTAMP,
                                hostname TEXT,
                                username TEXT,
                               CONSTRAINT monitors_constraint UNIQUE (pid,hostname));")
	  (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS servers (id INTEGER PRIMARY KEY,
	      (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS servers (id INTEGER PRIMARY KEY,
                                  pid INTEGER,
                                  interface TEXT,
                                  hostname TEXT,
                                  port INTEGER,
                                  pubport INTEGER,
                                  start_time TIMESTAMP,
                                  priority INTEGER,
                                  state TEXT,
                                  mt_version TEXT,
                                  heartbeat TIMESTAMP,
                                  transport TEXT,
                                  run_id INTEGER);")
;;                               CONSTRAINT servers_constraint UNIQUE (pid,hostname,port));")
	  (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS clients (id INTEGER PRIMARY KEY,
	      ;;                               CONSTRAINT servers_constraint UNIQUE (pid,hostname,port));")
	      (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS clients (id INTEGER PRIMARY KEY,
                                  server_id INTEGER,
                                  pid INTEGER,
                                  hostname TEXT,
                                  cmdline TEXT,
                                  login_time TIMESTAMP,
                                  logout_time TIMESTAMP DEFAULT -1,
                                CONSTRAINT clients_constraint UNIQUE (pid,hostname));")
                                  
	  ))
	      
	      ))
    mdb))

	(set! *task-db* (cons mdb dbpath))
(define (tasks:get-db)
  (if *task-db*
	*task-db*)))
      (vector-ref *task-db* 0)
      (let ((db  (tasks:open-db))
	    (pth (tasks:get-task-db-path)))
	(set! *task-db* (vector db pth))
	db)))
  

;;======================================================================
;; Server and client management
;;======================================================================

;; make-vector-record tasks hostinfo id interface port pubport transport pid hostname
(define (tasks:hostinfo-get-id          vec)    (vector-ref  vec 0))
(define (tasks:hostinfo-get-interface   vec)    (vector-ref  vec 1))
350
351
352
353
354
355
356
357
358


359
360
361
362
363
364
365

366
367


368
369
370
371
372
373
374
342
343
344
345
346
347
348


349
350
351
352
353
354
355
356

357
358

359
360
361
362
363
364
365
366
367







-
-
+
+






-
+

-
+
+







  (system (conc "nbfake kill " pid))
  (unsetenv "TARGETHOST_LOGF")
  (unsetenv "TARGETHOST"))
 
;; look up a server by run-id and send it a kill, also delete the record for that server
;;
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdb  (tasks:open-db))
	 (sdat (tasks:get-server tdb run-id)))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))
	  (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record tdb server-id tag) )
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
    (sqlite3:finalize! tdb)))
    ;; (sqlite3:finalize! tdb)
    ))
    
;;   (if status ;; #t means alive
;;       (begin
;; 	(if (equal? hostname (get-host-name))
;; 	    (handle-exceptions
;; 	     exn
;; 	     (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n"