Megatest

Check-in [6444c1bdfc]
Login
Overview
Comment:Runs through test4 pretty well. No server enabled yet
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.62-no-rpc
Files: files | file ages | folders
SHA1: 6444c1bdfc166bc0eef081e1803eaad7028405c1
User & Date: matt on 2016-11-25 22:30:07
Other Links: branch diff | manifest | tags
Context
2016-11-26
18:16
Cleaned up globals. Added audit script to report on globals, where defined and where used. check-in: 22df9fe278 user: matt tags: v1.62-no-rpc
2016-11-25
22:30
Runs through test4 pretty well. No server enabled yet check-in: 6444c1bdfc user: matt tags: v1.62-no-rpc
11:50
Consolidated to single global for dbstruct. Removed *megatest-db* Removed *inmemdb* Removed *write-frequency* Removed *client-non-blocking-mode* Consolidated db:open-local-db-handle in with db:setup Fixed calls which used db to instead use dbstruct Change repl to use db:setup for getting a db handle check-in: e335fe582a user: matt tags: v1.62-no-rpc
Changes

Modified client.scm from [b597605018] to [5c6ba40366].

178
179
180
181
182
183
184
185

186
187
188
189
190
191
192

193
194
195
196
197
198
199
178
179
180
181
182
183
184

185
186
187
188
189
190
191

192
193
194
195
196
197
198
199







-
+






-
+







 				  ;;              (car (vector-ref logininfo 1))
 				  ;;              #f)))

                                  )))
		(if (and start-res
			 ping-res)
		    (begin
		      (hash-table-set! *runremote* run-id start-res)
		      (set! *runremote* start-res) ;; (hash-table-set! *runremote* run-id start-res)
		      (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
		      start-res)
		    (begin    ;; login failed but have a server record, clean out the record and try again
		      (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
		      (case *transport-type* 
			((http)(http-transport:close-connections run-id)))
		      (hash-table-delete! *runremote* run-id)
		      (set! *runremote* #f)  ;; (hash-table-delete! *runremote* run-id)
		      (tasks:kill-server-run-id run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")
		      (if (> remaining-tries 8)

Modified common.scm from [d6de275ae9] to [03889704ec].

1
2
3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18
19
1
2
3
4
5
6
7
8
9
10
11

12
13
14
15
16
17
18
19











-
+







;;======================================================================
;; Copyright 2006-2012, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.
;;======================================================================

(use srfi-1 posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records)
(use srfi-1 posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records directory-utils)
(require-extension regex posix)

(require-extension (srfi 18) extras tcp rpc)

(import (prefix sqlite3 sqlite3:))
(import (prefix base64 base64:))

88
89
90
91
92
93
94

95

96
97
98
99
100
101
102
103
104
105
106

107
108
109
110
111
112
113
88
89
90
91
92
93
94
95

96
97
98
99
100
101
102
103
104
105
106

107
108
109
110
111
112
113
114







+
-
+










-
+







(define *dbstruct-db*         #f) ;; used when local access is triggered in rmt.scm

(define *db-stats*            (make-hash-table)) ;; hash of vectors < count duration-total >
(define *db-stats-mutex*      (make-mutex))
(define *db-sync-mutex*       (make-mutex))
(define *db-multi-sync-mutex* (make-mutex))
(define *db-local-sync*       (make-hash-table)) ;; used to record last touch of db
(define *db-last-sync*        0)                 ;; last time the sync to megatest.db happened 
(define *last-db-access*      (current-seconds))  ;; update when db is accessed via server
(define *last-db-access*      (current-seconds)) ;; update when db is accessed via server
(define *db-write-access*     #t)
(define *task-db*             #f) ;; (vector db path-to-db)
(define *db-access-allowed*   #t) ;; flag to allow access
(define *db-access-mutex*     (make-mutex))
(define *db-cache-path*       #f)

;; SERVER
(define *my-client-signature* #f)
(define *transport-type*    'http)
(define *transport-type*    'http)             ;; override with [server] transport http|rpc|nmsg
(define *runremote*         (make-hash-table)) ;; if set up for server communication this will hold <host port>
(define *runremote*         #f)                ;; if set up for server communication this will hold <host port>
(define *max-cache-size*    0)
(define *logged-in-clients* (make-hash-table))
(define *server-id*         #f)
(define *server-info*       #f)
(define *time-to-exit*      #f)
(define *received-response* #f)
(define *default-numtries*  10)
198
199
200
201
202
203
204
























205
206
207
208
209
210
211
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







   'dejunk
   ;; 'adj-testids
   ;; 'old2new
   'new2old
   'schema)
  (if (common:version-changed?)
      (common:set-last-run-version)))

;; Rotate logs, logic: 
;;                 if > 500k and older than 1 week:
;;                     remove previous compressed log and compress this log
;; WARNING: This proc operates assuming that it is in the directory above the
;;          logs directory you wish to log-rotate.
;;
(define (common:rotate-logs)
  (if (not (directory-exists? "logs"))(create-directory "logs"))
  (directory-fold 
   (lambda (file rem)
     (if (and (string-match "^.*.log" file)
	      (> (file-size (conc "logs/" file)) 200000))
	 (let ((gzfile (conc "logs/" file ".gz")))
	   (if (file-exists? gzfile)
	       (begin
		 (debug:print-info 0 *default-log-port* "removing " gzfile)
		 (delete-file gzfile)))
	   (debug:print-info 0 *default-log-port* "compressing " file)
	   (system (conc "gzip logs/" file)))))
   '()
   "logs"))
    


;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
;;
(define (common:exit-on-version-changed)
  (if (common:version-changed?)
      (let ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")))
        (debug:print 0 *default-log-port*
480
481
482
483
484
485
486
487
488


489
490
491

492
493

494
495
496
497
498
499
500
505
506
507
508
509
510
511


512
513
514
515

516
517

518
519
520
521
522
523
524
525







-
-
+
+


-
+

-
+







;;======================================================================
;; E X I T   H A N D L I N G
;;======================================================================

(define (common:legacy-sync-recommended)
  (or (and (common:get-homehost)
	   (cdr (common:get-homehost)))
      (args:get-arg "-runtests")
      (args:get-arg "-run")
      ;;(args:get-arg "-runtests")
      ;;(args:get-arg "-run")
      (args:get-arg "-server")
      ;; (args:get-arg "-set-run-status")
      (args:get-arg "-remove-runs")
      ;;(args:get-arg "-remove-runs")
      ;; (args:get-arg "-get-run-status")
      (args:get-arg "-use-db-cache") ;; feels like a bad idea ...
      ;;(args:get-arg "-use-db-cache") ;; feels like a bad idea ...
      ))

(define (common:legacy-sync-required)
  (configf:lookup *configdat* "setup" "megatest-db"))

;; run-ids
;;    if #f use *db-local-sync* : or 'local-sync-flags
534
535
536
537
538
539
540
541

542
543
544
545
546
547
548
559
560
561
562
563
564
565

566
567
568
569
570
571
572
573







-
+







	(last-time   (current-seconds)))
    (if (or (common:legacy-sync-recommended)
	    legacy-sync)
	(let loop ()
	  ;; sync for filesystem local db writes
	  ;;
	  (let ((start-time   (current-seconds)))
	    (common:sync-to-megatest.db 'local-sync-flags)
	    ;; (common:sync-to-megatest.db 'local-sync-flags)
	    (if (and debug-mode
		     (> (- start-time last-time) 60))
		(begin
		  (set! last-time start-time)
		  (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))

	  ;; keep going unless time to exit

Modified db.scm from [8de1aa48cd] to [05634fcfcf].

194
195
196
197
198
199
200

201

202
203
204
205
206
207
208
194
195
196
197
198
199
200
201

202
203
204
205
206
207
208
209







+
-
+







	 (file-write   (if file-exists
			   (file-write-access? fname)
			   dir-writable )))
    (if file-write ;; dir-writable
	(let (;; (lock    (obtain-dot-lock fname 1 5 10))
	      (db      (sqlite3:open-database fname)))
	  (sqlite3:set-busy-handler! db (make-busy-timeout 136000))
	  ;; (db:set-sync db)
	  (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = 0;")
	  (sqlite3:execute db "PRAGMA synchronous = NORMAL;")
	  (if (not file-exists)
	      (begin
		(if (string-match "^/tmp/.*" fname) ;; this is a file in /tmp
		    (sqlite3:execute db "PRAGMA journal_mode=WAL;")
		    (print "Creating " fname " in NON-WAL mode."))
		(initproc db)))
	  ;; (release-dot-lock fname)
308
309
310
311
312
313
314

315
316
317
318
319




320
321
322
323
324
325
326
309
310
311
312
313
314
315
316
317
318
319


320
321
322
323
324
325
326
327
328
329
330







+



-
-
+
+
+
+







	;; (rundb  (dbr:dbstruct-rundb dbstruct))
	;; (inmem  (dbr:dbstruct-inmem dbstruct))
	;; (maindb (dbr:dbstruct-main  dbstruct))
	;; (refdb  (dbr:dbstruct-refdb dbstruct))
        (tmpdb   (dbr:dbstruct-tmpdb dbstruct))
	(mtdb    (dbr:dbstruct-mtdb dbstruct))
        (refndb  (dbr:dbstruct-refndb dbstruct))
	(start-t (current-seconds))
	;; (runid  (dbr:dbstruct-run-id dbstruct))
	)
    (debug:print-info 4 *default-log-port* "Syncing for run-id: " run-id)
    ;; (mutex-lock! *http-mutex*)
    (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb)))
    (mutex-lock! *db-sync-mutex*)
    (db:sync-tables (db:sync-all-tables-list dbstruct) (cons *db-last-sync* "last_update") tmpdb refndb mtdb)
    (set! *db-last-sync* start-t)
    (mutex-unlock! *db-sync-mutex*)))
;;    (if (eq? run-id 0)
;;	;; runid equal to 0 is main.db
;;	(if maindb
;;	    (if (or (not (number? mtime))
;;		    (not (number? stime))
;;		    (> mtime stime)
;;		    force-sync)
369
370
371
372
373
374
375
376

377
378
379
380
381
382
383
373
374
375
376
377
378
379

380
381
382
383
384
385
386
387







-
+







;; 	  (dbr:dbstruct-localdb-set! dbstruct run-id #f)
;; 	  (dbr:dbstruct-inmem-set! dbstruct #f)))))

;; close all opened run-id dbs
(define (db:close-all dbstruct)
  (if (dbr:dbstruct? dbstruct)
      (begin
        (db:sync-touched dbstruct 0 force-sync: #t)
        ;; (db:sync-touched dbstruct 0 force-sync: #t) ;; NO. Do not do this here. Instead we rely on a server to be started when there are writes, even if the server itself is not going to be used as a server.
        (let ((tdb (db:dbdat-get-db (dbr:dbstruct-tmpdb  dbstruct)))
              (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb   dbstruct)))
              (rdb (db:dbdat-get-db (dbr:dbstruct-refndb dbstruct))))
          (if tdb (sqlite3:finalize! tdb))
          (if mdb (sqlite3:finalize! mdb))
          (if rdb (sqlite3:finalize! rdb))))))
  
3153
3154
3155
3156
3157
3158
3159


3160
3161
3162
3163
3164
3165
3166
3167
3168








3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179

3180
3181
3182
3183
3184
3185
3186
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170




3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188

3189
3190
3191
3192
3193
3194
3195
3196







+
+





-
-
-
-
+
+
+
+
+
+
+
+










-
+







		(regexp "_") "=" msg #t)))
	   (lambda ()(deserialize)))
	 (begin
	   (debug:print-error 0 *default-log-port* "reception failed. Received " msg " but cannot translate it.")
	   msg))) ;; crude reply for when things go awry
    ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize))))
    (else msg))) ;; rpc

;; This is to be the big daddy call

(define (db:test-set-status-state dbstruct run-id test-id status state msg)
  (let ((dbdat  (db:get-db dbstruct run-id)))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
	(db:general-call dbdat 'set-test-start-time (list test-id)))
    (if msg
	(db:general-call dbdat 'state-status-msg (list state status msg test-id))
	(db:general-call dbdat 'state-status     (list state status test-id)))
     (mt:process-triggers run-id test-id state status)))
    ;; (if msg
    ;; 	(db:general-call dbdat 'state-status-msg (list state status msg test-id))
    ;; 	(db:general-call dbdat 'state-status     (list state status test-id)))
    (db:set-state-status-and-roll-up-items dbstruct run-id test-id #f state status msg)
    ;; process the test_data table
    (if (and test-id state status (equal? status "AUTO")) 
	(db:test-data-rollup dbstruct run-id test-id status))
    (mt:process-triggers run-id test-id state status)))

;; state is the priority rollup of all states
;; status is the priority rollup of all completed states
;;
;; if test-name is an integer work off that instead of test-name test-path
;;
(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
  ;; establish info on incoming test followed by info on top level test
  (let* ((db           (db:dbdat-get-db (dbr:dbstruct-tmpdb dbstruct)))
	 (testdat      (if (number? test-name)
			   (db:get-test-info-by-id dbstruct run-id test-name)
			   (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id
			   (db:get-test-info       dbstruct run-id test-name item-path)))
	 (test-id      (db:test-get-id testdat))
	 (test-name    (if (number? test-name)
			   (db:test-get-testname testdat)
			   test-name))
	 (item-path    (db:test-get-item-path testdat))
         (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
3205
3206
3207
3208
3209
3210
3211
3212

3213
3214
3215
3216
3217
3218
3219
3215
3216
3217
3218
3219
3220
3221

3222
3223
3224
3225
3226
3227
3228
3229







-
+







						   statuses)
					     statuses)))
				      *common:std-statuses* >))
		  (newstate          (if (null? all-curr-states) "NOT_STARTED" (car all-curr-states)))
		  (newstatus         (if (null? all-curr-statuses) "n/a" (car all-curr-statuses))))
	     ;; (print "Setting toplevel to: " newstate "/" newstatus)
	     (db:test-set-state-status-by-id dbstruct run-id tl-test-id newstate newstatus #f)))))))
        

(define db:roll-up-pass-fail-counts db:set-state-status-and-roll-up-items)

;; call with state = #f to roll up with out accounting for state/status of this item
;;
;;    (define (db:roll-up-pass-fail-counts dbstruct run-id test-name item-path state status)
;;      (if (not (equal? item-path "")) ;; if asked to do this for a specific item then do an incremental update
;;          (let* ((dbdat         (db:get-db dbstruct run-id))
3313
3314
3315
3316
3317
3318
3319
3320

3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334

3335
3336
3337
3338
3339
3340
3341
3323
3324
3325
3326
3327
3328
3329

3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342


3343
3344
3345
3346
3347
3348
3349
3350







-
+












-
-
+







                                      (SELECT status FROM tests WHERE id=?) NOT IN ('WARN','FAIL')
                                    THEN 'PASS'
                                    ELSE status
                                    END WHERE id=?;") ;; DONE
	'(test-set-log            "UPDATE tests SET final_logf=? WHERE id=?;")      ;; DONE
	;; '(test-set-rundir-by-test-id "UPDATE tests SET rundir=? WHERE id=?")        ;; DONE
	;; '(test-set-rundir         "UPDATE tests SET rundir=? AND testname=? AND item_path=?;") ;; DONE
	'(test-set-rundir-shortdir "UPDATE tests SET rundir=?,shortdir=? WHERE testname=? AND item_path=?;")    ;; BROKEN!!! NEEDS run-id
	'(test-set-rundir-shortdir "UPDATE tests SET rundir=?,shortdir=? WHERE testname=? AND item_path=? AND run_id=?;")    ;; BROKEN!!! NEEDS run-id
	'(delete-tests-in-state   ;; "DELETE FROM tests WHERE state=?;")                  ;; DONE
	  "UPDATE tests SET state='DELETED' WHERE state=?")
	'(tests:test-set-toplog   "UPDATE tests SET final_logf=? WHERE run_id=? AND testname=? AND item_path='';")
	'(update-cpuload-diskfree "UPDATE tests SET cpuload=?,diskfree=? WHERE id=?;") ;; DONE
	'(update-uname-host       "UPDATE tests SET uname=?,host=? WHERE id=?;")       ;; DONE
	'(update-test-state       "UPDATE tests SET state=? WHERE state=? AND run_id=? AND testname=? AND NOT (item_path='' AND testname IN (SELECT DISTINCT testname FROM tests WHERE testname=? AND item_path != ''));")
	'(update-test-status      "UPDATE tests SET status=? WHERE status like ? AND run_id=? AND testname=? AND NOT (item_path='' AND testname IN (SELECT DISTINCT testname FROM tests WHERE testname=? AND item_path != ''));")
	;; stuff for roll-up-pass-fail-counts
	'(update-pass-fail-counts "UPDATE tests 
             SET fail_count=(SELECT count(id) FROM tests WHERE testname=? AND item_path != '' AND status IN ('FAIL','CHECK','INCOMPLETE','ABORT')),
                 pass_count=(SELECT count(id) FROM tests WHERE testname=? AND item_path != '' AND status IN ('PASS','WARN','WAIVED'))
             WHERE testname=? AND item_path='' AND run_id=?;") ;; DONE  ;; BROKEN!!! NEEDS run-id
	'(top-test-set          "UPDATE tests SET state=? WHERE testname=? AND item_path='';") ;; DONE   ;; BROKEN!!! NEEDS run-id
	'(top-test-set-running  "UPDATE tests SET state='RUNNING' WHERE testname=? AND item_path='';") ;; DONE   ;; BROKEN!!! NEEDS run-id
	'(top-test-set-running  "UPDATE tests SET state='RUNNING' WHERE testname=? AND item_path='' AND run_id=?;") ;; DONE   ;; BROKEN!!! NEEDS run-id


	;; Might be the following top-test-set-per-pf-counts query could be better based off of something like this:
	;;
	;; select state,status,count(state) from tests where run_id=59 AND testname='runfirst' group by state,status;
	;;
	'(top-test-set-per-pf-counts "UPDATE tests
3439
3440
3441
3442
3443
3444
3445
3446

3447
3448
3449
3450
3451


3452
3453
3454
3455
3456
3457
3458
3448
3449
3450
3451
3452
3453
3454

3455
3456
3457
3458


3459
3460
3461
3462
3463
3464
3465
3466
3467







-
+



-
-
+
+







			       immediate
			       flush
			       sync
			       set-verbosity
			       killserver
			       ))

(define (db:login dbstruct calling-path calling-version run-id client-signature)
(define (db:login dbstruct calling-path calling-version client-signature)
  (cond 
   ((not (equal? calling-path *toppath*))
    (list #f "Login failed due to mismatch paths: " calling-path ", " *toppath*))
   ((not (equal? *run-id* run-id))
    (list #f "Login failed due to mismatch run-id: " run-id ", " *run-id*))
   ;; ((not (equal? *run-id* run-id))
   ;;  (list #f "Login failed due to mismatch run-id: " run-id ", " *run-id*))
   ((not (equal? megatest-version calling-version))
    (list #f "Login failed due to mismatch megatest version: " calling-version ", " megatest-version))
   (else
    (hash-table-set! *logged-in-clients* client-signature (current-seconds))
    '(#t "successful login"))))

(define (db:general-call dbdat stmtname params)

Modified http-transport.scm from [d70882fd0b] to [9e8b14e328].

109
110
111
112
113
114
115
116

117
118
119
120
121
122
123
109
110
111
112
113
114
115

116
117
118
119
120
121
122
123







-
+







    (http-transport:try-start-server run-id ipaddrstr start-port server-id)))

;; This is recursively run by http-transport:run until sucessful
;;
(define (http-transport:try-start-server run-id ipaddrstr portnum server-id)
  (let ((config-hostname (configf:lookup *configdat* "server" "hostname"))
	(tdbdat          (tasks:open-db)))
    (debug:print-info 0 *default-log-port* "http-transport:try-start-server run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname)
    (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname)
    (handle-exceptions
     exn
     (begin
       (print-error-message exn)
       (if (< portnum 64000)
	   (begin 
	     (debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...")
258
259
260
261
262
263
264
265

266
267
268
269
270
271
272
258
259
260
261
262
263
264

265
266
267
268
269
270
271
272







-
+







					 (db:string->obj 
					  (handle-exceptions
					   exn
					   (begin
					     (set! success #f)
					     (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".")
					     (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
					     (hash-table-delete! *runremote* run-id)
					     (set! *runremote* #f) ;; (hash-table-delete! *runremote* run-id)
					     ;; Killing associated server to allow clean retry.")
					     ;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
					     (mutex-unlock! *http-mutex*)
					     ;;; (signal (make-composite-condition
					     ;;;          (make-property-condition 'commfail 'message "failed to connect to server")))
					     ;;; "communications failed"
					     (db:obj->string #f))
305
306
307
308
309
310
311
312

313
314
315
316
317
318
319
305
306
307
308
309
310
311

312
313
314
315
316
317
318
319







-
+







		      (make-property-condition 
		       'timeout
		       'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))))

;; careful closing of connections stored in *runremote*
;;
(define (http-transport:close-connections run-id)
  (let* ((server-dat (hash-table-ref/default *runremote* run-id #f)))
  (let* ((server-dat *runremote*)) ;; (hash-table-ref/default *runremote* run-id #f)))
    (if (vector? server-dat)
	(let ((api-dat (http-transport:server-dat-get-api-uri server-dat)))
	  (close-connection! api-dat)
	  #t)
	#f)))


396
397
398
399
400
401
402


403

404
405
406
407
408
409
410
396
397
398
399
400
401
402
403
404

405
406
407
408
409
410
411
412







+
+
-
+








      ;; Use this opportunity to sync the tmp db to megatest.db
      (if *dbstruct-db* 
	  (let ((start-time (current-milliseconds))
		(sync-time  #f)
		(rem-time   #f))
	    (condition-case
	     ;; (if (and (member (mutex-state *db-sync-mutex*) '(abandoned not-abandoned))
	     ;;	      (> (- (current-seconds) *db-last-sync*) 5)) ;; if not currently being synced nor recently synced
	     (db:sync-touched *dbstruct-db* *run-id* force-sync: #t)
	     (db:sync-touched *dbstruct-db* *run-id* force-sync: #t) ;; usually done in the watchdog, not here.
	     ((sync-failed)(cond
			    ((> bad-sync-count 10) ;; time to give up
			     (http-transport:server-shutdown server-id port))
			    (else ;; (> bad-sync-count 0)  ;; we've had a fail or two, delay and loop
			     (thread-sleep! 5)
			     (loop count server-state (+ bad-sync-count 1)))))
	     ((exn)
426
427
428
429
430
431
432
433


434
435
436
437
438
439
440
428
429
430
431
432
433
434

435
436
437
438
439
440
441
442
443







-
+
+







	  (if (eq? server-state 'available)
	      (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers
		(if (equal? new-server-id server-id)
		    (begin
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
		      (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access
		      (set! *dbstruct-db*  (db:setup)) ;;  run-id))
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
		      (server:write-dotserver *toppath* (conc iface ":" port)))
		    (begin ;; gotta exit nicely
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
		      (http-transport:server-shutdown server-id port))))))
      
      (if (< count 1) ;; 3x3 = 9 secs aprox
	  (loop (+ count 1) 'running bad-sync-count))
      
483
484
485
486
487
488
489
490

491
492
493
494
495
496
497
486
487
488
489
490
491
492

493
494
495
496
497
498
499
500







-
+







	      (loop 0 server-state bad-sync-count))
	    (http-transport:server-shutdown server-id port))))))
  
(define (http-transport:server-shutdown server-id port)
  (let ((tdbdat (tasks:open-db)))
    (debug:print-info 0 *default-log-port* "Starting to shutdown the server.")
    ;; need to delete only *my* server entry (future use)
    (if *dbstruct-db* (db:sync-touched *dbstruct-db* *run-id* force-sync: #t))
    ;; (if *dbstruct-db* (db:sync-touched *dbstruct-db* *run-id* force-sync: #t)) ;; handled in the watchdog only
    (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up
    ;;
    ;; start_shutdown
    ;;
    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
    (portlogger:open-run-close portlogger:set-port port "released")
    (thread-sleep! 5)
508
509
510
511
512
513
514


515
516
517
518
519
520
521
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526







+
+







		      (if (eq? *number-non-write-queries* 0)
			  "n/a (no queries)"
			  (/ *total-non-write-delay* 
			     *number-non-write-queries*))
		      " ms")
    (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
    (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete")
    ;; if the .server file contained :myport then we can remove it
    (server:remove-dotserver-file *toppath* port)
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)

Modified launch.scm from [2ea5c9f7cf] to [8b28cbf454].

894
895
896
897
898
899
900
901

902
903
904
905
906
907
908
894
895
896
897
898
899
900

901
902
903
904
905
906
907
908







-
+







	 (lnkbase   (conc linktree "/" target "/" runname))
	 (lnkpath   (conc lnkbase "/" testname))
	 (lnkpathf  (conc lnkpath (if not-iterated "" "/") item-path))
	 (lnktarget (conc lnkpath "/" item-path)))

    ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical
    ;;                                                 rundir   shortdir
    (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path)
    (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id)

    (debug:print 2 *default-log-port* "INFO:\n       lnkbase=" lnkbase "\n       lnkpath=" lnkpath "\n  toptest-path=" toptest-path "\n     test-path=" test-path)
    (if (not (file-exists? linktree))
	(begin
	  (debug:print 0 *default-log-port* "WARNING: linktree did not exist! Creating it now at " linktree)
	  (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree))))
    ;; create the directory for the tests dir links, this is needed no matter what...
967
968
969
970
971
972
973
974

975
976
977
978
979
980
981
967
968
969
970
971
972
973

974
975
976
977
978
979
980
981







-
+







	  (hash-table-set! *toptest-paths* testname curr-test-path)
	  ;; NB// Was this for the test or for the parent in an iterated test?
	  (rmt:general-call 'test-set-rundir-shortdir run-id lnkpath 
			    (if (file-exists? lnkpath)
				;; (resolve-pathname lnkpath)
				(common:nice-path lnkpath)
				lnkpath)
			    testname "")
			    testname "" run-id)
	  ;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path)
	  (if (or (not curr-test-path)
		  (not (directory-exists? toptest-path)))
	      (begin
		(debug:print-info 2 *default-log-port* "Creating " toptest-path " and link " lnkpath)
		(handle-exceptions
		 exn

Modified megatest.scm from [bb00f9be7d] to [db7e42e4e9].

341
342
343
344
345
346
347
348

349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
341
342
343
344
345
346
347

348































349
350
351
352
353
354
355







-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







;; immediately set MT_TARGET if -reqtarg or -target are available
;;
(let ((targ (or (args:get-arg "-reqtarg")(args:get-arg "-target"))))
  (if targ (setenv "MT_TARGET" targ)))

;; The watchdog is to keep an eye on things like db sync etc.
;;
(define *watchdog*
(define *watchdog* (make-thread common:watchdog "Watchdog thread"))
  (make-thread 
   (lambda ()
     (thread-sleep! 0.05) ;; delay for startup
     (let ((legacy-sync (common:legacy-sync-required))
	   (debug-mode  (debug:debug-mode 1))
	   (last-time   (current-seconds)))
       (if (common:legacy-sync-recommended)
	   (let loop ()
	     ;; sync for filesystem local db writes
	     ;;
             (let ((start-time   (current-seconds)))
               ;; disabling for now (if legacy-sync (common:sync-to-megatest.db #f))
	       (if (and debug-mode
			(> (- start-time last-time) 60))
		   (begin
		     (set! last-time start-time)
		     (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))
	     
	     ;; keep going unless time to exit
	     ;;
	     (if (not *time-to-exit*)
		 (let delay-loop ((count 0))
		   (if (and (not *time-to-exit*)
			    (< count 11)) ;; aprox 5-6 seconds
		       (begin
			 (thread-sleep! 1)
			 (delay-loop (+ count 1))))
		   (loop)))
	     (if (common:low-noise-print 30)
		 (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*)))))
     "Watchdog thread")))

(thread-start! *watchdog*)

(if (args:get-arg "-log")
    (let ((oup (open-output-file (args:get-arg "-log"))))
      (debug:print-info 0 *default-log-port* "Sending log output to " (args:get-arg "-log"))
      (set! *default-log-port* oup)))
671
672
673
674
675
676
677
678

679
680

681
682
683
684
685
686
687
640
641
642
643
644
645
646

647
648

649
650
651
652
653
654
655
656







-
+

-
+







		(else
		 (pp data))))))
      (if out-file (close-output-port out-port))
      (exit) ;; yes, bending the rules here - need to exit since this is a utility
      ))

(if (args:get-arg "-ping")
    (let* ((run-id        (string->number (args:get-arg "-run-id")))
    (let* (;; (run-id        (string->number (args:get-arg "-run-id")))
	   (host:port     (args:get-arg "-ping")))
      (server:ping run-id host:port)))
      (server:ping host:port)))

;;======================================================================
;; Capture, save and manipulate environments
;;======================================================================

;; NOTE: Keep these above the section where the server or client code is setup

2008
2009
2010
2011
2012
2013
2014
2015

2016
2017
2018
2019
2020
2021
2022
1977
1978
1979
1980
1981
1982
1983

1984
1985
1986
1987
1988
1989
1990
1991







-
+







          (debug:print 0 *default-log-port* "Failed to create HTML output in " toppath "/lt/runs-index.html"))
      (set! *didsomething* #t)))

;;======================================================================
;; Exit and clean up
;;======================================================================

(if *runremote* (close-all-connections!))
(if *runremote* (close-all-connections!)) ;; for http-client

(if (not *didsomething*)
    (debug:print 0 *default-log-port* help))

(set! *time-to-exit* #t)
(thread-join! *watchdog*)

Modified rmt.scm from [ea574182b1] to [2858b3ea4e].

38
39
40
41
42
43
44
45

46
47
48
49
50
51
52
53
54
55
56
57
58













59
60
61
62
63
64
65
38
39
40
41
42
43
44

45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78







-
+













+
+
+
+
+
+
+
+
+
+
+
+
+







;;  S U P P O R T   F U N C T I O N S
;;======================================================================

;; if a server is either running or in the process of starting call client:setup
;; else return #f to let the calling proc know that there is no server available
;;
(define (rmt:get-connection-info run-id)
  (let ((cinfo (hash-table-ref/default *runremote* run-id #f)))
  (let ((cinfo *runremote*)) ;; (hash-table-ref/default *runremote* run-id #f)))
    (if cinfo
	cinfo
	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)
	    (client:setup run-id)
	    #f))))

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected
  ;; clean out old connections
  ;; (mutex-lock! *db-multi-sync-mutex*)

  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in *runremote*
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
  ;; 3. do the query, if on homehost use local access

  (if (and #f  ;; FORCE NO GO FOR RIGHT NOW
	   (not *runremote*)                         ;; we trust *runremote* to reflect that a server was found previously
	   (not (member cmd api:read-only-queries))) ;; we don't trust so much the list of write queries
      (let ((serverconn (server:check-if-running *toppath*)))
	(if serverconn
	    (set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed
	    (server:kind-run *toppath*))))
  
  (rmt:open-qry-close-locally cmd (if rid rid 0) params))

;;   (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin
;;     (for-each 
;;      (lambda (run-id)
;;        (let ((connection (hash-table-ref/default *runremote* run-id #f)))
;;          (if (and (vector? connection)
215
216
217
218
219
220
221
222
223
224

225
226
227
228
229
230
231
232
233
228
229
230
231
232
233
234



235


236
237
238
239
240
241
242







-
-
-
+
-
-







			     (loop (car tal)(cdr tal) newmax-cmd currmax)))))))
    (mutex-unlock! *db-stats-mutex*)
    res))

(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
  (let* ((qry-is-write   (not (member cmd api:read-only-queries)))
	 (db-file-path   (db:dbfile-path)) ;;  0))
	 (dbstruct-local (if *dbstruct-db*
			     *dbstruct-db*
			     (let* ((dbstruct (db:setup))) ;; make-dbr:dbstruct path:  dbdir local: #t)))
	 (dbstruct-local (db:setup))  ;; make-dbr:dbstruct path:  dbdir local: #t)))
			       (set! *dbstruct-db* dbstruct)
			       dbstruct)))
	 (read-only      (not (file-write-access? db-file-path)))
	 (start          (current-milliseconds))
	 (resdat         (if (not (and read-only qry-is-write))
			     (api:execute-requests dbstruct-local (vector (symbol->string cmd) params))
			     (vector #t '())))
	 (success        (vector-ref resdat 0))
	 (res            (vector-ref resdat 1))
241
242
243
244
245
246
247
248

249
250
251
252
253
254
255
250
251
252
253
254
255
256

257
258
259
260
261
262
263
264







-
+







	      (thread-sleep! (/ (random 5000) 1000)) ;; some random delay 
	      (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1)))
	    (begin
	      (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up")
	      #f))
	(begin
	  ;; (rmt:update-db-stats run-id cmd params duration)
	  ;; mark this run as dirty if this was a write
	  ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it
	  (if qry-is-write
	      (let ((start-time (current-seconds)))
		(mutex-lock! *db-multi-sync-mutex*)
		(hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write"
                (mutex-unlock! *db-multi-sync-mutex*)))))
    res))

302
303
304
305
306
307
308
309
310
311



312
313
314
315
316
317
318
311
312
313
314
315
316
317



318
319
320
321
322
323
324
325
326
327







-
-
-
+
+
+








(define (rmt:login run-id)
  (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))

;; This login does no retries under the hood - it acts a bit like a ping.
;; Deprecated for nmsg-transport.
;;
(define (rmt:login-no-auto-client-setup connection-info run-id)
  (case *transport-type*
    ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))
(define (rmt:login-no-auto-client-setup connection-info)
  (case *transport-type* ;; run-id of 0 is just a placeholder
    ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version *my-client-signature*)))
    ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*)))
    ))

;; hand off a call to one of the db:queries statements
;; added run-id to make looking up the correct db possible 
;;
(define (rmt:general-call stmtname run-id . params)

Modified server.scm from [19061b35b0] to [7b40e2c3b9].

102
103
104
105
106
107
108
109

110
111

112
113
114

115
116

117
118
119

120


121
122
123
124


125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

158
159
160
161
162
163
164
165
166
167
168
169


170
171
172
173
174


175
176
177
178
179
180
181
182
183
184
185
186
187
188










189
190
191
192
193
194






































195
196
197

198
199
200

201
202
203
204
205
206
207
208
209



210
211
212



213

214
215
216
217
218
219

220
221

222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242














243
244
245
246

247
248

249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265

266
267
268

269
270
271
272
273
274
275
276
277
278
279
102
103
104
105
106
107
108

109
110
111
112
113
114

115
116

117
118
119

120

121
122
123



124
125

















126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

142
143
144
145
146
147
148
149
150
151
152


153
154
155
156
157


158
159
160
161
162
163
164
165
166
167






168
169
170
171
172
173
174
175
176
177






178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215



216



217

218







219
220
221
222
223
224
225
226
227

228
229
230
231
232
233

234


235
236




















237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253

254
255

256
257
258
259
260
261
262
263
264
265
266
267
268
269
270



271



272
273
274
275
276
277
278
279
280
281
282
283







-
+


+


-
+

-
+


-
+
-
+
+

-
-
-
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+















-
+










-
-
+
+



-
-
+
+








-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
-
-
-
+
-

-
-
-
-
-
-
-
+
+
+



+
+
+
-
+





-
+
-
-
+

-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+



-
+

-
+














-
-
-
+
-
-
-
+











     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run run-id)
(define  (server:run areapath) ;; areapath is ignored for now.
  (let* ((curr-host   (get-host-name))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (curr-pid    (current-process-id))
	 (target-host (configf:lookup *configdat* "server" "homehost" ))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc *toppath* "/logs/" run-id ".log"))
	 (logfile     (conc *toppath* "/logs/server-" curr-pid ".log"))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server " (or target-host "-") " -run-id " run-id (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
		      " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
									      (conc " -daemonize -log " logfile)
									      "")
		      " -m testsuite:" testsuite))) ;; (conc " >> " logfile " 2>&1 &")))))
		      " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &")))))
    (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...")
	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")))
    ;; we want the remote server to start in *toppath* so push there
    (push-directory *toppath*)
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    
    ;; Rotate logs, logic: 
    (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...")
    (thread-start! log-rotate)
    ;;                 if > 500k and older than 1 week:
    ;;                     remove previous compressed log and compress this log
    ;;
    (directory-fold 
     (lambda (file rem)
       (if (and (string-match "^.*.log" file)
		(> (file-size (conc "logs/" file)) 200000))
	   (let ((gzfile (conc "logs/" file ".gz")))
	     (if (file-exists? gzfile)
		 (begin
		   (debug:print-info 0 *default-log-port* "removing " gzfile)
		   (delete-file gzfile)))
	     (debug:print-info 0 *default-log-port* "compressing " file)
	     (system (conc "gzip logs/" file)))))
     '()
     "logs")
    

    ;; host.domain.tld match host?
    (if (and target-host 
	     ;; look at target host, is it host.domain.tld or ip address and does it 
	     ;; match current ip or hostname
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
    
    (setenv "TARGETHOST_LOGF" logfile)
    (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    ;; (system cmdln)
    (thread-join! log-rotate)
    (pop-directory)))

(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
	(set! *my-client-signature* sig)
	*my-client-signature*)))

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
(define (server:kind-run run-id)
  (let ((last-run-time (hash-table-ref/default *server-kind-run* run-id #f)))
(define (server:kind-run areapath)
  (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f)))
    (if (or (not last-run-time)
	    (> (- (current-seconds) last-run-time) 30))
	(begin
	  (server:run run-id)
	  (hash-table-set! *server-kind-run* run-id (current-seconds))))))
	  (server:run areapath)
	  (hash-table-set! *server-kind-run* areapath (current-seconds))))))

;; The generic run a server command. Dispatches the call to server 0 if run-id != 0
;; 
(define (server:try-running run-id)
  (if (eq? run-id 0)
      (server:run run-id)
      (rmt:start-server run-id)))

(define (server:check-if-running run-id)
  (let ((tdbdat (tasks:open-db)))
    (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id))
	       (trycount 0))
    (if server
	;; note: client:start will set *runremote*. this needs to be changed
(define (server:read-dotserver areapath)
  (let ((dotfile (conc areapath "/.server")))
    (if (and (file-exists? dotfile)
	     (file-read-access? dotfile))
	(with-input-from-file
	    dotfile
	  (lambda ()
	    (read-line)))
	#f)))

	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;
	(let ((res (case *transport-type*
		     ((http)(server:ping-server run-id 
;; write a .server file in *toppath* with hostport
;; return #t on success, #f otherwise
;;
(define (server:write-dotserver areapath hostport)
  (let ((lock-file   (conc areapath "/.server.lock"))
	(server-file (conc areapath "/.server")))
    (if (common:simple-file-lock lock-file)
	(let ((res (handle-exceptions
		    exn
		    #f ;; failed for some reason, for the moment simply return #f
		    (with-output-to-file server-file
		      (lambda ()
			(print hostport)))
		    #t)))
	  (common:simple-file-release-lock lock-file)
	  res)
	#f)))

(define (server:remove-dotserver-file areapath hostport)
  (let ((dotserver   (server:read-dotserver areapath))
	(server-file (conc areapath "/.server"))
	(lock-file   (conc areapath "/.server.lock")))
    (if (string-match (conc ".*:" hostport "$") dotserver) ;; port matches, good enough info to decide to remove the file
	(if (common:simple-file-lock lock-file)
	    (begin
	      (handle-exceptions
	       exn
	       #f
	       (delete-file* server-file))
	      (common:simple-file-release-lock lock-file))))))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath)
  (let* ((dotserver (server:read-dotserver areapath))) ;; tdbdat (tasks:open-db)))
    (if dotserver
	(let* ((res (case *transport-type*
		      ((http)(server:ping-server dotserver))
						(tasks:hostinfo-get-interface server)
						(tasks:hostinfo-get-port      server)))
		     ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
		      ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
		     ;;    			 (tasks:hostinfo-get-port      server)
		     ;;    			 timeout: 2))
                     )))
		      )))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      #t
	      (begin
		(debug:print-info 0 *default-log-port* "server at " server " not responding, removing record")
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
				" server:check-if-running")
		res)))
	#f))))
	      dotserver
	      #f))
	#f)))

;; called in megatest.scm, host-port is string hostname:port
;;
;; NOTE: This is NOT called directly from clients as not all transports support a client running
;;       in the same process as the server.
;;
(define (server:ping run-id host:port)
(define (server:ping host:port)
  (let ((tdbdat (tasks:open-db)))
    (let* ((host-port (let ((slst (string-split   host:port ":")))
			(if (eq? (length slst) 2)
			    (list (car slst)(string->number (cadr slst)))
			    #f)))
	   (toppath       (launch:setup))
	   (toppath       (launch:setup)))
	   (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f)))
      (if (not run-id)
      (if (not host-port)
	  (begin
	    (debug:print-error 0 *default-log-port* "must specify run-id when doing ping, -run-id n")
	    (print "ERROR: No run-id")
	    (exit 1))
	  (if (and (not host-port)
		   (not server-db-dat))
	      (begin
		(print "ERROR: bad host:port")
		(exit 1))
	      (let* ((iface      (if host-port (car host-port) (tasks:hostinfo-get-interface server-db-dat)))
		     (port       (if host-port (cadr host-port)(tasks:hostinfo-get-port      server-db-dat)))
		     (server-dat (http-transport:client-connect iface port))
		     (login-res  (rmt:login-no-auto-client-setup server-dat run-id)))
		(if (and (list? login-res)
			 (car login-res))
		    (begin
		      (print "LOGIN_OK")
		      (exit 0))
		    (begin
		      (print "LOGIN_FAILED")
		      (exit 1)))))))))
	    (print "ERROR: bad host:port")
	    (exit 1))
	  (let* ((iface      (if host-port (car host-port) (tasks:hostinfo-get-interface server-db-dat)))
		 (port       (if host-port (cadr host-port)(tasks:hostinfo-get-port      server-db-dat)))
		 (server-dat (http-transport:client-connect iface port))
		 (login-res  (rmt:login-no-auto-client-setup server-dat)))
	    (if (and (list? login-res)
		     (car login-res))
		(begin
		  (print "LOGIN_OK")
		  (exit 0))
		(begin
		  (print "LOGIN_FAILED")
		  (exit 1))))))))

;; run ping in separate process, safest way in some cases
;;
(define (server:ping-server run-id iface port)
(define (server:ping-server ifaceport)
  (with-input-from-pipe 
   (conc (common:get-megatest-exe) " -run-id " run-id " -ping " (conc iface ":" port))
   (conc (common:get-megatest-exe) " -ping " ifaceport)
   (lambda ()
     (let loop ((inl (read-line))
		(res "NOREPLY"))
       (if (eof-object? inl)
	   (case (string->symbol res)
	     ((NOREPLY)  #f)
	     ((LOGIN_OK) #t)
	     (else       #f))
	   (loop (read-line) inl))))))

(define (server:login toppath)
  (lambda (toppath)
    (set! *last-db-access* (current-seconds))
    (if (equal? *toppath* toppath)
	(begin
	  ;; (debug:print-info 2 *default-log-port* "login successful")
	  #t)
	#t
	(begin
	  ;; (debug:print-info 2 *default-log-port* "login failed")
	  #f))))
	#f)))

(define (server:get-timeout)
  (let ((tmo (configf:lookup  *configdat* "server" "timeout")))
    (if (and (string? tmo)
	     (string->number tmo))
	(* 60 60 (string->number tmo))
	;; (* 3 24 60 60) ;; default to three days
	(* 60 1)         ;; default to one minute
	;; (* 60 60 25)      ;; default to 25 hours
	)))

Modified tests.scm from [84fbf4d5d8] to [8ec0971889].

395
396
397
398
399
400
401
402

403
404
405
406
407
408


409
410
411
412
413
414
415
395
396
397
398
399
400
401

402
403
404
405
406


407
408
409
410
411
412
413
414
415







-
+




-
-
+
+








    (debug:print 4 *default-log-port* "real-status " real-status ", waived " waived ", status " status)

    ;; update the primary record IF state AND status are defined
    (if (and state status)
	(begin
	  (rmt:test-set-status-state run-id test-id real-status state (if waived waived comment))
	  (mt:process-triggers run-id test-id state real-status)
	  ;; (mt:process-triggers run-id test-id state real-status) ;; triggers are called in test-set-status-state
	  ))
    
    ;; if status is "AUTO" then call rollup (note, this one modifies data in test
    ;; run area, it does remote calls under the hood.
    (if (and test-id state status (equal? status "AUTO")) 
	(rmt:test-data-rollup run-id test-id status))
    ;; (if (and test-id state status (equal? status "AUTO")) 
    ;; 	(rmt:test-data-rollup run-id test-id status))

    ;; add metadata (need to do this way to avoid SQL injection issues)

    ;; :first_err
    ;; (let ((val (hash-table-ref/default otherdat ":first_err" #f)))
    ;;   (if val
    ;;       (sqlite3:execute db "UPDATE tests SET first_err=? WHERE run_id=? AND testname=? AND item_path=?;" val run-id test-name item-path)))
479
480
481
482
483
484
485
486
487
488

489
490
491
492
493
494
495
479
480
481
482
483
484
485

486

487
488
489
490
491
492
493
494







-

-
+







	    force)
	(let ((my-start-time (current-seconds))
	      (lockf         (conc outputfilename ".lock")))
	  (let loop ((have-lock  (common:simple-file-lock lockf)))
	    (if have-lock
		(let ((script (configf:lookup *configdat* "testrollup" test-name)))
		  (print "Obtained lock for " outputfilename)
		  ;; (rmt:top-test-set-per-pf-counts run-id test-name)
		  (rmt:roll-up-pass-fail-counts run-id test-name "" #f #f #f)
		  (rmt:top-test-set-per-pf-counts run-id test-name)
		  ;; (rmt:test-set-status-state run-id test-name #f #f #f) ;; (rmt:top-test-set-per-pf-counts run-id test-name)
		  (if script
		      (system (conc script " > " outputfilename " & "))
		      (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename))
		  (common:simple-file-release-lock lockf)
		  (change-directory orig-dir)
		  ;; NB// tests:test-set-toplog! is remote internal...
		  (tests:test-set-toplog! run-id test-name outputfilename))