Megatest

Check-in [0b3a6d8aa9]
Login
Overview
Comment:Merged v1.60 changes into try-nanomsg
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | try-nanomsg
Files: files | file ages | folders
SHA1: 0b3a6d8aa92b0c1bee6199cd6a43aff8c5eb60a5
User & Date: matt on 2014-11-26 08:59:53
Other Links: branch diff | manifest | tags
Context
2014-11-26
16:09
Added debug support in newdashboard check-in: b79afa463f user: matt tags: try-nanomsg
08:59
Merged v1.60 changes into try-nanomsg check-in: 0b3a6d8aa9 user: matt tags: try-nanomsg
2014-11-25
21:10
Add big delay and take a break when system is clearly overloaded. check-in: ec50f4ac00 user: matt tags: v1.60
2014-11-24
23:36
several little fixes check-in: 308a5d65b6 user: matt tags: try-nanomsg
Changes

Modified db.scm from [f9ca78a55d] to [75e6e604f1].

193
194
195
196
197
198
199
200

201
202
203
204
205
206
207
193
194
195
196
197
198
199

200
201
202
203
204
205
206
207







-
+







    (if (or rdb
	    do-not-open)
	rdb
	(let* ((dbpath       (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db"))
	       (dbexists     (file-exists? dbpath))
	       (inmem        (if local #f (db:open-inmem-db)))
	       (refdb        (if local #f (db:open-inmem-db)))
	       (db           (db:lock-create-open dbpath 
	       (db           (db:lock-create-open dbpath ;; this is the database physically on disk
						  (lambda (db)
						    (handle-exceptions
						     exn
						     (begin
						       (release-dot-lock dbpath)
						       (if (> attemptnum 2)
							   (debug:print 0 "ERROR: tried twice, cannot create/initialize db for run-id " run-id ", at path " dbpath)
232
233
234
235
236
237
238

239
240

241
242



243
244
245
246
247
248
249
232
233
234
235
236
237
238
239
240

241
242
243
244
245
246
247
248
249
250
251
252
253







+

-
+


+
+
+







	  ;; (dbr:dbstruct-set-run-id! dbstruct run-id)
	  (if local
	      (begin
		(dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ...
		db)
	      (begin
		(dbr:dbstruct-set-inmem!  dbstruct inmem)
		(sqlite3:execute db "DELETE FROM tests WHERE state='DELETED';") ;; they just slow us down in this context
		(db:sync-tables db:sync-tests-only db inmem)
		(db:delay-if-busy dbpath: (db:dbdat-get-path refdb))
		(db:delay-if-busy refdb) ;; dbpath: (db:dbdat-get-path refdb))
		(dbr:dbstruct-set-refdb!  dbstruct refdb)
		(db:sync-tables db:sync-tests-only db refdb)
		;; sync once more to deal with delays
		(db:sync-tables db:sync-tests-only db inmem)
		(db:sync-tables db:sync-tests-only db refdb)
		inmem))))))

;; This routine creates the db. It is only called if the db is not already ls opened
;;
(define (db:open-main dbstruct) ;;  (conc *toppath* "/megatest.db") (car *configinfo*)))
  (let ((mdb (dbr:dbstruct-get-main dbstruct)))
    (if mdb
290
291
292
293
294
295
296

297
298
299
300
301
302
303
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308







+







	(inmem  (dbr:dbstruct-get-inmem dbstruct))
	(maindb (dbr:dbstruct-get-main  dbstruct))
	(refdb  (dbr:dbstruct-get-refdb dbstruct))
	(olddb  (dbr:dbstruct-get-olddb dbstruct))
	;; (runid  (dbr:dbstruct-get-run-id dbstruct))
	)
    (debug:print-info 4 "Syncing for run-id: " run-id)
    (mutex-lock! *http-mutex*)
    (if (eq? run-id 0)
	;; runid equal to 0 is main.db
	(if maindb
	    (if (or (not (number? mtime))
		    (not (number? stime))
		    (> mtime stime)
		    force-sync)
320
321
322
323
324
325
326

327


328

329
330
331
332
333
334
335
325
326
327
328
329
330
331
332
333
334
335

336
337
338
339
340
341
342
343







+

+
+
-
+







		(> mtime stime)
		force-sync)
	    (begin
	      (db:delay-if-busy rundb)
	      (db:delay-if-busy olddb)
	      (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb)))
		(dbr:dbstruct-set-stime! dbstruct (current-milliseconds))
		(mutex-unlock! *http-mutex*)
		num-synced)
	      (begin
		(mutex-unlock! *http-mutex*)
	      0)))))
		0))))))

(define (db:close-main dbstruct)
  (let ((maindb (dbr:dbstruct-get-main dbstruct)))
    (if maindb
	(begin
	  (sqlite3:finalize! (db:dbdat-get-db maindb))
	  (dbr:dbstruct-set-main! dbstruct #f)))))
649
650
651
652
653
654
655
656

657
658
659
660
661
662
663
664
665
666
667
668
669
670

671
672
673
674
675
676
677
657
658
659
660
661
662
663

664
665
666
667
668
669
670
671
672
673
674
675
676
677

678
679
680
681
682
683
684
685







-
+













-
+







	  (for-each 
	   (lambda (run-id)
	     (db:delay-if-busy mtdb)
	     (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))
		   (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)))
	       (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db")
	       (db:replace-test-records dbstruct run-id testrecs)
	       (sqlite3:finalize! (dbr:dbstruct-get-rundb dbstruct))))
	       (sqlite3:finalize! (db:dbdat-get-db (dbr:dbstruct-get-rundb dbstruct)))))
	   run-ids)))

    ;; now ensure all newdb data are synced to megatest.db
    (if (member 'new2old options)
	(for-each
	 (lambda (run-id)
	   (let* ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))
		  (frundb (db:dbdat-get-db (db:get-db fromdb run-id))))
	     ;; (db:delay-if-busy frundb)
	     ;; (db:delay-if-busy mtdb)
	     (if (eq? run-id 0)
		 (db:sync-tables (db:sync-main-list dbstruct) (db:get-db fromdb #f) mtdb)
		 (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb))))
	 run-ids))
	 (cons 0 run-ids)))
    ;; (db:close-all dbstruct)
    ;; (sqlite3:finalize! mdb)
    ))

;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling  proc idb . params)
  (debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
1942
1943
1944
1945
1946
1947
1948

1949
1950
1951




1952
1953
1954
1955
1956
1957
1958
1950
1951
1952
1953
1954
1955
1956
1957



1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968







+
-
-
-
+
+
+
+








(define db:test-record-qry-selector (string-intersperse db:test-record-fields ","))


;; NOTE: Use db:test-get* to access records
;; NOTE: This needs rundir decoding? Decide, decode here or where used? For the moment decode where used.
(define (db:get-all-tests-info-by-run-id dbstruct run-id)
  (let* ((dbdat (if (vector? dbstruct)
  (let ((dbdat (db:get-db dbstruct run-id))
	(db    (db:dbdat-get-db dbdat))
	(res '()))
		    (db:get-db dbstruct run-id)
		    dbstruct)) ;; still settling on when to use dbstruct or dbdat
	 (db    (db:dbdat-get-db dbdat))
	 (res '()))
    (db:delay-if-busy dbdat)
    (sqlite3:for-each-row
     (lambda (id run-id testname state status event-time host cpuload diskfree uname rundir item-path run-duration final-logf comment shortdir attemptnum)
       ;;                 0    1       2      3      4        5       6      7        8     9     10      11          12          13       14     15
       (set! res (cons (vector id run-id testname state status event-time host cpuload diskfree uname rundir item-path run-duration final-logf comment shortdir attemptnum)
		       res)))
     db

Modified http-transport.scm from [8c622aa861] to [445c83d4cb].

486
487
488
489
490
491
492
493

494
495
496
497
498
499
500
486
487
488
489
490
491
492

493
494
495
496
497
498
499
500







-
+







    (debug:print-info 0 "Average non-cached time   "
		      (if (eq? *number-non-write-queries* 0)
			  "n/a (no queries)"
			  (/ *total-non-write-delay* 
			     *number-non-write-queries*))
		      " ms")
    (debug:print-info 0 "Server shutdown complete. Exiting")
    (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running")
    (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete")
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547

548
549
550
551
552
553
554
532
533
534
535
536
537
538








539
540
541
542
543
544
545
546
547







-
-
-
-
-
-
-
-

+







					  "-")
				      run-id
				      server-id)) "Server run"))
		 (th3 (make-thread (lambda ()
				     (debug:print-info 0 "Server monitor thread started")
				     (http-transport:keep-running server-id run-id))
				   "Keep running")))
	    ;; Database connection


	    ;; don't start the db here

	    ;; (set! *inmemdb*  (db:setup run-id))


	    (thread-start! th2)
	    (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))

(define (http-transport:server-signal-handler signum)
  (signal-mask! signum)

Modified launch.scm from [e538ecba1a] to [34882953c1].

97
98
99
100
101
102
103

104


105
106
107
108
109
110
111
97
98
99
100
101
102
103
104

105
106
107
108
109
110
111
112
113







+
-
+
+







					    
	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;
	  (let ((test-info (rmt:get-testinfo-state-status run-id test-id)))
	    (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
		(tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
		(begin
		(debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed")))
		  (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed")
		  (exit))))
	  
	  (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name))
	  (set! keys       (rmt:get-keys))
	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
	  ;; one of these is defunct/redundant ...
	  (if (not (launch:setup-for-run force: #t))
	      (begin

Modified rmt.scm from [e6ea17e36c] to [2b271743b3].

87
88
89
90
91
92
93
94
95


96
97
98
99
100
101
102
103
104
105
106
107
108
109

110
111
112
113
114
115
116
87
88
89
90
91
92
93


94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

109
110
111
112
113
114
115
116







-
-
+
+













-
+







    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*
			  ((http)(http-transport:client-api-send-receive run-id connection-info cmd jparams))
			  ((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info cmd params))
			  (else  (exit))))
	       (res     (if (vector? dat) (vector-ref dat 1) #f))
	       (success (if (vector? dat) (vector-ref dat 0) #f)))
	       (res     (if (and dat (vector? dat)) (vector-ref dat 1) #f))
	       (success (if (and dat (vector? dat)) (vector-ref dat 0) #f)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if success
	      (case *transport-type* 
		((http)(db:string->obj res))
		((nmsg) res))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		(if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))

		(thread-sleep! 2)
		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))
	(if (and (< attemptnum 10)
		 (tasks:need-server run-id))
	    (begin
	      (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
	      (rmt:send-receive cmd rid params (+ attemptnum 1)))
	    (rmt:open-qry-close-locally cmd run-id params)))))

Modified runs.scm from [3205d6c7db] to [f2976f1213].

552
553
554
555
556
557
558
559

560
561
562


563
564
565
566
567
568
569
552
553
554
555
556
557
558

559
560
561
562
563
564
565
566
567
568
569
570
571







-
+



+
+







			(runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull)
			reruns)))
	      (list (car newtal)(append (cdr newtal) reg) '() reruns))))

     ((and (null? fails)
	   (null? prereq-fails)
	   (null? non-completed))
      (if  (runs:can-keep-running? hed 5)
      (if  (runs:can-keep-running? hed 20)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0))
	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
	    (thread-sleep! 60)
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin
	    (debug:print-info 1 "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
	    (let ((test-id (rmt:get-test-id run-id hed "")))
	      (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
671
672
673
674
675
676
677

678
679
680







681
682
683
684
685
686
687
673
674
675
676
677
678
679
680



681
682
683
684
685
686
687
688
689
690
691
692
693
694







+
-
-
-
+
+
+
+
+
+
+







	  #f))
     
     ;; Register tests 
     ;;
     ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f))
      (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
      ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs
      (let register-loop ((numtries 15))
      (rmt:general-call 'register-test run-id run-id test-name item-path)
      (if (rmt:get-test-id run-id test-name item-path)
	  (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done))
	(rmt:general-call 'register-test run-id run-id test-name item-path)
	(thread-sleep! 0.5)
	(if (rmt:get-test-id run-id test-name item-path)
	    (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)
	    (if (> numtries 0)
		(register-loop (- numtries 1))
		(debug:print 0 "ERROR: failed to register test " (runs:make-full-test-name test-name item-path)))))
      (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
	  (begin
	    (rmt:general-call 'register-test run-id run-id test-name "")
	    (if (rmt:get-test-id run-id test-name "")
		(hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done))))
      (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
      (if (and (null? tal)(null? reg))

Modified tasks.scm from [84082aa618] to [42971aa928].

165
166
167
168
169
170
171
172

173
174
175
176
177
178
179
165
166
167
168
169
170
171

172
173
174
175
176
177
178
179







-
+







(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(thread-sleep! 2) ;; Try removing this. It may not be needed.
	;; (thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 
342
343
344
345
346
347
348









349
350
351
352
353
354
355
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364







+
+
+
+
+
+
+
+
+







  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND  (strftime('%s','now') - start_time) < 60));" run-id)
    res))

(define (tasks:server-running? mdb run-id)
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id)
    res))

(define (tasks:need-server run-id)
  (let ((forced (configf:lookup *configdat* "server" "required"))
	(maxqry (cdr (rmt:get-max-query-average run-id)))
	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
    (cond
     (forced 
403
404
405
406
407
408
409

410
411
412
413
414
415
416
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426







+







(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed")
	  (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    

Modified tests/fullrun/megatest.config from [558689506a] to [7518de8980].

131
132
133
134
135
136
137
138

139
140
141
142
143
144
145
131
132
133
134
135
136
137

138
139
140
141
142
143
144
145







-
+







# If the server can't be started on this port it will try the next port until
# it succeeds
port 8080

# This server will keep running this number of hours after last access. 
# Three minutes is 0.05 hours
# timeout 0.025
timeout 0.01
timeout 0.1

# Server is required - slower but more resistant to Sqlite issues.
required yes

# Start server when average query takes longer than this
# server-query-threshold 55500
server-query-threshold -1