Megatest

Check-in [3757f74464]
Login
Overview
Comment:Better error reporting (use stderr), faster registration of tests, bug fix in tasks path to db handling
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: 3757f74464111e176863209a126d027b4d48c58e
User & Date: matt on 2014-11-16 20:02:13
Other Links: branch diff | manifest | tags
Context
2014-11-16
20:40
Use required mode by default in QA - for now check-in: 91923e2ee5 user: matt tags: v1.60
20:02
Better error reporting (use stderr), faster registration of tests, bug fix in tasks path to db handling check-in: 3757f74464 user: matt tags: v1.60
17:37
Added channel for result codes to http communication. Fixed some unit tests check-in: 98ffd649ba user: matt tags: v1.60
Changes

Modified common.scm from [fba3cc219f] to [afd51ef265].

158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
(define (common:read-encoded-string instr)
  (handle-exceptions
   exn
   (handle-exceptions
    exn
    (begin
      (debug:print 0 "ERROR: received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn))
      (print-call-chain)
      #f)
    (read (open-input-string (base64:base64-decode instr))))
   (read (open-input-string (z3:decode-buffer (base64:base64-decode instr))))))

;;======================================================================
;; S T A T E S   A N D   S T A T U S E S
;;======================================================================







|







158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
(define (common:read-encoded-string instr)
  (handle-exceptions
   exn
   (handle-exceptions
    exn
    (begin
      (debug:print 0 "ERROR: received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn))
      (print-call-chain (current-error-port))
      #f)
    (read (open-input-string (base64:base64-decode instr))))
   (read (open-input-string (z3:decode-buffer (base64:base64-decode instr))))))

;;======================================================================
;; S T A T E S   A N D   S T A T U S E S
;;======================================================================

Modified datashare.scm from [b4dde544c7] to [2abd8aec1c].

253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
       ((busy)
        (thread-sleep! sleep-time))
       (else
        (print "EXCEPTION: database overloaded or unreadable.")
        (print " message: " ((condition-property-accessor 'exn 'message) exn))
        (print "exn=" (condition->list exn))
        (print " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
        (print-call-chain)
        (thread-sleep! sleep-time)
        (print "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up")))
     (apply open-run-close-exception-handling proc idb params))
   (apply open-run-close-no-exception-handling proc idb params)))

(define (open-run-close-no-exception-handling  proc idb . params)
  ;; (print "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)







|







253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
       ((busy)
        (thread-sleep! sleep-time))
       (else
        (print "EXCEPTION: database overloaded or unreadable.")
        (print " message: " ((condition-property-accessor 'exn 'message) exn))
        (print "exn=" (condition->list exn))
        (print " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
        (print-call-chain (current-error-port))
        (thread-sleep! sleep-time)
        (print "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up")))
     (apply open-run-close-exception-handling proc idb params))
   (apply open-run-close-no-exception-handling proc idb params)))

(define (open-run-close-no-exception-handling  proc idb . params)
  ;; (print "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)

Modified db.scm from [15f2be2e7b] to [658953f833].

327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
	     (sqlite3:database? rundb))
	(handle-exceptions
	 exn
	 (begin 
	   (debug:print 0 "WARNING: database files may not have been closed correctly. Consider running -cleanup-db")
	   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	   (debug:print 0 " db: " rundb)
	   (print-call-chain)
	   #f)
	 (sqlite3:interrupt! rundb)
	 (sqlite3:finalize! rundb #t))))
  ;; (mutex-unlock! *db-sync-mutex*)
  )

(define (db:open-inmem-db)







|







327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
	     (sqlite3:database? rundb))
	(handle-exceptions
	 exn
	 (begin 
	   (debug:print 0 "WARNING: database files may not have been closed correctly. Consider running -cleanup-db")
	   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	   (debug:print 0 " db: " rundb)
	   (print-call-chain (current-error-port))
	   #f)
	 (sqlite3:interrupt! rundb)
	 (sqlite3:finalize! rundb #t))))
  ;; (mutex-unlock! *db-sync-mutex*)
  )

(define (db:open-inmem-db)
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
     (print "exn=" (condition->list exn))
     (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
     (debug:print 0 " src db:  " (db:dbdat-get-path fromdb))
     (for-each (lambda (dbdat)
		 (debug:print 0 " dbpath:  " (db:dbdat-get-path dbdat)))
	       (cons todb slave-dbs))
     (print-call-chain))
   (cond
    ((not fromdb) (debug:print 3 "WARNING: db:sync-tables called with fromdb missing") -1)
    ((not todb)   (debug:print 3 "WARNING: db:sync-tables called with todb missing") -2)
    ((not (sqlite3:database? (db:dbdat-get-db fromdb)))
     (debug:print 0 "ERROR: db:sync-tables called with fromdb not a database " fromdb) -3)
    ((not (sqlite3:database? (db:dbdat-get-db todb)))
     (debug:print 0 "ERROR: db:sync-tables called with todb not a database " todb) -4)







|







434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
     (print "exn=" (condition->list exn))
     (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
     (debug:print 0 " src db:  " (db:dbdat-get-path fromdb))
     (for-each (lambda (dbdat)
		 (debug:print 0 " dbpath:  " (db:dbdat-get-path dbdat)))
	       (cons todb slave-dbs))
     (print-call-chain (current-error-port)))
   (cond
    ((not fromdb) (debug:print 3 "WARNING: db:sync-tables called with fromdb missing") -1)
    ((not todb)   (debug:print 3 "WARNING: db:sync-tables called with todb missing") -2)
    ((not (sqlite3:database? (db:dbdat-get-db fromdb)))
     (debug:print 0 "ERROR: db:sync-tables called with fromdb not a database " fromdb) -3)
    ((not (sqlite3:database? (db:dbdat-get-db todb)))
     (debug:print 0 "ERROR: db:sync-tables called with todb not a database " todb) -4)
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
       ((busy)
	(thread-sleep! sleep-time))
       (else
	(debug:print 0 "EXCEPTION: database probably overloaded or unreadable.")
	(debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	(print "exn=" (condition->list exn))
	(debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
	(print-call-chain)
	(thread-sleep! sleep-time)
	(debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up")))
     (apply open-run-close-exception-handling proc idb params))
   (apply open-run-close-no-exception-handling proc idb params)))

;; (define open-run-close 
(define open-run-close open-run-close-exception-handling)







|







647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
       ((busy)
	(thread-sleep! sleep-time))
       (else
	(debug:print 0 "EXCEPTION: database probably overloaded or unreadable.")
	(debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	(print "exn=" (condition->list exn))
	(debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
	(print-call-chain (current-error-port))
	(thread-sleep! sleep-time)
	(debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up")))
     (apply open-run-close-exception-handling proc idb params))
   (apply open-run-close-no-exception-handling proc idb params)))

;; (define open-run-close 
(define open-run-close open-run-close-exception-handling)
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
;; i.e. these lists define what to NOT show.
;; states and statuses are required to be lists, empty is ok
;; not-in #t = above behaviour, #f = must match
(define (db:get-tests-for-run dbstruct run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals)
  (if (not (number? run-id))
      (begin ;; no need to treat this as an error by default
	(debug:print 4 "WARNING: call to db:get-tests-for-run with bad run-id=" run-id)
	;; (print-call-chain)
	'())
      (let* ((qryvalstr       (case qryvals
				((shortlist) "id,run_id,testname,item_path,state,status")
				((#f)        db:test-record-qry-selector) ;; "id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment")
				(else        qryvals)))
	     (res            '())
	     ;; if states or statuses are null then assume match all when not-in is false







|







1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
;; i.e. these lists define what to NOT show.
;; states and statuses are required to be lists, empty is ok
;; not-in #t = above behaviour, #f = must match
(define (db:get-tests-for-run dbstruct run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals)
  (if (not (number? run-id))
      (begin ;; no need to treat this as an error by default
	(debug:print 4 "WARNING: call to db:get-tests-for-run with bad run-id=" run-id)
	;; (print-call-chain (current-error-port))
	'())
      (let* ((qryvalstr       (case qryvals
				((shortlist) "id,run_id,testname,item_path,state,status")
				((#f)        db:test-record-qry-selector) ;; "id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment")
				(else        qryvals)))
	     (res            '())
	     ;; if states or statuses are null then assume match all when not-in is false
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
   exn
   (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
     ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn)
     (if (eq? err-status 'done)
	 default
	 (begin
	   (debug:print 0 "ERROR:  query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn))
	   (print-call-chain)
	   default)))
   (apply sqlite3:first-result db stmt params)))

;;======================================================================
;; Extract ods file from the db
;;======================================================================








|







2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
   exn
   (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
     ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn)
     (if (eq? err-status 'done)
	 default
	 (begin
	   (debug:print 0 "ERROR:  query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn))
	   (print-call-chain (current-error-port))
	   default)))
   (apply sqlite3:first-result db stmt params)))

;;======================================================================
;; Extract ods file from the db
;;======================================================================

Modified lock-queue.scm from [141b4e4668] to [fb7e24faf1].

187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
	 (mystart (current-seconds))
	 (db      (lock-queue:db-dat-get-db dbdat)))
    (handle-exceptions
     exn
     (begin
       (debug:print 0 "WARNING: Failed to find out if it is ok to skip the wait queue. Will try again in few seconds")
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (print-call-chain)
       (thread-sleep! 10)
       (if (> count 0)
	   (begin
	     (sqlite3:finalize! db)
	     (lock-queue:wait-turn fname test-id count: (- count 1)))
	   (begin
	     (debug:print 0 "Giving up calls to lock-queue:wait-turn for test-id " test-id " at path " fname ", printing call chain")
	     (print-call-chain)
	     #f)))
     (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 waiting-msg: "lock-queue:wait-turn; waiting on journal file")
     (sqlite3:execute
      db
      "INSERT OR REPLACE INTO queue (test_id,start_time,state) VALUES (?,?,'waiting');"
      test-id mystart)
     (thread-sleep! 1) ;; give other tests a chance to register







|







|







187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
	 (mystart (current-seconds))
	 (db      (lock-queue:db-dat-get-db dbdat)))
    (handle-exceptions
     exn
     (begin
       (debug:print 0 "WARNING: Failed to find out if it is ok to skip the wait queue. Will try again in few seconds")
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (print-call-chain (current-error-port))
       (thread-sleep! 10)
       (if (> count 0)
	   (begin
	     (sqlite3:finalize! db)
	     (lock-queue:wait-turn fname test-id count: (- count 1)))
	   (begin
	     (debug:print 0 "Giving up calls to lock-queue:wait-turn for test-id " test-id " at path " fname ", printing call chain")
	     (print-call-chain (current-error-port))
	     #f)))
     (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 waiting-msg: "lock-queue:wait-turn; waiting on journal file")
     (sqlite3:execute
      db
      "INSERT OR REPLACE INTO queue (test_id,start_time,state) VALUES (?,?,'waiting');"
      test-id mystart)
     (thread-sleep! 1) ;; give other tests a chance to register

Modified mt.scm from [fdb95af183] to [15956fcc00].

166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
;;======================================================================

;; speed up for common cases with a little logic
(define (mt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment)
  (if (not (and run-id test-id))
      (begin
	(debug:print 0 "ERROR: bad data handed to mt:test-set-state-status-by-id, run-id=" run-id ", test-id=" test-id ", newstate=" newstate)
	(print-call-chain)
	#f)
      (begin
	(cond
	 ((and newstate newstatus newcomment)
	  (rmt:general-call 'state-status-msg run-id newstate newstatus newcomment test-id))
	 ((and newstate newstatus)
	  (rmt:general-call 'state-status run-id newstate newstatus test-id))







|







166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
;;======================================================================

;; speed up for common cases with a little logic
(define (mt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment)
  (if (not (and run-id test-id))
      (begin
	(debug:print 0 "ERROR: bad data handed to mt:test-set-state-status-by-id, run-id=" run-id ", test-id=" test-id ", newstate=" newstate)
	(print-call-chain (current-error-port))
	#f)
      (begin
	(cond
	 ((and newstate newstatus newcomment)
	  (rmt:general-call 'state-status-msg run-id newstate newstatus newcomment test-id))
	 ((and newstate newstatus)
	  (rmt:general-call 'state-status run-id newstate newstatus test-id))

Modified portlogger.scm from [05c38f2bc0] to [c19f5a6299].

56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
     exn
     (begin
       ;; (release-dot-lock fname)
       (debug:print 0 "ERROR: portlogger:open-run-close failed. " proc " " params)
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 "exn=" (condition->list exn))
       (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it
       (print-call-chain))
     (let* (;; (lock   (obtain-dot-lock fname 2 9 10))
	    (db     (portlogger:open-db fname))
	    (res    (apply proc db params)))
       (sqlite3:finalize! db)
       ;; (release-dot-lock fname)
       res))))








|







56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
     exn
     (begin
       ;; (release-dot-lock fname)
       (debug:print 0 "ERROR: portlogger:open-run-close failed. " proc " " params)
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 "exn=" (condition->list exn))
       (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it
       (print-call-chain (current-error-port)))
     (let* (;; (lock   (obtain-dot-lock fname 2 9 10))
	    (db     (portlogger:open-db fname))
	    (res    (apply proc db params)))
       (sqlite3:finalize! db)
       ;; (release-dot-lock fname)
       res))))

102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
(define (portlogger:get-prev-used-port db)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db")
     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
     (debug:print 0 "exn=" (condition->list exn))
     (print-call-chain)
     (debug:print 0 "Continuing anyway.")
     #f)
   (sqlite3:fold-row
    (lambda (var curr)
      (or curr var curr))
    #f
    db







|







102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
(define (portlogger:get-prev-used-port db)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db")
     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
     (debug:print 0 "exn=" (condition->list exn))
     (print-call-chain (current-error-port))
     (debug:print 0 "Continuing anyway.")
     #f)
   (sqlite3:fold-row
    (lambda (var curr)
      (or curr var curr))
    #f
    db
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
			 (random (- 64000 lowport))))))
    (handle-exceptions
     exn
     (begin
       (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db")
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 "exn=" (condition->list exn))
       (print-call-chain)
       (debug:print 0 "Continuing anyway."))
     (portlogger:take-port db portnum))
    portnum))

;; set port to "released", "failed" etc.
;; 
(define (portlogger:set-port db portnum value)







|







127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
			 (random (- 64000 lowport))))))
    (handle-exceptions
     exn
     (begin
       (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db")
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 "exn=" (condition->list exn))
       (print-call-chain (current-error-port))
       (debug:print 0 "Continuing anyway."))
     (portlogger:take-port db portnum))
    portnum))

;; set port to "released", "failed" etc.
;; 
(define (portlogger:set-port db portnum value)
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
	  (handle-exceptions
	   exn
	   (begin
	     (debug:print 0 "EXCEPTION: portlogger database at " dbfname " probably overloaded or unreadable. Try removing it.")
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (print "exn=" (condition->list exn))
	     (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
	     (print-call-chain))
	   (cond
	    ((> numargs 1) ;; most commands
	     (case (string->symbol (car args)) ;; commands with two or more params
	       ((take)(portlogger:take-port db (string->number (cadr args))))
	       ((set) (portlogger:set-port db 
					   (string->number (cadr args))
					   (caddr args))
		(caddr args))
	       ((failed)(portlogger:set-failed db (string->number (cadr args))) 'failed)))))))
    (sqlite3:finalize! db)
    result))
     
;; (print (apply portlogger:main (cdr (argv))))







|













158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
	  (handle-exceptions
	   exn
	   (begin
	     (debug:print 0 "EXCEPTION: portlogger database at " dbfname " probably overloaded or unreadable. Try removing it.")
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (print "exn=" (condition->list exn))
	     (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
	     (print-call-chain (current-error-port)))
	   (cond
	    ((> numargs 1) ;; most commands
	     (case (string->symbol (car args)) ;; commands with two or more params
	       ((take)(portlogger:take-port db (string->number (cadr args))))
	       ((set) (portlogger:set-port db 
					   (string->number (cadr args))
					   (caddr args))
		(caddr args))
	       ((failed)(portlogger:set-failed db (string->number (cadr args))) 'failed)))))))
    (sqlite3:finalize! db)
    result))
     
;; (print (apply portlogger:main (cdr (argv))))

Modified rmt.scm from [7a184ad9b2] to [045a714be1].

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
	      (db:string->obj res)
	      ;; (if (< attemptnum 100)
	      ;;     (begin
	      ;;       (hash-table-delete! *runremote* run-id)
	      ;;       (thread-sleep! 0.5)
	      ;;       (rmt:send-receive cmd rid params attempnum: (+ attemptnum 1)))
	      ;;     (begin
	      ;;       (print-call-chain)
	      ;;       (debug:print 0 "ERROR: too many attempts to communicate have failed. Giving up. Kill your mtest processes and start over")
	      ;;       (exit 1)))))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection

		;; no longer killing the server in http-transport:client-api-send-receive







|







95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
	      (db:string->obj res)
	      ;; (if (< attemptnum 100)
	      ;;     (begin
	      ;;       (hash-table-delete! *runremote* run-id)
	      ;;       (thread-sleep! 0.5)
	      ;;       (rmt:send-receive cmd rid params attempnum: (+ attemptnum 1)))
	      ;;     (begin
	      ;;       (print-call-chain (current-error-port))
	      ;;       (debug:print 0 "ERROR: too many attempts to communicate have failed. Giving up. Kill your mtest processes and start over")
	      ;;       (exit 1)))))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection

		;; no longer killing the server in http-transport:client-api-send-receive
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))

(define (rmt:get-test-info-by-id run-id test-id)
  (if (and (number? run-id)(number? test-id))
      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
      (begin
	(debug:print 0 "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
	(print-call-chain)
	#f)))

(define (rmt:test-get-rundir-from-test-id run-id test-id)
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))

(define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f))
  (let* ((test-path (if (string? work-area)







|







298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))

(define (rmt:get-test-info-by-id run-id test-id)
  (if (and (number? run-id)(number? test-id))
      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
      (begin
	(debug:print 0 "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
	(print-call-chain (current-error-port))
	#f)))

(define (rmt:test-get-rundir-from-test-id run-id test-id)
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))

(define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f))
  (let* ((test-path (if (string? work-area)
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
  (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus)))

(define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals)
  (if (number? run-id)
      (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals))
      (begin
	(debug:print "ERROR: rmt:get-tests-for-run called with bad run-id=" run-id)
	(print-call-chain)
	'())))

(define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in)
  (let ((run-id-list (if run-ids
			 run-ids
			 (rmt:get-all-run-ids))))
    (apply append (map (lambda (run-id)







|







323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
  (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus)))

(define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals)
  (if (number? run-id)
      (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals))
      (begin
	(debug:print "ERROR: rmt:get-tests-for-run called with bad run-id=" run-id)
	(print-call-chain (current-error-port))
	'())))

(define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in)
  (let ((run-id-list (if run-ids
			 run-ids
			 (rmt:get-all-run-ids))))
    (apply append (map (lambda (run-id)

Modified runs.scm from [692dff51df] to [94d1fbb17e].

668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
		reruns)
	  #f))
     
     ;; Register tests 
     ;;
     ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f))
      (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
      (if #t ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs
	  (begin
	    (rmt:general-call 'register-test run-id run-id test-name item-path)
	    (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done))
	  (let ((th (make-thread (lambda ()
				   (mutex-lock! registry-mutex)
				   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start)
				   (mutex-unlock! registry-mutex)
				   ;; If haven't done it before register a top level test if this is an itemized test
				   (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
				       (rmt:general-call 'register-test run-id run-id test-name ""))
				   (rmt:general-call 'register-test run-id run-id test-name item-path)
				   (mutex-lock! registry-mutex)
				   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)
				   (mutex-unlock! registry-mutex))
				 (conc test-name "/" item-path))))
	    (thread-start! th)))
      (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
      (if (and (null? tal)(null? reg))
	  (list hed tal (append reg (list hed)) reruns)
	  (list (runs:queue-next-hed tal reg reglen regfull)
		(runs:queue-next-tal tal reg reglen regfull)
		;; NB// Here we are building reg as we register tests
		;; if regfull we must pop the front item off reg







|
<
|
<
<
<
<
<
<
|
|
<
<
|
<
<
<







668
669
670
671
672
673
674
675

676






677
678


679



680
681
682
683
684
685
686
		reruns)
	  #f))
     
     ;; Register tests 
     ;;
     ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f))
      (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
      ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs

      (rmt:general-call 'register-test run-id run-id test-name item-path)






      (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
	  (rmt:general-call 'register-test run-id run-id test-name ""))


      (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)



      (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
      (if (and (null? tal)(null? reg))
	  (list hed tal (append reg (list hed)) reruns)
	  (list (runs:queue-next-hed tal reg reglen regfull)
		(runs:queue-next-tal tal reg reglen regfull)
		;; NB// Here we are building reg as we register tests
		;; if regfull we must pop the front item off reg

Modified tasks.scm from [6b278cc270] to [f329b3e32b].

21
22
23
24
25
26
27


28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54


55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

82
83
84
85

86
87
88
89
90
91
92
93
94
;;======================================================================
;; Tasks db
;;======================================================================

;; wait up to aprox n seconds for a journal to go away
;;
(define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f))


  (let ((fullpath (conc path "-journal")))
    (handle-exceptions
     exn
     #t ;; if stuff goes wrong just allow it to move on
     (let loop ((journal-exists (file-exists? fullpath))
		(count          n)) ;; wait ten times ...
       (if journal-exists
	   (begin
	     (if (and waiting-msg
		      (eq? (modulo n 30) 0))
		 (debug:print 0 waiting-msg))
	     (if (> count 0)
		 (begin
		   (thread-sleep! 1)
		   (loop (file-exists? fullpath)
			 (- count 1)))
		 (begin
		   (if remove (system (conc "rm -rf " fullpath)))
		   #f)))
	   #t)))))

(define (tasks:get-task-db-path)
  (if *task-db*
      (vector-ref *task-db* 1)
      (let* ((linktree     (configf:lookup *configdat* "setup" "linktree"))
	     (dbpath       (conc linktree "/.db/monitor.db")))
	dbpath)))



;; If file exists AND
;;    file readable
;;         ==> open it
;; If file exists AND
;;    file NOT readable
;;         ==> open in-mem version
;; If file NOT exists
;;    ==> open in-mem version
;;
(define (tasks:open-db #!key (numretries 4))
  (if *task-db*
      *task-db*
      (handle-exceptions
       exn
       (if (> numretries 0)
	   (begin
	     (print-call-chain)
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (print "exn=" (condition->list exn))
	     (thread-sleep! 1)
	     (tasks:open-db numretries (- numretries 1)))
	   (begin
	     (print-call-chain)
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (print "exn=" (condition->list exn))))
       (let* ((dbpath       (tasks:get-task-db-path))

	      (avail        (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away
	      (exists       (file-exists? dbpath))
	      (write-access (file-write-access? dbpath))
	      (mdb          (cond

			     ((file-write-access? *toppath*)(sqlite3:open-database dbpath))
			     ((file-read-access? dbpath)    (sqlite3:open-database dbpath))
			     (else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath))
	      (handler      (make-busy-timeout 36000)))
	 (if (and exists
		  (not write-access))
	     (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control
	 (sqlite3:set-busy-handler! mdb handler)
	 (db:set-sync mdb) ;; (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;"))







>
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|


<
<
|
|
|
>
>

















|

|



|

|

>



|
>
|
|







21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51


52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
;;======================================================================
;; Tasks db
;;======================================================================

;; wait up to aprox n seconds for a journal to go away
;;
(define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f))
  (if (not (string? path))
      (debug:print 0 "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)")
      (let ((fullpath (conc path "-journal")))
	(handle-exceptions
	 exn
	 #t ;; if stuff goes wrong just allow it to move on
	 (let loop ((journal-exists (file-exists? fullpath))
		    (count          n)) ;; wait ten times ...
	   (if journal-exists
	       (begin
		 (if (and waiting-msg
			  (eq? (modulo n 30) 0))
		     (debug:print 0 waiting-msg))
		 (if (> count 0)
		     (begin
		       (thread-sleep! 1)
		       (loop (file-exists? fullpath)
			     (- count 1)))
		     (begin
		       (if remove (system (conc "rm -rf " fullpath)))
		       #f)))
	       #t))))))

(define (tasks:get-task-db-path)


  (let* ((linktree     (configf:lookup *configdat* "setup" "linktree"))
	 (dbpath       (conc linktree "/.db")))
    dbpath))



;; If file exists AND
;;    file readable
;;         ==> open it
;; If file exists AND
;;    file NOT readable
;;         ==> open in-mem version
;; If file NOT exists
;;    ==> open in-mem version
;;
(define (tasks:open-db #!key (numretries 4))
  (if *task-db*
      *task-db*
      (handle-exceptions
       exn
       (if (> numretries 0)
	   (begin
	     (print-call-chain (current-error-port))
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 0 " exn=" (condition->list exn))
	     (thread-sleep! 1)
	     (tasks:open-db numretries (- numretries 1)))
	   (begin
	     (print-call-chain (current-error-port))
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 0 " exn=" (condition->list exn))))
       (let* ((dbpath       (tasks:get-task-db-path))
	      (dbfile       (conc dbpath "/monitor.db"))
	      (avail        (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away
	      (exists       (file-exists? dbpath))
	      (write-access (file-write-access? dbpath))
	      (mdb          (cond ;; what the hek is *toppath* doing here?
			     ((and (string? *toppath*)(file-write-access? *toppath*))
			      (sqlite3:open-database dbfile))
			     ((file-read-access? dbpath)    (sqlite3:open-database dbfile))
			     (else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath))
	      (handler      (make-busy-timeout 36000)))
	 (if (and exists
		  (not write-access))
	     (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control
	 (sqlite3:set-busy-handler! mdb handler)
	 (db:set-sync mdb) ;; (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;"))
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
	(best #f))
    (handle-exceptions
     exn
     (begin 
       (debug:print 0 "WARNING: tasks:get-server db access error.")
	   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	   (debug:print 0 " for run " run-id)
	   (print-call-chain)
	   (if (> retries 0)
	       (begin
		 (debug:print 0 " trying call to tasks:get-server again in 10 seconds")
		 (thread-sleep! 10)
		 (tasks:get-server mdb run-id retries: (- retries 0)))
	       (debug:print 0 "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\"")))
     (sqlite3:for-each-row







|







310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
	(best #f))
    (handle-exceptions
     exn
     (begin 
       (debug:print 0 "WARNING: tasks:get-server db access error.")
	   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	   (debug:print 0 " for run " run-id)
	   (print-call-chain (current-error-port))
	   (if (> retries 0)
	       (begin
		 (debug:print 0 " trying call to tasks:get-server again in 10 seconds")
		 (thread-sleep! 10)
		 (tasks:get-server mdb run-id retries: (- retries 0)))
	       (debug:print 0 "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\"")))
     (sqlite3:for-each-row

Modified tests.scm from [9299be9897] to [3b87b51253].

658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    ;;        (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1)))
    ;;      (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
    ;;        (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up")
    ;;        (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.")
    ;;        (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
    ;;        (print "exn=" (condition->list exn))
    ;;        (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
    ;;        (print-call-chain)))
    ;;  (let* ((num-records 0) ;; (test:tdb-get-rundat-count tdb))
    ;;         (cpuload  (get-cpu-load))
    ;;         (diskfree (get-df (current-directory)))
    ;;         (uname    (get-uname "-srvpio"))
    ;;         (hostname (get-host-name)))
    ;;    ;(tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes)
    ;;    (tests:update-central-meta-info  run-id test-id cpuload diskfree minutes uname hostname)







|







658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    ;;        (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1)))
    ;;      (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
    ;;        (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up")
    ;;        (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.")
    ;;        (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
    ;;        (print "exn=" (condition->list exn))
    ;;        (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
    ;;        (print-call-chain (current-error-port))))
    ;;  (let* ((num-records 0) ;; (test:tdb-get-rundat-count tdb))
    ;;         (cpuload  (get-cpu-load))
    ;;         (diskfree (get-df (current-directory)))
    ;;         (uname    (get-uname "-srvpio"))
    ;;         (hostname (get-host-name)))
    ;;    ;(tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes)
    ;;    (tests:update-central-meta-info  run-id test-id cpuload diskfree minutes uname hostname)
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
	   (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1)))
	 (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
	   (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up")
	   (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.")
	   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	   (print "exn=" (condition->list exn))
	   (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
	   (print-call-chain)))
     (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes)
  )))
	 
;;======================================================================
;; A R C H I V I N G
;;======================================================================

(define (test:archive db test-id)
  #f)

(define (test:archive-tests db keynames target)
  #f)








|













686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
	   (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1)))
	 (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
	   (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up")
	   (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.")
	   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	   (print "exn=" (condition->list exn))
	   (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
	   (print-call-chain (current-error-port))))
     (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes)
  )))
	 
;;======================================================================
;; A R C H I V I N G
;;======================================================================

(define (test:archive db test-id)
  #f)

(define (test:archive-tests db keynames target)
  #f)