Megatest

Diff
Login

Differences From Artifact [f329b3e32b]:

To Artifact [d12c4149c9]:


26
27
28
29
30
31
32





33
34
35
36
37
38
39
40
;;
(define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f))
  (if (not (string? path))
      (debug:print 0 "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)")
      (let ((fullpath (conc path "-journal")))
	(handle-exceptions
	 exn





	 #t ;; if stuff goes wrong just allow it to move on
	 (let loop ((journal-exists (file-exists? fullpath))
		    (count          n)) ;; wait ten times ...
	   (if journal-exists
	       (begin
		 (if (and waiting-msg
			  (eq? (modulo n 30) 0))
		     (debug:print 0 waiting-msg))







>
>
>
>
>
|







26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
;;
(define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f))
  (if (not (string? path))
      (debug:print 0 "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)")
      (let ((fullpath (conc path "-journal")))
	(handle-exceptions
	 exn
	 (begin
	   (print-call-chain (current-error-port))
	   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	   (debug:print 0 " exn=" (condition->list exn))
	   (debug:print 0 "tasks:wait-on-journal failed. Continuing on, you can ignore this call-chain")
	   #t) ;; if stuff goes wrong just allow it to move on
	 (let loop ((journal-exists (file-exists? fullpath))
		    (count          n)) ;; wait ten times ...
	   (if journal-exists
	       (begin
		 (if (and waiting-msg
			  (eq? (modulo n 30) 0))
		     (debug:print 0 waiting-msg))
306
307
308
309
310
311
312
313

314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
    (vector header res)))

(define (tasks:get-server mdb run-id #!key (retries 10))
  (let ((res  #f)
	(best #f))
    (handle-exceptions
     exn
     (begin 

       (debug:print 0 "WARNING: tasks:get-server db access error.")
	   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	   (debug:print 0 " for run " run-id)
	   (print-call-chain (current-error-port))
	   (if (> retries 0)
	       (begin
		 (debug:print 0 " trying call to tasks:get-server again in 10 seconds")
		 (thread-sleep! 10)
		 (tasks:get-server mdb run-id retries: (- retries 0)))
	       (debug:print 0 "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\"")))
     (sqlite3:for-each-row
      (lambda (id interface port pubport transport pid hostname)
	(set! res (vector id interface port pubport transport pid hostname)))
      mdb
      ;; removed:
      ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ?
      "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers







|
>

|
|
|
|
|
|
|
|
|







311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
    (vector header res)))

(define (tasks:get-server mdb run-id #!key (retries 10))
  (let ((res  #f)
	(best #f))
    (handle-exceptions
     exn
     (begin
       (print-call-chain (current-error-port))
       (debug:print 0 "WARNING: tasks:get-server db access error.")
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 " for run " run-id)
       (print-call-chain (current-error-port))
       (if (> retries 0)
	   (begin
	     (debug:print 0 " trying call to tasks:get-server again in 10 seconds")
	     (thread-sleep! 10)
	     (tasks:get-server mdb run-id retries: (- retries 0)))
	   (debug:print 0 "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\"")))
     (sqlite3:for-each-row
      (lambda (id interface port pubport transport pid hostname)
	(set! res (vector id interface port pubport transport pid hostname)))
      mdb
      ;; removed:
      ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ?
      "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
	  (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    
;;   (if status ;; #t means alive
;;       (begin
;; 	(if (equal? hostname (get-host-name))
;; 	    (handle-exceptions
;; 	     exn
;; 	     (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n"
;; 			       "  EXCEPTION: " ((condition-property-accessor 'exn 'message) exn))
;; 	     (debug:print 1 "Sending signal/term to " pid " on " hostname)
;; 	     (process-signal pid signal/term)
;; 	     (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill
;; 	     ;;(process-signal pid signal/kill)
;; 	     ) ;; local machine, send sig term
;; 	    (begin
;; 	      ;;(debug:print-info 1 "Stopping remote servers not yet supported."))))
;; 	      (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide")
;; 	      (let ((serverdat (list hostname port)))
;; 		(hash-table-set! *runremote* run-id (http-transport:client-connect hostname port))
;; 	      	(cdb:kill-server serverdat pid)))))    ;; remote machine, try telling server to commit suicide
;;       (begin
;; 	(if status 
;; 	    (if (equal? hostname (get-host-name))
;; 		(begin
;; 		  (debug:print-info 1 "Sending signal/term to " pid " on " hostname)
;; 		  (process-signal pid signal/term)  ;; local machine, send sig term
;; 		  (thread-sleep! 5)                 ;; give it five seconds to die peacefully then do a brutal kill
;; 		  (process-signal pid signal/kill)) 
;; 		(debug:print 0 "WARNING: Can't kill frozen server on remote host " hostname))))))


;;======================================================================
;; Tasks and Task monitors
;;======================================================================


;;======================================================================







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







379
380
381
382
383
384
385




























386
387
388
389
390
391
392
	  (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    





























;;======================================================================
;; Tasks and Task monitors
;;======================================================================


;;======================================================================