Megatest

Diff
Login

Differences From Artifact [2559bee69c]:

To Artifact [7b05deb102]:


23
24
25
26
27
28
29
30

31
32
33
34
35
36
37
38



39
40
41
42
43
44
45
46

47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

65
66
67
68
69
70
71
23
24
25
26
27
28
29

30
31
32
33
34
35



36
37
38
39
40
41
42
43
44
45

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

64
65
66
67
68
69
70
71







-
+





-
-
-
+
+
+







-
+

















-
+







;; Tasks db
;;======================================================================

;; wait up to aprox n seconds for a journal to go away
;;
(define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f))
  (if (not (string? path))
      (debug:print 0 "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)")
      (debug:print 0 #f "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)")
      (let ((fullpath (conc path "-journal")))
	(handle-exceptions
	 exn
	 (begin
	   (print-call-chain (current-error-port))
	   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	   (debug:print 0 " exn=" (condition->list exn))
	   (debug:print 0 "tasks:wait-on-journal failed. Continuing on, you can ignore this call-chain")
	   (debug:print 0 #f " message: " ((condition-property-accessor 'exn 'message) exn))
	   (debug:print 0 #f " exn=" (condition->list exn))
	   (debug:print 0 #f "tasks:wait-on-journal failed. Continuing on, you can ignore this call-chain")
	   #t) ;; if stuff goes wrong just allow it to move on
	 (let loop ((journal-exists (file-exists? fullpath))
		    (count          n)) ;; wait ten times ...
	   (if journal-exists
	       (begin
		 (if (and waiting-msg
			  (eq? (modulo n 30) 0))
		     (debug:print 0 waiting-msg))
		     (debug:print 0 #f waiting-msg))
		 (if (> count 0)
		     (begin
		       (thread-sleep! 1)
		       (loop (file-exists? fullpath)
			     (- count 1)))
		     (begin
		       (if remove (system (conc "rm -rf " fullpath)))
		       #f)))
	       #t))))))

(define (tasks:get-task-db-path)
  (let ((dbdir  (or (configf:lookup *configdat* "setup" "monitordir")
		    (configf:lookup *configdat* "setup" "dbdir")
		    (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))))
    (handle-exceptions
     exn
     (begin
       (debug:print 0 "ERROR: Couldn't create path to " dbdir)
       (debug:print 0 #f "ERROR: Couldn't create path to " dbdir)
       (exit 1))
     (if (not (directory? dbdir))(create-directory dbdir #t)))
    dbdir))

;; If file exists AND
;;    file readable
;;         ==> open it
79
80
81
82
83
84
85
86
87


88
89
90
91
92
93


94
95
96
97
98
99
100
79
80
81
82
83
84
85


86
87
88
89
90
91


92
93
94
95
96
97
98
99
100







-
-
+
+




-
-
+
+







  (if *task-db*
      *task-db*
      (handle-exceptions
       exn
       (if (> numretries 0)
	   (begin
	     (print-call-chain (current-error-port))
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 0 " exn=" (condition->list exn))
	     (debug:print 0 #f " message: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 0 #f " exn=" (condition->list exn))
	     (thread-sleep! 1)
	     (tasks:open-db numretries (- numretries 1)))
	   (begin
	     (print-call-chain (current-error-port))
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 0 " exn=" (condition->list exn))))
	     (debug:print 0 #f " message: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 0 #f " exn=" (condition->list exn))))
       (let* ((dbpath       (tasks:get-task-db-path))
	      (dbfile       (conc dbpath "/monitor.db"))
	      (avail        (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away
	      (exists       (file-exists? dbpath))
	      (write-access (file-write-access? dbpath))
	      (mdb          (cond ;; what the hek is *toppath* doing here?
			     ((and (string? *toppath*)(file-write-access? *toppath*))
284
285
286
287
288
289
290
291

292
293
294
295
296
297
298
299
300
301

302
303
304
305
306
307
308
284
285
286
287
288
289
290

291
292
293
294
295
296
297
298
299
300

301
302
303
304
305
306
307
308







-
+









-
+







		(loop (get-rand-port)(- remtries 1))
		(get-rand-port))
	    port))))))

(define (tasks:server-am-i-the-server? mdb run-id)
  (let* ((all    (tasks:server-get-servers-vying-for-run-id mdb run-id))
	 (first  (if (null? all)
		     #f;; (begin (debug:print 0 "ERROR: no servers listed, should be at least one by now.") 
		     #f;; (begin (debug:print 0 #f "ERROR: no servers listed, should be at least one by now.") 
		       ;;      (sqlite3:finalize! mdb)
		       ;;      (exit 1))
		     (car (db:get-rows all)))))
    (if first
	(let* ((header   (db:get-header all))
	       (id       (db:get-value-by-header first header "id"))
	       (hostname (db:get-value-by-header first header "hostname"))
	       (pid      (db:get-value-by-header first header "pid"))
	       (priority (db:get-value-by-header first header "priority")))
	  ;; (debug:print 0 "INFO: am-i-the-server got record " first)
	  ;; (debug:print 0 #f "INFO: am-i-the-server got record " first)
	  ;; for now a basic check. add tiebreaking by priority later
	  (if (and (equal? hostname (get-host-name))
		   (equal? pid      (current-process-id)))
	      id
	      #f))
	#f)))
	     
324
325
326
327
328
329
330
331
332
333



334
335
336
337

338
339
340

341
342
343
344
345
346
347
324
325
326
327
328
329
330



331
332
333
334
335
336

337
338
339

340
341
342
343
344
345
346
347







-
-
-
+
+
+



-
+


-
+







(define (tasks:get-server mdb run-id #!key (retries 10))
  (let ((res  #f)
	(best #f))
    (handle-exceptions
     exn
     (begin
       (print-call-chain (current-error-port))
       (debug:print 0 "WARNING: tasks:get-server db access error.")
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 " for run " run-id)
       (debug:print 0 #f "WARNING: tasks:get-server db access error.")
       (debug:print 0 #f " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 #f " for run " run-id)
       (print-call-chain (current-error-port))
       (if (> retries 0)
	   (begin
	     (debug:print 0 " trying call to tasks:get-server again in 10 seconds")
	     (debug:print 0 #f " trying call to tasks:get-server again in 10 seconds")
	     (thread-sleep! 10)
	     (tasks:get-server mdb run-id retries: (- retries 0)))
	   (debug:print 0 "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\"")))
	   (debug:print 0 #f "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\"")))
     (sqlite3:for-each-row
      (lambda (id interface port pubport transport pid hostname)
	(set! res (vector id interface port pubport transport pid hostname)))
      mdb
      ;; removed:
      ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ?
      "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers
371
372
373
374
375
376
377
378

379
380
381
382

383
384
385
386
387
388
389
390
391
392
393
394
395
396
397

398
399
400
401
402
403
404
371
372
373
374
375
376
377

378
379
380
381

382
383
384
385
386
387
388
389
390
391
392
393
394
395
396

397
398
399
400
401
402
403
404







-
+



-
+














-
+







  (configf:lookup *configdat* "server" "required"))

;; 	(maxqry (cdr (rmt:get-max-query-average run-id)))
;; 	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
;;     (cond
;;      (forced 
;;       (if (common:low-noise-print 60 run-id "server required is set")
;; 	  (debug:print-info 0 "Server required is set, starting server for run-id " run-id "."))
;; 	  (debug:print-info 0 #f "Server required is set, starting server for run-id " run-id "."))
;;       #t)
;;      ((> maxqry threshold)
;;       (if (common:low-noise-print 60 run-id "Max query time execeeded")
;; 	  (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id "."))
;; 	  (debug:print-info 0 #f "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id "."))
;;       #t)
;;      (else
;;       #f))))

;; try to start a server and wait for it to be available
;;
(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
  ;; ensure a server is running for this run
  (let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))
	     (delay-time 0))
      (if (and (not server-dat)
	       (< delay-time delay-max-tries))
	  (begin
	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
		(debug:print 0 "Try starting server for run-id " run-id))
		(debug:print 0 #f "Try starting server for run-id " run-id))
	    (thread-sleep! (/ (random 2000) 1000))
	    (server:kind-run run-id)
	    (thread-sleep! (min delay-time 1))
	    (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))))))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
422
423
424
425
426
427
428
429

430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446

447
448
449

450
451
452
453
454
455
456
422
423
424
425
426
427
428

429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445

446
447
448

449
450
451
452
453
454
455
456







-
+
















-
+


-
+







        FROM servers WHERE run_id=? AND state NOT LIKE 'defunct%' ORDER BY start_time DESC;"
     run-id)
    (reverse res)))

;; no elegance here ...
;;
(define (tasks:kill-server hostname pid)
  (debug:print-info 0 "Attempting to kill server process " pid " on host " hostname)
  (debug:print-info 0 #f "Attempting to kill server process " pid " on host " hostname)
  (setenv "TARGETHOST" hostname)
  (setenv "TARGETHOST_LOGF" "server-kills.log")
  (system (conc "nbfake kill " pid))
  (unsetenv "TARGETHOST_LOGF")
  (unsetenv "TARGETHOST"))
 
;; look up a server by run-id and send it a kill, also delete the record for that server
;;
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed")
	  (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (debug:print-info 0 #f "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
	(debug:print-info 0 #f "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    
;;======================================================================
;; M O N I T O R S
;;======================================================================

517
518
519
520
521
522
523
524

525
526
527
528
529
530
531
517
518
519
520
521
522
523

524
525
526
527
528
529
530
531







-
+







     "SELECT count(id) FROM monitors WHERE last_update < (strftime('%s','now') - 300) AND username=?;"
     (car (user-information (current-user-id))))
    res))

;; 
(define (tasks:start-monitor db mdb)
  (if (> (tasks:get-num-alive-monitors mdb) 2) ;; have two running, no need for more
      (debug:print-info 1 "Not starting monitor, already have more than two running")
      (debug:print-info 1 #f "Not starting monitor, already have more than two running")
      (let* ((megatestdb     (conc *toppath* "/megatest.db"))
	     (monitordbf     (conc (db:dbfile-path #f) "/monitor.db"))
	     (last-db-update 0)) ;; (file-modification-time megatestdb)))
	(task:register-monitor mdb)
	(let loop ((count      0)
		   (next-touch 0)) ;; next-touch is the time where we need to update last_update
	  ;; if the db has been modified we'd best look at the task queue
745
746
747
748
749
750
751
752
753


754
755
756
757
758
759
760
761

762
763
764
765
766
767
768
769


770
771
772
773
774
775
776
777
778
779
780
781
782
783

784
785
786
787
788
789
790
745
746
747
748
749
750
751


752
753
754
755
756
757
758
759
760

761
762
763
764
765
766
767


768
769
770
771
772
773
774
775
776
777
778
779
780
781
782

783
784
785
786
787
788
789
790







-
-
+
+







-
+






-
-
+
+













-
+







;; 
;; do a remote call to get the task queue info but do the killing as self here.
;;
(define (tasks:kill-runner target run-name testpatt)
  (let ((records    (rmt:tasks-find-task-queue-records target run-name testpatt "running" "run-tests"))
	(hostpid-rx (regexp "\\s+(\\w+)\\s+(\\d+)$"))) ;; host pid is at end of param string
    (if (null? records)
	(debug:print 0 "No run launching processes found for " target " / " run-name " with testpatt " (or testpatt "* no testpatt specified! *"))
	(debug:print 0 "Found " (length records) " run(s) to kill."))
	(debug:print 0 #f "No run launching processes found for " target " / " run-name " with testpatt " (or testpatt "* no testpatt specified! *"))
	(debug:print 0 #f "Found " (length records) " run(s) to kill."))
    (for-each 
     (lambda (record)
       (let* ((param-key (list-ref record 8))
	      (match-dat (string-search hostpid-rx param-key)))
	 (if match-dat
	     (let ((hostname  (cadr match-dat))
		   (pid       (string->number (caddr match-dat))))
	       (debug:print 0 "Sending SIGINT to process " pid " on host " hostname)
	       (debug:print 0 #f "Sending SIGINT to process " pid " on host " hostname)
	       (if (equal? (get-host-name) hostname)
		   (if (process:alive? pid)
		       (begin
			 (handle-exceptions
			  exn
			  (begin
			    (debug:print 0 "Kill of process " pid " on host " hostname " failed.")
			    (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
			    (debug:print 0 #f "Kill of process " pid " on host " hostname " failed.")
			    (debug:print 0 #f " message: " ((condition-property-accessor 'exn 'message) exn))
			    #t)
			  (process-signal pid signal/int)
			  (thread-sleep! 5)
			  (if (process:alive? pid)
			      (process-signal pid signal/kill)))))
		   ;;  (call-with-environment-variables
		   (let ((old-targethost (getenv "TARGETHOST")))
		     (setenv "TARGETHOST" hostname)
		     (setenv "TARGETHOST_LOGF" "server-kills.log")
		     (system (conc "nbfake kill " pid))
		     (if old-targethost (setenv "TARGETHOST" old-targethost))
		     (unsetenv "TARGETHOST")
		     (unsetenv "TARGETHOST_LOGF"))))
	     (debug:print 0 "ERROR: no record or improper record for " target "/" run-name " in tasks_queue in main.db"))))
	     (debug:print 0 #f "ERROR: no record or improper record for " target "/" run-name " in tasks_queue in main.db"))))
     records)))

;; (define (tasks:start-run dbstruct mdb task)
;;   (let ((flags (make-hash-table)))
;;     (hash-table-set! flags "-rerun" "NOT_STARTED")
;;     (if (not (string=? (tasks:task-get-params task) ""))
;; 	(hash-table-set! flags "-setvars" (tasks:task-get-params task)))