Megatest

Check-in [bd4a33953c]
Login
Overview
Comment:addressed stach dump in db:get-run-stats
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.63-stackdumpfix
Files: files | file ages | folders
SHA1: bd4a33953c050658f50c69dccdeafcd623c45230
User & Date: bjbarcla on 2017-01-05 15:43:17
Other Links: branch diff | manifest | tags
Context
2017-01-06
14:31
wip check-in: b3d324d50f user: bjbarcla tags: v1.63-stackdumpfix
2017-01-05
15:43
addressed stach dump in db:get-run-stats check-in: bd4a33953c user: bjbarcla tags: v1.63-stackdumpfix
11:03
added .server verification against running server ; also touching .server periodically and checking its age to detect and recover from potential server crashes check-in: 65358a4d53 user: bjbarcla tags: v1.63
Changes

Modified db.scm from [31eac1d5ff] to [270a2d0497].

1959
1960
1961
1962
1963
1964
1965

1966
1967
1968
1969

1970
1971

1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984

1985
1986
1987
1988
1989
1990
1991
	"SELECT state,status,count(id) AS count FROM tests WHERE run_id=? AND NOT(uname='n/a' AND item_path='') GROUP BY state,status;;"
	run-id))))

;; Update run_stats for given run_id
;; input data is a list (state status count)
;;
(define (db:update-run-stats dbstruct run-id stats)

  (db:with-db
   dbstruct
   #f
   #f

   (lambda (db)
     ;; remove previous data

     (let* ((stmt1 (sqlite3:prepare db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;"))
	    (stmt2 (sqlite3:prepare db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);"))
	    (res
	     (sqlite3:with-transaction
	      db
	      (lambda ()
		(for-each
		 (lambda (dat)
		   (sqlite3:execute stmt1 run-id (car dat)(cadr dat))
		   (apply sqlite3:execute stmt2 run-id dat))
		 stats)))))
       (sqlite3:finalize! stmt1)
       (sqlite3:finalize! stmt2)

       res))))

(define (db:get-main-run-stats dbstruct run-id)
  (db:with-db
   dbstruct
   #f ;; this data comes from main
   #f







>




>


>













>







1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
	"SELECT state,status,count(id) AS count FROM tests WHERE run_id=? AND NOT(uname='n/a' AND item_path='') GROUP BY state,status;;"
	run-id))))

;; Update run_stats for given run_id
;; input data is a list (state status count)
;;
(define (db:update-run-stats dbstruct run-id stats)
  (mutex-lock! *db-stats-mutex*)
  (db:with-db
   dbstruct
   #f
   #f

   (lambda (db)
     ;; remove previous data
     
     (let* ((stmt1 (sqlite3:prepare db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;"))
	    (stmt2 (sqlite3:prepare db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);"))
	    (res
	     (sqlite3:with-transaction
	      db
	      (lambda ()
		(for-each
		 (lambda (dat)
		   (sqlite3:execute stmt1 run-id (car dat)(cadr dat))
		   (apply sqlite3:execute stmt2 run-id dat))
		 stats)))))
       (sqlite3:finalize! stmt1)
       (sqlite3:finalize! stmt2)
       (mutex-unlock! *db-stats-mutex*)
       res))))

(define (db:get-main-run-stats dbstruct run-id)
  (db:with-db
   dbstruct
   #f ;; this data comes from main
   #f

Modified rmt.scm from [b725604f3b] to [33def41478].

73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
     ;; ensure we have a record for our connection for given area
     ((not *runremote*)                     
      (set! *runremote* (make-remote))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat *runremote*)))  ;; have a homehost record?
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
      (remote-hh-dat-set! *runremote* (common:get-homehost))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  2")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; on homehost and this is a read
     ((and (cdr (remote-hh-dat *runremote*))   ;; on homehost







|







73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
     ;; ensure we have a record for our connection for given area
     ((not *runremote*)                     
      (set! *runremote* (make-remote))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat *runremote*)))  ;; not on homehost
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
      (remote-hh-dat-set! *runremote* (common:get-homehost))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  2")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; on homehost and this is a read
     ((and (cdr (remote-hh-dat *runremote*))   ;; on homehost
139
140
141
142
143
144
145

146
147
148
149
150
151
152
153

154
155
156
157
158
159
160
	    ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.2")
	    ;; (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
            ;; (rmt:send-receive cmd rid params attemptnum: attemptnum))   ;)  ;)
;;;;
     
     ;; if not on homehost ensure we have a connection to a live server
     ;; NOTE: we *have* a homehost record by now

     ((and (not (cdr (remote-hh-dat *runremote*)))        ;; not on a homehost 
           (not (remote-conndat *runremote*))             ;; and no connection
           (server:read-dotserver *toppath*))             ;; .server file exists
      ;; something caused the server entry in tdb to disappear, but the server is still running
      (server:remove-dotserver-file *toppath* ".*")
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  20")
      (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum)))

     ((and (not (cdr (remote-hh-dat *runremote*)))        ;; not on a homehost 
           (not (remote-conndat *runremote*)))            ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6  hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*))
      (mutex-unlock! *rmt-mutex*)
      (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
      (remote-conndat-set! *runremote* (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as







>
|
|
|
|
|
|
|
|
>







139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
	    ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.2")
	    ;; (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
            ;; (rmt:send-receive cmd rid params attemptnum: attemptnum))   ;)  ;)
;;;;
     
     ;; if not on homehost ensure we have a connection to a live server
     ;; NOTE: we *have* a homehost record by now

     ;; ((and (not (cdr (remote-hh-dat *runremote*)))        ;; not on a homehost 
     ;;       (not (remote-conndat *runremote*))             ;; and no connection
     ;;       (server:read-dotserver *toppath*))             ;; .server file exists
     ;;  ;; something caused the server entry in tdb to disappear, but the server is still running
     ;;  (server:remove-dotserver-file *toppath* ".*")
     ;;  (mutex-unlock! *rmt-mutex*)
     ;;  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  20")
     ;;  (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum)))

     ((and (not (cdr (remote-hh-dat *runremote*)))        ;; not on a homehost 
           (not (remote-conndat *runremote*)))            ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6  hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*))
      (mutex-unlock! *rmt-mutex*)
      (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
      (remote-conndat-set! *runremote* (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as

Modified tasks.scm from [a0c6ff1ee2] to [0bc99f47ad].

446
447
448
449
450
451
452

453
454
455
456

457
458
459
460
461
462
463
464
465
466
467
468
469
470

471
472
473
474
475
476
477
        FROM servers WHERE run_id=? AND state NOT LIKE 'defunct%' ORDER BY start_time DESC;"
     run-id)
    (reverse res)))

;; no elegance here ...
;;
(define (tasks:kill-server hostname pid #!key (kill-switch ""))

  (debug:print-info 0 *default-log-port* "Attempting to kill server process " pid " on host " hostname)
  (setenv "TARGETHOST" hostname)
  (setenv "TARGETHOST_LOGF" "server-kills.log")
  (system (conc "nbfake kill "kill-switch" "pid))

  (unsetenv "TARGETHOST_LOGF")
  (unsetenv "TARGETHOST"))
 
;; look up a server by run-id and send it a kill, also delete the record for that server
;;
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed")
	  (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)

	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    
;;======================================================================







>




>














>







446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
        FROM servers WHERE run_id=? AND state NOT LIKE 'defunct%' ORDER BY start_time DESC;"
     run-id)
    (reverse res)))

;; no elegance here ...
;;
(define (tasks:kill-server hostname pid #!key (kill-switch ""))
  (server:remove-dotserver-file *toppath* ".*")
  (debug:print-info 0 *default-log-port* "Attempting to kill server process " pid " on host " hostname)
  (setenv "TARGETHOST" hostname)
  (setenv "TARGETHOST_LOGF" "server-kills.log")
  (system (conc "nbfake kill "kill-switch" "pid))

  (unsetenv "TARGETHOST_LOGF")
  (unsetenv "TARGETHOST"))
 
;; look up a server by run-id and send it a kill, also delete the record for that server
;;
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed")
	  (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
          (server:remove-dotserver-file *toppath* ".*")
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    
;;======================================================================