Megatest

Check-in [dba7069ea8]
Login
Overview
Comment:More cleanup in exception handling
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | streamline-exception-handling
Files: files | file ages | folders
SHA1: dba7069ea857d64878b47d1ecde08bc5a445b106
User & Date: matt on 2014-11-01 08:41:22
Other Links: branch diff | manifest | tags
Context
2014-11-01
22:37
Moved the db sync for local access to watchdog thread. check-in: 39fc3591a8 user: matt tags: streamline-exception-handling
08:41
More cleanup in exception handling check-in: dba7069ea8 user: matt tags: streamline-exception-handling
07:01
Streamline exception handling check-in: 2d5e0edb49 user: matt tags: streamline-exception-handling
Changes

Modified dashboard-tests.scm from [701f564f5d] to [22f7e05798].

559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
							    (> (current-milliseconds)(+ last-update 250))) ;; every half seconds if db touched
						       (> (current-milliseconds)(+ last-update 10000))     ;; force update even 10 seconds
						       request-update))
				    (newtestdat (if need-update 
						    ;; NOTE: BUG HIDER, try to eliminate this exception handler
						    (handle-exceptions
						     exn 
						     (debug:print-info 0 "test db access issue: " ((condition-property-accessor 'exn 'message) exn))
						     (db:get-test-info-by-id dbstruct run-id test-id )))))
			       ;; (debug:print-info 0 "need-update= " need-update " curr-mod-time = " curr-mod-time)
			       (cond
				((and need-update newtestdat)
				 (set! testdat newtestdat)
				 (set! teststeps    (dashboard-tests:get-compressed-steps dbstruct run-id test-id))
				 (set! logfile      (conc (db:test-get-rundir testdat) "/" (db:test-get-final_logf testdat)))







|







559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
							    (> (current-milliseconds)(+ last-update 250))) ;; every half seconds if db touched
						       (> (current-milliseconds)(+ last-update 10000))     ;; force update even 10 seconds
						       request-update))
				    (newtestdat (if need-update 
						    ;; NOTE: BUG HIDER, try to eliminate this exception handler
						    (handle-exceptions
						     exn 
						     (debug:print-info 0 "test db access issue in examine test for run-id " run-id ", test-id " test-id ": " ((condition-property-accessor 'exn 'message) exn))
						     (db:get-test-info-by-id dbstruct run-id test-id )))))
			       ;; (debug:print-info 0 "need-update= " need-update " curr-mod-time = " curr-mod-time)
			       (cond
				((and need-update newtestdat)
				 (set! testdat newtestdat)
				 (set! teststeps    (dashboard-tests:get-compressed-steps dbstruct run-id test-id))
				 (set! logfile      (conc (db:test-get-rundir testdat) "/" (db:test-get-final_logf testdat)))

Modified dashboard.scm from [7792f161eb] to [588f9fa302].

1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
(let ((db (tasks:open-db)))
  (sqlite3:finalize! db))

(define (dashboard:get-youngest-run-db-mod-time)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: error in accessing databases: " ((condition-property-accessor 'exn 'message) exn))
     (current-seconds)) ;; something went wrong - just print an error and return current-seconds
   (apply max (map (lambda (filen)
		     (file-modification-time filen))
		   (glob (conc *dbdir* "/*.db"))))))

(define (dashboard:run-update x)
  (let* ((modtime         (dashboard:get-youngest-run-db-mod-time)) ;; (file-modification-time *db-file-path*))







|







1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
(let ((db (tasks:open-db)))
  (sqlite3:finalize! db))

(define (dashboard:get-youngest-run-db-mod-time)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn))
     (current-seconds)) ;; something went wrong - just print an error and return current-seconds
   (apply max (map (lambda (filen)
		     (file-modification-time filen))
		   (glob (conc *dbdir* "/*.db"))))))

(define (dashboard:run-update x)
  (let* ((modtime         (dashboard:get-youngest-run-db-mod-time)) ;; (file-modification-time *db-file-path*))

Modified portlogger.scm from [3222aa8dd9] to [614321ce45].

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
	 (handler  (make-busy-timeout 136000))
	 (canwrite (file-write-access? fname)))
    (sqlite3:set-busy-handler! db handler)
    (sqlite3:execute db "PRAGMA synchronous = 0;")
    (if (not exists)
	(sqlite3:execute 
	 db
	 "CREATE TABLE ports (
            port INTEGER PRIMARY KEY,
            state TEXT DEFAULT 'not-used',
            fail_count INTEGER DEFAULT 0,
            update_time TIMESTAMP DEFAULT (strftime('%s','now')) );"))
    db))

(define (portlogger:open-run-close proc . params)







|







29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
	 (handler  (make-busy-timeout 136000))
	 (canwrite (file-write-access? fname)))
    (sqlite3:set-busy-handler! db handler)
    (sqlite3:execute db "PRAGMA synchronous = 0;")
    (if (not exists)
	(sqlite3:execute 
	 db
	 "CREATE TABLE IF NOT EXISTS ports (
            port INTEGER PRIMARY KEY,
            state TEXT DEFAULT 'not-used',
            fail_count INTEGER DEFAULT 0,
            update_time TIMESTAMP DEFAULT (strftime('%s','now')) );"))
    db))

(define (portlogger:open-run-close proc . params)
86
87
88
89
90
91
92









93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108








109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

128
129









130
131
132
133
134
135
136
137
138
139
140
141
		    res)))))
    (sqlite3:finalize! qry1)
    (sqlite3:finalize! qry2)
    (sqlite3:finalize! qry3)
    res))

(define (portlogger:get-prev-used-port db)









  (sqlite3:fold-row
   (lambda (var curr)
     (or curr var curr))
   #f
   db
   "SELECT (port) FROM ports WHERE state='released' LIMIT 1;"))

(define (portlogger:find-port db)
  (let* ((lowport (let ((val (configf:lookup *configdat* "server" "lowport")))
		    (if (and val 
			     (string->number val))
			(string->number val)
			32768)))
	 (portnum (or (portlogger:get-prev-used-port db)
		      (+ lowport ;; top of registered ports is 49152 but lets use ports in the registered range
			 (random (- 64000 lowport))))))








    (portlogger:take-port db portnum)
    portnum))

;; set port to "released", "failed" etc.
;; 
(define (portlogger:set-port db portnum value)
  (sqlite3:execute db "UPDATE ports SET state=?,update_time=strftime('%s','now') WHERE port=?;" value portnum))

;; set port to failed (attempted to take but got error)
;;
(define (portlogger:set-failed db portnum)
  (sqlite3:execute db "UPDATE ports SET state='failed',fail_count=fail_count+1,update_time=strftime('%s','now') WHERE port=?;" portnum))

;;======================================================================
;; MAIN
;;======================================================================

(define (portlogger:main . args)
  (let* ((db      (portlogger:open-db (conc "/tmp/." (current-user-name) "-portlogger.db")))

	 (numargs (length args))
	 (result  (cond









		   ((> numargs 1) ;; most commands
		    (case (string->symbol (car args)) ;; commands with two or more params
		      ((take)(portlogger:take-port db (string->number (cadr args))))
		      ((set) (portlogger:set-port db 
						  (string->number (cadr args))
						  (caddr args))
		       (caddr args))
		      ((failed)(portlogger:set-failed db (string->number (cadr args))) 'failed))))))
    (sqlite3:finalize! db)
    result))
     
;; (print (apply portlogger:main (cdr (argv))))







>
>
>
>
>
>
>
>
>
|
|
|
|
|
|










>
>
>
>
>
>
>
>
|

















|
>

|
>
>
>
>
>
>
>
>
>
|
|
|
|
|
|
|
|




86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
		    res)))))
    (sqlite3:finalize! qry1)
    (sqlite3:finalize! qry2)
    (sqlite3:finalize! qry3)
    res))

(define (portlogger:get-prev-used-port db)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db")
     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
     (debug:print 0 "exn=" (condition->list exn))
     (print-call-chain)
     (debug:print 0 "Continuing anyway.")
     #f)
   (sqlite3:fold-row
    (lambda (var curr)
      (or curr var curr))
    #f
    db
    "SELECT (port) FROM ports WHERE state='released' LIMIT 1;")))

(define (portlogger:find-port db)
  (let* ((lowport (let ((val (configf:lookup *configdat* "server" "lowport")))
		    (if (and val 
			     (string->number val))
			(string->number val)
			32768)))
	 (portnum (or (portlogger:get-prev-used-port db)
		      (+ lowport ;; top of registered ports is 49152 but lets use ports in the registered range
			 (random (- 64000 lowport))))))
    (handle-exceptions
     exn
     (begin
       (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db")
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 "exn=" (condition->list exn))
       (print-call-chain)
       (debug:print 0 "Continuing anyway."))
     (portlogger:take-port db portnum))
    portnum))

;; set port to "released", "failed" etc.
;; 
(define (portlogger:set-port db portnum value)
  (sqlite3:execute db "UPDATE ports SET state=?,update_time=strftime('%s','now') WHERE port=?;" value portnum))

;; set port to failed (attempted to take but got error)
;;
(define (portlogger:set-failed db portnum)
  (sqlite3:execute db "UPDATE ports SET state='failed',fail_count=fail_count+1,update_time=strftime('%s','now') WHERE port=?;" portnum))

;;======================================================================
;; MAIN
;;======================================================================

(define (portlogger:main . args)
  (let* ((dbfname (conc "/tmp/." (current-user-name) "-portlogger.db"))
	 (db      (portlogger:open-db dbfname))
	 (numargs (length args))
	 (result  
	  (handle-exceptions
	   exn
	   (begin
	     (debug:print 0 "EXCEPTION: portlogger database at " dbfname " probably overloaded or unreadable. Try removing it.")
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (print "exn=" (condition->list exn))
	     (debug:print 0 " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
	     (print-call-chain))
	   (cond
	    ((> numargs 1) ;; most commands
	     (case (string->symbol (car args)) ;; commands with two or more params
	       ((take)(portlogger:take-port db (string->number (cadr args))))
	       ((set) (portlogger:set-port db 
					   (string->number (cadr args))
					   (caddr args))
		(caddr args))
	       ((failed)(portlogger:set-failed db (string->number (cadr args))) 'failed)))))))
    (sqlite3:finalize! db)
    result))
     
;; (print (apply portlogger:main (cdr (argv))))

Modified process.scm from [f9c6ebc5de] to [781c177a90].

49
50
51
52
53
54
55


56
57
58
59
60
61
62

(define (cmd-run-proc-each-line cmd proc . params)
  ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params)
  (handle-exceptions
   exn
   (begin
     (print "ERROR:  Failed to run command: " cmd " " (string-intersperse params " "))


     #f)
   (let-values (((fh fho pid) (if (null? params)
				  (process cmd)
				  (process cmd params))))
       (let loop ((curr (read-line fh))
		(result  '()))
       (if (not (eof-object? curr))







>
>







49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

(define (cmd-run-proc-each-line cmd proc . params)
  ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params)
  (handle-exceptions
   exn
   (begin
     (print "ERROR:  Failed to run command: " cmd " " (string-intersperse params " "))
     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
     (print "exn=" (condition->list exn))
     #f)
   (let-values (((fh fho pid) (if (null? params)
				  (process cmd)
				  (process cmd params))))
       (let loop ((curr (read-line fh))
		(result  '()))
       (if (not (eof-object? curr))
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
     (let loop ((inl (read-line))
		(res '()))
       (if (eof-object? inl)
	   (reverse res)
	   (let ((pid (string->number inl)))
	     (if proc (proc pid))
	     (loop (read-line) (cons pid res))))))))
       

(define (process:alive? pid)
  (handle-exceptions
   exn
   ;; possibly pid is a process not a child, look in /proc to see if it is running still
   (file-exists? (conc "/proc/" pid))
   (let-values (((rpid exit-type exit-signal)(process-wait pid #t)))
       (and (number? rpid)
	    (equal? rpid pid)))))
	 







<










124
125
126
127
128
129
130

131
132
133
134
135
136
137
138
139
140
     (let loop ((inl (read-line))
		(res '()))
       (if (eof-object? inl)
	   (reverse res)
	   (let ((pid (string->number inl)))
	     (if proc (proc pid))
	     (loop (read-line) (cons pid res))))))))


(define (process:alive? pid)
  (handle-exceptions
   exn
   ;; possibly pid is a process not a child, look in /proc to see if it is running still
   (file-exists? (conc "/proc/" pid))
   (let-values (((rpid exit-type exit-signal)(process-wait pid #t)))
       (and (number? rpid)
	    (equal? rpid pid)))))
	 

Modified rmt.scm from [a8f8013949] to [658bf1b8f2].

102
103
104
105
106
107
108


109
110
111
112
113
114
115

(define (rmt:update-db-stats rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: stats collection failed in update-db-stats")


     #f) ;; if this fails we don't care, it is just stats
   (let* ((cmd      (if (eq? rawcmd 'general-call) (car params) rawcmd))
	  (stat-vec (hash-table-ref/default *db-stats* cmd #f)))
     (if (not stat-vec)
	 (let ((newvec (vector 0 0)))
	   (hash-table-set! *db-stats* cmd newvec)
	   (set! stat-vec newvec)))







>
>







102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

(define (rmt:update-db-stats rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: stats collection failed in update-db-stats")
     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
     (print "exn=" (condition->list exn))
     #f) ;; if this fails we don't care, it is just stats
   (let* ((cmd      (if (eq? rawcmd 'general-call) (car params) rawcmd))
	  (stat-vec (hash-table-ref/default *db-stats* cmd #f)))
     (if (not stat-vec)
	 (let ((newvec (vector 0 0)))
	   (hash-table-set! *db-stats* cmd newvec)
	   (set! stat-vec newvec)))

Modified tasks.scm from [9445a5df8c] to [e808e7265f].

648
649
650
651
652
653
654
655
656
657

658
659
660
661
662
663
664
665
666

667
668
669
670
671
672

673
674
675
676
677
678
679
	(debug:print 0 "Found " (length records) " run(s) to kill."))
    (for-each 
     (lambda (record)
       (let* ((param-key (list-ref record 8))
	      (match-dat (string-search hostpid-rx param-key)))
	 (if match-dat
	     (let ((hostname  (cadr match-dat))
		   (pid       (caddr match-dat)))
	       (debug:print 0 "Sending SIGINT to process " pid " on host " hostname)
	       (if (equal? (get-host-name) hostname)

		   (begin
		     (handle-exceptions
		      exn
		      (begin
			(debug:print 0 "Kill of process " pid " on host " hostname " failed.")
			(debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
			#t)
		      (process-signal (string->number pid) signal/int)
		      (thread-sleep! 5)

		      (process-signal (string->number pid) signal/kill)))
		   ;;  (call-with-environment-variables
		   (let ((old-targethost (getenv "TARGETHOST")))
		     (setenv "TARGETHOST" hostname)
		     (system (conc "nbfake kill " pid))
		     (if old-targethost (setenv "TARGETHOST" old-targethost)))))

	     (debug:print 0 "ERROR: no record or improper record for " target "/" run-name " in tasks_queue in monitor.db"))))
     records)))


;;======================================================================
;; The routines to process tasks
;;======================================================================







|


>
|
|
|
|
|
|
|
|
|
>
|




|
>







648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
	(debug:print 0 "Found " (length records) " run(s) to kill."))
    (for-each 
     (lambda (record)
       (let* ((param-key (list-ref record 8))
	      (match-dat (string-search hostpid-rx param-key)))
	 (if match-dat
	     (let ((hostname  (cadr match-dat))
		   (pid       (string->number (caddr match-dat))))
	       (debug:print 0 "Sending SIGINT to process " pid " on host " hostname)
	       (if (equal? (get-host-name) hostname)
		   (if (process:alive? pid)
		       (begin
			 (handle-exceptions
			  exn
			  (begin
			    (debug:print 0 "Kill of process " pid " on host " hostname " failed.")
			    (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
			    #t)
			  (process-signal pid signal/int)
			  (thread-sleep! 5)
			  (if (process:alive? pid)
			      (process-signal pid signal/kill)))))
		   ;;  (call-with-environment-variables
		   (let ((old-targethost (getenv "TARGETHOST")))
		     (setenv "TARGETHOST" hostname)
		     (system (conc "nbfake kill " pid))
		     (if old-targethost (setenv "TARGETHOST" old-targethost))
		     (unsetenv "TARGETHOST"))))
	     (debug:print 0 "ERROR: no record or improper record for " target "/" run-name " in tasks_queue in monitor.db"))))
     records)))


;;======================================================================
;; The routines to process tasks
;;======================================================================

Modified tdb.scm from [de69c98c94] to [4b5015105f].

68
69
70
71
72
73
74
75

76
77
78
79
80
81
82
	      (tdb:testdb-initialize db)))
	;; (sqlite3:execute db "PRAGMA synchronous = 0;")
	(debug:print-info 11 "open-test-db END (sucessful)" work-area)
	;; now let's test that everything is correct
	(handle-exceptions
	 exn
	 (begin
	   (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test"

			((condition-property-accessor 'exn 'message) exn))
	   #f)
	 ;; Is there a cheaper single line operation that will check for existance of a table
	 ;; and raise an exception ?
	 (sqlite3:execute db "SELECT id FROM test_data LIMIT 1;"))
	db)
      (let ((baddb (sqlite3:open-database ":memory:")))







|
>







68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
	      (tdb:testdb-initialize db)))
	;; (sqlite3:execute db "PRAGMA synchronous = 0;")
	(debug:print-info 11 "open-test-db END (sucessful)" work-area)
	;; now let's test that everything is correct
	(handle-exceptions
	 exn
	 (begin
	   (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test or remove the file " 
			dbpath ".\n  "
			((condition-property-accessor 'exn 'message) exn))
	   #f)
	 ;; Is there a cheaper single line operation that will check for existance of a table
	 ;; and raise an exception ?
	 (sqlite3:execute db "SELECT id FROM test_data LIMIT 1;"))
	db)
      (let ((baddb (sqlite3:open-database ":memory:")))