Megatest

Changes On Branch f5bedac3fbb78351
Login

Changes In Branch v1.65-telemetry Through [f5bedac3fb] Excluding Merge-Ins

This is equivalent to a diff from 2aaccbd409 to f5bedac3fb

2019-02-14
15:02
merged in trunk to get docs/manual updates check-in: 73cb4bf58e user: bjbarcla tags: v1.65
2019-02-13
18:28
added telemetry-daemon check-in: 610250e3f0 user: bjbarcla tags: v1.65-telemetry
18:27
added telemetry on rmt: and api: check-in: f5bedac3fb user: bjbarcla tags: v1.65-telemetry
00:53
removed stray prints check-in: 673fea48bc user: bjbarcla tags: v1.65-telemetry
00:51
added telemetry logging func common:telemetry-log check-in: 76975179f6 user: bjbarcla tags: v1.65-telemetry
2019-02-11
11:35
merged brute force syncer check-in: 2aaccbd409 user: bjbarcla tags: v1.65, v1.6524
11:30
made server messages such that sync handily summarized by watch "grep -h SYNC server-*.log | sort | tail -30" Closed-Leaf check-in: ef2ec4a2aa user: bjbarcla tags: v1.65-dump-for-sync
2019-02-07
17:16
coordinate multiple servers such that only one server will be syncing exclusively at any given momemt\ check-in: 31c8ca7f78 user: bjbarcla tags: v1.65

Modified api.scm from [1541791de9] to [cf3fabb928].

155
156
157
158
159
160
161





162
163
164
165
166
167
168
				   cmd-in
				   (string->symbol cmd-in)))
            (params            (vector-ref dat 1))
            (start-t           (current-milliseconds))
            (readonly-mode     (dbr:dbstruct-read-only dbstruct))
            (readonly-command  (member cmd api:read-only-queries))
            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))





            (res    
             (if writecmd-in-readonly-mode
                 (conc "attempt to run write command "cmd" on a read-only database")
                 (case cmd
                   ;;===============================================
                   ;; READ/WRITE QUERIES
                   ;;===============================================







>
>
>
>
>







155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
				   cmd-in
				   (string->symbol cmd-in)))
            (params            (vector-ref dat 1))
            (start-t           (current-milliseconds))
            (readonly-mode     (dbr:dbstruct-read-only dbstruct))
            (readonly-command  (member cmd api:read-only-queries))
            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))
            (foo               (begin
                                 (common:telemetry-log (conc "api-in:"(->string cmd))
                                                       payload: `((params . ,params)))
                                 
                                 #t))
            (res    
             (if writecmd-in-readonly-mode
                 (conc "attempt to run write command "cmd" on a read-only database")
                 (case cmd
                   ;;===============================================
                   ;; READ/WRITE QUERIES
                   ;;===============================================
325
326
327
328
329
330
331

332
333
334
335
336
337
338




339




340
341
342
343
344
345
346
347
                   ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))

                   ;; TASKS 
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
		   (else
		    (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
		    (conc "ERROR: BAD api call " cmd))))))

       
       ;; save all stats
       (let ((delta-t (- (current-milliseconds)
			 start-t)))
	 (hash-table-set! *db-api-call-time* cmd
			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
       (if writecmd-in-readonly-mode




	   (vector #f res)




           (vector #t res)))))))

;; http-server  send-response
;;                 api:process-request
;;                    db:*
;;
;; NB// Runs on the server as part of the server loop
;;







>







>
>
>
>
|
>
>
>
>
|







330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
                   ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))

                   ;; TASKS 
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
		   (else
		    (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
		    (conc "ERROR: BAD api call " cmd))))))

       
       ;; save all stats
       (let ((delta-t (- (current-milliseconds)
			 start-t)))
	 (hash-table-set! *db-api-call-time* cmd
			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
       (if writecmd-in-readonly-mode
           (begin
             (common:telemetry-log (conc "api-out:"(->string cmd))
                                   payload: `((params . ,params)
                                              (ok-res . #t)))
	     (vector #f res))
           (begin
             (common:telemetry-log (conc "api-out:"(->string cmd))
                                   payload: `((params . ,params)
                                              (ok-res . #f)))
             (vector #t res))))))))

;; http-server  send-response
;;                 api:process-request
;;                    db:*
;;
;; NB// Runs on the server as part of the server loop
;;

Modified common.scm from [b6c40dc319] to [2f4d86191c].

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
;; 
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;;======================================================================

(use srfi-1 data-structures posix regex-case (prefix base64 base64:)
     format dot-locking csv-xml z3 ;; sql-de-lite
     hostinfo md5 message-digest typed-records directory-utils stack
     matchable regex posix (srfi 18) extras ;; tcp 
     (prefix nanomsg nmsg:)
     (prefix sqlite3 sqlite3:)
     pkts (prefix dbi dbi:)
     )








|







15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
;; 
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;;======================================================================

(use srfi-1 data-structures posix regex-case (prefix base64 base64:)
     format dot-locking csv-xml z3 udp ;; sql-de-lite
     hostinfo md5 message-digest typed-records directory-utils stack
     matchable regex posix (srfi 18) extras ;; tcp 
     (prefix nanomsg nmsg:)
     (prefix sqlite3 sqlite3:)
     pkts (prefix dbi dbi:)
     )

79
80
81
82
83
84
85

86
87
88
89
90
91
92
(define (get-file-descriptor-count #!key  (pid (current-process-id )))
  (list
    (length (glob (conc "/proc/" pid "/fd/*")))
    (length  (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*")))))
  )
)



;; GLOBALS

;; CONTEXTS
(defstruct cxt
  (taskdb #f)
  (cmutex (make-mutex)))







>







79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
(define (get-file-descriptor-count #!key  (pid (current-process-id )))
  (list
    (length (glob (conc "/proc/" pid "/fd/*")))
    (length  (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*")))))
  )
)

  

;; GLOBALS

;; CONTEXTS
(defstruct cxt
  (taskdb #f)
  (cmutex (make-mutex)))
885
886
887
888
889
890
891

892
893
894
895
896
897
898
	      (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
	      (server:writable-watchdog dbstruct)))
	    (debug:print-info 13 *default-log-port* "watchdog done."))
	  (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))


(define (std-exit-procedure)

  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f
		       (begin
			 (set! *time-to-exit* #t)
			 #t))))







>







886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
	      (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
	      (server:writable-watchdog dbstruct)))
	    (debug:print-info 13 *default-log-port* "watchdog done."))
	  (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))


(define (std-exit-procedure)
  ;;(common:telemetry-log-close)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f
		       (begin
			 (set! *time-to-exit* #t)
			 #t))))
3047
3048
3049
3050
3051
3052
3053





















































       (if thread
           (handle-exceptions
           exn
           #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception
           (thread-join! thread))
           )))
   (hash-table-keys *common:thread-punchlist*)))




























































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
       (if thread
           (handle-exceptions
           exn
           #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception
           (thread-join! thread))
           )))
   (hash-table-keys *common:thread-punchlist*)))

(define *common:telemetry-log-state* 'startup)
(define *common:telemetry-log-socket* #f)

(define (common:telemetry-log-open)
  (if (eq? *common:telemetry-log-state* 'startup)
      (let* ((serverhost (configf:lookup *configdat* "telemetry" "host"))
             (serverport (configf:lookup-number *configdat* "telemetry" "port"))
             (user (or (get-environment-variable "USER") "unknown"))
             (host (or (get-environment-variable "HOST") "unknown")))
        (set! *common:telemetry-log-state*
              (handle-exceptions
               exn
               (begin
                 (debug:print-info 0 *default-log-port* "common-telemetry-log get udp port failure")
                 'broken)
               (if (and serverhost serverport user host)
                   (let* ((s (udp-open-socket)))
                     ;;(udp-bind! s #f 0)
                     (udp-connect! s serverhost serverport)
                     (set! *common:telemetry-log-socket* s)
                     'open)
                   'not-needed))))))
  
(define (common:telemetry-log event #!key (payload '()))
  (if (eq? *common:telemetry-log-state* 'startup)
      (common:telemetry-log-open))
  (handle-exceptions
   exn
   (begin
     (debug:print-info 0 *default-log-port* "common-telemetry-log failure"))
   (if (and *common:telemetry-log-socket* event)
       (let* ((user (or (get-environment-variable "USER") "unknown"))
              (host (or (get-environment-variable "HOST") "unknown"))
              (start (conc "[megatest "event"]"))
              (toppath (or *toppath* "/dev/null"))
              (payload-serialized
               (base64:base64-encode
                (z3:encode-buffer
                 (with-output-to-string (lambda () (pp payload))))))
              (msg     (conc user":"host":"start":"(current-process-id)":"
                             toppath":"payload-serialized)))
         (udp-send *common:telemetry-log-socket* msg)))))

(define (common:telemetry-log-close)
  (when (and (eq? *common:telemetry-log-state* 'open) *common:telemetry-log-socket*)
    (handle-exceptions
     exn
     (begin
       (debug:print-info 0 *default-log-port* "common-telemetry-log closure failure"))
     (begin
       (udp-close-socket *common:telemetry-log-socket*)
       (set! *common:telemetry-log-socket* #f)))))

Modified rmt.scm from [0a05f35135] to [bc89e0120c].

53
54
55
56
57
58
59





60
61
62
63
64
65
66

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected






  ;;DOT digraph megatest_state_status {
  ;;DOT   ranksep=0;
  ;;DOT   // rankdir=LR;
  ;;DOT   node [shape="box"];
  ;;DOT "rmt:send-receive" -> MUTEXLOCK;
  ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; }
  ;; do all the prep locked under the rmt-mutex







>
>
>
>
>







53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected

  (common:telemetry-log (conc "rmt:"(->string cmd))
                        payload: `((rid . ,rid)
                                   (params . ,params)))
                          
  
  ;;DOT digraph megatest_state_status {
  ;;DOT   ranksep=0;
  ;;DOT   // rankdir=LR;
  ;;DOT   node [shape="box"];
  ;;DOT "rmt:send-receive" -> MUTEXLOCK;
  ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; }
  ;; do all the prep locked under the rmt-mutex

Modified runs.scm from [4560e73753] to [9ffac6688c].

449
450
451
452
453
454
455







456
457
458
459
460
461
462
	  	  (debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests)
		  ));; tests will be ANDed with this list

    ;; register this run in monitor.db
    (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
    (rmt:tasks-set-state-given-param-key task-key "running")








    ;; Now generate all the tests lists
    (set! all-tests-registry (tests:get-all))   ;; hash of testname => path-to-test
    (set! all-test-names     (hash-table-keys all-tests-registry))
    ;; filter first for allowed-tests (from -tagexpr) then for test-patts.
    (set! test-names         (tests:filter-test-names
			      (if allowed-tests
				  (tests:filter-test-names all-test-names allowed-tests)







>
>
>
>
>
>
>







449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
	  	  (debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests)
		  ));; tests will be ANDed with this list

    ;; register this run in monitor.db
    (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
    (rmt:tasks-set-state-given-param-key task-key "running")

    (common:telemetry-log "run-tests"
                          payload:
                          `( (target . ,target)
                             (run-name . ,runname)
                             (test-patts . ,test-patts) ) )

    
    ;; Now generate all the tests lists
    (set! all-tests-registry (tests:get-all))   ;; hash of testname => path-to-test
    (set! all-test-names     (hash-table-keys all-tests-registry))
    ;; filter first for allowed-tests (from -tagexpr) then for test-patts.
    (set! test-names         (tests:filter-test-names
			      (if allowed-tests
				  (tests:filter-test-names all-test-names allowed-tests)