Megatest

Artifact [c1fd4093ed]
Login

Artifact c1fd4093ed8b44b34526869419ea80c706337673:


;;======================================================================
;; Copyright 2006-2012, Matthew Welland.
;; 
;; This file is part of Megatest.
;; 
;;     Megatest is free software: you can redistribute it and/or modify
;;     it under the terms of the GNU General Public License as published by
;;     the Free Software Foundation, either version 3 of the License, or
;;     (at your option) any later version.
;; 
;;     Megatest is distributed in the hope that it will be useful,
;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;     GNU General Public License for more details.
;; 
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;;======================================================================

;; (use srfi-1 data-structures posix regex-case (prefix base64 base64:)
;;      format dot-locking csv-xml z3 udp ;; sql-de-lite
;;      hostinfo md5 message-digest typed-records directory-utils stack
;;      matchable regex posix (srfi 18) extras ;; tcp 
;;      (prefix nanomsg nmsg:)
;;      (prefix sqlite3 sqlite3:)
;;      pkts (prefix dbi dbi:)
;;      )
;; 
;; (declare (unit common))
;; ;; (declare (uses commonmod))
;; ;; (import commonmod)
;; 
;; (include "common_records.scm")


;; (require-library margs)
;; (include "margs.scm")

;; (define old-exit exit)
;; 
;; (define (exit . code)
;;   (if (null? code)
;;       (old-exit)
;;       (old-exit code)))


(define (common:with-queue-db mtconf proc #!key (use-lt #f)(toppath-in #f))
  (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt))
	 (pktsdir  (if pktsdirs (car pktsdirs) #f))
	 (toppath  (or (configf:lookup mtconf "scratchdat" "toppath")
		       toppath-in))
	 (pdbpath  (or (configf:lookup mtconf "setup"  "pdbpath") pktsdir)))
    (cond
     ((not (and  pktsdir toppath pdbpath))
      (debug:print 0 *default-log-port* "ERROR: settings are missing in your megatest.config for area management.")
      (debug:print  0 *default-log-port* "  you need to have pktsdirs in the [setup] section."))
     ((not (common:file-exists? pktsdir))
      (debug:print 0 *default-log-port* "ERROR: pkts directory not found " pktsdir))
     ((not (equal? (file-owner pktsdir)(current-effective-user-id)))
      (debug:print 0 *default-log-port* "ERROR: directory " pktsdir " is not owned by " (current-effective-user-name)))
     (else
	(let* ((pdb  (open-queue-db pdbpath "pkts.db"
				    schema: '("CREATE TABLE groups (id INTEGER PRIMARY KEY,groupname TEXT, CONSTRAINT group_constraint UNIQUE (groupname));"))))
	  (proc pktsdirs pktsdir pdb)
	  (dbi:close pdb))))))

(define (common:load-pkts-to-db mtconf #!key (use-lt #f))
  (common:with-queue-db
   mtconf
   (lambda (pktsdirs pktsdir pdb)
     (for-each
      (lambda (pktsdir) ;; look at all
	(cond
	 ((not (common:file-exists? pktsdir))
	  (debug:print 0 *default-log-port* "ERROR: packets directory " pktsdir " does not exist."))
	 ((not (directory? pktsdir))
	  (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not a directory."))
	 ((not (file-readable? pktsdir))
	  (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not readable."))
	 (else
	  (debug:print-info 0 *default-log-port* "Loading packets found in " pktsdir)
	  (let ((pkts (glob (conc pktsdir "/*.pkt"))))
	    (for-each
	     (lambda (pkt)
	       (let* ((uuid    (cadr (string-match ".*/([0-9a-f]+).pkt" pkt)))
		      (exists  (lookup-by-uuid pdb uuid #f)))
		 (if (not exists)
		     (let* ((pktdat (string-intersperse
				     (with-input-from-file pkt read-lines)
				     "\n"))
			    (apkt   (pkt->alist pktdat))
			    (ptype  (alist-ref 'T apkt)))
		       (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0)
		       (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue"))
		     (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...")
		     )))
	     pkts)))))
      pktsdirs))
   use-lt: use-lt))


;;======================================================================
;; N A N O M S G   C L I E N T
;;======================================================================
;; 
;; 
;; 
;; (define (common:send-dboard-main-changed)
;;   (let* ((dashboard-ips (mddb:get-dashboards)))
;;     (for-each
;;      (lambda (ipadr)
;;        (let* ((soc (common:open-nm-req (conc "tcp://" ipadr)))
;; 	      (msg (conc "main " *toppath*))
;; 	      (res (common:nm-send-receive-timeout soc msg)))
;; 	 (if (not res) ;; couldn't reach that dashboard - remove it from db
;; 	     (print "ERROR: couldn't reach dashboard " ipadr))
;; 	 res))
;;      dashboard-ips)))
;;     
;;     
;; ;;======================================================================
;; ;; D A S H B O A R D   D B 
;; ;;======================================================================
;; 
;; (define (mddb:open-db)
;;   (let* ((db (open-database (conc (get-environment-variable "HOME") "/.dashboard.db"))))
;;     (set-busy-handler! db (busy-timeout 10000))
;;     (for-each
;;      (lambda (qry)
;;        (exec (sql db qry)))
;;      (list 
;;       "CREATE TABLE IF NOT EXISTS vars       (id INTEGER PRIMARY KEY,key TEXT, val TEXT, CONSTRAINT varsconstraint UNIQUE (key));"
;;       "CREATE TABLE IF NOT EXISTS dashboards (
;;           id         INTEGER PRIMARY KEY,
;;           pid        INTEGER,
;;           username   TEXT,
;;           hostname   TEXT,
;;           ipaddr     TEXT,
;;           portnum    INTEGER,
;;           start_time TIMESTAMP DEFAULT (strftime('%s','now')),
;;              CONSTRAINT hostport UNIQUE (hostname,portnum)
;;         );"
;;       ))
;;     db))
;; 
;; ;; register a dashboard 
;; ;;
;; (define (mddb:register-dashboard port)
;;   (let* ((pid      (current-process-id))
;; 	 (hostname (get-host-name))
;; 	 (ipaddr   (server:get-best-guess-address hostname))
;; 	 (username (current-user-name)) ;; (car userinfo)))
;; 	 (db      (mddb:open-db)))
;;     (print "Register monitor, pid: " pid ", hostname: " hostname ", port: " port ", username: " username)
;;     (exec (sql db "INSERT OR REPLACE INTO dashboards (pid,username,hostname,ipaddr,portnum) VALUES (?,?,?,?,?);")
;; 	   pid username hostname ipaddr port)
;;     (close-database db)))
;; 
;; ;; unregister a monitor
;; ;;
;; (define (mddb:unregister-dashboard host port)
;;   (let* ((db      (mddb:open-db)))
;;     (print "Register unregister monitor, host:port=" host ":" port)
;;     (exec (sql db "DELETE FROM dashboards WHERE hostname=? AND portnum=?;") host port)
;;     (close-database db)))
;; 
;; ;; get registered dashboards
;; ;;
;; (define (mddb:get-dashboards)
;;   (let ((db (mddb:open-db)))
;;     (query fetch-column
;; 	   (sql db "SELECT ipaddr || ':' || portnum FROM dashboards;"))))
    
;;======================================================================
;; faux-lock is deprecated. Please use simple-lock below
;;
(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t))
  (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count
      (if (> wait-time 0)
	  (begin
	    (thread-sleep! 1)
	    (if (eq? wait-time 1) ;; only one second left, steal the lock
		(begin
		  (debug:print-info 0 *default-log-port* "stealing lock for " keyname)
		  (common:faux-unlock keyname force: #t)))
	    (common:faux-lock keyname wait-time: (- wait-time 1)))
	  #f)
      (begin
        (rmt:no-sync-set keyname (conc (current-process-id)))
        (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))))

(define (common:faux-unlock keyname #!key (force #f))
  (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))
      (begin
        (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname))
        #t)
      #f))

(define (std-exit-procedure)
  ;;(common:telemetry-log-close)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f
		       (begin
			 (set! *time-to-exit* #t)
			 #t))))
    (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
    (if (and no-hurry (debug:debug-mode 18))
	(rmt:print-db-stats))
    (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
                              (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
			      (if *task-db*    
				  (let ((db (cdr *task-db*)))
				    (if (sqlite3:database? db)
					(begin
					  (sqlite3:interrupt! db)
					  (sqlite3:finalize! db #t)
					  ;; (vector-set! *task-db* 0 #f)
					  (set! *task-db* #f)))))
                              (http-client#close-idle-connections!)
                              ;; (if (and *runremote*
                              ;;          (remote-conndat *runremote*))
                              ;;     (begin
                              ;;       (http-client#close-all-connections!))) ;; for http-client
                              (if (not (eq? *default-log-port* (current-error-port)))
                                  (close-output-port *default-log-port*))
			      (set! *default-log-port* (current-error-port))) "Cleanup db exit thread"))
	  (th2 (make-thread (lambda ()
			      (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...")
			      (if no-hurry
                                  (begin
                                    (thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff
                                  (begin
      				  (thread-sleep! 2)))
      			      (debug:print 4 *default-log-port* " ... done")
      			      )
			    "clean exit")))
      (thread-start! th1)
      (thread-start! th2)
      (thread-join! th1)
      )
    )

  0)


;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (init-watchdog)
  (set! (bdat-watchdog-set! *bdat*)
	(make-thread
	 (lambda ()
	   (handle-exceptions
	    exn
	    (begin
	      (print-call-chain)
	      (print " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
	    (common:watchdog)))
	 "Watchdog thread"))
  (start-watchdog))

(define (start-watchdog)
  ;;(if (not (args:get-arg "-server"))
  ;;    (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
  (let* ((no-watchdog-args
	  '("-list-runs"
	    "-testdata-csv"
	    "-list-servers"
	    "-server"
	    "-adjutant"
	    "-list-disks"
	    "-list-targets"
	    "-show-runconfig"
	    ;;"-list-db-targets"
	    "-show-runconfig"
	    "-show-config"
	    "-show-cmdinfo"
	    "-cleanup-db"
	    ))
	 (no-watchdog-argvals (list '("-archive" . "replicate-db")))
	 (start-watchdog-specail-arg-val (let loop ((hed  (car no-watchdog-argvals))
						    (tail (cdr   no-watchdog-argvals)))
					   ;; (print "hed" hed " arg " (args:get-arg (car hed)) " val:" (cdr hed)  " eql" (equal? (args:get-arg (car hed)) (cdr hed)))  
					   (if (equal? (args:get-arg (car hed)) (cdr hed))
					       #f
					       (if (null? tail)
						   #t
						   (loop (car tail) (cdr tail))))))      
	 (no-watchdog-args-vals (filter (lambda (x) x)
					(map args:get-arg no-watchdog-args)))
	 (start-watchdog (and (null? no-watchdog-args-vals) start-watchdog-specail-arg-val)))
					;(print  "no-watchdog-args="no-watchdog-args "no-watchdog-args-vals="no-watchdog-args-vals " start-watchdog-specail-arg-val:" start-watchdog-specail-arg-val " start-watchdog:" start-watchdog) 
    (if start-watchdog
	(thread-start! (bdat-watchdog *bdat*)))))
     
;;======================================================================
;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (common:watchdog)
  (debug:print-info 13 *default-log-port* "common:watchdog entered.")
  (if (launch:setup)
      (if (common:on-homehost?)
	  (let ((dbstruct (db:setup #t)))
	    (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct)
	    (cond
	     ((dbr:dbstruct-read-only dbstruct)
	      (debug:print-info 13 *default-log-port* "loading read-only watchdog")
	      (common:readonly-watchdog dbstruct))
	     (else
	      (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
              (let* ((syncer (or (configf:lookup *configdat* "server" "sync-method") "brute-force-sync")))
                (cond
                 ((equal? syncer "brute-force-sync")
                  (server:writable-watchdog-bruteforce dbstruct))
                 ((equal? syncer "delta-sync")
                  (server:writable-watchdog-deltasync dbstruct))
                 (else
                  (debug:print-error 0 *default-log-port* "Unknown server/sync-method specified ("syncer") - valid values are brute-force-sync and delta-sync.")
                  (exit 1)))
                ;;(debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] Syncer started (method="syncer")")
                )))
	    (debug:print-info 13 *default-log-port* "watchdog done."))
	  (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))


;;======================================================================
;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp
;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server)
;;
(define (common:readonly-watchdog dbstruct)
  (let ((just-testing 0.0501))
    (thread-sleep! just-testing)) ;; (/ 1 20)) ;; 0.051) ;; delay for startup
  (debug:print-info 13 *default-log-port* "common:readonly-watchdog entered.")
  ;; sync megatest.db to /tmp/.../megatst.db
  (let* ((sync-cool-off-duration   3)
        (golden-mtdb     (dbr:dbstruct-mtdb dbstruct))
        (golden-mtpath   (db:dbdat-get-path golden-mtdb))
        (tmp-mtdb        (dbr:dbstruct-tmpdb dbstruct))
        (tmp-mtpath      (db:dbdat-get-path tmp-mtdb)))
    (debug:print-info 0 *default-log-port* "Read-only periodic sync thread started.")
    (let loop ((last-sync-time 0))
      (debug:print-info 13 *default-log-port* "loop top tmp-mtpath="tmp-mtpath" golden-mtpath="golden-mtpath)
      (let* ((duration-since-last-sync (- (current-seconds) last-sync-time)))
        (debug:print-info 13 *default-log-port* "duration-since-last-sync="duration-since-last-sync)
        (if (and (not *time-to-exit*)
                 (< duration-since-last-sync sync-cool-off-duration))
            (thread-sleep! (- sync-cool-off-duration duration-since-last-sync)))
        (if (not *time-to-exit*)
            (let ((golden-mtdb-mtime (file-modification-time golden-mtpath))
                  (tmp-mtdb-mtime    (file-modification-time tmp-mtpath)))
	      (if (> golden-mtdb-mtime tmp-mtdb-mtime)
		  (if (< golden-mtdb-mtime (- (current-seconds) 3)) ;; file has NOT been touched in past three seconds, this way multiple servers won't fight to sync back
		      (let ((res (db:multi-db-sync dbstruct 'old2new)))
			(debug:print-info 13 *default-log-port* "rosync called, " res " records transferred."))))
              (loop (current-seconds)))
            #t)))
    (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" mtpath="golden-mtpath)))

;;======================================================================
;; from metadat lookup MEGATEST_VERSION
;;
(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB
  (rmt:get-var "MEGATEST_VERSION"))

(define (common:get-last-run-version-number)
  (string->number 
   (substring (common:get-last-run-version) 0 6)))

(define (common:set-last-run-version)
  (rmt:set-var "MEGATEST_VERSION" (common:version-signature)))

;;======================================================================
;; postive number if megatest version > db version
;; negative number if megatest version < db version
(define (common:version-db-delta)
  (- megatest-version (common:get-last-run-version-number)))

(define (common:version-changed?)
  (not (equal? (common:get-last-run-version)
               (common:version-signature))))

(define (common:api-changed?)
  (not (equal? (substring (->string megatest-version) 0 4)
               (substring (conc (common:get-last-run-version)) 0 4))))

;;======================================================================
;; Move me elsewhere ...
;; RADT => Why do we meed the version check here, this is called only if version misma
;;
(define (common:cleanup-db dbstruct #!key (full #f))
  (apply db:multi-db-sync 
   dbstruct
   'schema
   ;; 'new2old
   'killservers
   'adj-target
   ;; 'old2new
   'new2old
   ;; (if full
       '(dejunk)
       ;; '())
       )
  (if (common:api-changed?)
      (common:set-last-run-version)))

;;======================================================================
;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
;; Do NOT check if not on homehost!
;;
(define (common:exit-on-version-changed)
  (if (common:on-homehost?)
      (if (common:api-changed?)
	  (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config"))
                (dbfile  (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db"))
                (read-only (not (file-writable? dbfile)))
                (dbstruct (db:setup #t)))
	    (debug:print 0 *default-log-port*
			 "WARNING: Version mismatch!\n"
			 "   expected: " (common:version-signature) "\n"
			 "   got:      " (common:get-last-run-version))
            (cond
             ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t)
             ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only)
                   (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db
              (debug:print 0 *default-log-port* "   I see you are the owner of megatest.config, attempting to cleanup and reset to new version")
              (handle-exceptions
               exn
               (begin
                 (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn)
                 (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
                 (print-call-chain (current-error-port))
                 (exit 1))
               (common:cleanup-db dbstruct)))
             ((not (common:file-exists? mtconf))
              (debug:print 0 *default-log-port* "   megatest.config does not exist in this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             ((not (common:file-exists? dbfile))
              (debug:print 0 *default-log-port* "   megatest.db does not exist in this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             ((not (eq? (current-user-id)(file-owner mtconf)))
              (debug:print 0 *default-log-port* "   You do not own megatest.db in this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             (read-only
              (debug:print 0 *default-log-port* "   You have read-only access to this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             (else
              (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"")
              (exit 1)))))))
;;======================================================================
;;      (begin
;;	(debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.")
;;	(exit 1))))

(define (common:run-sync?)
    (and (common:on-homehost?)
	 (args:get-arg "-server")))