Megatest

Check-in [b665288f56]
Login
Overview
Comment:Fixed sync to megatest.db, it was not respecting last_update and thus syncing everything. Added hard storage of last update in a no-sync db on /tmp
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: b665288f56f41486d502fa16f9a987e879c11602
User & Date: matt on 2017-06-01 22:08:31
Other Links: branch diff | manifest | tags
Context
2017-06-02
00:58
Bumped delay for sync to 6 seconds check-in: 139076d39f user: matt tags: v1.64
2017-06-01
22:08
Fixed sync to megatest.db, it was not respecting last_update and thus syncing everything. Added hard storage of last update in a no-sync db on /tmp check-in: b665288f56 user: matt tags: v1.64
00:21
Corrected issue with server overload handling. Switch to using a less burdensome server ping instead of server:check-if-running check-in: fd6bcc8b75 user: matt tags: v1.64
Changes

Modified common.scm from [ebe13dceae] to [d262d87a84].

638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
    (and (common:on-homehost?)
	 (args:get-arg "-server")))

;;   (let ((ohh (common:on-homehost?))
;; 	(srv (args:get-arg "-server")))
;;     (and ohh srv)))
    ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv)

;;;; run-ids
;;    if #f use *db-local-sync* : or 'local-sync-flags
;;    if #t use timestamps      : or 'timestamps
(define (common:sync-to-megatest.db dbstruct) 
  (let ((start-time         (current-seconds))
	(res                (db:multi-db-sync dbstruct 'new2old)))
    (let ((sync-time (- (current-seconds) start-time)))
      (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id))
      (if (common:low-noise-print 30 "sync new to old")
	  (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id))))
    res))




(define *wdnum* 0)
(define *wdnum*mutex (make-mutex))
;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp
;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server)







<
<
<
<
<
<
<
<
<
<
<
<
<







638
639
640
641
642
643
644













645
646
647
648
649
650
651
    (and (common:on-homehost?)
	 (args:get-arg "-server")))

;;   (let ((ohh (common:on-homehost?))
;; 	(srv (args:get-arg "-server")))
;;     (and ohh srv)))
    ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv)
















(define *wdnum* 0)
(define *wdnum*mutex (make-mutex))
;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp
;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server)
681
682
683
684
685
686
687
688

689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
        (debug:print-info 13 *default-log-port* "duration-since-last-sync="duration-since-last-sync)
        (if (and (not *time-to-exit*)
                 (< duration-since-last-sync sync-cool-off-duration))
            (thread-sleep! (- sync-cool-off-duration duration-since-last-sync)))
        (if (not *time-to-exit*)
            (let ((golden-mtdb-mtime (file-modification-time golden-mtpath))
                  (tmp-mtdb-mtime    (file-modification-time tmp-mtpath)))
              (if (> golden-mtdb-mtime tmp-mtdb-mtime)

                  (let ((res (db:multi-db-sync dbstruct 'old2new)))
                    (debug:print-info 13 *default-log-port* "rosync called, " res " records transferred.")))
              (loop (current-seconds)))
            #t)))
    (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" mtpath="golden-mtpath)))


        
(define (common:writable-watchdog dbstruct)
  (thread-sleep! 0.05) ;; delay for startup
  (let ((legacy-sync (common:run-sync?))
	(debug-mode  (debug:debug-mode 1))
	(last-time   (current-seconds))
        (this-wd-num     (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))))
    (debug:print-info 2 *default-log-port* "Periodic sync thread started.")
    (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num)
    (if (and legacy-sync (not *time-to-exit*))
	(let* (;;(dbstruct (db:setup))
	       (mtdb     (dbr:dbstruct-mtdb dbstruct))
	       (mtpath   (db:dbdat-get-path mtdb)))
	  (debug:print-info 0 *default-log-port* "Server running, periodic sync started.")
	  (let loop ()
	    ;; sync for filesystem local db writes
	    ;;
	    (mutex-lock! *db-multi-sync-mutex*)
	    (let* ((need-sync        (>= *db-last-access* *db-last-sync*)) ;; no sync since last write
		   (sync-in-progress *db-sync-in-progress*)
		   (should-sync      (and (not *time-to-exit*)
                                          (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum
		   (start-time       (current-seconds))
		   (mt-mod-time      (file-modification-time mtpath))
		   (recently-synced  (< (- start-time mt-mod-time) 4))
		   (will-sync        (and (or need-sync should-sync)
					  (not sync-in-progress)
					  (not recently-synced))))
              (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop.  need-sync="need-sync" sync-in-progress="sync-in-progress" should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync)
	      ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced))
	      ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync)
	      (if will-sync (set! *db-sync-in-progress* #t))
	      (mutex-unlock! *db-multi-sync-mutex*)
	      (if will-sync
		  (let ((res (common:sync-to-megatest.db dbstruct))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive
		    (if (> res 0) ;; some records were transferred, keep the db alive
			(begin
			  (mutex-lock! *heartbeat-mutex*)
			  (set! *db-last-access* (current-seconds))
			  (mutex-unlock! *heartbeat-mutex*)
			  (debug:print-info 0 *default-log-port* "sync called, " res " records transferred."))
			(debug:print-info 2 *default-log-port* "sync called but zero records transferred"))))
	      (if will-sync
		  (begin
		    (mutex-lock! *db-multi-sync-mutex*)
		    (set! *db-sync-in-progress* #f)
		    (set! *db-last-sync* start-time)
		    (mutex-unlock! *db-multi-sync-mutex*)))
	      (if (and debug-mode
		       (> (- start-time last-time) 60))
		  (begin
		    (set! last-time start-time)
		    (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))
	    
	    ;; keep going unless time to exit
	    ;;
	    (if (not *time-to-exit*)
		(let delay-loop ((count 0))
                  ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*)
                                                            
		  (if (and (not *time-to-exit*)
			   (< count 4)) ;; was 11, changing to 4. 
		      (begin
			(thread-sleep! 1)
			(delay-loop (+ count 1))))
		  (if (not *time-to-exit*) (loop))))
	    (if (common:low-noise-print 30)
		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num)))))))

;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (common:watchdog)
  (debug:print-info 13 *default-log-port* "common:watchdog entered.")
  (if (launch:setup)
      (if (common:on-homehost?)
	  (let ((dbstruct (db:setup #t)))
	    (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct)
	    (cond
	     ((dbr:dbstruct-read-only dbstruct)
	      (debug:print-info 13 *default-log-port* "loading read-only watchdog")
	      (common:readonly-watchdog dbstruct))
	     (else
	      (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
	      (common:writable-watchdog dbstruct)))
	    (debug:print-info 13 *default-log-port* "watchdog done."))
	  (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))


(define (std-exit-procedure)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)







|
>
|
|




<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<













|







668
669
670
671
672
673
674
675
676
677
678
679
680
681
682






































































683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
        (debug:print-info 13 *default-log-port* "duration-since-last-sync="duration-since-last-sync)
        (if (and (not *time-to-exit*)
                 (< duration-since-last-sync sync-cool-off-duration))
            (thread-sleep! (- sync-cool-off-duration duration-since-last-sync)))
        (if (not *time-to-exit*)
            (let ((golden-mtdb-mtime (file-modification-time golden-mtpath))
                  (tmp-mtdb-mtime    (file-modification-time tmp-mtpath)))
	      (if (> golden-mtdb-mtime tmp-mtdb-mtime)
		  (if (< golden-mtdb-mtime (- (current-seconds) 3)) ;; file has NOT been touched in past three seconds, this way multiple servers won't fight to sync back
		      (let ((res (db:multi-db-sync dbstruct 'old2new)))
			(debug:print-info 13 *default-log-port* "rosync called, " res " records transferred."))))
              (loop (current-seconds)))
            #t)))
    (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" mtpath="golden-mtpath)))







































































;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (common:watchdog)
  (debug:print-info 13 *default-log-port* "common:watchdog entered.")
  (if (launch:setup)
      (if (common:on-homehost?)
	  (let ((dbstruct (db:setup #t)))
	    (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct)
	    (cond
	     ((dbr:dbstruct-read-only dbstruct)
	      (debug:print-info 13 *default-log-port* "loading read-only watchdog")
	      (common:readonly-watchdog dbstruct))
	     (else
	      (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
	      (server:writable-watchdog dbstruct)))
	    (debug:print-info 13 *default-log-port* "watchdog done."))
	  (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))


(define (std-exit-procedure)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)

Modified db.scm from [de6a9e2632] to [89c961e6e8].

667
668
669
670
671
672
673
674
675

676




677
678
679
680
681







682




683
684
685
686
687
688
689
690
691
692
693
694
695
696
     (let ((stmts       (make-hash-table)) ;; table-field => stmt
	   (all-stmts   '())              ;; ( ( stmt1 value1 ) ( stml2 value2 ))
	   (numrecs     (make-hash-table))
	   (start-time  (current-milliseconds))
	   (tot-count   0))
       (for-each ;; table
	(lambda (tabledat)
	  (let* ((tablename  (car tabledat))
		 (fields     (cdr tabledat))

		 (use-last-update  (if last-update




				       (if (pair? last-update)
					   (member (car last-update)    ;; last-update field name
						   (map car fields))
					   (begin
					     (debug:print 0 *default-log-port* "ERROR: parameter last-update for db:sync-tables must be a pair, received: " last-update) ;; found in fields







					     #f))




				       #f))
		 (num-fields (length fields))
		 (field->num (make-hash-table))
		 (num->field (apply vector (map car fields)))
		 (full-sel   (conc "SELECT " (string-intersperse (map car fields) ",") 
				   " FROM " tablename (if use-last-update ;; apply last-update criteria
							  (conc " " (car last-update) ">=" (cdr last-update))
							  "")
				   ";"))
		 (full-ins   (conc "INSERT OR REPLACE INTO " tablename " ( " (string-intersperse (map car fields) ",") " ) "
				   " VALUES ( " (string-intersperse (make-list num-fields "?") ",") " );"))
		 (fromdat    '())
		 (fromdats   '())
		 (totrecords 0)







|
|
>
|
>
>
>
>
|
|
|
|
|
>
>
>
>
>
>
>
|
>
>
>
>
|





|







667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
     (let ((stmts       (make-hash-table)) ;; table-field => stmt
	   (all-stmts   '())              ;; ( ( stmt1 value1 ) ( stml2 value2 ))
	   (numrecs     (make-hash-table))
	   (start-time  (current-milliseconds))
	   (tot-count   0))
       (for-each ;; table
	(lambda (tabledat)
	  (let* ((tablename        (car tabledat))
		 (fields           (cdr tabledat))
		 (has-last-update  (member "last_update" fields))
		 (use-last-update  (cond
				    ((and has-last-update
					  (member "last_update" fields))
				     #t) ;; if given a number, just use it for all fields
				    ((number? last-update) #f) ;; if not matched first entry then ignore last-update for this table
				    ((and (pair? last-update)
					  (member (car last-update)    ;; last-update field name
						  (map car fields))) #t)
				    (last-update
				     (debug:print 0 *default-log-port* "ERROR: parameter last-update for db:sync-tables must be a pair or a number, received: " last-update) ;; found in fields
				     #f)
				    (else
				     #f)))
		 (last-update-value (if use-last-update ;; no need to check for has-last-update - it is already accounted for
					(if (number? last-update)
					    last-update
					    (cdr last-update))
					#f))
		 (last-update-field (if use-last-update
					(if (number? last-update)
					    "last_update"
					    (car last-update))
					#f))
		 (num-fields (length fields))
		 (field->num (make-hash-table))
		 (num->field (apply vector (map car fields)))
		 (full-sel   (conc "SELECT " (string-intersperse (map car fields) ",") 
				   " FROM " tablename (if use-last-update ;; apply last-update criteria
							  (conc " WHERE " last-update-field " >= " last-update-value)
							  "")
				   ";"))
		 (full-ins   (conc "INSERT OR REPLACE INTO " tablename " ( " (string-intersperse (map car fields) ",") " ) "
				   " VALUES ( " (string-intersperse (make-list num-fields "?") ",") " );"))
		 (fromdat    '())
		 (fromdats   '())
		 (totrecords 0)
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
	     (res      '())
	     (last-update (if use-last-update (cons "last_update" targ-db-last-mod) #f)))
	(db:sync-tables (db:sync-main-list source-db) last-update source-db cache-db)
	(db:sync-tables db:sync-tests-only last-update source-db cache-db)
	(hash-table-set! *global-db-store* target cache-db)
	cache-db)))

;; call a proc with a cached db
;;
(define (db:call-with-cached-db proc . params)
  ;; first cache the db in /tmp
  (let* ((cname-part (conc "megatest_cache/" (common:get-testsuite-name)))
	 (fname      (conc  (common:get-area-path-signature) ".db"))
	 (cache-dir  (common:get-create-writeable-dir
		      (list (conc "/tmp/" (current-user-name) "/" cname-part)
			    (conc "/tmp/" (current-user-name) "-" cname-part)
			     (conc "/tmp/" (current-user-name) "_" cname-part))))
	 (megatest-db (conc *toppath* "/megatest.db")))
    ;; (debug:print-info 0 *default-log-port* "Using cache dir " cache-dir)
    (if (not cache-dir)
	(begin
	  (debug:print 0 *default-log-port* "ERROR: Failed to find an area to write the cache db")
	  (exit 1))
	(let* ((th1      (make-thread
			  (lambda ()
			    (if (and (file-exists? megatest-db)
				     (file-write-access? megatest-db))
				(begin
				  (common:sync-to-megatest.db 'timestamps) ;; internally mutexes on *db-local-sync*
				  (debug:print-info 2 *default-log-port* "Done syncing to megatest.db"))))
			  "call-with-cached-db sync-to-megatest.db"))
	       (cache-db (db:cache-for-read-only
			  megatest-db
			  (conc cache-dir "/" fname)
			  use-last-update: #t)))
	  (thread-start! th1)
	  (apply proc cache-db params)
	  ))))

;; options:
;;
;;  'killservers  - kills all servers
;;  'dejunk       - removes junk records
;;  'adj-testids  - move test-ids into correct ranges
;;  'old2new      - sync megatest.db to /tmp/.../megatest.db and /tmp/.../megatest_ref.db







|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|







923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
	     (res      '())
	     (last-update (if use-last-update (cons "last_update" targ-db-last-mod) #f)))
	(db:sync-tables (db:sync-main-list source-db) last-update source-db cache-db)
	(db:sync-tables db:sync-tests-only last-update source-db cache-db)
	(hash-table-set! *global-db-store* target cache-db)
	cache-db)))

;; ;; call a proc with a cached db
;; ;;
;; (define (db:call-with-cached-db proc . params)
;;   ;; first cache the db in /tmp
;;   (let* ((cname-part (conc "megatest_cache/" (common:get-testsuite-name)))
;; 	 (fname      (conc  (common:get-area-path-signature) ".db"))
;; 	 (cache-dir  (common:get-create-writeable-dir
;; 		      (list (conc "/tmp/" (current-user-name) "/" cname-part)
;; 			    (conc "/tmp/" (current-user-name) "-" cname-part)
;; 			     (conc "/tmp/" (current-user-name) "_" cname-part))))
;; 	 (megatest-db (conc *toppath* "/megatest.db")))
;;     ;; (debug:print-info 0 *default-log-port* "Using cache dir " cache-dir)
;;     (if (not cache-dir)
;; 	(begin
;; 	  (debug:print 0 *default-log-port* "ERROR: Failed to find an area to write the cache db")
;; 	  (exit 1))
;; 	(let* ((th1      (make-thread
;; 			  (lambda ()
;; 			    (if (and (file-exists? megatest-db)
;; 				     (file-write-access? megatest-db))
;; 				(begin
;; 				  (db:sync-to-megatest.db dbstruct 'timestamps) ;; internally mutexes on *db-local-sync*
;; 				  (debug:print-info 2 *default-log-port* "Done syncing to megatest.db"))))
;; 			  "call-with-cached-db sync-to-megatest.db"))
;; 	       (cache-db (db:cache-for-read-only
;; 			  megatest-db
;; 			  (conc cache-dir "/" fname)
;; 			  use-last-update: #t)))
;; 	  (thread-start! th1)
;; 	  (apply proc cache-db params)
;; 	  ))))

;; options:
;;
;;  'killservers  - kills all servers
;;  'dejunk       - removes junk records
;;  'adj-testids  - move test-ids into correct ranges
;;  'old2new      - sync megatest.db to /tmp/.../megatest.db and /tmp/.../megatest_ref.db
1012
1013
1014
1015
1016
1017
1018























1019
1020
1021
1022
1023
1024
1025
	  (db:patch-schema-rundb  (db:dbdat-get-db mtdb))
	  (db:patch-schema-rundb  (db:dbdat-get-db tmpdb))
	  (db:patch-schema-rundb  (db:dbdat-get-db refndb))))
       
       (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))
     options)
    data-synced))
























;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling  proc idb . params)
  (debug:print-info 11 *default-log-port* "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
  (print "I don't work anymore. open-run-close-no-exception-handling needs fixing or removing...")
  (exit)
  (if (or *db-write-access*







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
	  (db:patch-schema-rundb  (db:dbdat-get-db mtdb))
	  (db:patch-schema-rundb  (db:dbdat-get-db tmpdb))
	  (db:patch-schema-rundb  (db:dbdat-get-db refndb))))
       
       (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))
     options)
    data-synced))

(define (db:tmp->megatest.db-sync dbstruct last-update)
  (let* ((mtdb        (dbr:dbstruct-mtdb dbstruct))
	 (tmpdb       (db:get-db dbstruct))
	 (refndb      (dbr:dbstruct-refndb dbstruct)))
    (db:sync-tables (db:sync-all-tables-list dbstruct) last-update tmpdb refndb mtdb)))

;;;; run-ids
;;    if #f use *db-local-sync* : or 'local-sync-flags
;;    if #t use timestamps      : or 'timestamps
(define (db:sync-to-megatest.db dbstruct #!key (no-sync-db #f)) 
  (let* ((start-time         (current-seconds))
	 (last-update        (if no-sync-db
				 (db:no-sync-get/default no-sync-db "LAST_UPDATE" 0)
				 0)) ;; (or (db:get-var dbstruct "LAST_UPDATE") 0))
	 (res                (db:tmp->megatest.db-sync dbstruct last-update)))
    (let ((sync-time (- (current-seconds) start-time)))
      (if no-sync-db
	  (db:no-sync-set no-sync-db "LAST_UPDATE" start-time))
      (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id))
      (if (common:low-noise-print 30 "sync new to old")
	  (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id))))
    res))

;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling  proc idb . params)
  (debug:print-info 11 *default-log-port* "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
  (print "I don't work anymore. open-run-close-no-exception-handling needs fixing or removing...")
  (exit)
  (if (or *db-write-access*
1782
1783
1784
1785
1786
1787
1788





























1789
1790
1791
1792
1793
1794
1795
	      (lambda (db)
		(sqlite3:execute db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);" var val))))

(define (db:del-var dbstruct var)
  (db:with-db dbstruct #f #t 
	      (lambda (db)
		(sqlite3:execute db "DELETE FROM metadat WHERE var=?;" var))))






























;; use a global for some primitive caching, it is just silly to
;; re-read the db over and over again for the keys since they never
;; change

;; why get the keys from the db? why not get from the *configdat*
;; using keys:config-get-fields?







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
	      (lambda (db)
		(sqlite3:execute db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);" var val))))

(define (db:del-var dbstruct var)
  (db:with-db dbstruct #f #t 
	      (lambda (db)
		(sqlite3:execute db "DELETE FROM metadat WHERE var=?;" var))))

;;======================================================================
;; no-sync.db - small bits of data to be shared between servers
;;======================================================================

(define (db:open-no-sync-db)
  (let* ((dbpath (db:dbfile-path))
	 (dbname (conc dbpath "/no-sync.db"))
	 (db     (sqlite3:open-database dbname)))
    (sqlite3:set-busy-handler! db (make-busy-timeout 136000))
    (sqlite3:execute db "PRAGMA synchronous = 0;")
    (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));")
    db))

(define (db:no-sync-set db var val)
  (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val))

(define (db:no-sync-get/default db var default)
  (let ((res default))
    (sqlite3:for-each-row
     (lambda (val)
       (set! res val))
     db
     "SELECT val FROM no_sync_metadat WHERE var=?;"
     var)
    res))

(define (db:no-sync-close-db db)
  (db:safely-close-sqlite3-db db))

;; use a global for some primitive caching, it is just silly to
;; re-read the db over and over again for the keys since they never
;; change

;; why get the keys from the db? why not get from the *configdat*
;; using keys:config-get-fields?

Modified http-transport.scm from [e6367359e1] to [1066bc8fad].

490
491
492
493
494
495
496


497
498
499
500
501
502
503
504
505
506
507
508
509
510
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch)


  ;; (if (args:get-arg "-daemonize")
  ;;     (begin
  ;; 	(daemon:ize)
  ;; 	(if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
  ;; 	    (begin
  ;; 	      (current-error-port *alt-log-file*)
  ;; 	      (current-output-port *alt-log-file*)))))
  (let* ((th2 (make-thread (lambda ()
			     (debug:print-info 0 *default-log-port* "Server run thread started")
			     (http-transport:run 
			      (if (args:get-arg "-server")
				  (args:get-arg "-server")
				  "-")
			      )) "Server run"))







>
>
|
|
<
<
|
<
|







490
491
492
493
494
495
496
497
498
499
500


501

502
503
504
505
506
507
508
509
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch)
  ;; lets not even bother to start if there are already three or more server files ready to go
  (let* ((num-alive   (server:get-num-alive (server:get-list *toppath*))))
    (if (> num-alive 3)
	(begin


	  (debug:print 0 *default-log-port* "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up")

	  (exit))))
  (let* ((th2 (make-thread (lambda ()
			     (debug:print-info 0 *default-log-port* "Server run thread started")
			     (http-transport:run 
			      (if (args:get-arg "-server")
				  (args:get-arg "-server")
				  "-")
			      )) "Server run"))

Modified rmt.scm from [7ffa1b548f] to [bcfa64a652].

122
123
124
125
126
127
128




129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
     ;; readonly mode, write request.  Do nothing, return #f
     (readonly-mode
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3")
      (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
      #f)





     ;;DOT CASE4 [label="reset\nconnection"];
     ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
     ;;DOT CASE4 -> "rmt:send-receive";
     ;; reset the connection if it has been unused too long
     ((and runremote
           (remote-conndat runremote)
	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #<request>.\n message: Server closed connection before sending response"
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
      (http-transport:close-connections area-dat: runremote)
      (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))

     ;;DOT CASE5 [label="local\nread"];
     ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
     ;;DOT CASE5 -> "rmt:open-qry-close-locally";
     ;; on homehost and this is a read
     ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
	   (cdr (remote-hh-dat runremote))       ;; on homehost







>
>
>
>
|
|
|
|
|
|
|
|
|
|
|
|
|







122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
     ;; readonly mode, write request.  Do nothing, return #f
     (readonly-mode
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3")
      (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
      #f)

     ;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
     ;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
     ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
     ;;
     ;; ;;DOT CASE4 [label="reset\nconnection"];
     ;; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
     ;; ;;DOT CASE4 -> "rmt:send-receive";
     ;; ;; reset the connection if it has been unused too long
     ;; ((and runremote
     ;;       (remote-conndat runremote)
     ;; 	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #<request>.\n message: Server closed connection before sending response"
     ;; 	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
     ;;  (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
     ;;  (http-transport:close-connections area-dat: runremote)
     ;;  (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
     ;;  (mutex-unlock! *rmt-mutex*)
     ;;  (rmt:send-receive cmd rid params attemptnum: attemptnum))

     ;;DOT CASE5 [label="local\nread"];
     ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
     ;;DOT CASE5 -> "rmt:open-qry-close-locally";
     ;; on homehost and this is a read
     ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
	   (cdr (remote-hh-dat runremote))       ;; on homehost

Modified server.scm from [9517dd6c38] to [f8280a94d4].

204
205
206
207
208
209
210
211














212
213
214
215
216
217
218
				     res
				     (cons (append serv-rec (list pid)) res))))
		(if (null? tal)
		    (if (and limit
			     (> (length new-res) limit))
			new-res ;; (take new-res limit)  <= need intelligent sorting before this will work
			new-res)
		      (loop (car tal)(cdr tal) new-res)))))))))















;; given a list of servers get a list of valid servers, i.e. at least
;; 10 seconds old, has started and is less than 1 hour old and is
;; active (i.e. mod-time < 10 seconds
;;
;; mod-time host port start-time pid
;;







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>







204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
				     res
				     (cons (append serv-rec (list pid)) res))))
		(if (null? tal)
		    (if (and limit
			     (> (length new-res) limit))
			new-res ;; (take new-res limit)  <= need intelligent sorting before this will work
			new-res)
		    (loop (car tal)(cdr tal) new-res)))))))))

(define (server:get-num-alive srvlst)
  (let ((num-alive 0))
    (for-each
     (lambda (server)
       (match-let (((mod-time host port start-time pid)
		    server))
	 (let* ((uptime  (- (current-seconds) mod-time))
		(runtime (if start-time
			     (- mod-time start-time)
			     0)))
	   (if (< uptime 5)(set! num-alive (+ num-alive 1))))))
     srvlst)
    num-alive))

;; given a list of servers get a list of valid servers, i.e. at least
;; 10 seconds old, has started and is less than 1 hour old and is
;; active (i.e. mod-time < 10 seconds
;;
;; mod-time host port start-time pid
;;
433
434
435
436
437
438
439









































































	     (string->number tmo))
	(* 60 60 (string->number tmo))
	;; (* 3 24 60 60) ;; default to three days
	;;(* 60 60 1)     ;; default to one hour
	(* 60 5)          ;; default to five minutes
	)))

















































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
	     (string->number tmo))
	(* 60 60 (string->number tmo))
	;; (* 3 24 60 60) ;; default to three days
	;;(* 60 60 1)     ;; default to one hour
	(* 60 5)          ;; default to five minutes
	)))

;; moving this here as it needs access to db and cannot be in common.
;;
(define (server:writable-watchdog dbstruct)
  (thread-sleep! 0.05) ;; delay for startup
  (let ((legacy-sync  (common:run-sync?))
	(debug-mode   (debug:debug-mode 1))
	(last-time    (current-seconds))
	(no-sync-db   (db:open-no-sync-db))
        (this-wd-num  (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))))
    (debug:print-info 2 *default-log-port* "Periodic sync thread started.")
    (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num)
    (if (and legacy-sync (not *time-to-exit*))
	(let* (;;(dbstruct (db:setup))
	       (mtdb     (dbr:dbstruct-mtdb dbstruct))
	       (mtpath   (db:dbdat-get-path mtdb)))
	  (debug:print-info 0 *default-log-port* "Server running, periodic sync started.")
	  (let loop ()
	    ;; sync for filesystem local db writes
	    ;;
	    (mutex-lock! *db-multi-sync-mutex*)
	    (let* ((need-sync        (>= *db-last-access* *db-last-sync*)) ;; no sync since last write
		   (sync-in-progress *db-sync-in-progress*)
		   (should-sync      (and (not *time-to-exit*)
                                          (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum
		   (start-time       (current-seconds))
		   (mt-mod-time      (file-modification-time mtpath))
		   (recently-synced  (< (- start-time mt-mod-time) 4))
		   (will-sync        (and (or need-sync should-sync)
					  (not sync-in-progress)
					  (not recently-synced))))
              (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop.  need-sync="need-sync" sync-in-progress="sync-in-progress" should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync)
	      ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced))
	      ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync)
	      (if will-sync (set! *db-sync-in-progress* #t))
	      (mutex-unlock! *db-multi-sync-mutex*)
	      (if will-sync
		  (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive
		    (if (> res 0) ;; some records were transferred, keep the db alive
			(begin
			  (mutex-lock! *heartbeat-mutex*)
			  (set! *db-last-access* (current-seconds))
			  (mutex-unlock! *heartbeat-mutex*)
			  (debug:print-info 0 *default-log-port* "sync called, " res " records transferred."))
			(debug:print-info 2 *default-log-port* "sync called but zero records transferred"))))
	      (if will-sync
		  (begin
		    (mutex-lock! *db-multi-sync-mutex*)
		    (set! *db-sync-in-progress* #f)
		    (set! *db-last-sync* start-time)
		    (mutex-unlock! *db-multi-sync-mutex*)))
	      (if (and debug-mode
		       (> (- start-time last-time) 60))
		  (begin
		    (set! last-time start-time)
		    (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))
	    
	    ;; keep going unless time to exit
	    ;;
	    (if (not *time-to-exit*)
		(let delay-loop ((count 0))
                  ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*)
                                                            
		  (if (and (not *time-to-exit*)
			   (< count 4)) ;; was 11, changing to 4. 
		      (begin
			(thread-sleep! 1)
			(delay-loop (+ count 1))))
		  (if (not *time-to-exit*) (loop))))
	    ;; time to exit, close the no-sync db here
	    (db:no-sync-close-db no-sync-db)
	    (if (common:low-noise-print 30)
		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num)))))))

Modified tests/fdktestqa/testqa/megatest.config from [a5c70b7363] to [96a4d22c9d].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
[setup]
testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log
# launchwait no
launch-delay 1

[server]
# runtime 180
# timeout is in hours
timeout 100

# All these are overridden in ../fdk.config
# [jobtools]
# launcher nbfake
# launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log 

[include ../fdk.config]



|



|
|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
[setup]
testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log
# launchwait no
launch-delay 0.1

[server]
# runtime 180
# timeout is in hours, this is how long the server will stay alive when not being used.
timeout 0.1

# All these are overridden in ../fdk.config
# [jobtools]
# launcher nbfake
# launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log 

[include ../fdk.config]