Megatest

Check-in [b260c4c5d9]
Login
Overview
Comment:http transport with try-nanomsg changes completed first pass Validation 100%
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | try-nanomsg
Files: files | file ages | folders
SHA1: b260c4c5d969da46f3a01ba6188986d4f9eb977a
User & Date: matt on 2014-11-29 20:33:57
Other Links: branch diff | manifest | tags
Context
2014-11-29
21:50
Added a little time spread on client starting servers - try to avoid startup storms check-in: b3a83a0cec user: matt tags: try-nanomsg
20:33
http transport with try-nanomsg changes completed first pass Validation 100% check-in: b260c4c5d9 user: matt tags: try-nanomsg
13:43
Fixed breakage caused by enabling newdashboard. Sigh. check-in: 2f12af3ef0 user: matt tags: try-nanomsg
Changes

Modified client.scm from [eb2cd66f13] to [8f8fe7296f].

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 2) (failed-connects 0))
  (debug:print-info 2 "client:setup remaining-tries=" remaining-tries)
  (let* ((tdbdat (tasks:open-db)))
    (if (<= remaining-tries 0)
	(begin
	  (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	  (exit 1))
	(let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))







|







51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0))
  (debug:print-info 2 "client:setup remaining-tries=" remaining-tries)
  (let* ((tdbdat (tasks:open-db)))
    (if (<= remaining-tries 0)
	(begin
	  (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	  (exit 1))
	(let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
84
85
86
87
88
89
90

91
92
93
94
95
96
97
		      (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res))
		      start-res)
		    (begin    ;; login failed but have a server record, clean out the record and try again
		      (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
		      (case *transport-type* 
			((http)(http-transport:close-connections run-id)))
		      (hash-table-delete! *runremote* run-id)

		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")
		      (server:try-running run-id)
		      (thread-sleep! 5) ;; give server a little time to start up







>







84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
		      (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res))
		      start-res)
		    (begin    ;; login failed but have a server record, clean out the record and try again
		      (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
		      (case *transport-type* 
			((http)(http-transport:close-connections run-id)))
		      (hash-table-delete! *runremote* run-id)
		      (tasks:kill-server-run-id run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")
		      (server:try-running run-id)
		      (thread-sleep! 5) ;; give server a little time to start up

Modified common.scm from [7a1802c2d4] to [5faac0528e].

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
(define *task-db*             #f) ;; (vector db path-to-db)
(define *db-access-allowed*   #t) ;; flag to allow access
(define *db-access-mutex*     (make-mutex))

;; SERVER
(define *my-client-signature* #f)
;; (define *transport-type*    'nmsg)
(define *transport-type*    'http)
(define *runremote*         (make-hash-table)) ;; if set up for server communication this will hold <host port>
(define *max-cache-size*    0)
(define *logged-in-clients* (make-hash-table))
(define *client-non-blocking-mode* #f)
(define *server-id*         #f)
(define *server-info*       #f)
(define *time-to-exit*      #f)







|







64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
(define *task-db*             #f) ;; (vector db path-to-db)
(define *db-access-allowed*   #t) ;; flag to allow access
(define *db-access-mutex*     (make-mutex))

;; SERVER
(define *my-client-signature* #f)
;; (define *transport-type*    'nmsg)
(define *transport-type*    'http)             ;; override with [server] transport http|rpc|nmsg
(define *runremote*         (make-hash-table)) ;; if set up for server communication this will hold <host port>
(define *max-cache-size*    0)
(define *logged-in-clients* (make-hash-table))
(define *client-non-blocking-mode* #f)
(define *server-id*         #f)
(define *server-info*       #f)
(define *time-to-exit*      #f)

Modified db.scm from [e95ebbcbeb] to [ad92b83fd4].

615
616
617
618
619
620
621

622
623
624
625
626
627
628
;;
;;  run-ids: '(1 2 3 ...) or #f (for all)
;;
(define (db:multi-db-sync run-ids . options)
  (let* ((toppath  (launch:setup-for-run))
	 (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f))
	 (mtdb     (if toppath (db:open-megatest-db)))

	 (run-ids  (if run-ids 
		       run-ids
		       (if toppath (begin
				     (db:delay-if-busy mtdb)
				     (db:get-all-run-ids mtdb)))))
	 (tdbdat  (tasks:open-db))
	 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))))







>







615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
;;
;;  run-ids: '(1 2 3 ...) or #f (for all)
;;
(define (db:multi-db-sync run-ids . options)
  (let* ((toppath  (launch:setup-for-run))
	 (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f))
	 (mtdb     (if toppath (db:open-megatest-db)))
	 (allow-cleanup (if run-ids #f #t))
	 (run-ids  (if run-ids 
		       run-ids
		       (if toppath (begin
				     (db:delay-if-busy mtdb)
				     (db:get-all-run-ids mtdb)))))
	 (tdbdat  (tasks:open-db))
	 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))))
661
662
663
664
665
666
667


668






669
670


671
672
673
674

675

676



677


678









679
680
681
682
683
684
685
		   (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)))
	       (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db")
	       (db:replace-test-records dbstruct run-id testrecs)
	       (sqlite3:finalize! (db:dbdat-get-db (dbr:dbstruct-get-rundb dbstruct)))))
	   run-ids)))

    ;; now ensure all newdb data are synced to megatest.db


    (if (member 'new2old options)






	(for-each
	 (lambda (run-id)


	   (let* ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))
		  (frundb (db:dbdat-get-db (db:get-db fromdb run-id))))
	     ;; (db:delay-if-busy frundb)
	     ;; (db:delay-if-busy mtdb)

	     (if (eq? run-id 0)

		 (db:sync-tables (db:sync-main-list dbstruct) (db:get-db fromdb #f) mtdb)



		 (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb))))


	 (cons 0 run-ids)))









    ;; (db:close-all dbstruct)
    ;; (sqlite3:finalize! mdb)
    ))

;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling  proc idb . params)
  (debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)







>
>

>
>
>
>
>
>
|
|
>
>
|
|
|
|
>
|
>
|
>
>
>
|
>
>
|
>
>
>
>
>
>
>
>
>







662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
		   (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)))
	       (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db")
	       (db:replace-test-records dbstruct run-id testrecs)
	       (sqlite3:finalize! (db:dbdat-get-db (dbr:dbstruct-get-rundb dbstruct)))))
	   run-ids)))

    ;; now ensure all newdb data are synced to megatest.db
    ;; do not use the run-ids list passed in to the function
    ;;
    (if (member 'new2old options)
	(let* ((maindb      (make-dbr:dbstruct path: toppath local: #t))
	       (src-run-ids (db:get-all-run-ids (db:dbdat-get-db (db:get-db maindb 0))))
	       (all-run-ids (sort (delete-duplicates (cons 0 src-run-ids)) <))
	       (count       1)
	       (total       (length all-run-ids))
	       (dead-runs  '()))
	  (for-each
	   (lambda (run-id)
	     (debug:print 0 "Processing run " (if (eq? run-id 0) " main.db " run-id) ", " count " of " total)
	     (set! count (+ count 1))
	     (let* ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))
		    (frundb (db:dbdat-get-db (db:get-db fromdb run-id))))
	       ;; (db:delay-if-busy frundb)
	       ;; (db:delay-if-busy mtdb)
	       ;; (db:clean-up frundb)
	       (if (eq? run-id 0)
		   (begin
		     (db:sync-tables (db:sync-main-list dbstruct) (db:get-db fromdb #f) mtdb)
		     (set! dead-runs (db:clean-up-maindb (db:get-db fromdb #f))))
		   (begin
		     ;; NB// must sync first to ensure deleted tests get marked as such in megatest.db
		     (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb)
		     (db:clean-up-rundb (db:get-db fromdb run-id))
		     ))))
	   all-run-ids)
	  ;; removed deleted runs
	  (let ((dbdir (tasks:get-task-db-path)))
	    (for-each (lambda (run-id)
			(let ((fullname (conc dbdir "/" run-id ".db")))
			  (if (file-exists? fullname)
			      (begin
				(debug:print 0 "Removing database file for deleted run " fullname)
				(delete-file fullname)))))
		      dead-runs))))
    ;; (db:close-all dbstruct)
    ;; (sqlite3:finalize! mdb)
    ))

;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling  proc idb . params)
  (debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
;;    a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown'
;;    b. If test dir gone, delete the test record
;; 2. Look at run records
;;    a. If have tests that are not deleted, set state='unknown'
;;    b. ....
;;
(define (db:clean-up dbdat)
  (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db")
  (let* ((db         (db:dbdat-get-db dbdat))
	 (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests)+(SELECT count(id) FROM runs);"))
	(statements
	 (map (lambda (stmt)
		(sqlite3:prepare db stmt))
	      (list
	       ;; delete all tests that belong to runs that are 'deleted'







|







1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
;;    a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown'
;;    b. If test dir gone, delete the test record
;; 2. Look at run records
;;    a. If have tests that are not deleted, set state='unknown'
;;    b. ....
;;
(define (db:clean-up dbdat)
  ;; (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db")
  (let* ((db         (db:dbdat-get-db dbdat))
	 (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests)+(SELECT count(id) FROM runs);"))
	(statements
	 (map (lambda (stmt)
		(sqlite3:prepare db stmt))
	      (list
	       ;; delete all tests that belong to runs that are 'deleted'
1025
1026
1027
1028
1029
1030
1031

























































































1032
1033
1034
1035
1036
1037
1038
			       (debug:print-info 0 "Records count after  clean: " tot))
			     count-stmt)))
    (map sqlite3:finalize! statements)
    (sqlite3:finalize! count-stmt)
    ;; (db:find-and-mark-incomplete db)
    (db:delay-if-busy dbdat)
    (sqlite3:execute db "VACUUM;")))


























































































;;======================================================================
;; M E T A   G E T   A N D   S E T   V A R S
;;======================================================================

;; returns number if string->number is successful, string otherwise
;; also updates *global-delta*







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
			       (debug:print-info 0 "Records count after  clean: " tot))
			     count-stmt)))
    (map sqlite3:finalize! statements)
    (sqlite3:finalize! count-stmt)
    ;; (db:find-and-mark-incomplete db)
    (db:delay-if-busy dbdat)
    (sqlite3:execute db "VACUUM;")))

;; Clean out old junk and vacuum the database
;;
;; Ultimately do something like this:
;;
;; 1. Look at test records either deleted or part of deleted run:
;;    a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown'
;;    b. If test dir gone, delete the test record
;; 2. Look at run records
;;    a. If have tests that are not deleted, set state='unknown'
;;    b. ....
;;
(define (db:clean-up-rundb dbdat)
  ;; (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db")
  (let* ((db         (db:dbdat-get-db dbdat))
	 (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests);"))
	(statements
	 (map (lambda (stmt)
		(sqlite3:prepare db stmt))
	      (list
	       ;; delete all tests that belong to runs that are 'deleted'
	       ;; (conc "DELETE FROM tests WHERE run_id NOT IN (" (string-intersperse (map conc valid-runs) ",") ");")
	       ;; delete all tests that are 'DELETED'
	       "DELETE FROM tests WHERE state='DELETED';"
	       ))))
    (db:delay-if-busy dbdat)
    (sqlite3:with-transaction 
     db
     (lambda ()
       (sqlite3:for-each-row (lambda (tot)
			       (debug:print-info 0 "Records count before clean: " tot))
			     count-stmt)
       (map sqlite3:execute statements)
       (sqlite3:for-each-row (lambda (tot)
			       (debug:print-info 0 "Records count after  clean: " tot))
			     count-stmt)))
    (map sqlite3:finalize! statements)
    (sqlite3:finalize! count-stmt)
    ;; (db:find-and-mark-incomplete db)
    (db:delay-if-busy dbdat)
    (sqlite3:execute db "VACUUM;")))

;; Clean out old junk and vacuum the database
;;
;; Ultimately do something like this:
;;
;; 1. Look at test records either deleted or part of deleted run:
;;    a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown'
;;    b. If test dir gone, delete the test record
;; 2. Look at run records
;;    a. If have tests that are not deleted, set state='unknown'
;;    b. ....
;;
(define (db:clean-up-maindb dbdat)
  ;; (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db")
  (let* ((db         (db:dbdat-get-db dbdat))
	 (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM runs);"))
	 (statements
	  (map (lambda (stmt)
		 (sqlite3:prepare db stmt))
	       (list
		;; delete all tests that belong to runs that are 'deleted'
		;; (conc "DELETE FROM tests WHERE run_id NOT IN (" (string-intersperse (map conc valid-runs) ",") ");")
		;; delete all tests that are 'DELETED'
		"DELETE FROM runs WHERE state='deleted';"
		)))
	 (dead-runs '()))
    (sqlite3:for-each-row
     (lambda (run-id)
       (set! dead-runs (cons run-id dead-runs)))
       db
       "SELECT id FROM runs WHERE state='deleted';")
    (db:delay-if-busy dbdat)
    (sqlite3:with-transaction 
     db
     (lambda ()
       (sqlite3:for-each-row (lambda (tot)
			       (debug:print-info 0 "Records count before clean: " tot))
			     count-stmt)
       (map sqlite3:execute statements)
       (sqlite3:for-each-row (lambda (tot)
			       (debug:print-info 0 "Records count after  clean: " tot))
			     count-stmt)))
    (map sqlite3:finalize! statements)
    (sqlite3:finalize! count-stmt)
    ;; (db:find-and-mark-incomplete db)
    (db:delay-if-busy dbdat)
    (sqlite3:execute db "VACUUM;")
    dead-runs))

;;======================================================================
;; M E T A   G E T   A N D   S E T   V A R S
;;======================================================================

;; returns number if string->number is successful, string otherwise
;; also updates *global-delta*
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
	     (z3:decode-buffer
	      (base64:base64-decode
	       (string-substitute 
		(regexp "_") "=" msg #t)))
	   (lambda ()(deserialize)))
	 (begin
	   (debug:print 0 "ERROR: reception failed. Received " msg " but cannot translate it.")
	   #f))) ;; crude reply for when things go awry
    ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize))))
    (else msg)))

(define (db:test-set-status-state dbstruct run-id test-id status state msg)
  (let ((dbdat  (db:get-db dbstruct run-id)))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
	(db:general-call dbdat 'set-test-start-time (list test-id)))







|







2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
	     (z3:decode-buffer
	      (base64:base64-decode
	       (string-substitute 
		(regexp "_") "=" msg #t)))
	   (lambda ()(deserialize)))
	 (begin
	   (debug:print 0 "ERROR: reception failed. Received " msg " but cannot translate it.")
	   msg))) ;; crude reply for when things go awry
    ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize))))
    (else msg)))

(define (db:test-set-status-state dbstruct run-id test-id status state msg)
  (let ((dbdat  (db:get-db dbstruct run-id)))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
	(db:general-call dbdat 'set-test-start-time (list test-id)))

Modified http-transport.scm from [fc5c6a3a75] to [4218ba432d].

281
282
283
284
285
286
287
288
289
290

291
292
293
294
295
296
297
			      (set! res (vector
					 success
					 (db:string->obj 
					  (handle-exceptions
					   exn
					   (begin
					     (set! success #f)
					     (debug:print 0 "WARNING: failure in with-input-from-request to " fullurl ". Killing associated server to allow clean retry.")
					     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
					     (hash-table-delete! *runremote* run-id)

					     ;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine.
					     #f)
					   (with-input-from-request ;; was dat
					    fullurl 
					    (list (cons 'key "thekey")
						  (cons 'cmd cmd)
						  (cons 'params sparams))







|


>







281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
			      (set! res (vector
					 success
					 (db:string->obj 
					  (handle-exceptions
					   exn
					   (begin
					     (set! success #f)
					     (debug:print 0 "WARNING: failure in with-input-from-request to " fullurl ".")
					     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
					     (hash-table-delete! *runremote* run-id)
					     ;; Killing associated server to allow clean retry.")
					     ;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine.
					     #f)
					   (with-input-from-request ;; was dat
					    fullurl 
					    (list (cons 'key "thekey")
						  (cons 'cmd cmd)
						  (cons 'params sparams))
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
	 (thread-join! th1)
	 (thread-terminate! th2)
	 (debug:print-info 11 "got res=" res)
	 (if (vector? res)
	     (if (vector-ref res 0)
		 res
		 (begin ;; note: this code also called in nmsg-transport - consider consolidating it
		   (debug:print 0 "ERROR: error occured at server, info=" (vector-ref result 2))
		   (debug:print 0 " client call chain:")
		   (print-call-chain (current-error-port))
		   (debug:print 0 " server call chain:")
		   (pp (vector-ref result 1) (current-error-port))
		   (signal (vector-ref result 0))))
	     (signal (make-composite-condition
		      (make-property-condition 
		       'timeout
		       'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))))

;; careful closing of connections stored in *runremote*







|



|







312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
	 (thread-join! th1)
	 (thread-terminate! th2)
	 (debug:print-info 11 "got res=" res)
	 (if (vector? res)
	     (if (vector-ref res 0)
		 res
		 (begin ;; note: this code also called in nmsg-transport - consider consolidating it
		   (debug:print 0 "ERROR: error occured at server, info=" (vector-ref res 2))
		   (debug:print 0 " client call chain:")
		   (print-call-chain (current-error-port))
		   (debug:print 0 " server call chain:")
		   (pp (vector-ref res 1) (current-error-port))
		   (signal (vector-ref result 0))))
	     (signal (make-composite-condition
		      (make-property-condition 
		       'timeout
		       'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))))

;; careful closing of connections stored in *runremote*

Modified launch.scm from [34882953c1] to [111f911e42].

456
457
458
459
460
461
462







463
464
465
466
467
468
469
	(set! *configinfo* (find-and-read-config 
			    (if (args:get-arg "-config")(args:get-arg "-config") "megatest.config")
			    environ-patt: "env-override"
			    given-toppath: (get-environment-variable "MT_RUN_AREA_HOME")
			    pathenvvar: "MT_RUN_AREA_HOME"))
	(set! *configdat*  (if (car *configinfo*)(car *configinfo*) #f))
	(set! *toppath*    (if (car *configinfo*)(cadr *configinfo*) #f))







	(let ((linktree (configf:lookup *configdat* "setup" "linktree"))) ;; link tree is critical
	  (if linktree
	      (if (not (file-exists? linktree))
		  (begin
		    (handle-exceptions
		     exn
		     (begin







>
>
>
>
>
>
>







456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
	(set! *configinfo* (find-and-read-config 
			    (if (args:get-arg "-config")(args:get-arg "-config") "megatest.config")
			    environ-patt: "env-override"
			    given-toppath: (get-environment-variable "MT_RUN_AREA_HOME")
			    pathenvvar: "MT_RUN_AREA_HOME"))
	(set! *configdat*  (if (car *configinfo*)(car *configinfo*) #f))
	(set! *toppath*    (if (car *configinfo*)(cadr *configinfo*) #f))
	(let* ((tmptransport (configf:lookup *configdat* "server" "transport"))
	       (transport    (if tmptransport (string->symbol tmptransport) 'http)))
	  (if (member transport '(http rpc nmsg))
	      (set! *transport-type* transport)
	      (begin
		(debug:print 0 "ERROR: Unrecognised transport " transport)
		(exit))))
	(let ((linktree (configf:lookup *configdat* "setup" "linktree"))) ;; link tree is critical
	  (if linktree
	      (if (not (file-exists? linktree))
		  (begin
		    (handle-exceptions
		     exn
		     (begin

Modified megatest.scm from [16af3583d8] to [f155a05495].

281
282
283
284
285
286
287

288
289
290
291
292


293
294
295
296
297
298
299
			"-q" ;; quiet 0, errors/warnings only
		       )
		 args:arg-hash
		 0))

;; The watchdog is to keep an eye on things like db sync etc.
;;

(define *watchdog*
  (make-thread 
   (lambda ()
     (thread-sleep! 0.05) ;; delay for startup
     (let ((legacy-sync (configf:lookup *configdat* "setup" "megatest-db")))


       (let loop ()
	 ;; sync for filesystem local db writes
	 ;;
	 (let ((start-time      (current-seconds))
	       (servers-started (make-hash-table)))
	   (for-each 
	    (lambda (run-id)







>




|
>
>







281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
			"-q" ;; quiet 0, errors/warnings only
		       )
		 args:arg-hash
		 0))

;; The watchdog is to keep an eye on things like db sync etc.
;;
(define *time-zero* (current-seconds))
(define *watchdog*
  (make-thread 
   (lambda ()
     (thread-sleep! 0.05) ;; delay for startup
     (let ((legacy-sync (configf:lookup *configdat* "setup" "megatest-db"))
	   (debug-mode  (debug:debug-mode 1))
	   (last-time   (current-seconds)))
       (let loop ()
	 ;; sync for filesystem local db writes
	 ;;
	 (let ((start-time      (current-seconds))
	       (servers-started (make-hash-table)))
	   (for-each 
	    (lambda (run-id)
308
309
310
311
312
313
314
315





316
317
318
319
320
321
322
323
324
325
326
327
328
			  (debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " sync-time " seconds")))
		    ;; (if (> sync-time 10) ;; took more than ten seconds, start a server for this run
		    ;;     (begin
		    ;;       (debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id)
		    ;;       (server:kind-run run-id)))))
		    (hash-table-delete! *db-local-sync* run-id)))
	      (mutex-unlock! *db-multi-sync-mutex*))
	    (hash-table-keys *db-local-sync*)))






	 ;; keep going unless time to exit
	 ;;
	 (if (not *time-to-exit*)
	     (begin
	       (thread-sleep! 1) ;; wait one second before syncing again
	       (loop)))))
     "Watchdog thread")))

(thread-start! *watchdog*)

(if (args:get-arg "-log")
    (let ((oup (open-output-file (args:get-arg "-log"))))







|
>
>
>
>
>
|




|







311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
			  (debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " sync-time " seconds")))
		    ;; (if (> sync-time 10) ;; took more than ten seconds, start a server for this run
		    ;;     (begin
		    ;;       (debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id)
		    ;;       (server:kind-run run-id)))))
		    (hash-table-delete! *db-local-sync* run-id)))
	      (mutex-unlock! *db-multi-sync-mutex*))
	    (hash-table-keys *db-local-sync*))
	   (if (and debug-mode
		    (> (- start-time last-time) 14))
	       (begin
		 (set! last-time start-time)
		 (debug:print-info 0 "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))
	 
	 ;; keep going unless time to exit
	 ;;
	 (if (not *time-to-exit*)
	     (begin
	       (thread-sleep! 5) ;; wait five seconds before syncing again, we'll also sync on exit
	       (loop)))))
     "Watchdog thread")))

(thread-start! *watchdog*)

(if (args:get-arg "-log")
    (let ((oup (open-output-file (args:get-arg "-log"))))

Modified rmt.scm from [7ce19b3436] to [fbd1a17708].

62
63
64
65
66
67
68

69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

84
85
86
87
88
89
90
91
92
93
94
95
96
97
98


99
100
101
102
103
104
105
106
107
108

109
110
111
112
113
114
115
116
117
118
119
120
121
122

123
124
125
126
127
128

129
130
131
132
133
134
135
	cinfo
	;; NB// can cache the answer for server running for 10 seconds ...
	;;  ;; (and (not (rmt:write-frequency-over-limit? cmd run-id))
	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)
	    (client:setup run-id)
	    #f))))


(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected
  ;; clean out old connections
  (mutex-lock! *db-multi-sync-mutex*)
  ;; (let ((expire-time (- (current-seconds) 60)))
  ;;   (for-each 
  ;;    (lambda (run-id)
  ;;      (let ((connection (hash-table-ref/default *runremote* run-id #f)))
  ;;        (if (and connection 
  ;;       	  (< (http-transport:server-dat-get-last-access connection) expire-time))
  ;;            (begin
  ;;              (debug:print-info 0 "Discarding connection to server for run-id " run-id ", too long between accesses")
  ;;              ;; SHOULD CLOSE THE CONNECTION HERE
  ;;              (hash-table-delete! *runremote* run-id)))))
  ;;    (hash-table-keys *runremote*)))
  (mutex-unlock! *db-multi-sync-mutex*)

  (let* ((run-id          (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*
			  ((http)(http-transport:client-api-send-receive run-id connection-info cmd params))
			  ((nmsg)(condition-case
				  (nmsg-transport:client-api-send-receive run-id connection-info cmd params)
				  ((timeout)(vector #f "timeout talking to server"))))
			  (else  (exit))))
	       (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))
	       (res     (if (and dat (vector? dat)) (vector-ref dat 1) #f)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if success


	      (case *transport-type* 
		((http) res) ;; (db:string->obj res))
		((nmsg) res)) ;; (vector-ref res 1)))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to rmt:send-receive again.")
		;; (case *transport-type*
		;;   ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info))))
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection
		(if (eq? (modulo attemptnum 5) 0)
		    (tasks:kill-server-run-id run-id tag: "api-send-receive-failed"))

		(tasks:start-and-wait-for-server (tasks:open-db) run-id 15)
		;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1))))))

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		;; (thread-sleep! 2)
		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))
	;; no connection info? try to start a server
	(if (and (< attemptnum 15)
		 (tasks:need-server run-id))
	    (begin
	      (hash-table-delete! *runremote* run-id)

	      (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
	      (client:setup run-id)
	      (thread-sleep! (random 5)) ;; give some time to settle and minimize collison?
	      (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
	    (begin
	      (debug:print 0 "ERROR: Communication failed!")

	      (exit)
	      ;; (rmt:open-qry-close-locally cmd run-id params))))
	      )))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions







>


|











|
>















>
>
|
|
|







>














>






>







62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
	cinfo
	;; NB// can cache the answer for server running for 10 seconds ...
	;;  ;; (and (not (rmt:write-frequency-over-limit? cmd run-id))
	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)
	    (client:setup run-id)
	    #f))))

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected
  ;; clean out old connections
  ;; (mutex-lock! *db-multi-sync-mutex*)
  ;; (let ((expire-time (- (current-seconds) 60)))
  ;;   (for-each 
  ;;    (lambda (run-id)
  ;;      (let ((connection (hash-table-ref/default *runremote* run-id #f)))
  ;;        (if (and connection 
  ;;       	  (< (http-transport:server-dat-get-last-access connection) expire-time))
  ;;            (begin
  ;;              (debug:print-info 0 "Discarding connection to server for run-id " run-id ", too long between accesses")
  ;;              ;; SHOULD CLOSE THE CONNECTION HERE
  ;;              (hash-table-delete! *runremote* run-id)))))
  ;;    (hash-table-keys *runremote*)))
  ;; (mutex-unlock! *db-multi-sync-mutex*)
  ;; (mutex-lock! *send-receive-mutex*)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*
			  ((http)(http-transport:client-api-send-receive run-id connection-info cmd params))
			  ((nmsg)(condition-case
				  (nmsg-transport:client-api-send-receive run-id connection-info cmd params)
				  ((timeout)(vector #f "timeout talking to server"))))
			  (else  (exit))))
	       (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))
	       (res     (if (and dat (vector? dat)) (vector-ref dat 1) #f)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if success
	      (begin
		;; (mutex-unlock! *send-receive-mutex*)
		(case *transport-type* 
		  ((http) res) ;; (db:string->obj res))
		  ((nmsg) res))) ;; (vector-ref res 1)))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to rmt:send-receive again.")
		;; (case *transport-type*
		;;   ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info))))
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection
		(if (eq? (modulo attemptnum 5) 0)
		    (tasks:kill-server-run-id run-id tag: "api-send-receive-failed"))
		;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications
		(tasks:start-and-wait-for-server (tasks:open-db) run-id 15)
		;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1))))))

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		;; (thread-sleep! 2)
		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))
	;; no connection info? try to start a server
	(if (and (< attemptnum 15)
		 (tasks:need-server run-id))
	    (begin
	      (hash-table-delete! *runremote* run-id)
	      ;; (mutex-unlock! *send-receive-mutex*)
	      (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
	      (client:setup run-id)
	      (thread-sleep! (random 5)) ;; give some time to settle and minimize collison?
	      (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
	    (begin
	      (debug:print 0 "ERROR: Communication failed!")
	      ;; (mutex-unlock! *send-receive-mutex*)
	      (exit)
	      ;; (rmt:open-qry-close-locally cmd run-id params))))
	      )))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
333
334
335
336
337
338
339

340
341
342










343
344
345













346
347
348
349
350
351
352
	(debug:print "ERROR: rmt:get-tests-for-run called with bad run-id=" run-id)
	(print-call-chain (current-error-port))
	'())))

;; IDEA: Threadify these - they spend a lot of time waiting ...
;;
(define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in)

  (let ((run-id-list (if run-ids
			 run-ids
			 (rmt:get-all-run-ids))))










    (apply append (map (lambda (run-id)
			 (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-id testpatt states status not-in)))
		       run-id-list))))














(define (rmt:delete-test-records run-id test-id)
  (rmt:send-receive 'delete-test-records run-id (list run-id test-id)))

;; This is not needed as test steps are deleted on test delete call
;;
;; (define (rmt:delete-test-step-records run-id test-id)







>
|

|
>
>
>
>
>
>
>
>
>
>
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>







340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
	(debug:print "ERROR: rmt:get-tests-for-run called with bad run-id=" run-id)
	(print-call-chain (current-error-port))
	'())))

;; IDEA: Threadify these - they spend a lot of time waiting ...
;;
(define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in)
  (let ((multi-run-mutex (make-mutex))
	(run-id-list (if run-ids
			 run-ids
			 (rmt:get-all-run-ids)))
	(result      '()))
    (if (null? run-id-list)
	'()
	(for-each 
	 (lambda (th)
	   (thread-join! th)) ;; I assume that joining completed threads just moves on
	 (let loop ((hed     (car run-id-list))
		    (tal     (cdr run-id-list))
		    (threads '()))
	   (let* ((newthread (make-thread
			      (lambda ()
				(let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in))))
				  (if (list? res)
				      (begin
					(mutex-lock! multi-run-mutex)
					(set! result (append result res))
					(mutex-unlock! multi-run-mutex))
				      (debug:print 0 "ERROR: get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in))))
			      (conc "multi-run-thread for run-id " hed)))
		  (newthreads (cons newthread threads)))
	     (thread-start! newthread)
	     (thread-sleep! 0.5) ;; give that thread some time to start
	     (if (null? tal)
		 newthreads
		 (loop (car tal)(cdr tal) newthreads))))))
    result))

(define (rmt:delete-test-records run-id test-id)
  (rmt:send-receive 'delete-test-records run-id (list run-id test-id)))

;; This is not needed as test steps are deleted on test delete call
;;
;; (define (rmt:delete-test-step-records run-id test-id)

Modified tasks.scm from [42971aa928] to [1cac97501f].

359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
(define (tasks:need-server run-id)
  (let ((forced (configf:lookup *configdat* "server" "required"))
	(maxqry (cdr (rmt:get-max-query-average run-id)))
	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
    (cond
     (forced 
      (if (common:low-noise-print 60 run-id "server required is set")
	  (debug:print-info 0 "Server required is set, starting server."))
      #t)
     ((> maxqry threshold)
      (if (common:low-noise-print 60 run-id "Max query time execeeded")
	  (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, starting server."))
      #t)
     (else
      #f))))







|







359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
(define (tasks:need-server run-id)
  (let ((forced (configf:lookup *configdat* "server" "required"))
	(maxqry (cdr (rmt:get-max-query-average run-id)))
	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
    (cond
     (forced 
      (if (common:low-noise-print 60 run-id "server required is set")
	  (debug:print-info 0 "Server required is set, starting server for run-id " run-id "."))
      #t)
     ((> maxqry threshold)
      (if (common:low-noise-print 60 run-id "Max query time execeeded")
	  (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, starting server."))
      #t)
     (else
      #f))))

Modified tests/fullrun/megatest.config from [26d5518426] to [ebc04ead74].

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
waivercommentpatt ^WW\d+ [a-z].*
incomplete-timeout 1

# yes, anything else is no
run-wait yes


# Use http instead of direct filesystem access
# transport http
# transport fs

# If set to "default" the old code is used. Otherwise defaults to 200 or uses
# numeric value given.
#
runqueue 20

# Default runtimelim 1d 1h 1m 10s







<
<
<







29
30
31
32
33
34
35



36
37
38
39
40
41
42
waivercommentpatt ^WW\d+ [a-z].*
incomplete-timeout 1

# yes, anything else is no
run-wait yes






# If set to "default" the old code is used. Otherwise defaults to 200 or uses
# numeric value given.
#
runqueue 20

# Default runtimelim 1d 1h 1m 10s
122
123
124
125
126
127
128






129
130
131
132
133
134
135
blah

MAX_ALLOWED_LOAD 200
# XTERM   [system xterm]
# RUNDEAD [system exit 56]

[server]






synchronous 0

# If the server can't be started on this port it will try the next port until
# it succeeds
port 8080

# This server will keep running this number of hours after last access. 







>
>
>
>
>
>







119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
blah

MAX_ALLOWED_LOAD 200
# XTERM   [system xterm]
# RUNDEAD [system exit 56]

[server]

# Use http instead of direct filesystem access
transport http
# transport fs
# transport nmsg

synchronous 0

# If the server can't be started on this port it will try the next port until
# it succeeds
port 8080

# This server will keep running this number of hours after last access.