Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -1,5 +1,7 @@ + + ;;====================================================================== ;; Copyright 2006-2013, Matthew Welland. ;; ;; This file is part of Megatest. ;; @@ -376,27 +378,36 @@ ;; db:* ;; ;; NB// Runs on the server as part of the server loop ;; (define (api:process-request dbstruct $) ;; the $ is the request vars proc - (set! *api-process-request-count* (+ *api-process-request-count* 1)) + (debug:print 4 *default-log-port* "server-id:" *server-id*) (let* ((cmd ($ 'cmd)) (paramsj ($ 'params)) - (params (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?) - (resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) - (if (not success) - (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) - (if (> *api-process-request-count* *max-api-process-requests*) - (set! *max-api-process-requests* *api-process-request-count*)) - (set! *api-process-request-count* (- *api-process-request-count* 1)) - ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds - ;; (rmt:dat->json-str - ;; (if (or (string? res) - ;; (list? res) - ;; (number? res) - ;; (boolean? res)) - ;; res - ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) - (db:obj->string res transport: 'http))) + (key ($ 'key)) + (params (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?) + (debug:print 0 *default-log-port* "cmd:" cmd " with params " params "key " key) + (if (equal? key *server-id*) + (begin + (set! *api-process-request-count* (+ *api-process-request-count* 1)) + (let* ((resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) + (success (vector-ref resdat 0)) + (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) + (debug:print 0 *default-log-port* "res:" res) + (if (not success) + (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) + (if (> *api-process-request-count* *max-api-process-requests*) + (set! *max-api-process-requests* *api-process-request-count*)) + (set! *api-process-request-count* (- *api-process-request-count* 1)) + ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds + ;; (rmt:dat->json-str + ;; (if (or (string? res) + ;; (list? res) + ;; (number? res) + ;; (boolean? res)) + ;; res + ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) + (db:obj->string res transport: 'http))) + (begin + (debug:print 0 *default-log-port* "Server refused to process request. Sever id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) + (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http))))) Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -88,18 +88,25 @@ (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http areapath remaining-tries: (- remaining-tries 1)) (let ((host (cadr server-dat)) - (port (caddr server-dat))) + (port (caddr server-dat)) + (server-id (caddr (cddr server-dat)))) (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if (and (not area-dat) (not *runremote*)) - (set! *runremote* (make-remote))) - (if (and host port) + (begin + (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))))) + (if (and host port server-id) (let* ((start-res (case *transport-type* - ((http)(http-transport:client-connect host port)))) + ((http)(http-transport:client-connect host port server-id)))) (ping-res (case *transport-type* ((http)(rmt:login-no-auto-client-setup start-res))))) (if (and start-res ping-res) (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -296,11 +296,13 @@ (define (common:logpro-exit-code->test-status exit-code) (status-sym->string (common:logpro-exit-code->status-sym exit-code))) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) + (server-url #f) ;; (server:check-if-running *toppath*) #f)) + (server-id #f) + (server-info (if *toppath* (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (server:expiration-timeout)) (force-server #f) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -3215,14 +3215,14 @@ test-id)))))) (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NEW BEHAVIOR: Count tests running in all runs! ;; -(define (db:get-count-tests-running dbstruct run-id fastmode) - (let* ((qry (if fastmode - "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;" - "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');"))) +(define (db:get-count-tests-running dbstruct run-id) ;; fastmode) + (let* ((qry ;; (if fastmode + ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;" + "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');")) ;; ) (db:with-db dbstruct run-id #f (lambda (db) @@ -3245,14 +3245,14 @@ run-id)))) ;; NOT IN (SELECT id FROM runs WHERE state='deleted');") ;; NEW BEHAVIOR: Look only at single run with run-id ;; ;; (define (db:get-running-stats dbstruct run-id) -(define (db:get-count-tests-running-for-run-id dbstruct run-id fastmode) - (let* ((qry (if fastmode - "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;" - "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;"))) +(define (db:get-count-tests-running-for-run-id dbstruct run-id) ;; fastmode) + (let* ((qry ;; (if fastmode + ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;" + "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")) ;; ) (db:with-db dbstruct run-id #f (lambda (db) @@ -4342,18 +4342,19 @@ set-verbosity killserver )) (define (db:login dbstruct calling-path calling-version client-signature) - (cond + (cond ((not (equal? calling-path *toppath*)) (list #f "Login failed due to mismatch paths: " calling-path ", " *toppath*)) ;; ((not (equal? *run-id* run-id)) ;; (list #f "Login failed due to mismatch run-id: " run-id ", " *run-id*)) ((not (equal? megatest-version calling-version)) (list #t (conc "Login warning due to mismatch megatest version: " calling-version ", " megatest-version))) - (else + + (else (hash-table-set! *logged-in-clients* client-signature (current-seconds)) '(#t "successful login")))) (define (db:general-call dbstruct stmtname params) (let ((query (let ((q (alist-ref (if (string? stmtname) Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -713,11 +713,11 @@ ;; colnames) (set! rownum 1) (for-each (lambda (server) (set! colnum 0) - (match-let (((mod-time host port start-time pid) + (match-let (((mod-time host port start-time server-id pid) server)) (let* ((uptime (- (current-seconds) mod-time)) (runtime (if start-time (- mod-time start-time) 0)) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -246,12 +246,18 @@ (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") (exit 1)))) (res (vector #f "uninitialized")) (success #t) (sparams (db:obj->string params transport: 'http)) - (runremote (or area-dat *runremote*))) - (debug:print-info 11 *default-log-port* "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") + (runremote (or area-dat *runremote*)) + (server-id (if (vector? serverdat) + (http-transport:server-dat-get-server-id serverdat) + (begin + (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") + (exit 1))))) + (debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) + ;; set up the http-client here (max-retry-attempts 1) ;; consider all requests indempotent (retry-request? (lambda (request) #f)) @@ -273,11 +279,11 @@ (if (debug:debug-mode 1) (debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...") (begin (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " msg ", exn=" exn) - (debug:print 0 *default-log-port* " cmd: " cmd " params: " params) + (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey")) (debug:print 0 *default-log-port* " call-chain: " call-chain))) (if runremote (remote-conndat-set! runremote #f)) ;; Killing associated server to allow clean retry.") ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? @@ -286,11 +292,11 @@ ;;; (make-property-condition 'commfail 'message "failed to connect to server"))) ;;; "communications failed" (db:obj->string #f)) (with-input-from-request ;; was dat fullurl - (list (cons 'key (or *server-id* "thekey")) + (list (cons 'key (or server-id "thekey")) (cons 'cmd cmd) (cons 'params sparams)) read-string)) transport: 'http) 0)) ;; added this speculatively @@ -305,15 +311,21 @@ (th1 (make-thread send-recieve "with-input-from-request")) (th2 (make-thread time-out "time out"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) + (vector-set! res 0 success) (thread-terminate! th2) - (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) - (if (vector-ref res 0) ;; this is the first flag or the second flag? - res ;; this is the *inner* vector? seriously? why? + (if (vector-ref res 0) ;; this is the first flag or the second flag? + (let* ((res-dat (vector-ref res 1))) + (if (and (string? res-dat) (string-contains res-dat "server-id mismatch")) + (signal (make-composite-condition + (make-property-condition + 'servermismatch + 'message (vector-ref res 1)))) + res)) ;; this is the *inner* vector? seriously? why? (if (debug:debug-mode 11) (let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it (print-call-chain (current-error-port)) (debug:print-error 11 *default-log-port* "error above occured at server, res=" res) ;; " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 11 *default-log-port* " server call chain:") @@ -350,11 +362,12 @@ (define (http-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2)) (define (http-transport:server-dat-get-api-url vec) (vector-ref vec 3)) (define (http-transport:server-dat-get-api-req vec) (vector-ref vec 4)) (define (http-transport:server-dat-get-last-access vec) (vector-ref vec 5)) -(define (http-transport:server-dat-get-socket vec) (vector-ref vec 6)) +;(define (http-transport:server-dat-get-socket vec) (vector-ref vec 6)) +(define (http-transport:server-dat-get-server-id vec) (vector-ref vec 6)) (define (http-transport:server-dat-make-url vec) (if (and (http-transport:server-dat-get-iface vec) (http-transport:server-dat-get-port vec)) (conc "http://" @@ -371,15 +384,15 @@ (debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!")))) ;; ;; connect ;; -(define (http-transport:client-connect iface port) +(define (http-transport:client-connect iface port server-id) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) - (server-dat (vector iface port api-uri api-url api-req (current-seconds)))) + (server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id))) server-dat)) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; @@ -475,21 +488,27 @@ (let ((new-iface (car sdat)) (new-port (cadr sdat))) (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info") (set! iface new-iface) (set! port new-port) - (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) + (if (not *server-id*) + (set! *server-id* (server:mk-signature))) + (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) (flush-output *default-log-port*))) ;; Transfer *db-last-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin - (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) + (if (not *server-id*) + (set! *server-id* (server:mk-signature))) + (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -731,11 +731,11 @@ ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) - (running-cnt (rmt:get-count-tests-running-for-run-id run-id #f)) ;; fastmode=no + (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) (current-state (rmt:get-run-state run-id)) (current-status (rmt:get-run-status run-id))) ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing (debug:print 0 *default-log-port* "Running test cnt :" running-cnt) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.6569) +(define megatest-version 1.6581) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -852,11 +852,11 @@ )) (if (args:get-arg "-ping") (let* ((server-id (string->number (args:get-arg "-ping"))) ;; extract run-id (i.e. no ":" (host:port (args:get-arg "-ping"))) - (server:ping (or server-id host:port) do-exit: #t))) + (server:ping (or server-id host:port) #f do-exit: #t))) ;;====================================================================== ;; Capture, save and manipulate environments ;;====================================================================== Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -100,10 +100,15 @@ ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; ;; ensure we have a record for our connection for given area (if (not runremote) ;; can remove this one. should never get here. (begin (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))) (set! runremote *runremote*))) ;; new runremote will come from this on next iteration ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; ;; DOT SET_HOMEHOST -> MUTEXLOCK; @@ -174,12 +179,17 @@ ;;DOT CASE6 -> "rmt:send-receive"; ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url runremote) ;; have a server - (not (server:ping (remote-server-url runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. + (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))) (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") (rmt:send-receive cmd rid params attemptnum: attemptnum)) @@ -202,20 +212,22 @@ ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required (cdr (remote-hh-dat runremote)) ;; have homehost (not (remote-server-url runremote)) ;; no connection yet (not (member cmd api:read-only-queries))) ;; not a read-only query (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") - (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call - (if server-url - (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed + (let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call + (if server-info + (begin + (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed + (remote-server-id-set! runremote (server:record->id server-info))) (if (common:force-server?) (server:start-and-wait *toppath*) - (server:kind-run *toppath*)))) + (server:kind-run *toppath*))) (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") - (rmt:open-qry-close-locally cmd 0 params)) + (rmt:open-qry-close-locally cmd 0 params))) ;;DOT CASE9 [label="force server\nnot on homehost"]; ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one @@ -263,10 +275,11 @@ ;; attemtped ;; communication to ;; servers that have gone ;; away (http-transport:client-api-send-receive 0 conninfo cmd params) + ((servermismatch) (vector #f "Server id mismatch" )) ((commfail)(vector #f "communications fail")) ((exn)(vector #f "other fail" (print-call-chain))))) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) @@ -310,11 +323,13 @@ (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was ;; successful, have to examine the data to see if ;; there was a detected issue at the other end (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd) - (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params) + (begin + (debug:print-error 0 *default-log-port* " dat=" dat) + (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params)) ))) (define (rmt:print-db-stats) (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" (debug:print 18 *default-log-port* "DB Stats\n========") @@ -661,21 +676,21 @@ run-ids)))) (define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) -(define (rmt:get-count-tests-running-for-run-id run-id fastmode) - (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id fastmode))) +(define (rmt:get-count-tests-running-for-run-id run-id) + (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))) (define (rmt:get-not-completed-cnt run-id) (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) ;; Statistical queries -(define (rmt:get-count-tests-running run-id fastmode) - (rmt:send-receive 'get-count-tests-running run-id (list run-id fastmode))) +(define (rmt:get-count-tests-running run-id) + (rmt:send-receive 'get-count-tests-running run-id (list run-id))) (define (rmt:get-count-tests-running-for-testname run-id testname) (rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname))) (define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -321,11 +321,11 @@ (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) - (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no + (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) @@ -1565,11 +1565,11 @@ extras) '()))) (waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) - (num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes + (num-running (rmt:get-count-tests-running-for-run-id run-id)) (testdat (make-runs:testdat hed: hed tal: tal reg: reg reruns: reruns @@ -1831,31 +1831,31 @@ ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle - (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes + (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) - (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no + (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id))) (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) - (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes + (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 *default-log-port* "Calling Post Hook") Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -93,10 +93,11 @@ (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string (lambda () (write (list (current-directory) + (current-process-id) (argv))))))) ;; When using zmq this would send the message back (two step process) ;; with spiffy or rpc this simply returns the return data to be returned ;; @@ -163,18 +164,19 @@ (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds -;; +;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let + (define (server:logf-get-start-info logf) - (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs + (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)"))) ;; SERVER STARTED: host:port AT timesecs server id (handle-exceptions exn (begin - (print "failed to get server info from " logf ", exn=" exn) - (list #f #f #f)) ;; no idea what went wrong, call it a bad server + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn) + (list #f #f #f #f)) ;; no idea what went wrong, call it a bad server (with-input-from-file logf (lambda () (let loop ((inl (read-line)) (lnum 0)) @@ -181,16 +183,21 @@ (if (not (eof-object? inl)) (let ((mlst (string-match rx inl))) (if (not mlst) (if (< lnum 500) ;; give up if more than 500 lines of server log read (loop (read-line)(+ lnum 1)) - (list #f #f #f)) + (begin + (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) + (list #f #f #f #f))) (let ((dat (cdr mlst))) (list (car dat) ;; host (string->number (cadr dat)) ;; port - (string->number (caddr dat)))))) - (list #f #f #f)))))))) + (string->number (caddr dat)) + (cadr (cddr dat)))))) + (begin + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds)) + (list #f #f #f #f))))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) @@ -217,11 +224,11 @@ (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions exn (begin - (print "failed to get modification time on " hed ", exn=" exn) + (debug:print 0 *default-log-port* "failed to get modification time on " hed ", exn=" exn) (current-seconds)) ;; 0 (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time 900)) ;; day-seconds)) @@ -230,11 +237,11 @@ (serv-rec (cons mod-time serv-dat)) (fmatch (string-match fname-rx hed)) (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) (new-res (if (null? serv-dat) res - (cons (append serv-rec (list pid)) res)))) + (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let (if (null? tal) (if (and limit (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) @@ -242,17 +249,21 @@ (define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) - (match-let (((mod-time host port start-time pid) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn)) + (match-let (((mod-time host port start-time server-id pid) server)) (let* ((uptime (- (current-seconds) mod-time)) (runtime (if start-time (- mod-time start-time) 0))) - (if (< uptime 5)(set! num-alive (+ num-alive 1)))))) + (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) srvlst) num-alive)) ;; given a list of servers get a list of valid servers, i.e. at least ;; 10 seconds old, has started and is less than 1 hour old and is @@ -305,17 +316,33 @@ (let* ((len (length srvrs)) (idx (random len))) (list-ref srvrs idx)) #f))) +(define (server:record->id servr) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) + #f) + (match-let (((mod-time host port start-time server-id pid) + servr)) + (if server-id + server-id + #f)))) (define (server:record->url servr) - (match-let (((mod-time host port start-time pid) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) + #f) + (match-let (((mod-time host port start-time server-id pid) servr)) (if (and host port) (conc host ":" port) - #f))) + #f)))) (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) @@ -378,15 +405,15 @@ ;; this one seems to be the general entry point ;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) - (let loop ((server-url (server:check-if-running areapath)) + (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) - (if (or server-url + (if (or server-info (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. - server-url + (server:record->url server-info) (let ((num-ok (length (server:get-best (server:get-list areapath))))) (if (and (> try-num 0) ;; first time through simply wait a little while then try again (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one (server:kind-run areapath)) (thread-sleep! 5) @@ -403,11 +430,10 @@ ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) (let* ((ns (server:get-num-servers)) (servers (server:get-best (server:get-list areapath)))) - ;; (print "servers: " servers " ns: " ns) (if (or (and servers (null? servers)) (not servers) (and (list? servers) (< (length servers) (random ns)))) ;; somewhere between 0 and numservers @@ -414,38 +440,44 @@ #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res - res + hed (if (null? tal) #f (loop (car tal)(cdr tal))))))))) ;; ping the given server ;; (define (server:check-server server-record) (let* ((server-url (server:record->url server-record)) + (server-id (server:record->id server-record)) (res (case *transport-type* - ((http)(server:ping server-url)) + ((http)(server:ping server-url server-id)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res server-url #f))) (define (server:kill servr) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) + #f) (match-let (((mod-time hostname port start-time pid) servr)) - (tasks:kill-server hostname pid))) + (tasks:kill-server hostname pid)))) ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; -(define (server:ping host-port-in #!key (do-exit #f)) +(define (server:ping host-port-in server-id #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find #f ;; (server:check-if-running *toppath*) ;; (if (number? host-port-in) ;; we were handed a server-id ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; ;; (print "srec: " srec " host-port-in: " host-port-in) @@ -467,11 +499,11 @@ (debug:print 0 *default-log-port* "ERROR: bad host:port")) (if do-exit (exit 1)) #f) (let* ((iface (car host-port)) (port (cadr host-port)) - (server-dat (http-transport:client-connect iface port)) + (server-dat (http-transport:client-connect iface port server-id)) (login-res (rmt:login-no-auto-client-setup server-dat))) (if (and (list? login-res) (car login-res)) (begin ;; (print "LOGIN_OK") @@ -589,15 +621,18 @@ (res (system sync-cmd)) (dbbackupfile (conc mtdbfile ".backup")) (res2 (cond ((eq? 0 res ) + (handle-exceptions + exn + #f (if (file-exists? dbbackupfile) (delete-file* dbbackupfile) ) (if (eq? 0 (file-size sync-log)) - (delete-file sync-log)) + (delete-file* sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) (set! off-time (calculate-off-time last-sync-seconds @@ -608,11 +643,11 @@ (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle) default-duty-cycle)))) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec") (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time) - 'sync-completed) + 'sync-completed)) (else (system (conc "/bin/cp "sync-log" "sync-log".fail")) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") (if (file-exists? (conc mtdbfile ".backup")) (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))) Index: sretrieve.scm ================================================================== --- sretrieve.scm +++ sretrieve.scm @@ -637,21 +637,21 @@ (conc(pathname-file target-path) "." (pathname-extension target-path)) (pathname-file target-path))) (curr-dir (current-directory)) (start-dir (conc (current-directory) "/" last-dir-name)) (execlude (make-exclude-pattern (string-split restrictions ","))) - (tmpfile (conc "/tmp/" (current-user-name) "/my-pipe-" (current-process-id)))) + (tmpfile (conc "/tmp/my-pipe-" (current-process-id)))) (if (file-exists? start-dir) (begin (sauth:print-error (conclast-dir-name " already exist in your work dir.")) (sauth:print-error "Nothing has been retrieved!! ")) (begin ; (sretrieve:do-as-calling-user ; (lambda () - - (if (not (file-exists? (conc "/tmp/" (current-user-name)))) - (create-directory (conc "/tmp/" (current-user-name)) #t)) + ; (print tmpfile) + ;(if (not (file-exists? (conc "/tmp/" (current-user-name)))) + ; (create-directory (conc "/tmp/" (current-user-name)) #t)) (change-directory parent-dir) (create-fifo tmpfile) (process-fork (lambda() (sleep 1) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -755,30 +755,31 @@ (owner (db:get-value-by-header row header "owner")) (event-time (db:get-value-by-header row header "event_time")) (comment (db:get-value-by-header row header "comment")) (fail-count (db:get-value-by-header row header "fail_count")) (pass-count (db:get-value-by-header row header "pass_count")) - (db-contour (db:get-value-by-header row header "contour")) + (db-contour (db:get-value-by-header row header "contour")) (contour (if (args:get-arg "-prepend-contour") (if (and db-contour (not (equal? db-contour "")) (string? db-contour )) (begin - (debug:print-info 1 *default-log-port* "db-contour") + (debug:print-info 10 *default-log-port* "db-contour" db-contour) db-contour) (args:get-arg "-contour")))) - (run-tag (if (args:get-arg "-run-tag") + (run-tag (if (args:get-arg "-run-tag") (args:get-arg "-run-tag") "")) - (last-update (db:get-value-by-header row header "last_update")) + (last-update (db:get-value-by-header row header "last_update")) (keytarg (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) (conc "MT_CONTOUR/MT_AREA/" (string-intersperse (rmt:get-keys) "/")) (string-intersperse (rmt:get-keys) "/"))) ;; e.g. version/iteration/platform + (base-target (rmt:get-target run-id)) (target (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) - (conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) (rmt:get-target run-id)) (rmt:get-target run-id))) ;; e.g. v1.63/a3e1/ubuntu + (conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) base-target) base-target)) ;; e.g. v1.63/a3e1/ubuntu (spec-id (pgdb:get-ttype dbh keytarg)) (publish-time (if (args:get-arg "-cp-eventtime-to-publishtime") event-time (current-seconds))) - (new-run-id (pgdb:get-run-id dbh spec-id target run-name area-id))) + (new-run-id (if (and run-name base-target) (pgdb:get-run-id dbh spec-id target run-name area-id) #f))) (if new-run-id (begin ;; let ((run-record (pgdb:get-run-info dbh new-run-id)) (hash-table-set! runs-ht run-id new-run-id) ;; ensure key fields are up to date ;; if last_update == pgdb_last_update do not update smallest-last-update-time @@ -788,16 +789,16 @@ (hash-table-set! smallest-last-update-time "smallest-time" last-update))) (pgdb:refresh-run-info dbh new-run-id state status owner event-time comment fail-count pass-count area-id last-update publish-time) - (debug:print-info 0 *default-log-port* "Working on run-id " run-id " pgdb-id " new-run-id ) + (debug:print-info 4 *default-log-port* "Working on run-id " run-id " pgdb-id " new-run-id ) (if (not (equal? run-tag "")) (task:add-run-tag dbh new-run-id run-tag)) new-run-id) - (if (equal? state "deleted") + (if (or (not state) (equal? state "deleted")) (begin (debug:print-info 1 *default-log-port* "Warning: Run with id " run-id " was created after previous sync and deleted before the sync") #f) (if (handle-exceptions exn (begin (print-call-chain) @@ -860,17 +861,17 @@ (begin (if pgdb-test-id (begin (if pgdb-step-id (begin - (debug:print-info 1 *default-log-port* "Updating existing test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id " pgdb step id " pgdb-step-id ) + (debug:print-info 4 *default-log-port* "Updating existing test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id " pgdb step id " pgdb-step-id ) (let* ((pgdb-last-update (pgdb:get-test-step-last-update dbh pgdb-step-id))) (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) (hash-table-set! smallest-last-update-time "smallest-time" last-update))) (pgdb:update-test-step dbh pgdb-step-id pgdb-test-id stepname state status event_time comment logfile last-update)) (begin - (debug:print-info 1 *default-log-port* "Inserting test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id) + (debug:print-info 4 *default-log-port* "Inserting test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id) (if (or (not smallest-time) (< last-update smallest-time)) (hash-table-set! smallest-last-update-time "smallest-time" last-update)) (pgdb:insert-test-step dbh pgdb-test-id stepname state status event_time comment logfile last-update ) (set! pgdb-step-id (pgdb:get-test-step-id dbh pgdb-test-id stepname state)))) (hash-table-set! step-ht step-id pgdb-step-id )) @@ -906,17 +907,17 @@ (begin (if pgdb-test-id (begin (if pgdb-data-id (begin - (debug:print-info 1 *default-log-port* "Updating existing test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id " pgdb data id " pgdb-data-id) + (debug:print-info 4 *default-log-port* "Updating existing test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id " pgdb data id " pgdb-data-id) (let* ((pgdb-last-update (pgdb:get-test-data-last-update dbh pgdb-data-id))) (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) (hash-table-set! smallest-last-update-time "smallest-time" last-update))) (pgdb:update-test-data dbh pgdb-data-id pgdb-test-id category variable value expected tol units comment status type last-update)) (begin - (debug:print-info 1 *default-log-port* "Inserting test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id) + (debug:print-info 4 *default-log-port* "Inserting test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id) (if (handle-exceptions exn (begin (print-call-chain) (print ((condition-property-accessor 'exn 'message) exn)) #f) @@ -970,21 +971,23 @@ (pgdb:get-test-id dbh pgdb-run-id test-name item-path)) #f))) ;; "id" "run_id" "testname" "state" "status" "event_time" ;; "host" "cpuload" "diskfree" "uname" "rundir" "item_path" ;; "run_duration" "final_logf" "comment" "shortdir" "attemptnum" "archived" + (if (or (not item-path) (string-null? item-path)) + (debug:print-info 0 *default-log-port* "Working on Run id : " run-id "and test name : " test-name)) (if pgdb-run-id (begin (if pgdb-test-id ;; have a record (begin ;; let ((key-name (conc run-id "/" test-name "/" item-path))) - (debug:print-info 0 *default-log-port* "Updating existing test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id " pgdb-test-id " pgdb-test-id) + (debug:print-info 4 *default-log-port* "Updating existing test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id " pgdb-test-id " pgdb-test-id) (let* ((pgdb-last-update (pgdb:get-test-last-update dbh pgdb-test-id))) (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) ;;if last-update is same as pgdb-last-update then it is safe to assume the records are identical and we can use a larger last update time. (hash-table-set! smallest-last-update-time "smallest-time" last-update))) (pgdb:update-test dbh pgdb-test-id pgdb-run-id test-name item-path state status host cpuload diskfree uname run-dir log-file run-duration comment event-time archived last-update pid)) (begin - (debug:print-info 0 *default-log-port* "Inserting test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id) + (debug:print-info 4 *default-log-port* "Inserting test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id) (pgdb:insert-test dbh pgdb-run-id test-name item-path state status host cpuload diskfree uname run-dir log-file run-duration comment event-time archived last-update pid) (if (or (not smallest-time) (< last-update smallest-time)) (hash-table-set! smallest-last-update-time "smallest-time" last-update)) (set! pgdb-test-id (pgdb:get-test-id dbh pgdb-run-id test-name item-path)))) (hash-table-set! test-ht test-id pgdb-test-id)) @@ -1013,11 +1016,11 @@ (pgdb:insert-area-tag dbh (vector-ref tag-info 0) (vector-ref area-info 0)))))) (define (tasks:sync-run-data dbh cached-info run-ids area-info smallest-last-update-time) (for-each (lambda (run-id) - (debug:print-info 1 *default-log-port* "Check if run with " run-id " needs to be synced" ) + (debug:print-info 4 *default-log-port* "Check if run with " run-id " needs to be synced" ) (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) run-ids)) ;; get runs changed since last sync @@ -1082,11 +1085,11 @@ (debug:print-info 0 *default-log-port* "syncing test steps") (tasks:sync-test-steps dbh cached-info test-step-ids smallest-last-update-time) (debug:print-info 0 *default-log-port* "syncing test data") (tasks:sync-test-gen-data dbh cached-info test-data-ids smallest-last-update-time) (print "----------done---------------"))) - (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) + (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" (current-seconds)))) (debug:print-info 0 "smallest-time :" smallest-time " last-sync-time " last-sync-time) (if (not (and target run-name)) (if (or (and smallest-time (> smallest-time last-sync-time)) (and smallest-time (eq? last-sync-time 0))) (pgdb:write-sync-time dbh area-info smallest-time))))) ;;this needs to be changed (if (tasks:set-area dbh configdat)