Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -214,10 +214,11 @@ ;; NO SYNC DB ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) + ((no-sync-get-lock) (apply db:no-sync-get-lock *no-sync-db* params)) ;; ARCHIVES ;; ((archive-get-allocations) ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -7,11 +7,11 @@ ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;;====================================================================== -(use srfi-1 posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records directory-utils stack +(use srfi-1 data-structures posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records directory-utils stack matchable regex posix srfi-18 extras pkts (prefix dbi dbi:)) (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) @@ -42,10 +42,18 @@ (setenv key val)) (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)))) (define home (getenv "HOME")) (define user (getenv "USER")) + +(define (get-file-descriptor-count #!key (pid (current-process-id ))) + (list + (length (glob (conc "/proc/" pid "/fd/*"))) + (length (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*"))))) + ) +) + ;; GLOBALS ;; CONTEXTS (defstruct cxt @@ -117,11 +125,11 @@ (define *my-client-signature* #f) (define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg (define *runremote* #f) ;; if set up for server communication this will hold ;; (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) -;; (define *server-id* #f) +(define *server-id* #f) (define *server-info* #f) ;; good candidate for easily convert to non-global (define *time-to-exit* #f) (define *server-run* #t) (define *run-id* #f) (define *server-kind-run* (make-hash-table)) @@ -158,11 +166,11 @@ (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) - (server-timeout (server:get-timeout)) ;; default from server:get-timeout + (server-timeout (server:expiration-timeout)) (force-server #f) (ro-mode #f) (ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode ;; launching and hosts @@ -500,12 +508,13 @@ (3 "SKIP") (4 "WARN") (5 "WAIVED") (6 "CHECK") (7 "STUCK/DEAD") - (8 "FAIL") - (9 "ABORT"))) + (8 "DEAD") + (9 "FAIL") + (10 "ABORT"))) (define *common:ended-states* ;; states which indicate the test is stopped and will not proceed '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE")) (define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked @@ -2079,11 +2088,17 @@ (number->string x 16)) (map string->number (string-split instr))) "/")) -(define (common:faux-lock keyname #!key (wait-time 8)) +;;====================================================================== +;; L O C K I N G M E C H A N I S M S +;;====================================================================== + +;; faux-lock is deprecated. Please use simple-lock below +;; +(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count (if (> wait-time 0) (begin (thread-sleep! 1) (if (eq? wait-time 1) ;; only one second left, steal the lock @@ -2101,11 +2116,19 @@ (begin (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) #t) #f)) - +;; simple lock. improve and converge on this one. +;; +(define (common:simple-lock keyname) + (rmt:no-sync-get-lock keyname)) + +;;====================================================================== +;; +;;====================================================================== + (define (common:in-running-test?) (and (args:get-arg "-execute") (get-environment-variable "MT_CMDINFO"))) (define (common:get-color-from-status status) (cond Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1048,20 +1048,32 @@ (db:sync-tables (db:sync-all-tables-list dbstruct) last-update tmpdb refndb mtdb))) ;;;; run-ids ;; if #f use *db-local-sync* : or 'local-sync-flags ;; if #t use timestamps : or 'timestamps +;; +;; NB// no-sync-db is the db handle, not a flag! +;; (define (db:sync-to-megatest.db dbstruct #!key (no-sync-db #f)) (let* ((start-time (current-seconds)) - (last-update (if no-sync-db - (db:no-sync-get/default no-sync-db "LAST_UPDATE" 0) - 0)) ;; (or (db:get-var dbstruct "LAST_UPDATE") 0)) + (last-full-update (if no-sync-db + (db:no-sync-get/default no-sync-db "LAST_FULL_UPDATE" 0) + 0)) + (full-sync-needed (> (- start-time last-full-update) 3600)) ;; every hour do a full sync + (last-update (if full-sync-needed + 0 + (if no-sync-db + (db:no-sync-get/default no-sync-db "LAST_UPDATE" 0) + 0))) ;; (or (db:get-var dbstruct "LAST_UPDATE") 0)) (sync-needed (> (- start-time last-update) 6)) - (res (if sync-needed ;; don't sync if a sync already occurred in the past 6 seconds + (res (if (or sync-needed ;; don't sync if a sync already occurred in the past 6 seconds + full-sync-needed) (begin (if no-sync-db - (db:no-sync-set no-sync-db "LAST_UPDATE" start-time)) + (begin + (if full-sync-needed (db:no-sync-set no-sync-db "LAST_FULL_UPDATE" start-time)) + (db:no-sync-set no-sync-db "LAST_UPDATE" start-time))) (db:tmp->megatest.db-sync dbstruct last-update)) 0)) (sync-time (- (current-seconds) start-time))) (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)) (if (common:low-noise-print 30 "sync new to old") @@ -1849,26 +1861,33 @@ ;;====================================================================== (define (db:open-no-sync-db) (let* ((dbpath (db:dbfile-path)) (dbname (conc dbpath "/no-sync.db")) + (db-exists (common:file-exists? dbname)) (db (sqlite3:open-database dbname))) (sqlite3:set-busy-handler! db (make-busy-timeout 136000)) - (sqlite3:execute db "PRAGMA synchronous = 0;") - (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));") + (if (not db-exists) + (begin + (sqlite3:execute db "PRAGMA synchronous = 0;") + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));") + (sqlite3:execute db "PRAGMA journal_mode=WAL;"))) db)) ;; if we are not a server create a db handle. this is not finalized ;; so watch for problems. I'm still not clear if it is needed to manually ;; finalize sqlite3 dbs with the sqlite3 egg. ;; (define (db:no-sync-db db-in) - (if db-in - db-in - (let ((db (db:open-no-sync-db))) - (set! *no-sync-db* db) - db))) + (mutex-lock! *db-access-mutex*) + (let ((res (if db-in + db-in + (let ((db (db:open-no-sync-db))) + (set! *no-sync-db* db) + db)))) + (mutex-unlock! *db-access-mutex*) + res)) (define (db:no-sync-set db var val) (sqlite3:execute (db:no-sync-db db) "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val)) (define (db:no-sync-del! db var) @@ -1891,10 +1910,30 @@ res)) res))) (define (db:no-sync-close-db db) (db:safely-close-sqlite3-db db)) + +;; transaction protected lock aquisition +;; either: +;; fails returns (#f . lock-creation-time) +;; succeeds (returns (#t . lock-creation-time) +;; use (db:no-sync-del! db keyname) to release the lock +;; +(define (db:no-sync-get-lock db-in keyname) + (let ((db (db:no-sync-db db-in))) + (sqlite3:with-transaction + db + (lambda () + (handle-exceptions + exn + (let ((lock-time (current-seconds))) + (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time) + `(#t . ,lock-time)) + `(#f . ,(sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname))))))) + + ;; use a global for some primitive caching, it is just silly to ;; re-read the db over and over again for the keys since they never ;; change Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -56,12 +56,15 @@ ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") (server:get-best-guess-address hostname) #f))) (if ipstr ipstr hostn))) ;; hostname))) (start-port (portlogger:open-run-close portlogger:find-port)) - (link-tree-path (common:get-linktree))) ;; (configf:lookup *configdat* "setup" "linktree"))) + (link-tree-path (common:get-linktree)) + (tmp-area (common:get-db-tmp-area)) + (start-file (conc tmp-area "/.server-start"))) (debug:print-info 0 *default-log-port* "portlogger recommended port: " start-port) + ;; set some parameters for the server (root-path (if link-tree-path link-tree-path (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP! (handle-directory spiffy-directory-listing) (handle-exception (lambda (exn chain) @@ -103,10 +106,11 @@ ((equal? (uri-path (request-uri (current-request))) '(/ "hey")) (send-response body: "hey there!\n" headers: '((content-type text/plain)))) (else (continue)))))))) + (with-output-to-file start-file (lambda ()(print (current-process-id)))) (http-transport:try-start-server ipaddrstr start-port))) ;; This is recursively run by http-transport:run until sucessful ;; (define (http-transport:try-start-server ipaddrstr portnum) @@ -238,10 +242,11 @@ (msg ((condition-property-accessor 'exn 'message) exn))) (set! success #f) (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " msg) (debug:print 0 *default-log-port* " cmd: " cmd " params: " params) + (debug:print 0 *default-log-port* " call-chain: " call-chain) (if runremote (remote-conndat-set! runremote #f)) ;; Killing associated server to allow clean retry.") ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? (mutex-unlock! *http-mutex*) @@ -249,11 +254,11 @@ ;;; (make-property-condition 'commfail 'message "failed to connect to server"))) ;;; "communications failed" (db:obj->string #f)) (with-input-from-request ;; was dat fullurl - (list (cons 'key "thekey") + (list (cons 'key (or *server-id* "thekey")) (cons 'cmd cmd) (cons 'params sparams)) read-string)) transport: 'http) 0)) ;; added this speculatively @@ -300,10 +305,11 @@ exn (begin (print-call-chain *default-log-port*) (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn))) (close-connection! api-dat) + (close-idle-connections!) #t)) #f))) (define (make-http-transport:server-dat)(make-vector 6)) @@ -347,11 +353,13 @@ (define (http-transport:keep-running) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") - (let* ((server-start-time (current-seconds)) + (let* ((tmp-area (common:get-db-tmp-area)) + (started-file (conc tmp-area "/.server-started")) + (server-start-time (current-seconds)) (server-info (let loop ((start-time (current-seconds)) (changed #t) (last-sdat "not this")) (let ((sdat #f)) (thread-sleep! 0.01) @@ -376,13 +384,16 @@ (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) - (server-timeout (server:get-timeout)) + (server-timeout (server:expiration-timeout)) (server-going #f) (server-log-file (args:get-arg "-log"))) ;; always set when we are a server + + (with-output-to-file started-file (lambda ()(print (current-process-id)))) + (let loop ((count 0) (server-state 'available) (bad-sync-count 0) (start-time (current-milliseconds))) ;; Use this opportunity to sync the tmp db to megatest.db @@ -430,21 +441,15 @@ (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) - (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) - (adjusted-timeout (if (> hrs-since-start 1) ;; never used! - (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour - server-timeout))) - (if (common:low-noise-print 120 "server timeout") - (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) + (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) (cond ((and *server-run* (> (+ last-access server-timeout) - (current-seconds)) - (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. + (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn @@ -491,10 +496,23 @@ ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch) + ;; check that a server start is in progress, pause or exit if so + (let* ((tmp-area (common:get-db-tmp-area)) + (server-start (conc tmp-area "/.server-start")) + (server-started (conc tmp-area "/.server-started")) + (start-time (common:lazy-modification-time server-start)) + (started-time (common:lazy-modification-time server-started)) + (server-starting (< start-time started-time)) ;; if start-time is less than started-time then a server is still starting + (start-time-old (> (- (current-seconds) start-time) 5))) + (if (and (not start-time-old) ;; last server start try was less than five seconds ago + (not server-starting)) + (begin + (debug:print-info 0 *default-log-port* "NOT starting server, there is either a recently started server or a server in process of starting") + (exit)))) ;; lets not even bother to start if there are already three or more server files ready to go (let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) (if (> num-alive 3) (begin (debug:print 0 *default-log-port* "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up") @@ -515,27 +533,27 @@ (thread-start! th3) (set! *didsomething* #t) (thread-join! th2) (exit))) -(define (http-transport:server-signal-handler signum) - (signal-mask! signum) - (handle-exceptions - exn - (debug:print 0 *default-log-port* " ... exiting ...") - (let ((th1 (make-thread (lambda () - (thread-sleep! 1)) - "eat response")) - (th2 (make-thread (lambda () - (debug:print-error 0 *default-log-port* "Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.") - (thread-sleep! 3) ;; give the flush three seconds to do it's stuff - (debug:print 0 *default-log-port* " Done.") - (exit 4)) - "exit on ^C timer"))) - (thread-start! th2) - (thread-start! th1) - (thread-join! th2)))) +;; (define (http-transport:server-signal-handler signum) +;; (signal-mask! signum) +;; (handle-exceptions +;; exn +;; (debug:print 0 *default-log-port* " ... exiting ...") +;; (let ((th1 (make-thread (lambda () +;; (thread-sleep! 1)) +;; "eat response")) +;; (th2 (make-thread (lambda () +;; (debug:print-error 0 *default-log-port* "Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.") +;; (thread-sleep! 3) ;; give the flush three seconds to do it's stuff +;; (debug:print 0 *default-log-port* " Done.") +;; (exit 4)) +;; "exit on ^C timer"))) +;; (thread-start! th2) +;; (thread-start! th1) +;; (thread-join! th2)))) ;;====================================================================== ;; web pages ;;====================================================================== Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -464,58 +464,58 @@ (setenv "MT_TESTSUITENAME" areaname) (setenv "MT_RUN_AREA_HOME" top-path) (set! *toppath* top-path) (setenv "MT_TEST_RUN_DIR" work-area) - ;; On NFS it can be slow and unreliable to get needed startup information. - ;; i. Check if we are on the homehost, if so, proceed - ;; ii. Check if host and port passed in via CMDINFO are valid and if - ;; possible use them. - (let ((bestadrs (server:get-best-guess-address (get-host-name))) - (needcare #f)) - (if (equal? homehost bestadrs) ;; we are likely on the homehost - (debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost) - (let ((host-port (if serverurl (string-split serverurl ":") #f))) - (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote* - (if (string? homehost) - (if (and host-port - (> (length host-port) 1)) - (let* ((host (car host-port)) - (port (cadr host-port)) - (start-res (http-transport:client-connect host port)) - (ping-res (rmt:login-no-auto-client-setup start-res))) - (if (and start-res - ping-res) - ;; (begin ;; let ((url (http-transport:server-dat-make-url start-res))) - (begin - (remote-conndat-set! *runremote* start-res) - ;; (remote-server-url-set! *runremote* url) - ;; (if (server:ping url) - (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data.")) - (begin - (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " host ":" port) - (set! *runremote* #f)) - ;; (remote-conndat-set! *runremote* #f)) - )) - (begin - (set! *runremote* #f) - (debug:print-info 0 *default-log-port* (if host-port - (conc "received invalid host-port information " host-port) - "no host-port information received")) - ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare. - (set! needcare #t))) - (begin - (set! *runremote* #f) - (debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.") - (set! needcare #t))))) - (if needcare ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host - (let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory - (handle-exceptions - exn - (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn)) - (create-directory logdir #t))))) - + ;; ;; On NFS it can be slow and unreliable to get needed startup information. + ;; ;; i. Check if we are on the homehost, if so, proceed + ;; ;; ii. Check if host and port passed in via CMDINFO are valid and if + ;; ;; possible use them. + ;; (let ((bestadrs (server:get-best-guess-address (get-host-name))) + ;; (needcare #f)) + ;; (if (equal? homehost bestadrs) ;; we are likely on the homehost + ;; (debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost) + ;; (let ((host-port (if serverurl (string-split serverurl ":") #f))) + ;; (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote* + ;; (if (string? homehost) + ;; (if (and host-port + ;; (> (length host-port) 1)) + ;; (let* ((host (car host-port)) + ;; (port (cadr host-port)) + ;; (start-res (http-transport:client-connect host port)) + ;; (ping-res (rmt:login-no-auto-client-setup start-res))) + ;; (if (and start-res + ;; ping-res) + ;; ;; (begin ;; let ((url (http-transport:server-dat-make-url start-res))) + ;; (begin + ;; (remote-conndat-set! *runremote* start-res) + ;; ;; (remote-server-url-set! *runremote* url) + ;; ;; (if (server:ping url) + ;; (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data.")) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " host ":" port) + ;; (set! *runremote* #f)) + ;; ;; (remote-conndat-set! *runremote* #f)) + ;; )) + ;; (begin + ;; (set! *runremote* #f) + ;; (debug:print-info 0 *default-log-port* (if host-port + ;; (conc "received invalid host-port information " host-port) + ;; "no host-port information received")) + ;; ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare. + ;; (set! needcare #t))) + ;; (begin + ;; (set! *runremote* #f) + ;; (debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.") + ;; (set! needcare #t))))) + ;; (if needcare ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host + ;; (let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory + ;; (handle-exceptions + ;; exn + ;; (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (create-directory logdir #t))))) + ;; ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (common:file-exists? top-path) (> count 10)) (change-directory top-path) @@ -1235,11 +1235,24 @@ ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex - (let* ((item-path (item-list->path itemdat)) + (let* ( ;; (lock-key (conc "test-" test-id)) + ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) + ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds + ;; (if (car lock) + ;; #t + ;; (if (> (current-seconds) expire-time) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) + ;; (rmt:no-sync-del! lock-key) ;; destroy the lock + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; + ;; (begin + ;; (thread-sleep! 1) + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) + (item-path (item-list->path itemdat)) (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1))) (if (> launch-delay delta) (begin @@ -1411,10 +1424,11 @@ (car fullcmd)) (if useshell '() (cdr fullcmd))))) (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. + ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) Index: megatest.config ================================================================== --- megatest.config +++ megatest.config @@ -50,5 +50,7 @@ [listener] script nbfake echo +[server] +timeout 1 Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -51,13 +51,37 @@ (include "key_records.scm") (include "db_records.scm") (include "run_records.scm") (include "megatest-fossil-hash.scm") +(define *usage-log-file* #f) ;; put path to file for logging usage in this var in the ~/.megatestrc file +(define *usage-use-seconds* #f) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file + +;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file +;; (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc"))) (if (common:file-exists? debugcontrolf) (load debugcontrolf))) + +;; usage logging, careful with this, it is not designed to deal with all real world challenges! +;; +(if (and (common:file-exists? *usage-log-file*) + (file-write-access? *usage-log-file*)) + (with-output-to-file + *usage-log-file* + (lambda () + (print + (if *usage-use-seconds* + (current-seconds) + (time->string + (seconds->local-time (current-seconds)) + "%Yww%V.%w %H:%M:%S")) + " " + (current-user-name) " " + (current-directory) " " + "\"" (string-intersperse (argv) " ") "\"")) + #:append)) ;; Disabled help items ;; -rollup : (currently disabled) fill run (set by :runname) with latest test(s) ;; from prior runs with same keys ;; -daemonize : fork into background and disconnect from stdin/out Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -126,24 +126,25 @@ ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) ;; - ;; ;;DOT CASE4 [label="reset\nconnection"]; - ;; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} - ;; ;;DOT CASE4 -> "rmt:send-receive"; - ;; ;; reset the connection if it has been unused too long - ;; ((and runremote - ;; (remote-conndat runremote) - ;; (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #.\n message: Server closed connection before sending response" - ;; (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) - ;; (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") - ;; (http-transport:close-connections area-dat: runremote) - ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. - ;; (mutex-unlock! *rmt-mutex*) - ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) - + ;;DOT CASE4 [label="reset\nconnection"]; + ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} + ;;DOT CASE4 -> "rmt:send-receive"; + ;; reset the connection if it has been unused too long + ((and runremote + (remote-conndat runremote) + (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on + (+ (http-transport:server-dat-get-last-access (remote-conndat runremote)) + (remote-server-timeout runremote)))) + (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") + (http-transport:close-connections area-dat: runremote) + (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. + (mutex-unlock! *rmt-mutex*) + (rmt:send-receive cmd rid params attemptnum: attemptnum)) + ;;DOT CASE5 [label="local\nread"]; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; ;;DOT CASE5 -> "rmt:open-qry-close-locally"; ;; on homehost and this is a read ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required @@ -227,13 +228,13 @@ ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; ;; not on homehost, do server query (else - (mutex-unlock! *rmt-mutex*) + ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") - (mutex-lock! *rmt-mutex*) + ;; (mutex-lock! *rmt-mutex*) (let* ((conninfo (remote-conndat runremote)) (dat (case (remote-transport runremote) ((http) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away (http-transport:client-api-send-receive 0 conninfo cmd params) ((commfail)(vector #f "communications fail")) @@ -241,22 +242,26 @@ (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) - (if (and (vector? conninfo) (> 5 (vector-length conninfo))) + (if (and (vector? conninfo) (< 5 (vector-length conninfo))) (http-transport:server-dat-update-last-access conninfo) ;; refresh access time - (begin + (begin + (debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo) (set! conninfo #f) - (remote-conndat-set! runremote #f))) + (remote-conndat-set! *runremote* #f) + (http-transport:close-connections area-dat: runremote))) ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end (if (and (vector? res) (eq? (vector-length res) 2) (eq? (vector-ref res 1) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision. + ;; this is the case where the returned data is bad or the server is overloaded and we want + ;; to ease off the queries (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (mutex-lock! *rmt-mutex*) (http-transport:close-connections area-dat: runremote) (set! *runremote* #f) ;; force starting over @@ -264,14 +269,16 @@ (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) - (remote-conndat-set! runremote #f) + (mutex-lock! *rmt-mutex*) + (remote-conndat-set! runremote #f) (http-transport:close-connections area-dat: runremote) (remote-server-url-set! runremote #f) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") ;; (if (not (server:check-if-running *toppath*)) ;; (server:start-and-wait *toppath*)) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;;DOT } @@ -861,10 +868,13 @@ (rmt:send-receive 'no-sync-get/default #f `(,var ,default))) (define (rmt:no-sync-del! var) (rmt:send-receive 'no-sync-del! #f `(,var))) +(define (rmt:no-sync-get-lock keyname) + (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) + ;;====================================================================== ;; A R C H I V E S ;;====================================================================== (define (rmt:archive-get-allocations testname itempath dneeded) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -325,13 +325,13 @@ (thread-start! th1) (thread-join! th2))))) (set-signal-handler! signal/int sighand) (set-signal-handler! signal/term sighand)) - ;; force the starting of a server - (debug:print 0 *default-log-port* "waiting on server...") - (server:start-and-wait *toppath*) + ;; force the starting of a server -- removed BB 17ww28 - no longer needed. + ;;(debug:print 0 *default-log-port* "waiting on server...") + ;;(server:start-and-wait *toppath*) (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process (set! runconf (if (common:file-exists? runconfigf) (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target) (begin @@ -595,10 +595,11 @@ '() reg))) (define runs:nothing-left-in-queue-count 0) +;; BB: for future reference - suspect target vars are not expanded to env vars at this point (item expansion using [items]\nwhatever [system echo $TARGETVAR] doesnt work right whereas [system echo #{targetvar}] does.. Tal and Randy have tix on this. on first pass, var not set, on second pass, ok. (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps) (let* ((loop-list (list hed tal reg reruns)) (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))) (if (list? res) res @@ -1270,17 +1271,18 @@ itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) + ;; -- removed BB 17ww28 - no longer needed. ;; every 15 minutes verify the server is there for this run - (if (and (common:low-noise-print 240 "try start server" run-id) - (not (or (and *runremote* - (remote-server-url *runremote*) - (server:ping (remote-server-url *runremote*))) - (server:check-if-running *toppath*)))) - (server:kind-run *toppath*)) + ;; (if (and (common:low-noise-print 240 "try start server" run-id) + ;; (not (or (and *runremote* + ;; (remote-server-url *runremote*) + ;; (server:ping (remote-server-url *runremote*))) + ;; (server:check-if-running *toppath*)))) + ;; (server:kind-run *toppath*)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) @@ -1408,11 +1410,11 @@ (loop (car tal)(cdr tal) reg reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS - ((or (procedure? items)(eq? items 'have-procedure)) + ((or (procedure? items)(eq? items 'have-procedure)) ;; BB - target vars are env vars here? to allow expansion of [items]\nsomething [system echo $SOMETARGVAR], which is wonky (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) (if loop-list @@ -1960,11 +1962,26 @@ (let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree (real-dir (if (common:file-exists? run-dir) ;; (resolve-pathname run-dir) (common:nice-path run-dir) #f)) - (clean-mode (or mode 'remove-all))) + (clean-mode (or mode 'remove-all)) + (test-id (db:test-get-id test)) + ;; (lock-key (conc "test-" test-id)) + ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) + ;; (expire-time (+ (current-seconds) 30))) ;; give up on getting the lock and steal it after 15 seconds + ;; (if (car lock) + ;; #t + ;; (if (> (current-seconds) expire-time) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to clean test with id " test-id) + ;; (rmt:no-sync-del! lock-key) ;; destroy the lock + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; + ;; (begin + ;; (thread-sleep! 1) + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time))))))) + ) (case clean-mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f)) ((remove-all) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f))) (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) @@ -2000,11 +2017,13 @@ )) ;; Only delete the records *after* removing the directory. If things fail we have a record (case clean-mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) (db:test-get-state test)(db:test-get-status test) #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVED" #f #f)) - (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test)))))) + (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test)))) + ;; (rmt:no-sync-del! lock-key) + )) ;;====================================================================== ;; Routines for manipulating runs ;;====================================================================== Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -443,20 +443,18 @@ (set! *db-last-access* (current-seconds)) ;; might not be needed. (if (equal? *toppath* toppath) #t #f))) -;; timeout is in hours -(define (server:get-timeout) - (let ((tmo (configf:lookup *configdat* "server" "timeout"))) +;; timeout is hms string: 1h 5m 3s, default is 1 minute +;; +(define (server:expiration-timeout) + (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) - (string->number tmo)) - (* 60 60 (string->number tmo)) - ;; (* 3 24 60 60) ;; default to three days - ;;(* 60 60 1) ;; default to one hour - (* 60 5) ;; default to five minutes - ))) + (common:hms-string->seconds tmo)) + (* 3600 (string->number tmo)) + 60))) (define (server:get-best-guess-address hostname) (let ((res #f)) (for-each (lambda (adr) @@ -482,35 +480,59 @@ (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls (debug:print-info 2 *default-log-port* "Periodic sync thread started.") (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) (let* (;;(dbstruct (db:setup)) - (mtdb (dbr:dbstruct-mtdb dbstruct)) - (mtpath (db:dbdat-get-path mtdb))) + (mtdb (dbr:dbstruct-mtdb dbstruct)) + (mtpath (db:dbdat-get-path mtdb)) + (tmp-area (common:get-db-tmp-area)) + (start-file (conc tmp-area "/.start-sync")) + (end-file (conc tmp-area "/.end-sync"))) (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") (let loop () ;; sync for filesystem local db writes ;; (mutex-lock! *db-multi-sync-mutex*) (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write (sync-in-progress *db-sync-in-progress*) (should-sync (and (not *time-to-exit*) - (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum + (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum, deprecated logic, can probably be removed (start-time (current-seconds)) + (cpu-load-adj (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f))) (mt-mod-time (file-modification-time mtpath)) - (recently-synced (< (- start-time mt-mod-time) 4)) - (will-sync (and (or need-sync should-sync) + (last-sync-start (if (common:file-exists? start-file) + (file-modification-time start-file) + 0)) + (last-sync-end (if (common:file-exists? end-file) + (file-modification-time end-file) + 10)) + (sync-period (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period + (recently-synced (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db! + (< mt-mod-time last-sync-start))) + (sync-done (<= last-sync-start last-sync-end)) + (will-sync (and (not *time-to-exit*) ;; do not start a sync if we are in the process of exiting + (or need-sync should-sync) + sync-done (not sync-in-progress) (not recently-synced)))) - (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress="sync-in-progress" should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync) + (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress=" sync-in-progress + " should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync + " sync-done=" sync-done " sync-period=" sync-period) + (if (and (> sync-period 5) + (common:low-noise-print 30 "sync-period")) + (debug:print-info 0 *default-log-port* "Increased sync period due to load: " sync-period)) ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) (if will-sync (set! *db-sync-in-progress* #t)) (mutex-unlock! *db-multi-sync-mutex*) (if will-sync (let ((sync-start (current-milliseconds))) - (if (< sync-duration 300) + (with-output-to-file start-file (lambda ()(print (current-process-id)))) + + ;; put lock here + + (if (< sync-duration 3000) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive (set! sync-duration (- (current-milliseconds) sync-start)) (if (> res 0) ;; some records were transferred, keep the db alive (begin (mutex-lock! *heartbeat-mutex*) @@ -518,11 +540,11 @@ (mutex-unlock! *heartbeat-mutex*) (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) (debug:print-info 2 *default-log-port* "sync called but zero records transferred"))) ;; TODO: factor this next routine out into a function (with-input-from-pipe ;; this should not block other threads but need to verify this - "megatest -sync-to-megatest.db" + (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*) (lambda () (let loop ((inl (read-line)) (res #f)) (if (eof-object? inl) (begin @@ -543,10 +565,14 @@ (if will-sync (begin (mutex-lock! *db-multi-sync-mutex*) (set! *db-sync-in-progress* #f) (set! *db-last-sync* start-time) + (with-output-to-file end-file (lambda ()(print (current-process-id)))) + + ;; release lock here + (mutex-unlock! *db-multi-sync-mutex*))) (if (and debug-mode (> (- start-time last-time) 60)) (begin (set! last-time start-time) Index: tcmt.scm ================================================================== --- tcmt.scm +++ tcmt.scm @@ -82,11 +82,11 @@ (not (equal? prevstat newstat))) (begin (case (string->symbol newstat) ((UNK) ) ;; do nothing ((RUNNING) (print "##teamcity[testStarted name='" tctname "']")) - ((PASS SKIP) (print "##teamcity[testFinished name='" tctname "' duration='" (* 1e3 duration) "'" cmtstr details " ]")) + ((PASS SKIP WARN WAIVED) (print "##teamcity[testFinished name='" tctname "' duration='" (* 1e3 duration) "'" cmtstr details " ]")) (else (print "##teamcity[testFailed name='" tctname "' " cmtstr details " ]"))) (flush-output) (hash-table-set! data testn newstat))))) tests))) Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -254,10 +254,12 @@ # launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log # launcher #{ shell if which bsub > /dev/null;then echo bsub -q priority -o openlava.log;else echo sleeprunner;fi} # launcher nbfake +maxload 1.1 +maxhomehostload 1.1 [configf:settings trim-trailing-spaces yes] # Override the rollup for specific tests [testrollup] Index: utils/installall.sh ================================================================== --- utils/installall.sh +++ utils/installall.sh @@ -17,15 +17,16 @@ if [[ $OPTION=="" ]]; then export OPTION=std fi echo You may need to do the following first: -echo sudo apt-get install libreadline-dev -echo sudo apt-get install libwebkitgtk-dev -echo sudo apt-get install libpangox-1.0-0 zlib1g-dev libfreetype6-dev cmake -echo sudo apt-get install libssl-dev uuid-dev -echo sudo apt-get install libmotif3 -OR- set KTYPE=26g4 +echo sudo apt install libreadline-dev +echo sudo apt install libwebkitgtk-dev +echo sudo apt install libpangox-1.0-0 zlib1g-dev libfreetype6-dev cmake +echo sudo apt install libssl-dev uuid-dev +echo sudo apt install libmotif3 -OR- set KTYPE=26g4 +echo sudo apt install cmake echo echo Set OPTION to std, currently OPTION=$OPTION echo echo Additionally, if you want mysql-client, you will need to make sure echo mysql_config is in your path @@ -35,10 +36,11 @@ echo You are using proxy="$proxy" echo echo "Set additional_libpath to help find gtk or other libraries, don't forget a leading :" SYSTEM_TYPE=$(lsb_release -irs |tr ' ' '_' |tr '\n' '-')$(uname -i)-$OPTION + CHICKEN_VERSION=4.11.0 CHICKEN_BASEVER=4.11.0 # Set up variables # @@ -54,10 +56,12 @@ Ubuntu-16.04-i686-std) KTYPE=32 CDVER=5.10 IUPVER=3.17 IMVER=3.11 + CHICKEN_VERSION=4.12.0 + CHICKEN_BASEVER=4.12.0 ;; SUSE_LINUX_11-x86_64-std) KTYPE=26g4 CDVER=5.10 IUPVER=3.17 @@ -69,14 +73,18 @@ IUPVER=3.5 IMVER=3.6.3 ;; esac +echo SYSTEM_TYPE=$SYSTEM_TYPE echo KTYPE=$KTYPE echo CDVER=$CDVER echo IUPVER=$IUPVER echo IMVER=$IMVER +echo CHICKEN_VERSION=$CHICKEN_VERSION +echo CHICKEN_BASEVER=$CHICKEN_BASEVER + # NOTES: # # Centos with security setup may need to do commands such as following as root: # # NB// fix the paths first @@ -101,10 +109,11 @@ if [[ $proxy == "" ]]; then echo 'Please set the environment variable "proxy" to host.com:port (e.g. foo.com:1234) to use a proxy' echo PROX="" else export http_proxy=http://$proxy + export https_proxy=http://$proxy export PROX="-proxy $proxy" fi if [[ $KTYPE == "" ]]; then echo 'Using KTYPE=26' @@ -153,22 +162,24 @@ make PLATFORM=linux PREFIX=$PREFIX make PLATFORM=linux PREFIX=$PREFIX install cd $BUILDHOME fi cd $BUILDHOME -#wget --no-check-certificate https://github.com/nanomsg/nanomsg/archive/1.0.0.tar.gz -#mv 1.0.0 1.0.0.tar.gz -# if ! [[ -e $PREFIX/lib64/libnanomsg.so.1.0.0 ]]; then -# wget --no-check-certificate https://github.com/nanomsg/nanomsg/archive/1.0.0.tar.gz -# mv 1.0.0 1.0.0.tar.gz -# tar xf 1.0.0.tar.gz -# cd nanomsg-1.0.0 -# ./configure --prefix=$PREFIX -# make -# make install -# fi -# cd $BUILDHOME +if [[ ! -e 1.0.0.tar.gz ]];then + wget --no-check-certificate https://github.com/nanomsg/nanomsg/archive/1.0.0.tar.gz + mv 1.0.0 1.0.0.tar.gz +fi +if ! [[ -e $PREFIX/lib64/libnanomsg.so.1.0.0 ]]; then + wget --no-check-certificate https://github.com/nanomsg/nanomsg/archive/1.0.0.tar.gz + mv 1.0.0 1.0.0.tar.gz + tar xf 1.0.0.tar.gz + cd nanomsg-1.0.0 + ./configure --prefix=$PREFIX + make + make install +fi +cd $BUILDHOME export SQLITE3_VERSION=3090200 if ! [[ -e $PREFIX/bin/sqlite3 ]]; then echo Install sqlite3 sqlite3_tgz=sqlite-autoconf-$SQLITE3_VERSION.tar.gz @@ -356,12 +367,16 @@ # CSC_OPTIONS="-I$PREFIX/include -L$CSCLIBS" $CHICKEN_INSTALL $PROX -D no-library-checks -deploy -prefix $DEPLOYTARG canvas-draw cd $BUILDHOME # install ducttape -cd ../ducttape -$CHICKEN_INSTALL +if [[ -e ../ducttape ]];then + cd ../ducttape + $CHICKEN_INSTALL +else + echo "ducttape egg not found at ../ducttape. You will need to cd into the ducttape directory in the megatest distribution and run \"chicken-install\"" +fi cd $BUILDHOME echo You may need to add $LD_LIBRARY_PATH to your LD_LIBRARY_PATH variable, a setup-chicken4x.sh echo file can be found in the current directory which should work for setting up to run chicken4x ADDED utils/watch-close-wait.sh Index: utils/watch-close-wait.sh ================================================================== --- /dev/null +++ utils/watch-close-wait.sh @@ -0,0 +1,8 @@ +psline=$(ps -F -u $USER | grep "mtest" |grep " -run " | egrep " -(target|reqtarg) "| head -1) +id=$(echo $psline|awk '{print $2}') +echo "Watching process for command line: $psline" +echo " with PID=$id" +while true;do + echo "CLOSE_WAIT: $(lsof -n | grep CLOSE_WAIT | grep $id | wc -l) ALL OPEN: $(lsof -n |grep $id|wc -l) ALL CLOSE_WAIT: $(netstat -ap 2> /dev/null| grep -i close_wait| wc -l)" + sleep 1 +done