Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -27,11 +27,11 @@ process.scm runs.scm tasks.scm tests.scm genexample.scm \ http-transport.scm filedb.scm tdb.scm \ client.scm mt.scm \ ezsteps.scm lock-queue.scm sdb.scm \ rmt.scm api.scm subrun.scm \ - portlogger.scm archive.scm env.scm diff-report.scm cgisetup/models/pgdb.scm + portlogger.scm archive.scm env.scm diff-report.scm cgisetup/models/pgdb.scm # module source files MSRCFILES = ftail.scm # Eggs to install (straightforward ones) @@ -69,14 +69,17 @@ # ARCHSTR=$(shell bash -c "echo \$$MACHTYPE") PNGFILES = $(shell cd docs/manual;ls *png) #all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut ndboard -all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut +all : $(PREFIX)/bin/.$(ARCHSTR) tquery mtest dboard mtut mtest: $(OFILES) readline-fix.scm megatest.o $(MOFILES) mofiles/ftail.o csc $(CSCOPTS) $(OFILES) $(MOFILES) megatest.o -o mtest + +tquery: $(OFILES) readline-fix.scm tquery.o $(MOFILES) mofiles/ftail.o + csc $(CSCOPTS) $(OFILES) $(MOFILES) tquery.o -o tquery showmtesthash: @echo $(MTESTHASH) dboard : $(OFILES) $(GOFILES) dashboard.scm $(MOFILES) @@ -146,17 +149,17 @@ # $(PREFIX)/bin/revtagfsl : utils/revtagfsl.scm # csc utils/revtagfsl.scm -o $(PREFIX)/bin/revtagfsl # Special dependencies for the includes tests.o db.o launch.o runs.o dashboard-tests.o dashboard-context-menu.o dashboard-guimonitor.o dashboard-main.o monitor.o dashboard.o \ -archive.o megatest.o : db_records.scm +archive.o megatest.o tquery.o: db_records.scm tests.o runs.o dashboard.o dashboard-tests.o dashboard-context-menu.o dashboard-main.o : run_records.scm -db.o ezsteps.o keys.o launch.o megatest.o monitor.o runs-for-ref.o runs.o tests.o : key_records.scm +db.o ezsteps.o keys.o launch.o megatest.o tquery.o monitor.o runs-for-ref.o runs.o tests.o : key_records.scm tests.o tasks.o dashboard-tasks.o : task_records.scm runs.o : test_records.scm -megatest.o : megatest-fossil-hash.scm -rmt.scm client.scm common.scm configf.scm dashboard-guimonitor.scm dashboard-tests.scm dashboard.scm db.scm dcommon.scm ezsteps.scm fs-transport.scm http-transport.scm index-tree.scm items.scm keys.scm launch.scm megatest.scm monitor.scm mt.scm newdashboard.scm runconfig.scm runs.scm server.scm tdb.scm tests.scm tree.scm : common_records.scm +megatest.o tquery.o: megatest-fossil-hash.scm +rmt.scm client.scm common.scm configf.scm dashboard-guimonitor.scm dashboard-tests.scm dashboard.scm db.scm dcommon.scm ezsteps.scm fs-transport.scm http-transport.scm index-tree.scm items.scm keys.scm launch.scm megatest.scm tquery.scm monitor.scm mt.scm newdashboard.scm runconfig.scm runs.scm server.scm tdb.scm tests.scm tree.scm : common_records.scm common_records.scm : altdb.scm vg.o dashboard.o : vg_records.scm dcommon.o : run_records.scm # Temporary while transitioning to new routine # runs.o : run-tests-queue-classic.scm run-tests-queue-new.scm @@ -171,12 +174,15 @@ csc $(CSCOPTS) -c $< $(MOFILES) $(PREFIX)/bin/.$(ARCHSTR)/mtest : mtest utils/mk_wrapper @echo Installing to PREFIX=$(PREFIX) $(INSTALL) mtest $(PREFIX)/bin/.$(ARCHSTR)/mtest + $(INSTALL) tquery $(PREFIX)/bin/.$(ARCHSTR)/tquery utils/mk_wrapper $(PREFIX) mtest $(PREFIX)/bin/megatest + utils/mk_wrapper $(PREFIX) tquery $(PREFIX)/bin/tquery chmod a+x $(PREFIX)/bin/megatest + chmod a+x $(PREFIX)/bin/tquery $(PREFIX)/bin/.$(ARCHSTR)/ndboard : ndboard $(INSTALL) ndboard $(PREFIX)/bin/.$(ARCHSTR)/ndboard $(PREFIX)/bin/newdashboard : $(PREFIX)/bin/.$(ARCHSTR)/ndboard utils/mk_wrapper @@ -276,17 +282,17 @@ $(PREFIX)/bin/.$(ARCHSTR)/dboard : dboard $(FILES) utils/mk_wrapper utils/mk_wrapper $(PREFIX) dboard $(PREFIX)/bin/dashboard chmod a+x $(PREFIX)/bin/dashboard $(INSTALL) dboard $(PREFIX)/bin/.$(ARCHSTR)/dboard -install : $(PREFIX)/bin/.$(ARCHSTR) $(PREFIX)/bin/.$(ARCHSTR)/mtest $(PREFIX)/bin/megatest \ +install : $(PREFIX)/bin/.$(ARCHSTR) $(PREFIX)/bin/.$(ARCHSTR)/mtest $(PREFIX)/bin/megatest $(PREFIX)/bin/.$(ARCHSTR)/tquery $(PREFIX)/bin/tquery \ $(PREFIX)/bin/.$(ARCHSTR)/dboard $(PREFIX)/bin/dashboard $(HELPERS) $(PREFIX)/bin/nbfake \ $(PREFIX)/bin/serialize-env \ $(PREFIX)/bin/nbfind $(PREFIX)/bin/loadrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \ $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun \ $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun $(PREFIX)/bin/mtutil \ - $(PREFIX)/bin/tcmt $(PREFIX)/share/db/mt-pg.sql \ + $(PREFIX)/bin/tcmt $(PREFIX)/share/db/mt-pg.sql $(PREFIX)/bin/.$(ARCHSTR)/tquery \ $(PREFIX)/share/js/jquery-3.1.0.slim.min.js # $(PREFIX)/bin/.$(ARCHSTR)/ndboard # $(PREFIX)/bin/newdashboard @@ -303,11 +309,11 @@ $(MTQA_FOSSIL) : fossil clone https://www.kiatoa.com/fossils/megatest_qa $(MTQA_FOSSIL) clean : - rm -f $(OFILES) $(GOFILES) $(MOFILES) $(TCMTOBJS) $(PREFIX)/megatest $(PREFIX)/dashboard mtest mtutil dboard dboard.o megatest.o dashboard.o megatest-fossil-hash.* altdb.scm mofiles/*.o vg.o + rm -f $(OFILES) $(GOFILES) $(MOFILES) $(TCMTOBJS) $(PREFIX)/megatest $(PREFIX)/dashboard $(PREFIX)/tquery mtest tquery mtutil dboard dboard.o megatest.o dashboard.o megatest-fossil-hash.* altdb.scm mofiles/*.o vg.o #====================================================================== # Make the records files #====================================================================== Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -18,11 +18,11 @@ ;;====================================================================== (use srfi-1 data-structures posix regex-case (prefix base64 base64:) format dot-locking csv-xml z3 ;; sql-de-lite - hostinfo md5 message-digest typed-records directory-utils stack + s11n hostinfo md5 message-digest typed-records directory-utils stack matchable regex posix (srfi 18) extras ;; tcp (prefix nanomsg nmsg:) (prefix sqlite3 sqlite3:) pkts (prefix dbi dbi:) ) @@ -1462,10 +1462,13 @@ (lambda () (read-line))))) (define (get-cpu-load #!key (remote-host #f)) (car (common:get-cpu-load remote-host))) + +(define (get-cpu-load-original #!key (remote-host #f)) + (car (common:get-cpu-load-original remote-host))) ;; (let* ((load-res (process:cmd-run->list "uptime")) ;; (load-rx (regexp "load average:\\s+(\\d+)")) ;; (cpu-load #f)) ;; (for-each (lambda (l) ;; (let ((match (string-search load-rx l))) @@ -1502,13 +1505,24 @@ (handle-exceptions exn #f (with-output-to-file fullpath (lambda ()(pp dat)))))) +(define (common:get-cpu-load remote-host) + (handle-exceptions + exn + (lambda() + (list 50 50 50) + ) + (let ((al (common:get-normalized-cpu-load remote-host))) + (list (alist-ref '1m-load al) (alist-ref '5m-load al) (alist-ref '15m-load al))) + ;;(common:get-cpu-load-original remote-host) + ) +) ;; get cpu load by reading from /proc/loadavg, return all three values ;; -(define (common:get-cpu-load remote-host) +(define (common:get-cpu-load-original remote-host) (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) (or (common:get-cached-info actual-hostname "cpu-load") (let ((result (if remote-host (map (lambda (res) (if (eof-object? res) 9e99 res)) @@ -1523,11 +1537,51 @@ ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads ;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. ;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load ;; (define (common:get-normalized-cpu-load remote-host) - (let ((res (common:get-normalized-cpu-load-raw remote-host)) + (if (file-exists? (pathname-expand "~/.megatest/tquery")) + (begin + (with-input-from-file (pathname-expand "~/.megatest/tquery") + (lambda() + (set! tqfilecontents (read-string)) + )) + (handle-exceptions exn + (lambda() + (sleep 1) + (common:get-normalized-cpu-load remote-host) + ) + (set! tqfileparts (string-split (string-trim-both tqfilecontents) ":")) + ) + ;;(print "TQuery host: " (car tqfileparts)) + ;;(print "TQuery port " (cadr tqfileparts)) + ;;(print "Getting normalized cpu load for : " remote-host " via " (car tqfileparts) ":" (cadr tqfileparts)) + ) + (begin + (process-run "nbfake tquery -server -") + (sleep 2) + ) + ) + (handle-exceptions exn + (lambda() + ;;(print "Need to start tquery server here:") + (process-run "nbfake tquery -server -") + (sleep 2) + (common:get-normalized-cpu-load remote-host) + ) + ;;(print "Preparing to connect to get load") + (define-values (i o) (tcp-connect (car tqfileparts) (string->number (cadr tqfileparts)))) + ;;(define-values (i o) (tcp-connect "plxcas102" 9000)) + (write-line (conc "adj-cpuload-full:" (if remote-host remote-host (get-host-name))) o) + ;;(write-line "adj-cpuload-full:plxcm5005" o) + (with-input-from-string (read-line i) read) + ) +) + + +(define (common:get-normalized-cpu-load-original remote-host) + (let ((res (common:get-normalized-cpu-load-raw-original remote-host)) (default `((adj-proc-load . 2) ;; there is no right answer (adj-core-load . 2) (1m-load . 2) (5m-load . 0) ;; causes a large delta - thus causing default of throttling if stuff goes wrong (15m-load . 0) @@ -1541,11 +1595,11 @@ res) ((eq? res #f) default) ;; add messages? ((eq? res #f) default) ;; this would be the #eof (else default)))) -(define (common:get-normalized-cpu-load-raw remote-host) +(define (common:get-normalized-cpu-load-raw-original remote-host) (let* ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost (or (common:get-cached-info actual-host "normalized-load") (let ((data (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/loadavg;cat /proc/cpuinfo;echo end") @@ -1721,11 +1775,11 @@ (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) -(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) +(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 100) (msg #f)(remote-host #f)(force-maxload #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (numcpus (if (< 1 numcpus-in) ;; not possible (common:get-num-cpus remote-host) numcpus-in)) (maxload (if force-maxload @@ -1733,23 +1787,25 @@ (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) - (adjwait (min (+ 300 (random 10)) (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously + (adjwait (min (+ 300 (random 10)) (* (+ (random 10)(/ (- 100 count) 10) waitdelay) (- first adjload) ) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload - ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) + ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp " ,adjwait: " adjwait " ,numcpus: " numcpus ", loadjmp: " loadjmp) (cond ((and (> first adjload) (> count 0)) (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) - (thread-sleep! adjwait) + (debug:print-info 1 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) + (thread-sleep! 1) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) ((and (> loadjmp numcpus) (> count 0)) (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) - (thread-sleep! adjwait) + (debug:print-info 1 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) + (thread-sleep! 1) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) (define (common:wait-for-homehost-load maxload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f @@ -1757,10 +1813,19 @@ (hh (if hh-dat (car hh-dat) #f)) (numcpus (common:get-num-cpus hh))) (common:wait-for-normalized-load maxload msg hh))) (define (common:get-num-cpus remote-host) + (handle-exceptions exn + (lambda() + 2 + ) + (alist-ref 'core (common:get-normalized-cpu-load remote-host)) + ) +) + +(define (common:get-num-cpus-orig remote-host) (let* ((actual-host (or remote-host (get-host-name)))) (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often! (let* ((proc (lambda () (let loop ((numcpu 0) (inl (read-line))) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -22,11 +22,11 @@ (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest posix-extras) (use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing) ;; Configurations for server -(tcp-buffer-size 2048) +;;(tcp-buffer-size 2048) (max-connections 2048) (declare (unit http-transport)) (declare (uses common))