Overview
Context
Changes
Modified client.scm
from [dc8b2be6ad]
to [406d30b1f6].
︙ | | |
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
+
+
|
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 100) (failed-connects 0))
(debug:print-info 2 "client:setup remaining-tries=" remaining-tries)
(let* ((tdbdat (tasks:open-db))
(tdb (db:dbdat-get-db tdbdat)))
(if (<= remaining-tries 0)
(begin
(debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
(exit 1))
(let ((host-info (hash-table-ref/default *runremote* run-id #f)))
(if host-info
(let* ((iface (http-transport:server-dat-get-iface host-info))
|
︙ | | |
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
-
+
-
+
-
+
-
+
|
(http-transport:close-connections run-id)
(hash-table-delete! *runremote* run-id)
(if (< remaining-tries 8)
(thread-sleep! 5)
(thread-sleep! 1))
(client:setup run-id remaining-tries: (- remaining-tries 1)))))
;; YUK: rename server-dat here
(let* ((server-dat (tasks:get-server (tasks:get-db) run-id)))
(let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
(debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
(if server-dat
(let* ((iface (tasks:hostinfo-get-interface server-dat))
(port (tasks:hostinfo-get-port server-dat))
(start-res (http-transport:client-connect iface port))
(ping-res (rmt:login-no-auto-client-setup start-res run-id)))
(if (and start-res
ping-res)
(begin
(hash-table-set! *runremote* run-id start-res)
(debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res))
start-res)
(begin ;; login failed but have a server record, clean out the record and try again
(debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
(http-transport:close-connections run-id)
(hash-table-delete! *runremote* run-id)
(tasks:server-force-clean-run-record (tasks:get-db)
(tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
run-id
(tasks:hostinfo-get-interface server-dat)
(tasks:hostinfo-get-port server-dat)
" client:setup (server-dat = #t)")
(thread-sleep! 2)
(server:try-running run-id)
(thread-sleep! 10) ;; give server a little time to start up
(client:setup run-id remaining-tries: (- remaining-tries 1)))))
(begin ;; no server registered
(let ((num-available (tasks:num-in-available-state (tasks:get-db) run-id)))
(let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
(debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
(thread-sleep! 2)
(if (< num-available 2)
(begin
;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)")
(server:try-running run-id)))
(thread-sleep! 10) ;; give server a little time to start up
(client:setup run-id remaining-tries: (- remaining-tries 1))))))))))
(client:setup run-id remaining-tries: (- remaining-tries 1)))))))))))
;; keep this as a function to ease future
(define (client:start run-id server-info)
(http-transport:client-connect (tasks:hostinfo-get-interface server-info)
(tasks:hostinfo-get-port server-info)))
;; client:signal-handler
|
︙ | | |
Modified dashboard.scm
from [c1ff2abbf5]
to [ea8b1971cf].
︙ | | |
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
|
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
|
-
+
-
|
(> modtime last-db-update-time)
(> (current-seconds)(+ last-db-update-time 1)))))
(define *monitor-db-path* (conc *dbdir* "/monitor.db"))
(define *last-monitor-update-time* 0)
;; Force creation of the db in case it isn't already there.
(let ((db (tasks:open-db)))
(tasks:open-db)
(sqlite3:finalize! db))
(define (dashboard:get-youngest-run-db-mod-time)
(handle-exceptions
exn
(begin
(debug:print 0 "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn))
(current-seconds)) ;; something went wrong - just print an error and return current-seconds
|
︙ | | |
Modified db.scm
from [4ac80e9f1a]
to [71a8762428].
︙ | | |
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
|
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
|
-
-
+
+
-
+
|
(db:dbdat-get-db todb)
full-sel)
;; first pass implementation, just insert all changed rows
(for-each
(lambda (targdb)
(let* ((db (db:dbdat-get-db targdb))
(stmth (sqlite3:prepare targdb full-ins)))
(db:delay-if-busy targdb)
(stmth (sqlite3:prepare db full-ins)))
;; (db:delay-if-busy targdb) ;; NO WAITING
(sqlite3:with-transaction
targdb
db
(lambda ()
(for-each ;;
(lambda (fromrow)
(let* ((a (vector-ref fromrow 0))
(curr (hash-table-ref/default todat a #f))
(same #t))
(let loop ((i 0))
|
︙ | | |
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
|
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
|
-
-
+
+
-
+
|
(dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f))
(mtdb (if toppath (db:open-megatest-db)))
(run-ids (if run-ids
run-ids
(if toppath (begin
(db:delay-if-busy mtdb)
(db:get-all-run-ids mtdb)))))
(mdb (tasks:open-db))
(servers (tasks:get-all-servers mdb)))
(tdbdat (tasks:open-db))
(servers (tasks:get-all-servers (db:delay-if-busy tdbdat))))
;; kill servers
(if (member 'killservers options)
(for-each
(lambda (server)
(tasks:server-delete-record mdb (vector-ref server 0) "dbmigration")
(tasks:server-delete-record (db:delay-if-busy tdbdat) (vector-ref server 0) "dbmigration")
(tasks:kill-server (vector-ref server 2)(vector-ref server 1)))
servers))
;; clear out junk records
;;
(if (member 'dejunk options)
(begin
|
︙ | | |
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
|
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
|
-
-
+
+
+
|
(frundb (db:dbdat-get-db (db:get-db fromdb run-id))))
(db:delay-if-busy frundb)
(db:delay-if-busy mtdb)
(if (eq? run-id 0)
(db:sync-tables (db:sync-main-list dbstruct) fromdb mtdb)
(db:sync-tables db:sync-tests-only fromdb mtdb))))
run-ids))
(db:close-all dbstruct)
(sqlite3:finalize! mdb)))
;; (db:close-all dbstruct)
;; (sqlite3:finalize! mdb)
))
;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling proc idb . params)
(debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
(if (or *db-write-access*
(not (member proc *db:all-write-procs*)))
(let* ((db (cond
|
︙ | | |
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
|
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
|
+
-
+
|
pid test-id))))
(define (db:test-get-top-process-pid dbstruct run-id test-id)
(db:with-db
dbstruct
run-id
#f
(lambda (db)
(sqlite3:first-result db "SELECT attemptnum FROM tests WHERE id=?;"
test-id)))
test-id))))
(define db:test-record-fields '("id" "run_id" "testname" "state" "status" "event_time"
"host" "cpuload" "diskfree" "uname" "rundir" "item_path"
"run_duration" "final_logf" "comment" "shortdir" "attemptnum"))
;; fields *must* be a non-empty list
;;
|
︙ | | |
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
|
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
|
+
|
(if (null? tal)
(map cdr (hash-table->alist tests-hash)) ;; return a list of the most recent tests
(loop (car tal)(cdr tal))))))))))
(define (db:delay-if-busy dbdat #!key (count 6))
(if dbdat
(let* ((dbpath (db:dbdat-get-path dbdat))
(db (db:dbdat-get-db dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline
(dbfj (conc dbpath "-journal")))
(if (file-exists? dbfj)
(case count
((6)
(thread-sleep! 0.2)
(db:delay-if-busy count: 5))
((5)
|
︙ | | |
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
|
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
|
-
+
+
+
|
(thread-sleep! 3.2)
(db:delay-if-busy count: 1))
((1)
(thread-sleep! 6.4)
(db:delay-if-busy count: 0))
(else
(debug:print-info 0 "delaying db access due to high database load.")
(thread-sleep! 12.8)))))))
(thread-sleep! 12.8))))
db)
"bogus result from db:delay-if-busy"))
(define (db:test-get-records-for-index-file dbstruct run-id test-name)
(let ((res '()))
(db:with-db
dbstruct
run-id
#f
|
︙ | | |
Modified dcommon.scm
from [ed88f64fcd]
to [e887ed7ced].
︙ | | |
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
|
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
|
+
+
-
+
-
+
|
(set! dashboard:update-summary-tab updater)
(iup:attribute-set! stats-matrix "WIDTHDEF" "40")
(iup:vbox
;; (iup:label "Run statistics" #:expand "HORIZONTAL")
stats-matrix)))
(define (dcommon:servers-table)
(let* ((tdbdat (tasks:open-db))
(tdb (db:dbdat-get-db tdbdat))
(let* ((colnum 0)
(colnum 0)
(rownum 0)
(servers-matrix (iup:matrix #:expand "YES"
#:numcol 7
#:numcol-visible 7
#:numlin-visible 5
))
(colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId"))
(updater (lambda ()
(let ((servers (tasks:get-all-servers (tasks:get-db))))
(let ((servers (tasks:get-all-servers (db:delay-if-busy tdbdat))))
(iup:attribute-set! servers-matrix "NUMLIN" (length servers))
;; (set! colnum 0)
;; (for-each (lambda (colname)
;; ;; (print "colnum: " colnum " colname: " colname)
;; (iup:attribute-set! servers-matrix (conc "0:" colnum) colname)
;; (set! colnum (+ 1 colnum)))
;; colnames)
|
︙ | | |
Modified http-transport.scm
from [c83e2578f6]
to [8d5a62d976].
︙ | | |
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
|
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
|
-
+
+
-
-
+
-
-
+
-
-
+
|
headers: '((content-type text/plain))))
(else (continue))))))))
(http-transport:try-start-server run-id ipaddrstr start-port server-id)))
;; This is recursively run by http-transport:run until sucessful
;;
(define (http-transport:try-start-server run-id ipaddrstr portnum server-id)
(let ((config-hostname (configf:lookup *configdat* "server" "hostname")))
(let ((config-hostname (configf:lookup *configdat* "server" "hostname"))
(tdbdat (tasks:open-db)))
(debug:print-info 0 "http-transport:try-start-server run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname)
(handle-exceptions
exn
(begin
(print-error-message exn)
(if (< portnum 64000)
(begin
(debug:print 0 "WARNING: attempt to start server failed. Trying again ...")
(debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
(debug:print 0 "exn=" (condition->list exn))
(portlogger:open-run-close portlogger:set-failed portnum)
(debug:print 0 "WARNING: failed to start on portnum: " portnum ", trying next port")
(thread-sleep! 0.1)
;; get_next_port goes here
(http-transport:try-start-server run-id
ipaddrstr
(portlogger:open-run-close portlogger:find-port)
server-id))
(begin
(tasks:wait-on-busy-monitor.db)
(tasks:server-force-clean-run-record (tasks:get-db) run-id ipaddrstr portnum " http-transport:try-start-server")
(tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server")
(print "ERROR: Tried and tried but could not start the server"))))
;; any error in following steps will result in a retry
(set! *server-info* (list ipaddrstr portnum))
(tasks:wait-on-busy-monitor.db)
(tasks:server-set-interface-port
(tasks:get-db)
(db:delay-if-busy tdbdat)
server-id
ipaddrstr portnum)
(debug:print 0 "INFO: Trying to start server on " ipaddrstr ":" portnum)
;; This starts the spiffy server
;; NEED WAY TO SET IP TO #f TO BIND ALL
;; (start-server bind-address: ipaddrstr port: portnum)
(if config-hostname ;; this is a hint to bind directly
(start-server port: portnum bind-address: (if (equal? config-hostname "-")
ipaddrstr
config-hostname))
(start-server port: portnum))
;; (portlogger:open-run-close portlogger:set-port portnum "released")
(tasks:wait-on-busy-monitor.db)
(tasks:server-force-clean-run-record (tasks:get-db) run-id ipaddrstr portnum " http-transport:try-start-server")
(tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server")
(debug:print 1 "INFO: server has been stopped"))))
;;======================================================================
;; S E R V E R U T I L I T I E S
;;======================================================================
;;======================================================================
|
︙ | | |
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
|
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
|
-
-
+
|
exn
(debug:print 0 "WARNING: closing connections failed. Server at " fullurl " almost certainly dead")
(close-all-connections!))
(debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries)
(http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1)))
(begin
(mutex-unlock! *http-mutex*)
(tasks:wait-on-busy-monitor.db)
(tasks:kill-server-run-id run-id)
#f))
(begin
(debug:print-info 11 "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n")
;; set up the http-client here
(max-retry-attempts 1)
;; consider all requests indempotent
(retry-request? (lambda (request)
#f))
;; send the data and get the response
;; extract the needed info from the http data and
;; process and return it.
(let* ((send-recieve (lambda ()
(mutex-lock! *http-mutex*)
;; (condition-case (with-input-from-request "http://localhost"; #f read-lines)
;; ((exn http client-error) e (print e)))
(set! res (handle-exceptions
exn
(begin
(debug:print 0 "WARNING: failure in with-input-from-request. Killing associated server to allow clean retry.")
(debug:print 0 "WARNING: failure in with-input-from-request to " fullrul ". Killing associated server to allow clean retry.")
(debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
(hash-table-delete! *runremote* run-id)
(tasks:kill-server-run-id run-id)
#f)
(with-input-from-request ;; was dat
fullurl
(list (cons 'key "thekey")
|
︙ | | |
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
|
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
|
+
-
+
-
+
-
-
+
-
-
-
+
-
-
+
-
+
|
;; run http-transport:keep-running in a parallel thread to monitor that the db is being
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running server-id run-id)
;; if none running or if > 20 seconds since
;; server last used then start shutdown
;; This thread waits for the server to come alive
(let* ((tdbdat (tasks:open-db))
(let* ((server-info (let loop ((start-time (current-seconds))
(server-info (let loop ((start-time (current-seconds))
(changed #t)
(last-sdat "not this"))
(let ((sdat #f))
(thread-sleep! 0.01)
(debug:print-info 0 "Waiting for server alive signature")
(mutex-lock! *heartbeat-mutex*)
(set! sdat *server-info*)
(mutex-unlock! *heartbeat-mutex*)
(if (and sdat
(not changed)
(> (- (current-seconds) start-time) 2))
sdat
(begin
(debug:print-info 0 "Still waiting, last-sdat=" last-sdat)
(sleep 4)
(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
(let ((tdb (tasks:open-db)))
(begin
(debug:print 0 "ERROR: transport appears to have died, exiting server " server-id " for run " run-id)
(tasks:wait-on-busy-monitor.db)
(tasks:server-delete-record tdb server-id "failed to start, never received server alive signature")
(tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
(sqlite3:finalize! tdb)
(exit))
(loop start-time
(equal? sdat last-sdat)
sdat)))))))
(iface (car server-info))
(port (cadr server-info))
(last-access 0)
(tdb (tasks:open-db))
(server-timeout (let ((tmo (configf:lookup *configdat* "server" "timeout")))
(if (and (string? tmo)
(string->number tmo))
(* 60 60 (string->number tmo))
;; (* 3 24 60 60) ;; default to three days
(* 60 1) ;; default to one minute
;; (* 60 60 25) ;; default to 25 hours
))))
(let loop ((count 0)
(server-state 'available))
;; Use this opportunity to sync the inmemdb to db
(let ((start-time (current-milliseconds))
(sync-time #f)
(rem-time #f))
;; inmemdb is a dbstruct
(if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
(set! sync-time (- (current-milliseconds) start-time))
(set! rem-time (quotient (- 4000 sync-time) 1000))
(debug:print 2 "SYNC: time= " sync-time ", rem-time=" rem-time)
;;
;; set_running after our first pass through and start the db
;;
(if (eq? server-state 'available)
(begin
(tasks:wait-on-busy-monitor.db)
(tasks:server-set-state! tdb server-id "dbprep")
(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
(thread-sleep! 5) ;; give some margin for queries to complete before switching from file based access to server based access
(set! *inmemdb* (db:setup run-id))
(tasks:server-set-state! tdb server-id "running")))
(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")))
(if (and (<= rem-time 4)
(> rem-time 0))
(thread-sleep! rem-time)
(thread-sleep! 4))) ;; fallback for if the math is changed ...
(if (< count 1) ;; 3x3 = 9 secs aprox
|
︙ | | |
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
|
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
|
-
-
+
-
-
+
+
-
-
+
-
-
+
-
-
+
|
;; (tasks:server-set-state! tdb server-id "running"))
;;
(loop 0 server-state))
(begin
(debug:print-info 0 "Starting to shutdown the server.")
;; need to delete only *my* server entry (future use)
(set! *time-to-exit* #t)
(tasks:wait-on-busy-monitor.db) ;; wait here in addition to just before the shutting-down
(if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
;;
;; start_shutdown
;;
(tasks:server-set-state! tdb server-id "shutting-down")
(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
(portlogger:open-run-close portlogger:set-port port "released")
(thread-sleep! 5)
(debug:print-info 0 "Max cached queries was " *max-cache-size*)
(debug:print-info 0 "Number of cached writes " *number-of-writes*)
(debug:print-info 0 "Average cached write time "
(if (eq? *number-of-writes* 0)
"n/a (no writes)"
(/ *writes-total-delay*
*number-of-writes*))
" ms")
(debug:print-info 0 "Number non-cached queries " *number-non-write-queries*)
(debug:print-info 0 "Average non-cached time "
(if (eq? *number-non-write-queries* 0)
"n/a (no queries)"
(/ *total-non-write-delay*
*number-non-write-queries*))
" ms")
(debug:print-info 0 "Server shutdown complete. Exiting")
(tasks:wait-on-busy-monitor.db)
(tasks:server-delete-record tdb server-id " http-transport:keep-running")
(tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running")
(exit))))))
;; all routes though here end in exit ...
;;
;; start_server?
;;
(define (http-transport:launch run-id)
(let* ((tdbdat (tasks:open-db)))
(set! *run-id* run-id)
(if (args:get-arg "-daemonize")
(begin
(daemon:ize)
(if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
(begin
(current-error-port *alt-log-file*)
(current-output-port *alt-log-file*)))))
(if (server:check-if-running run-id)
(begin
(debug:print 0 "INFO: Server for run-id " run-id " already running")
(exit 0)))
(tasks:wait-on-busy-monitor.db)
(let loop ((server-id (tasks:server-lock-slot (tasks:get-db) run-id))
(let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
(remtries 4))
(if (not server-id)
(if (> remtries 0)
(begin
(thread-sleep! 2)
(tasks:wait-on-busy-monitor.db)
(loop (tasks:server-lock-slot (tasks:get-db) run-id)
(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)
(- remtries 1)))
(begin
;; since we didn't get the server lock we are going to clean up and bail out
(debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
(tasks:wait-on-busy-monitor.db)
(tasks:server-delete-records-for-this-pid (tasks:get-db) " http-transport:launch")
(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
))
(let* ((th2 (make-thread (lambda ()
(debug:print-info 0 "Server run thread started")
(http-transport:run
(if (args:get-arg "-server")
(args:get-arg "-server")
"-")
|
︙ | | |
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
|
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
|
-
+
|
;; (set! *inmemdb* (db:setup run-id))
(thread-start! th2)
(thread-start! th3)
(set! *didsomething* #t)
(thread-join! th2)
(exit)))))
(exit))))))
(define (http-transport:server-signal-handler signum)
(signal-mask! signum)
(handle-exceptions
exn
(debug:print " ... exiting ...")
(let ((th1 (make-thread (lambda ()
|
︙ | | |
Modified megatest.scm
from [9b64b448cf]
to [42ff41d3a5].
︙ | | |
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
|
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
|
+
-
+
|
;; MAY STILL NEED THIS
;; (set! *megatest-db* (make-dbr:dbstruct path: *toppath* local: #t))))))))))
(if (or (args:get-arg "-list-servers")
(args:get-arg "-stop-server"))
(let ((tl (launch:setup-for-run)))
(if tl
(let* ((tdbdat (tasks:open-db))
(let* ((servers (open-run-close tasks:get-all-servers tasks:open-db))
(servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))
(fmtstr "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n")
(servers-to-kill '())
(killinfo (args:get-arg "-stop-server"))
(khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f))
(sid (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)))
(format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport")
(format #t fmtstr "==" "=====" "===" "====" "=================" "======" "========" "=====" "=========")
|
︙ | | |
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
|
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
|
-
+
-
+
|
(killed #f)
(status (< last-update 20)))
;; (zmq-sockets (if status (server:client-connect hostname port) #f)))
;; no need to login as status of #t indicates we are connecting to correct
;; server
(if (equal? state "dead")
(if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day.
(open-run-close tasks:server-deregister tasks:open-db hostname pullport: pullport pid: pid action: 'delete))
(tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete))
(if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds
(open-run-close tasks:server-deregister tasks:open-db hostname pullport: pullport pid: pid)))
(tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid)))
(format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update
(if status "alive" "dead") transport)
(if (or (equal? id sid)
(equal? sid 0)) ;; kill all/any
(begin
(debug:print-info 0 "Attempting to stop server with pid " pid)
(tasks:kill-server status hostname pullport pid transport)))))
|
︙ | | |
Modified rmt.scm
from [ec918e30be]
to [3dfb2ffd80].
︙ | | |
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
|
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
-
+
+
+
|
(mutex-unlock! *db-multi-sync-mutex*)
(let* ((run-id (if rid rid 0))
(connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f)))
(if cinfo
cinfo
;; NB// can cache the answer for server running for 10 seconds ...
;; ;; (and (not (rmt:write-frequency-over-limit? cmd run-id))
(if (tasks:server-running-or-starting? (tasks:get-db) run-id)
(if (tasks:server-running-or-starting? (db:delay-if-busy
(tasks:open-db))
run-id)
(let ((res (client:setup run-id)))
(if res
(hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully)
#f))
#f))))
(jparams (db:obj->string params)))
(if connection-info
|
︙ | | |
Modified runs.scm
from [5861640706]
to [692dff51df].
︙ | | |
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
|
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
|
-
+
+
-
-
+
+
-
-
-
-
+
+
|
(test-records (make-hash-table))
;; need to process runconfigs before generating these lists
(all-tests-registry #f) ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
(all-test-names #f) ;; (hash-table-keys all-tests-registry))
(test-names #f) ;; (tests:filter-test-names all-test-names test-patts))
(required-tests #f) ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
(task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
(tasks-db (tasks:open-db)))
(tdbdat (tasks:open-db)))
(set-signal-handler! signal/int
(lambda (signum)
(signal-mask! signum)
(print "Received signal " signum ", cleaning up before exit. Please wait...")
(let ((tdb (tasks:open-db)))
(tasks:set-state-given-param-key tdb task-key "killed")
(let ((tdbdat (tasks:open-db)))
(tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "killed"))
;; (sqlite3:interrupt! tdb) ;; seems silly?
(sqlite3:finalize! tdb))
(print "Killed by signal " signum ". Exiting")
(exit)))
;; register this run in monitor.db
(tasks:add tasks-db "run-tests" user target runname test-patts task-key) ;; params)
(tasks:set-state-given-param-key tasks-db task-key "running")
(tasks:add (db:delay-if-busy tdbdat) "run-tests" user target runname test-patts task-key) ;; params)
(tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "running")
(runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process
(if (file-exists? runconfigf)
(setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target)
(debug:print 0 "WARNING: You do not have a run config file: " runconfigf))
;; Now generate all the tests lists
(set! all-tests-registry (tests:get-all))
|
︙ | | |
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
|
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
|
-
-
+
+
+
|
(if (not (hash-table-ref/default flags "-preclean" #f))
(hash-table-set! flags "-preclean" #t))
(if (not (hash-table-ref/default flags "-rerun" #f))
(hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS"))
(runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))))
(debug:print-info 0 "No tests to run")))
(debug:print-info 4 "All done by here")
(tasks:set-state-given-param-key tasks-db task-key "done")
(sqlite3:finalize! tasks-db)))
(tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "done")
;; (sqlite3:finalize! tasks-db)
))
;; loop logic. These are used in runs:run-tests-queue to make it a bit more readable.
;;
;; If reg not full and have items in tal then loop with (car tal)(cdr tal) reg reruns
;; If reg is full (i.e. length >= n
;; loop with (car reg) tal (cdr reg) reruns
|
︙ | | |
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
|
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
|
-
+
|
;; 'set-state-status
;;
;; NB// should pass in keys?
;;
(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(remove-data-only #f))
(common:clear-caches) ;; clear all caches
(let* ((db #f)
(tasks-db (tasks:open-db))
(tdbdat (tasks:open-db))
(keys (rmt:get-keys))
(rundat (mt:get-runs-by-patt keys runnamepatt target))
(header (vector-ref rundat 0))
(runs (vector-ref rundat 1))
(states (if state (string-split state ",") '()))
(statuses (if status (string-split status ",") '()))
(state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f))))
|
︙ | | |
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
|
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
|
-
+
|
(debug:print-info 4 "runs:operate-on run=" run ", header=" header)
(if (not (null? tests))
(begin
(case action
((remove-runs)
;; seek and kill in flight -runtests with % as testpatt here
(if (equal? testpatt "%")
(tasks:kill-runner tasks-db target run-name)
(tasks:kill-runner (db:delay-if-busy tdbdat) target run-name)
(debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt))
(debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
((set-state-status)
(debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
((print-run)
(debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
action)
|
︙ | | |
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
|
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
|
-
+
+
|
;; (if (null? (glob (conc runpath "/*")))
;; (begin
;; (debug:print 1 "Removing run dir " runpath)
;; (system (conc "rmdir -p " runpath))))
)))))
))
runs)
(sqlite3:finalize! tasks-db))
;; (sqlite3:finalize! (db:delay-if-busy tdbdat))
)
#t)
(define (runs:remove-test-directory db test remove-data-only)
(let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree
(real-dir (if (file-exists? run-dir)
(resolve-pathname run-dir)
#f)))
|
︙ | | |
Modified server.scm
from [faceda817c]
to [13f9300039].
︙ | | |
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
|
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
|
+
-
+
-
+
-
+
+
-
+
|
;;
(define (server:try-running run-id)
(if (eq? run-id 0)
(server:run run-id)
(rmt:start-server run-id)))
(define (server:check-if-running run-id)
(let ((tdbdat (tasks:open-db)))
(let loop ((server (tasks:get-server (tasks:get-db) run-id))
(let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id))
(trycount 0))
(if server
;; note: client:start will set *runremote*. this needs to be changed
;; also, client:start will login to the server, also need to change that.
;;
;; client:start returns #t if login was successful.
;;
(let ((res (server:ping-server run-id
(tasks:hostinfo-get-interface server)
(tasks:hostinfo-get-port server))))
;; if the server didn't respond we must remove the record
(if res
#t
(begin
(debug:print-info 0 "server at " server " not responding, removing record")
(tasks:server-force-clean-running-records-for-run-id (tasks:get-db) run-id
(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id
" server:check-if-running")
res)))
#f)))
#f))))
;; called in megatest.scm, host-port is string hostname:port
;;
(define (server:ping run-id host:port)
(let ((tdbdat (tasks:open-db)))
(let* ((host-port (let ((slst (string-split host:port ":")))
(if (eq? (length slst) 2)
(list (car slst)(string->number (cadr slst)))
#f)))
(toppath (launch:setup-for-run))
(server-db-dat (if (not host-port)(tasks:get-server (tasks:get-db) run-id) #f)))
(server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f)))
(if (not run-id)
(begin
(debug:print 0 "ERROR: must specify run-id when doing ping, -run-id n")
(print "ERROR: No run-id")
(exit 1))
(if (and (not host-port)
(not server-db-dat))
|
︙ | | |
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
|
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
|
-
+
|
(if (and (list? login-res)
(car login-res))
(begin
(print "LOGIN_OK")
(exit 0))
(begin
(print "LOGIN_FAILED")
(exit 1))))))))
(exit 1)))))))))
;; run ping in separate process, safest way in some cases
;;
(define (server:ping-server run-id iface port)
(with-input-from-pipe
(conc (common:get-megatest-exe) " -run-id " run-id " -ping " (conc iface ":" port))
(lambda ()
|
︙ | | |
Modified tasks.scm
from [059408bffa]
to [c8e0f86792].
︙ | | |
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
-
-
-
+
+
|
(define (tasks:get-task-db-path)
(if *task-db*
(vector-ref *task-db* 1)
(let* ((linktree (configf:lookup *configdat* "setup" "linktree"))
(dbpath (conc linktree "/.db/monitor.db")))
dbpath)))
(define (tasks:wait-on-busy-monitor.db)
(tasks:wait-on-journal (tasks:get-task-db-path) 30))
;; If file exists AND
;; file readable
;; ==> open it
;; If file exists AND
;; file NOT readable
;; ==> open in-mem version
;; If file NOT exists
;; ==> open in-mem version
;;
(define (tasks:open-db)
(if *task-db*
*task-db*
(let* ((dbpath (tasks:get-task-db-path))
(avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away
(exists (file-exists? dbpath))
(write-access (file-write-access? dbpath))
(mdb (cond
((file-write-access? *toppath*)(sqlite3:open-database dbpath))
((file-read-access? dbpath) (sqlite3:open-database dbpath))
|
︙ | | |
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
|
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
-
-
+
-
-
+
-
-
-
-
-
|
hostname TEXT,
cmdline TEXT,
login_time TIMESTAMP,
logout_time TIMESTAMP DEFAULT -1,
CONSTRAINT clients_constraint UNIQUE (pid,hostname));")
))
mdb))
(set! *task-db* (cons mdb dbpath))
(define (tasks:get-db)
(if *task-db*
*task-db*)))
(vector-ref *task-db* 0)
(let ((db (tasks:open-db))
(pth (tasks:get-task-db-path)))
(set! *task-db* (vector db pth))
db)))
;;======================================================================
;; Server and client management
;;======================================================================
;; make-vector-record tasks hostinfo id interface port pubport transport pid hostname
(define (tasks:hostinfo-get-id vec) (vector-ref vec 0))
|
︙ | | |
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
|
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
|
-
-
+
+
-
+
-
+
+
|
(system (conc "nbfake kill " pid))
(unsetenv "TARGETHOST_LOGF")
(unsetenv "TARGETHOST"))
;; look up a server by run-id and send it a kill, also delete the record for that server
;;
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
(let* ((tdb (tasks:open-db))
(sdat (tasks:get-server tdb run-id)))
(let* ((tdbdat (tasks:open-db))
(sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
(if sdat
(let ((hostname (vector-ref sdat 6))
(pid (vector-ref sdat 5))
(server-id (vector-ref sdat 0)))
(debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
(tasks:kill-server hostname pid)
(tasks:server-delete-record tdb server-id tag) )
(tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
(sqlite3:finalize! tdb)))
;; (sqlite3:finalize! tdb)
))
;; (if status ;; #t means alive
;; (begin
;; (if (equal? hostname (get-host-name))
;; (handle-exceptions
;; exn
;; (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n"
|
︙ | | |