Megatest

Check-in [f61052be3c]
Login
Overview
Comment:merged changes to reduce load of brute force syner
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65 | v1.6528
Files: files | file ages | folders
SHA1: f61052be3c1cb795806f4c8895add22d6d1c9bf8
User & Date: bjbarcla on 2019-03-29 13:17:59
Other Links: branch diff | manifest | tags
Context
2019-04-01
08:32
Merged trim sleeps check-in: db1d22eadb user: mrwellan tags: v1.65
2019-03-29
18:26
wip Closed-Leaf check-in: 8e9712a5d3 user: bjbarcla tags: v1.65-dont-preqfail-completed
17:59
Trim couple sleeps that seem unnecessary Closed-Leaf check-in: 1550ea7ddf user: mrwellan tags: v1.65-trim-sleeps
13:17
merged changes to reduce load of brute force syner check-in: f61052be3c user: bjbarcla tags: v1.65, v1.6528
11:36
merged v1.65-nosleep Closed-Leaf check-in: 2ea9ce81b4 user: bjbarcla tags: v1.65-integ-19-03-29
2019-03-28
16:10
updated version check-in: 8293650fcd user: pjhatwal tags: v1.65
Changes

Modified common.scm from [c41ac723cd] to [b1d85b703a].

316
317
318
319
320
321
322
323






324
325
326
327
328
329
330
(define (common:version-changed?)
  (not (equal? (common:get-last-run-version)
               (common:version-signature))))

(define (common:api-changed?)
  (not (equal? (substring (->string megatest-version) 0 4)
               (substring (conc (common:get-last-run-version)) 0 4))))
  






;; Move me elsewhere ...
;; RADT => Why do we meed the version check here, this is called only if version misma
;;
(define (common:cleanup-db dbstruct #!key (full #f))
  (apply db:multi-db-sync 
   dbstruct
   'schema







|
>
>
>
>
>
>







316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
(define (common:version-changed?)
  (not (equal? (common:get-last-run-version)
               (common:version-signature))))

(define (common:api-changed?)
  (not (equal? (substring (->string megatest-version) 0 4)
               (substring (conc (common:get-last-run-version)) 0 4))))


(define (common:get-sync-lock-filepath)
  (let* ((tmp-area     (common:get-db-tmp-area))
         (lockfile     (conc tmp-area "/megatest.db.sync-lock")))
    lockfile))
    
;; Move me elsewhere ...
;; RADT => Why do we meed the version check here, this is called only if version misma
;;
(define (common:cleanup-db dbstruct #!key (full #f))
  (apply db:multi-db-sync 
   dbstruct
   'schema
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622


623
624
625
626
627
628
629
   (read (open-input-string (z3:decode-buffer (base64:base64-decode instr))))))

;; dot-locking egg seems not to work, using this for now
;; if lock is older than expire-time then remove it and try again
;; to get the lock
;;
(define (common:simple-file-lock fname #!key (expire-time 300))
  (handle-exceptions
      exn
      #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail.
    (if (common:file-exists? fname)
	(if (> (- (current-seconds)(file-modification-time fname)) expire-time)
	    (begin
	      (delete-file* fname)
	      (common:simple-file-lock fname expire-time: expire-time))
	    #f)
	(let ((key-string (conc (get-host-name) "-" (current-process-id))))
	  (with-output-to-file fname
	    (lambda ()
	      (print key-string)))
	  (thread-sleep! 0.25)
	  (if (common:file-exists? fname)
	      (with-input-from-file fname
		(lambda ()
		  (equal? key-string (read-line))))
	      #f)))))



(define (common:simple-file-lock-and-wait fname #!key (expire-time 300))
  (let ((end-time (+ expire-time (current-seconds))))
    (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time)))
      (if got-lock
	  #t
	  (if (> end-time (current-seconds))







|
|
|















|
>
>







603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
   (read (open-input-string (z3:decode-buffer (base64:base64-decode instr))))))

;; dot-locking egg seems not to work, using this for now
;; if lock is older than expire-time then remove it and try again
;; to get the lock
;;
(define (common:simple-file-lock fname #!key (expire-time 300))
;  (handle-exceptions
;      exn
;      #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail.
    (if (common:file-exists? fname)
	(if (> (- (current-seconds)(file-modification-time fname)) expire-time)
	    (begin
	      (delete-file* fname)
	      (common:simple-file-lock fname expire-time: expire-time))
	    #f)
	(let ((key-string (conc (get-host-name) "-" (current-process-id))))
	  (with-output-to-file fname
	    (lambda ()
	      (print key-string)))
	  (thread-sleep! 0.25)
	  (if (common:file-exists? fname)
	      (with-input-from-file fname
		(lambda ()
		  (equal? key-string (read-line))))
	      #f)))
;    )
  )

(define (common:simple-file-lock-and-wait fname #!key (expire-time 300))
  (let ((end-time (+ expire-time (current-seconds))))
    (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time)))
      (if got-lock
	  #t
	  (if (> end-time (current-seconds))
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
      )
    )

  0)

(define (std-signal-handler signum)
  ;; (signal-mask! signum)
  (set! *time-to-exit* #t)
  ;;(debug:print-info 13 *default-log-port* "got signal "signum)
  (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly")
  ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
  (exit))

(define (special-signal-handler signum)
  ;; (signal-mask! signum)







|







943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
      )
    )

  0)

(define (std-signal-handler signum)
  ;; (signal-mask! signum)
  (set! *time-to-exit* #t) 
  ;;(debug:print-info 13 *default-log-port* "got signal "signum)
  (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly")
  ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
  (exit))

(define (special-signal-handler signum)
  ;; (signal-mask! signum)

Modified db.scm from [73e0450409] to [a146d876b8].

311
312
313
314
315
316
317

318
319
320
321
322
323
324
325

;; This routine creates the db if not already present. It is only called if the db is not already opened
;;
(define (db:open-db dbstruct #!key (areapath #f)(do-sync #t)) ;; TODO: actually use areapath
  (let ((tmpdb-stack (dbr:dbstruct-dbstack dbstruct))) ;; RA => Returns the first reference in dbstruct
    (if (stack? tmpdb-stack)
	(db:get-db tmpdb-stack) ;; get previously opened db (will create new db handle if all in the stack are already used

        (let* ((dbpath       (db:dbfile-path ))      ;; path to tmp db area
               (dbexists     (common:file-exists? dbpath))
	       (tmpdbfname   (conc dbpath "/megatest.db"))
	       (dbfexists    (common:file-exists? tmpdbfname))  ;; (conc dbpath "/megatest.db")))
               (mtdbexists   (common:file-exists? (conc *toppath* "/megatest.db")))
							 
               (mtdbmodtime  (if mtdbexists (common:lazy-sqlite-db-modification-time (conc *toppath* "/megatest.db"))   #f))
	        		 (tmpdbmodtime (if dbfexists  (common:lazy-sqlite-db-modification-time tmpdbfname) #f)) 







>
|







311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326

;; This routine creates the db if not already present. It is only called if the db is not already opened
;;
(define (db:open-db dbstruct #!key (areapath #f)(do-sync #t)) ;; TODO: actually use areapath
  (let ((tmpdb-stack (dbr:dbstruct-dbstack dbstruct))) ;; RA => Returns the first reference in dbstruct
    (if (stack? tmpdb-stack)
	(db:get-db tmpdb-stack) ;; get previously opened db (will create new db handle if all in the stack are already used
        (let* ((max-stale-tmp (configf:lookup-number *configdat* "server" "filling-db-max-stale-seconds" default: 10))
               (dbpath       (db:dbfile-path ))      ;; path to tmp db area
               (dbexists     (common:file-exists? dbpath))
	       (tmpdbfname   (conc dbpath "/megatest.db"))
	       (dbfexists    (common:file-exists? tmpdbfname))  ;; (conc dbpath "/megatest.db")))
               (mtdbexists   (common:file-exists? (conc *toppath* "/megatest.db")))
							 
               (mtdbmodtime  (if mtdbexists (common:lazy-sqlite-db-modification-time (conc *toppath* "/megatest.db"))   #f))
	        		 (tmpdbmodtime (if dbfexists  (common:lazy-sqlite-db-modification-time tmpdbfname) #f)) 
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
          (dbr:dbstruct-tmpdb-set!  dbstruct tmpdb)
          (dbr:dbstruct-dbstack-set! dbstruct (make-stack)) ;; BB: why a stack?  Why would the number of db's be indeterminate?  Is this a legacy of 1.db 2.db .. ?
          (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb) ;; olddb is already a (cons db path)
          (dbr:dbstruct-refndb-set! dbstruct refndb)
          ;;	    (mutex-unlock! *rundb-mutex*)
          (if (and  (or (not dbfexists)
			(and modtimedelta
			     (> modtimedelta 10))) ;; if db in tmp is over ten seconds older than the file in MTRA then do a sync back
		    do-sync)
	      (begin
		(debug:print 1 *default-log-port* "filling db " (db:dbdat-get-path tmpdb) " with data \n    from " (db:dbdat-get-path mtdb) " mod time delta: " modtimedelta)
		(db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb refndb tmpdb)
    ;touch tmp db to avoid wal mode wierdness  
     (set! (file-modification-time tmpdbfname) (current-seconds))  
                (debug:print-info 13 *default-log-port* "db:sync-all-tables-list done.")







|







350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
          (dbr:dbstruct-tmpdb-set!  dbstruct tmpdb)
          (dbr:dbstruct-dbstack-set! dbstruct (make-stack)) ;; BB: why a stack?  Why would the number of db's be indeterminate?  Is this a legacy of 1.db 2.db .. ?
          (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb) ;; olddb is already a (cons db path)
          (dbr:dbstruct-refndb-set! dbstruct refndb)
          ;;	    (mutex-unlock! *rundb-mutex*)
          (if (and  (or (not dbfexists)
			(and modtimedelta
			     (> modtimedelta max-stale-tmp))) ;; if db in tmp is over ten seconds older than the file in MTRA then do a sync back
		    do-sync)
	      (begin
		(debug:print 1 *default-log-port* "filling db " (db:dbdat-get-path tmpdb) " with data \n    from " (db:dbdat-get-path mtdb) " mod time delta: " modtimedelta)
		(db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb refndb tmpdb)
    ;touch tmp db to avoid wal mode wierdness  
     (set! (file-modification-time tmpdbfname) (current-seconds))  
                (debug:print-info 13 *default-log-port* "db:sync-all-tables-list done.")
1046
1047
1048
1049
1050
1051
1052
1053




1054
1055
1056
1057
1058
1059
1060
	 ;; kill servers
	 ((killservers)
	  (for-each
	   (lambda (server)
	     (match-let (((mod-time host port start-time pid) server))
	       (if (and host pid)
		   (tasks:kill-server host pid))))
	   servers))




	 
	 ;; clear out junk records
	 ;;
	 ((dejunk)
	  (db:delay-if-busy mtdb) ;; ok to delay on mtdb
	  (db:clean-up mtdb)
	  (db:clean-up tmpdb)







|
>
>
>
>







1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
	 ;; kill servers
	 ((killservers)
	  (for-each
	   (lambda (server)
	     (match-let (((mod-time host port start-time pid) server))
	       (if (and host pid)
		   (tasks:kill-server host pid))))
	   servers)

          ;; /tmp/bjbarcla/megatest_localdb/fullrun/.nfs.pdx.disks.icf_env_disk001.bjbarcla.gwa.issues.mtdev.matt-bisect.megatest.ext-tests.runs.all.v1.65.1.6524.dev.bb-24-justrollup-f8.rollup.fullrun/megatest.db.lock
          (delete-file* (common:get-sync-lock-filepath))
          )
	 
	 ;; clear out junk records
	 ;;
	 ((dejunk)
	  (db:delay-if-busy mtdb) ;; ok to delay on mtdb
	  (db:clean-up mtdb)
	  (db:clean-up tmpdb)

Modified megatest.scm from [a7dc9766fe] to [8964e71ae0].

424
425
426
427
428
429
430

431
432
433
434
435
436
437
			"-create-megatest-area"
			"-mark-incompletes"

			"-convert-to-norm"
			"-convert-to-old"
			"-import-megatest.db"
			"-sync-to-megatest.db"

			"-logging"
			"-v" ;; verbose 2, more than normal (normal is 1)
			"-q" ;; quiet 0, errors/warnings only

                        "-diff-rep"
                        )
		 args:arg-hash







>







424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
			"-create-megatest-area"
			"-mark-incompletes"

			"-convert-to-norm"
			"-convert-to-old"
			"-import-megatest.db"
			"-sync-to-megatest.db"
                        "-sync-brute-force"
			"-logging"
			"-v" ;; verbose 2, more than normal (normal is 1)
			"-q" ;; quiet 0, errors/warnings only

                        "-diff-rep"
                        )
		 args:arg-hash
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
							 (db:get-value-by-header run header x))
						       keys) "/"))
                                        (statuses (string-split (or (args:get-arg "-status") "") ","))
                                        (run-id  (db:get-value-by-header run header "id"))
                                        (runname (db:get-value-by-header run header "runname")) 
                                        (states  (string-split (or (args:get-arg "-state") "") ","))
                                        (tests   (if tests-spec
                                                     (db:dispatch-query access-mode rmt:get-tests-for-run db:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc 
                                                                        ;; use qryvals if test-spec provided
                                                                        (if tests-spec
                                                                            (string-intersperse adj-tests-spec ",")
                                                                            ;; db:test-record-fields
                                                                            #f)
                                                                        #f
                                                                        'normal)







|







1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
							 (db:get-value-by-header run header x))
						       keys) "/"))
                                        (statuses (string-split (or (args:get-arg "-status") "") ","))
                                        (run-id  (db:get-value-by-header run header "id"))
                                        (runname (db:get-value-by-header run header "runname")) 
                                        (states  (string-split (or (args:get-arg "-state") "") ","))
                                        (tests   (if tests-spec
                                                     (rmt:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc 
                                                                        ;; use qryvals if test-spec provided
                                                                        (if tests-spec
                                                                            (string-intersperse adj-tests-spec ",")
                                                                            ;; db:test-record-fields
                                                                            #f)
                                                                        #f
                                                                        'normal)
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
			     (hash-table-set! data "targets" (cons targetstr (hash-table-ref/default data "targets" '())))
			     )))
		   (let* ((run-id  (db:get-value-by-header run header "id"))
			  (runname (db:get-value-by-header run header "runname")) 
			  (states  (string-split (or (args:get-arg "-state") "") ","))
			  (statuses (string-split (or (args:get-arg "-status") "") ","))
			  (tests   (if tests-spec
				       (db:dispatch-query access-mode rmt:get-tests-for-run db:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc 
							     ;; use qryvals if test-spec provided
							     (if tests-spec
								 (string-intersperse adj-tests-spec ",")
								 ;; db:test-record-fields
								 #f)
							     #f
							     'normal)







|







1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
			     (hash-table-set! data "targets" (cons targetstr (hash-table-ref/default data "targets" '())))
			     )))
		   (let* ((run-id  (db:get-value-by-header run header "id"))
			  (runname (db:get-value-by-header run header "runname")) 
			  (states  (string-split (or (args:get-arg "-state") "") ","))
			  (statuses (string-split (or (args:get-arg "-status") "") ","))
			  (tests   (if tests-spec
				       (rmt:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc 
							     ;; use qryvals if test-spec provided
							     (if tests-spec
								 (string-intersperse adj-tests-spec ",")
								 ;; db:test-record-fields
								 #f)
							     #f
							     'normal)
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
						 (conc "\n         rundir:   " (get-value-by-fieldname test test-field-index "rundir")) ;; (db:test-get-rundir test)
						 "")
;;					     "\n         rundir:   " (get-value-by-fieldname test test-field-index "") ;; (sdb:qry 'getstr ;; (filedb:get-path *fdb* 
;; 					     (db:test-get-rundir test) ;; )
					     )
				    ;; Each test
				    ;; DO NOT remote run
				    (let ((steps (db:dispatch-query access-mode rmt:get-steps-for-test db:get-steps-for-test run-id (db:test-get-id test)))) ;; (db:get-steps-for-test dbstruct run-id (db:test-get-id test))))
				      (for-each 
				       (lambda (step)
					 (format #t 
						 "    Step: ~20a State: ~10a Status: ~10a Time ~22a\n"
						 (tdb:step-get-stepname step)
						 (tdb:step-get-state step)
						 (tdb:step-get-status step)







|







1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
						 (conc "\n         rundir:   " (get-value-by-fieldname test test-field-index "rundir")) ;; (db:test-get-rundir test)
						 "")
;;					     "\n         rundir:   " (get-value-by-fieldname test test-field-index "") ;; (sdb:qry 'getstr ;; (filedb:get-path *fdb* 
;; 					     (db:test-get-rundir test) ;; )
					     )
				    ;; Each test
				    ;; DO NOT remote run
				    (let ((steps (rmt:get-steps-for-test run-id (db:test-get-id test)))) ;; (db:get-steps-for-test dbstruct run-id (db:test-get-id test))))
				      (for-each 
				       (lambda (step)
					 (format #t 
						 "    Step: ~20a State: ~10a Status: ~10a Time ~22a\n"
						 (tdb:step-get-stepname step)
						 (tdb:step-get-state step)
						 (tdb:step-get-status step)
2294
2295
2296
2297
2298
2299
2300




2301
2302
2303
2304
2305
2306
2307
       'killservers
       'dejunk
       'adj-testids
       'old2new
       ;; 'new2old
       )
      (set! *didsomething* #t)))





(if (args:get-arg "-sync-to-megatest.db")
    (let* ((dbstruct (db:setup #f))
	   (tmpdbpth (cdr (dbr:dbstruct-tmpdb dbstruct)))
	   (lockfile (conc tmpdbpth ".lock"))
	   (locked   (common:simple-file-lock lockfile)) 
	   (res      (if locked







>
>
>
>







2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
       'killservers
       'dejunk
       'adj-testids
       'old2new
       ;; 'new2old
       )
      (set! *didsomething* #t)))

(when (args:get-arg "-sync-brute-force")
  ((server:get-bruteforce-syncer (db:setup #t) persist-until-sync: #t))
  (set! *didsomething* #t))

(if (args:get-arg "-sync-to-megatest.db")
    (let* ((dbstruct (db:setup #f))
	   (tmpdbpth (cdr (dbr:dbstruct-tmpdb dbstruct)))
	   (lockfile (conc tmpdbpth ".lock"))
	   (locked   (common:simple-file-lock lockfile)) 
	   (res      (if locked

Modified server.scm from [8ce184eea5] to [a266765465].

498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515




516
517
518
519
520




521





522
523







524
525
526
527
528
529

530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547

548
549
550
551
552
553
554
555
556

557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583

584
585



586
587


588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609

610
611
612
613
614

615
616
617
618




619
620

621
622
623
624
625
626
627
628
629
630
631


632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662

663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702

703
704
705


706
;;        (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180)))
;;       (server:release-sync-lock)
;;       (server:have-sync-lock?))
;;      (else #f))))

;; moving this here as it needs access to db and cannot be in common.
;;
(define (server:writable-watchdog dbstruct)
  (thread-sleep! 10) ;; delay for startup
  (let* ((legacy-sync  (common:run-sync?))
         (sqlite-exe   (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh
         (sync-log     (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log")))
	 (tmp-area     (common:get-db-tmp-area))
	 (tmp-db       (conc tmp-area "/megatest.db"))
	 (staging-file (conc *toppath* "/.megatest.db"))
	 (mtdbfile     (conc *toppath* "/megatest.db"))
	 (lockfile     (conc tmp-db ".lock"))
         (sync-cmd     (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log))




	 (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 30)))
    (if (and (not (args:get-arg "-sync-to-megatest.db")) ;; conditions under which we do not run the sync
	     (args:get-arg "-server"))
	(let loop ()
          (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for server.minimum-intersync-delay seconds ["min-intersync-delay"]")




	  (thread-sleep! min-intersync-delay)





	  (if (common:simple-file-lock lockfile)
	      (begin







                
                (if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
                    (common:snapshot-file mtdbfile subdir: ".db-snapshot"))
		(delete-file* staging-file)
		(let* ((start-time (current-milliseconds))
                       (res (system sync-cmd)))

                  (cond
                   ((eq? 0 res)
		    (delete-file* (conc mtdbfile ".backup"))
                    (if (eq? 0 (file-size sync-log))
                        (delete-file sync-log))
		    (system (conc "/bin/mv " staging-file " " mtdbfile))
                    (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "(/ (- (current-milliseconds) start-time) 1000)" sec")
                    #t)
                   (else
                    (system (conc "/bin/cp "sync-log" "sync-log".fail"))
                    (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail")
                    (if (file-exists? (conc mtdbfile ".backup"))
                        (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile)))))
                  (common:simple-file-release-lock lockfile)))
              ;; else
              (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.")
              ) ;; end if got lockfile
	  

	  ;; keep going unless time to exit
	  ;;
	  (if (not *time-to-exit*)
	      (let delay-loop ((count 0))
		;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*)
		
		(if (and (not *time-to-exit*)
			 (< count 6)) ;; was 11, changing to 4. 
		    (begin

		      (thread-sleep! 1)
		      (delay-loop (+ count 1))))
		(if (not *time-to-exit*) (loop))))
	  ;; time to exit, close the no-sync db here
	  ;; (db:no-sync-close-db no-sync-db)
	  (if (common:low-noise-print 30)
	      (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)
				))))))

;;   (let ((legacy-sync  (common:run-sync?)))
;;         (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
;; 	(debug-mode   (debug:debug-mode 1))
;; 	(last-time    (current-seconds))
;; 	(no-sync-db   (db:open-no-sync-db))
;;         (sync-duration 0) ;; run time of the sync in milliseconds
;;         ;;(this-wd-num  (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))
;;         )
;;     (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls
;;     (debug:print-info 2 *default-log-port* "Periodic sync thread started.")
;;     (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)  );;  " this-wd-num="this-wd-num)
;;     (if (and legacy-sync (not *time-to-exit*))
;; 	(let* (;;(dbstruct (db:setup))
;; 	       (mtdb       (dbr:dbstruct-mtdb dbstruct))
;; 	       (mtpath     (db:dbdat-get-path mtdb))
;; 	       (tmp-area   (common:get-db-tmp-area))
;; 	       (lockfile   (conc tmp-area "/megatest.db.lock"))
;; 	       (start-file (conc tmp-area "/.start-sync"))

;; 	       (end-file   (conc tmp-area "/.end-sync")))
;; 	  (debug:print-info 0 *default-log-port* "Server running, periodic sync started.")



;; 	  (let loop ()
;; 	    ;; sync for filesystem local db writes


;; 	    ;;
;; 	    
;; 	    (mutex-lock! *db-multi-sync-mutex*)
;; 		  (let* ((need-sync        (>= *db-last-access* *db-last-sync*)) ;; no sync since last write
;; 			 (sync-in-progress *db-sync-in-progress*)
;; 			 
;; 			 (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5))
;; 			 (should-sync      (and (not *time-to-exit*)
;; 						(> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed
;; 			 (start-time       (current-seconds))
;; 			 (cpu-load-adj     (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)))
;; 			 (mt-mod-time      (file-modification-time mtpath))
;; 			 (last-sync-start  (if (common:file-exists? start-file)
;; 					       (file-modification-time start-file)
;; 					       0))
;; 			 (last-sync-end    (if (common:file-exists? end-file)
;; 					       (file-modification-time end-file)
;; 					       10))
;; 			 (sync-period      (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period
;; 			 (recently-synced  (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db!
;; 						(< mt-mod-time last-sync-start)))
;; 			 (sync-done        (<= last-sync-start last-sync-end))

;; 			 (sync-stale       (> start-time (+ last-sync-start sync-stale-seconds)))
;; 			 (will-sync-pre        (and (not *time-to-exit*)       ;; do not start a sync if we are in the process of exiting
;; 						    have-lock?
;; 						    (or need-sync should-sync)
;; 						    (or sync-done sync-stale)

;; 						    (not sync-in-progress)
;; 						    (not recently-synced)))
;; 			 (will-sync        (if will-sync-pre
;; 					       ;; delay get lock until we decide to sync




;; 					       #t ;; (server:have-sync-lock?) 
;; 					       #f)))

;; 		    ;; if another server is syncing, postpone sync
;; 		    (if (and will-sync-pre (not will-sync))
;; 			(set! *db-last-sync* start-time))
;; 		    (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop.  need-sync="need-sync" sync-in-progress=" sync-in-progress
;; 				      " should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync
;; 				      " sync-done=" sync-done " sync-period=" sync-period)
;; 		    (if (and (> sync-period 5)
;; 			     (common:low-noise-print 30 "sync-period"))
;; 			(debug:print-info 0 *default-log-port* "Increased sync period due to long sync times, sync took: " sync-period " seconds."))
;; 		    ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced))
;; 		    ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync)


;; 		    (if will-sync (set! *db-sync-in-progress* #t))
;; 		    (mutex-unlock! *db-multi-sync-mutex*)
;; 		    (if will-sync
;; 			(let (;; (max-sync-duration  (configf:lookup-number *configdat* "server" "max-sync-duration")) ;; KEEPING THIS AVAILABLE BUT SHOULD NOT USE, I'M PRETTY SURE IT DOES NOT WORK!
;; 			      (sync-start         (current-milliseconds)))
;; 			  (with-output-to-file start-file (lambda ()(print (current-process-id))))
;; 			  
;; 			  ;; put lock here
;; 			  
;; 			  ;; (if (or (not max-sync-duration)
;; 			  ;;        (< sync-duration max-sync-duration)) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally
;; 			  (let ((res        (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive
;; 			    (set! sync-duration (- (current-milliseconds) sync-start))
;; 			    (if (> res 0) ;; some records were transferred, keep the db alive
;; 				(begin
;; 				  (mutex-lock! *heartbeat-mutex*)
;; 				  (set! *db-last-access* (current-seconds))
;; 				  (mutex-unlock! *heartbeat-mutex*)
;; 				  (debug:print-info 0 *default-log-port* "sync called, " res " records transferred."))
;; 				(debug:print-info 2 *default-log-port* "sync called but zero records transferred")))))
;; 		    ;;                         ;; TODO: factor this next routine out into a function
;; 		    ;;                         (with-input-from-pipe ;; this should not block other threads but need to verify this
;; 		    ;;                          (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*)
;; 		    ;;                          (lambda ()
;; 		    ;;                            (let loop ((inl (read-line))
;; 		    ;;                                       (res #f))
;; 		    ;;                              (if (eof-object? inl)
;; 		    ;;                                  (begin
;; 		    ;;                                    (set! sync-duration (- (current-milliseconds) sync-start))
;; 		    ;;                                    (cond
;; 		    ;;                                     ((not res)

;; 		    ;;                                      (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\""))
;; 		    ;;                                     ((> res 0)
;; 		    ;;                                      (mutex-lock! *heartbeat-mutex*)
;; 		    ;;                                      (set! *db-last-access* (current-seconds))
;; 		    ;;                                      (mutex-unlock! *heartbeat-mutex*))))
;; 		    ;;                                  (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl)))
;; 		    ;;                                                      (if matches
;; 		    ;;                                                          (string->number (cadr matches))
;; 		    ;;                                                          #f))))
;; 		    ;;                                    (loop (read-line)
;; 		    ;;                                          (or num-synced res))))))))))
;; 		    (if will-sync
;; 			(begin
;; 			  (mutex-lock! *db-multi-sync-mutex*)
;; 			  (set! *db-sync-in-progress* #f)
;; 			  (set! *db-last-sync* start-time)
;; 			  (with-output-to-file end-file (lambda ()(print (current-process-id))))
;; 
;; 			  ;; release lock here
;; 			  ;; (server:release-sync-lock)
;; 			  (mutex-unlock! *db-multi-sync-mutex*)))
;; 		    (if (and debug-mode
;; 			     (> (- start-time last-time) 60))
;; 			(begin
;; 			  (set! last-time start-time)
;; 			  (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))
;; 		  
;; 		  ;; keep going unless time to exit
;; 		  ;;
;; 		  (if (not *time-to-exit*)
;; 		      (let delay-loop ((count 0))
;; 			;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*)
;; 			
;; 			(if (and (not *time-to-exit*)
;; 				 (< count 6)) ;; was 11, changing to 4. 
;; 			    (begin
;; 			      (thread-sleep! 1)
;; 			      (delay-loop (+ count 1))))
;; 			(if (not *time-to-exit*) (loop))))
;; 		  ;; time to exit, close the no-sync db here

;; 		  (db:no-sync-close-db no-sync-db)
;; 		  (if (common:low-noise-print 30)
;; 		      (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num)))))))










|
|
<
|





|
|
>
>
>
>
|
<
<
|
<
>
>
>
>
|
>
>
>
>
>
|
|
>
>
>
>
>
>
>
|
|
|
|
|
|
>
|
|
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
|
>
|
<
<
<
<
|
<
<
|
>
|
|
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
|
<
<
|
|
|
<
<
|
>
|
|
>
>
>
|
|
>
>
|
|
<
<
<
<
<
<
<
<
<
|
|
|
|
|
|
<
<
<
<
<
>
|
<
<
<
<
>
|
<
<
|
>
>
>
>
|
|
>
|
<
<
|
<
<
<
<
<
<
<
>
>
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
>
|
<
<
<
<
<
<
<
<
|
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
|
<
<
<
<
<
<
<
<
<
<
|
>
|
|
|
>
>

498
499
500
501
502
503
504
505
506

507
508
509
510
511
512
513
514
515
516
517
518
519


520

521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552











553
554
555




556


557
558
559
560






561









562


563
564
565


566
567
568
569
570
571
572
573
574
575
576
577
578









579
580
581
582
583
584





585
586




587
588


589
590
591
592
593
594
595
596
597


598







599
600
601





















602








603
604








605

606















607
608










609
610
611
612
613
614
615
616
;;        (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180)))
;;       (server:release-sync-lock)
;;       (server:have-sync-lock?))
;;      (else #f))))

;; moving this here as it needs access to db and cannot be in common.
;;

(define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f))

  (let* ((sqlite-exe   (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh
         (sync-log     (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log")))
	 (tmp-area     (common:get-db-tmp-area))
	 (tmp-db       (conc tmp-area "/megatest.db"))
	 (staging-file (conc *toppath* "/.megatest.db"))
	 (mtdbfile     (conc *toppath* "/megatest.db"))
	 (lockfile     (common:get-sync-lock-filepath))
         (sync-cmd-core     (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log))
         (sync-cmd     (if fork-to-background 
                           (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"")
                           sync-cmd-core))
         (default-min-intersync-delay 2)
	 (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay))


         (default-duty-cycle 0.1)

         (duty-cycle   (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle))
         (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle)
         (calculate-off-time (lambda (work-duration duty-cycle)
                                  (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds)))
         (off-time min-intersync-delay) ;; adjusted in closure below.
         (do-a-sync
          (lambda ()
            (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync)
            (let* ((finalres
                    (let retry-loop ((num-tries 0))
                         (if (common:simple-file-lock lockfile)
	                     (begin
                               (cond
                                ((not (or fork-to-background persist-until-sync))
                                 (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay
                                              " , off-time="off-time" seconds ]")
                                 (thread-sleep! (max off-time min-intersync-delay)))
                                (else
                                 (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit...")))

                               (if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
                                   (common:snapshot-file mtdbfile subdir: ".db-snapshot"))
		               (delete-file* staging-file)
		               (let* ((start-time (current-milliseconds))
                                      (res (system sync-cmd))
                                      (res2 
                                       (cond
                                        ((eq? 0 res)
		                         (delete-file* (conc mtdbfile ".backup"))
                                         (if (eq? 0 (file-size sync-log))
                                             (delete-file sync-log))
		                         (system (conc "/bin/mv " staging-file " " mtdbfile))











                                         
                                         (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000))
                                         (set! off-time (calculate-off-time




                                                         last-sync-seconds


                                                         (cond
                                                          ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1))
                                                           duty-cycle)
                                                          (else






                                                           (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid.  Should be a number between 0 and 1, but "duty-cycle" was specified.  Using default value: "default-duty-cycle)









                                                           default-duty-cycle))))


                                         
                                         (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec")
                                         (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time)


                                         'sync-completed)
                                        (else
                                         (system (conc "/bin/cp "sync-log" "sync-log".fail"))
                                         (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail")
                                         (if (file-exists? (conc mtdbfile ".backup"))
                                             (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile)))
                                         #f))))
                                 (common:simple-file-release-lock lockfile)
                                 (BB> "released lockfile: " lockfile)
                                 (when (common:file-exists? lockfile)
                                   (BB> "DID NOT ACTUALLY RELEASE LOCKFILE"))
                                 res2) ;; end let
                               );; end begin









                             ;; else
                             (cond
                              (persist-until-sync
                               (thread-sleep! 1)
                               (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed.  Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)")
                               (retry-loop (add1 num-tries)))





                              (else
                               (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay)))




                               (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.")
                               'parallel-sync-in-progress))


                             ) ;; end if got lockfile
                         )
                    ))
              (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres)
              finalres)
            ) ;; end lambda
          ))
    do-a-sync))



(define (server:writable-watchdog dbstruct)







  (thread-sleep! 10) ;; delay for startup
  (let* ((do-a-sync  (server:get-bruteforce-syncer dbstruct))
         (final-sync (server:get-bruteforce-syncer dbstruct fork-to-background: #t persist-until-sync: #t)))





















    (when (and (not (args:get-arg "-sync-to-megatest.db")) ;; conditions under which we do not run the sync








	       (args:get-arg "-server"))
      








      (let loop ()

	(do-a-sync)















        (if (not *time-to-exit*) (loop))) ;; keep going unless time to exit











      ;; time to exit, close the no-sync db here
      (final-sync)

      (if (common:low-noise-print 30)
	  (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)
			    )))))


Modified tests.scm from [c7490d0df8] to [3fce4840b2].

483
484
485
486
487
488
489



490
491
492
493
494
495
496
497
			   tol      ","
			   units    ","
			   dcomment ",," ;; extra comma for status
			   type     )))
	    ;; This was run remote, don't think that makes sense. Perhaps not, but that is the easiest path for the moment.
	    (rmt:csv->test-data run-id test-id
				dat)



            (thread-sleep! 10) ;; add 10 second delay before quit incase rmt needs time to start a server.
            )))
      
    ;; need to update the top test record if PASS or FAIL and this is a subtest
    ;;;;;; (if (not (equal? item-path ""))
    ;;;;;;     (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status #f) ;;;;;)

    (if (or (and (string? comment)







>
>
>
|







483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
			   tol      ","
			   units    ","
			   dcomment ",," ;; extra comma for status
			   type     )))
	    ;; This was run remote, don't think that makes sense. Perhaps not, but that is the easiest path for the moment.
	    (rmt:csv->test-data run-id test-id
				dat)
	    ;; This was added in check-in a5adfa3f9a. Message was: "...added delay in set-values to allow for delayed write on server start"
	    ;; I'm inserting an arbitrary rmt: call to force/ensure that the server is available to (hopefully) prevent a communication issue.
	    (rmt:get-var "MEGATEST_VERSION") ;; this does NOTHING but ensure the server is reachable. This is almost certainly NOT needed :)
            ;; BB - commentiong out arbitrary 10 second wait (thread-sleep! 10) ;; add 10 second delay before quit incase rmt needs time to start a server.
            )))
      
    ;; need to update the top test record if PASS or FAIL and this is a subtest
    ;;;;;; (if (not (equal? item-path ""))
    ;;;;;;     (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status #f) ;;;;;)

    (if (or (and (string? comment)