Overview
Comment: | Fixed param order issue that was causing some server crashes. Cleaned up some exception handing |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.60 |
Files: | files | file ages | folders |
SHA1: |
3a0e887fa84a31cc2684fedafb292e38 |
User & Date: | matt on 2014-11-18 00:15:30 |
Other Links: | branch diff | manifest | tags |
Context
2014-11-18
| ||
09:32 | Force start on servers before doing write heavy stuff. The transition when doing writes from direct access -> server needs improving. This is a work-around (but a good feature anyway. check-in: c101fd0ec9 user: mrwellan tags: v1.60 | |
00:15 | Fixed param order issue that was causing some server crashes. Cleaned up some exception handing check-in: 3a0e887fa8 user: matt tags: v1.60 | |
2014-11-17
| ||
20:32 | Adding back some tests check-in: faa19d1d5f user: matt tags: v1.60 | |
Changes
Modified common_records.scm from [785b03b237] to [08c9f6257d].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | ;;====================================================================== ;; Copyright 2006-2012, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;;====================================================================== ;; (use trace) (define (debug:calc-verbosity vstr) (cond ((number? vstr) vstr) ((not (string? vstr)) 1) ;; ((string-match "^\\s*$" vstr) 1) (vstr (let ((debugvals (filter number? (map string->number (string-split vstr ","))))) (cond | > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | ;;====================================================================== ;; Copyright 2006-2012, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;;====================================================================== ;; (use trace) ;; Some of these routines use: ;; ;; http://www.cs.toronto.edu/~gfb/scheme/simple-macros.html ;; ;; Syntax for defining macros in a simple style similar to function definiton, ;; when there is a single pattern for the argument list and there are no keywords. ;; ;; (define-simple-syntax (name arg ...) body ...) ;; (define-syntax define-simple-syntax (syntax-rules () ((_ (name arg ...) body ...) (define-syntax name (syntax-rules () ((name arg ...) (begin body ...))))))) (define-syntax common:handle-exceptions (syntax-rules () ((_ exn-in errstmt ...)(handle-exceptions exn-in errstmt ...)))) (define (debug:calc-verbosity vstr) (cond ((number? vstr) vstr) ((not (string? vstr)) 1) ;; ((string-match "^\\s*$" vstr) 1) (vstr (let ((debugvals (filter number? (map string->number (string-split vstr ","))))) (cond |
︙ | ︙ |
Modified db.scm from [f79d03493a] to [d047d1e2d5].
︙ | ︙ | |||
30 31 32 33 34 35 36 37 38 39 40 41 42 43 | (include "key_records.scm") (include "run_records.scm") (define *rundb-mutex* (make-mutex)) ;; prevent problems opening/closing rundb's (define *number-of-writes* 0) (define *number-non-write-queries* 0) ;; Get/open a database ;; if run-id => get run specific db ;; if #f => get main db ;; if db already open - return inmem ;; if db not open, open inmem, rundb and sync then return inmem ;; inuse gets set automatically for rundb's ;; | > > > > > > > > > > > > > > > > > > | 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | (include "key_records.scm") (include "run_records.scm") (define *rundb-mutex* (make-mutex)) ;; prevent problems opening/closing rundb's (define *number-of-writes* 0) (define *number-non-write-queries* 0) ;;====================================================================== ;; SQLITE3 HELPERS ;;====================================================================== ;; convert to -inline (define (db:first-result-default db stmt default . params) (handle-exceptions exn (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn) (if (eq? err-status 'done) default (begin (debug:print 0 "ERROR: query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain (current-error-port)) default))) (apply sqlite3:first-result db stmt params))) ;; Get/open a database ;; if run-id => get run specific db ;; if #f => get main db ;; if db already open - return inmem ;; if db not open, open inmem, rundb and sync then return inmem ;; inuse gets set automatically for rundb's ;; |
︙ | ︙ | |||
427 428 429 430 431 432 433 434 435 436 437 438 439 440 | ;; (define (db:sync-tables tbls fromdb todb . slave-dbs) (mutex-lock! *db-sync-mutex*) (handle-exceptions exn (begin (debug:print 0 "EXCEPTION: database probably overloaded or unreadable in db:sync-tables.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) (debug:print 0 " src db: " (db:dbdat-get-path fromdb)) (for-each (lambda (dbdat) (debug:print 0 " dbpath: " (db:dbdat-get-path dbdat))) (cons todb slave-dbs)) | > | > | | 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 | ;; (define (db:sync-tables tbls fromdb todb . slave-dbs) (mutex-lock! *db-sync-mutex*) (handle-exceptions exn (begin (debug:print 0 "EXCEPTION: database probably overloaded or unreadable in db:sync-tables.") (print-call-chain (current-error-port)) (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) (debug:print 0 " src db: " (db:dbdat-get-path fromdb)) (for-each (lambda (dbdat) (debug:print 0 " dbpath: " (db:dbdat-get-path dbdat))) (cons todb slave-dbs)) (if *server-run* ;; we are inside a server (set! *time-to-exit* #t) ;; let watch dog know that it is time to die. (exit 1))) (cond ((not fromdb) (debug:print 3 "WARNING: db:sync-tables called with fromdb missing") -1) ((not todb) (debug:print 3 "WARNING: db:sync-tables called with todb missing") -2) ((not (sqlite3:database? (db:dbdat-get-db fromdb))) (debug:print 0 "ERROR: db:sync-tables called with fromdb not a database " fromdb) -3) ((not (sqlite3:database? (db:dbdat-get-db todb))) (debug:print 0 "ERROR: db:sync-tables called with todb not a database " todb) -4) |
︙ | ︙ | |||
1861 1862 1863 1864 1865 1866 1867 | (define (db:test-get-top-process-pid dbstruct run-id test-id) (db:with-db dbstruct run-id #f (lambda (db) | > > | > | 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 | (define (db:test-get-top-process-pid dbstruct run-id test-id) (db:with-db dbstruct run-id #f (lambda (db) (db:first-result-default db "SELECT attemptnum FROM tests WHERE id=?;" #f test-id)))) (define db:test-record-fields '("id" "run_id" "testname" "state" "status" "event_time" "host" "cpuload" "diskfree" "uname" "rundir" "item_path" "run_duration" "final_logf" "comment" "shortdir" "attemptnum")) ;; fields *must* be a non-empty list |
︙ | ︙ | |||
2658 2659 2660 2661 2662 2663 2664 | ;; if the test is not found then clearly the waiton is not met... ;; (if (not ever-seen)(set! result (cons waitontest-name result))))) (if (not ever-seen) (set! result (append (if (null? tests)(list waitontest-name) tests) result))))) waitons) (delete-duplicates result)))) | < < < < < < < < < < < < < < < < < < | 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 | ;; if the test is not found then clearly the waiton is not met... ;; (if (not ever-seen)(set! result (cons waitontest-name result))))) (if (not ever-seen) (set! result (append (if (null? tests)(list waitontest-name) tests) result))))) waitons) (delete-duplicates result)))) ;;====================================================================== ;; Extract ods file from the db ;;====================================================================== ;; NOT REWRITTEN YET!!!!! ;; runspatt is a comma delimited list of run patterns |
︙ | ︙ |
Modified http-transport.scm from [0ec84bc71c] to [068fb6dcc3].
︙ | ︙ | |||
461 462 463 464 465 466 467 | ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; (loop 0 server-state)) | > | > | 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 | ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; (loop 0 server-state)) (http-transport:server-shutdown server-id port))))) (define (http-transport:server-shutdown server-id port) (debug:print-info 0 "Starting to shutdown the server.") ;; need to delete only *my* server entry (future use) (set! *time-to-exit* #t) (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) ;; ;; start_shutdown ;; |
︙ | ︙ | |||
489 490 491 492 493 494 495 | (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running") | | | 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 | (if (eq? *number-non-write-queries* 0) "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running") (exit)) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) (let* ((tdbdat (tasks:open-db))) |
︙ | ︙ |
Modified launch.scm from [3011b582fa] to [e538ecba1a].
︙ | ︙ | |||
62 63 64 65 66 67 68 | (top-path (assoc/default 'toppath cmdinfo)) (work-area (assoc/default 'work-area cmdinfo)) ;; work-area is the test run area (test-name (assoc/default 'test-name cmdinfo)) (runscript (assoc/default 'runscript cmdinfo)) (ezsteps (assoc/default 'ezsteps cmdinfo)) ;; (runremote (assoc/default 'runremote cmdinfo)) (transport (assoc/default 'transport cmdinfo)) | | | 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | (top-path (assoc/default 'toppath cmdinfo)) (work-area (assoc/default 'work-area cmdinfo)) ;; work-area is the test run area (test-name (assoc/default 'test-name cmdinfo)) (runscript (assoc/default 'runscript cmdinfo)) (ezsteps (assoc/default 'ezsteps cmdinfo)) ;; (runremote (assoc/default 'runremote cmdinfo)) (transport (assoc/default 'transport cmdinfo)) ;; (serverinf (assoc/default 'serverinf cmdinfo)) (port (assoc/default 'port cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) (test-id (assoc/default 'test-id cmdinfo)) (target (assoc/default 'target cmdinfo)) (itemdat (assoc/default 'itemdat cmdinfo)) (env-ovrd (assoc/default 'env-ovrd cmdinfo)) (set-vars (assoc/default 'set-vars cmdinfo)) ;; pre-overrides from -setvar |
︙ | ︙ | |||
785 786 787 788 789 790 791 | (debug:print 0 "WARNING: No disk work area specified - running in the test directory under tmp_run"))) (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) (list 'transport (conc *transport-type*)) | | | 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 | (debug:print 0 "WARNING: No disk work area specified - running in the test directory under tmp_run"))) (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) (list 'transport (conc *transport-type*)) ;; (list 'serverinf *server-info*) (list 'toppath *toppath*) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript) (list 'run-id run-id ) (list 'test-id test-id ) ;; (list 'item-path item-path ) |
︙ | ︙ |
Modified tasks.scm from [f329b3e32b] to [d12c4149c9].
︙ | ︙ | |||
26 27 28 29 30 31 32 | ;; (define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f)) (if (not (string? path)) (debug:print 0 "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)") (let ((fullpath (conc path "-journal"))) (handle-exceptions exn | > > > > > | | 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | ;; (define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f)) (if (not (string? path)) (debug:print 0 "ERROR: Called tasks:wait-on-journal with path=" path " (not a string)") (let ((fullpath (conc path "-journal"))) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 " exn=" (condition->list exn)) (debug:print 0 "tasks:wait-on-journal failed. Continuing on, you can ignore this call-chain") #t) ;; if stuff goes wrong just allow it to move on (let loop ((journal-exists (file-exists? fullpath)) (count n)) ;; wait ten times ... (if journal-exists (begin (if (and waiting-msg (eq? (modulo n 30) 0)) (debug:print 0 waiting-msg)) |
︙ | ︙ | |||
307 308 309 310 311 312 313 314 315 316 317 318 319 320 | (define (tasks:get-server mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (handle-exceptions exn (begin (debug:print 0 "WARNING: tasks:get-server db access error.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 " for run " run-id) (print-call-chain (current-error-port)) (if (> retries 0) (begin (debug:print 0 " trying call to tasks:get-server again in 10 seconds") | > | 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 | (define (tasks:get-server mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 0 "WARNING: tasks:get-server db access error.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 " for run " run-id) (print-call-chain (current-error-port)) (if (> retries 0) (begin (debug:print 0 " trying call to tasks:get-server again in 10 seconds") |
︙ | ︙ | |||
373 374 375 376 377 378 379 | (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) | < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 379 380 381 382 383 384 385 386 387 388 389 390 391 392 | (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) ;;====================================================================== ;; Tasks and Task monitors ;;====================================================================== ;;====================================================================== |
︙ | ︙ |
Modified tdb.scm from [13705ac22c] to [575d5c7ba8].
︙ | ︙ | |||
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | (dbexists (file-exists? dbpath)) (handler (make-busy-timeout (if (args:get-arg "-override-timeout") (string->number (args:get-arg "-override-timeout")) 136000)))) (handle-exceptions exn (begin (debug:print 2 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test" ((condition-property-accessor 'exn 'message) exn)) (set! db (sqlite3:open-database ":memory:")) ;; open an in-memory db to allow readonly access (set! dbexists #f)) ;; must force re-creation of tables, more tom-foolery (set! db (sqlite3:open-database dbpath))) (if *db-write-access* (sqlite3:set-busy-handler! db handler)) (if (not dbexists) (begin (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = FULL;") (debug:print-info 11 "Initialized test database " dbpath) (tdb:testdb-initialize db))) ;; (sqlite3:execute db "PRAGMA synchronous = 0;") (debug:print-info 11 "open-test-db END (sucessful)" work-area) ;; now let's test that everything is correct (handle-exceptions exn (begin (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test or remove the file " dbpath ".\n " ((condition-property-accessor 'exn 'message) exn)) #f) ;; Is there a cheaper single line operation that will check for existance of a table ;; and raise an exception ? (sqlite3:execute db "SELECT id FROM test_data LIMIT 1;")) | > > | 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | (dbexists (file-exists? dbpath)) (handler (make-busy-timeout (if (args:get-arg "-override-timeout") (string->number (args:get-arg "-override-timeout")) 136000)))) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 2 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test" ((condition-property-accessor 'exn 'message) exn)) (set! db (sqlite3:open-database ":memory:")) ;; open an in-memory db to allow readonly access (set! dbexists #f)) ;; must force re-creation of tables, more tom-foolery (set! db (sqlite3:open-database dbpath))) (if *db-write-access* (sqlite3:set-busy-handler! db handler)) (if (not dbexists) (begin (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = FULL;") (debug:print-info 11 "Initialized test database " dbpath) (tdb:testdb-initialize db))) ;; (sqlite3:execute db "PRAGMA synchronous = 0;") (debug:print-info 11 "open-test-db END (sucessful)" work-area) ;; now let's test that everything is correct (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test or remove the file " dbpath ".\n " ((condition-property-accessor 'exn 'message) exn)) #f) ;; Is there a cheaper single line operation that will check for existance of a table ;; and raise an exception ? (sqlite3:execute db "SELECT id FROM test_data LIMIT 1;")) |
︙ | ︙ |
Modified tests.scm from [3b87b51253] to [2a580a2e0e].
︙ | ︙ | |||
645 646 647 648 649 650 651 | (define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (uname (get-uname "-srvpio")) (hostname (get-host-name))) | < < < < < < < < < < < < < < < < < < < < < > > | 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 | (define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (uname (get-uname "-srvpio")) (hostname (get-host-name))) (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) ;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) (define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (remtries 10)) (handle-exceptions exn (if (> remtries 0) (begin (print-call-chain (current-error-port)) (debug:print-info 0 "WARNING: failed to set meta info. Will try " remtries " more times") (set! remtries (- remtries 1)) (thread-sleep! 10) (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) (debug:print 0 "ERROR: tried for over a minute to update meta info and failed. Giving up") (debug:print 0 "EXCEPTION: database probably overloaded or unreadable.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) |
︙ | ︙ |