Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -126,10 +126,11 @@ (http-transport:client-connect (tasks:hostinfo-get-interface server-info) (tasks:hostinfo-get-port server-info))) ;; client:signal-handler (define (client:signal-handler signum) + (signal-mask! signum) (handle-exceptions exn (debug:print " ... exiting ...") (let ((th1 (make-thread (lambda () "") ;; do nothing for now (was flush out last call if applicable) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -156,11 +156,17 @@ (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest")) (define (common:read-encoded-string instr) (handle-exceptions exn - (read (open-input-string (base64:base64-decode instr))) + (handle-exceptions + exn + (begin + (debug:print 0 "ERROR: received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn)) + (print-call-chain) + #f) + (read (open-input-string (base64:base64-decode instr)))) (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;;====================================================================== ;; S T A T E S A N D S T A T U S E S ;;====================================================================== @@ -550,10 +556,21 @@ (seconds->local-time sec) "%yww%V.%w")) (define (seconds->year-work-week/day-time sec) (time->string (seconds->local-time sec) "%yww%V.%w %H:%M")) + +(define (seconds->quarter sec) + (case (string->number + (time->string + (seconds->local-time sec) + "%m")) + ((1 2 3) 1) + ((4 5 6) 2) + ((7 8 9) 3) + ((10 11 12) 4) + (else #f))) ;;====================================================================== ;; Colors ;;====================================================================== Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -561,11 +561,11 @@ request-update)) (newtestdat (if need-update ;; NOTE: BUG HIDER, try to eliminate this exception handler (handle-exceptions exn - (debug:print-info 0 "test db access issue: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print-info 0 "test db access issue in examine test for run-id " run-id ", test-id " test-id ": " ((condition-property-accessor 'exn 'message) exn)) (db:get-test-info-by-id dbstruct run-id test-id ))))) ;; (debug:print-info 0 "need-update= " need-update " curr-mod-time = " curr-mod-time) (cond ((and need-update newtestdat) (set! testdat newtestdat) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -1458,11 +1458,11 @@ (define (dashboard:get-youngest-run-db-mod-time) (handle-exceptions exn (begin - (debug:print 0 "WARNING: error in accessing databases: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn)) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (apply max (map (lambda (filen) (file-modification-time filen)) (glob (conc *dbdir* "/*.db")))))) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -296,11 +296,16 @@ (thread-sleep! 3) (if (and rundb (sqlite3:database? rundb)) (handle-exceptions exn - #t ;; (debug:print 0 "WARNING: database files may not have been closed correctly. Consider running -cleanup-db") + (begin + (debug:print 0 "WARNING: database files may not have been closed correctly. Consider running -cleanup-db") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 " db: " rundb) + (print-call-chain) + #f) (sqlite3:interrupt! rundb) (sqlite3:finalize! rundb #t)))) ;; (mutex-unlock! *db-sync-mutex*) ) @@ -2437,11 +2442,11 @@ (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn) (if (eq? err-status 'done) default (begin - (debug:print 0 "ERROR: query " stmt " failed " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 "ERROR: query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain) default))) (apply sqlite3:first-result db stmt params))) ;;====================================================================== Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -1,1272 +1,1284 @@ - - - - - -The Megatest Users Manual - - - - - -
-
-

Preface

-
-

This book is organised as three sub-books; getting started, writing tests and reference.

-
-

Why Megatest?

-

The Megatest project was started for two reasons, the first was an -immediate and pressing need for a generalized tool to manage a suite -of regression tests and the second was the fact that the author had -written or maintained several such tools at different companies over -the years and it seemed a good thing to have a single open source -tool, flexible enough to meet the needs of any team doing continuous -integrating and or running a complex suite of tests for release -qualification.

-
-
-

Megatest Design Philosophy

-

Megatest is intended to provide the minimum needed resources to make -writing a suite of tests and tasks for implementing continuous build -for software, design engineering or process control (via owlfs for -example) without being specialized for any specific problem -space. Megatest in of itself does not know what constitutes a PASS or -FAIL of a test. In most cases megatest is best used in conjunction -with logpro or a similar tool to parse, analyze and decide on the test -outcome.

-
-
-

Megatest Architecture

-

All data to specify the tests and configure the system is stored in -plain text files. All system state is stored in an sqlite3 -database. Tests are launched using the launching system available for -the distributed compute platform in use. A template script is provided -which can launch jobs on local and remote Linux hosts. Currently -megatest uses the network filesystem to call home to your master -sqlite3 database.

-
-
-
-

Road Map

-

Note 1: This road-map is tentative and subject to change without notice.

-

Note 2: Starting over. Old plan is commented out.

-
-

Current Items

-
-
-

ww05 - migrate to inmem-db

-

Keep as much the same as possible. Add internal reference to almost -eliminate contention on db(s).

-
    -
  1. -

    -Add internal reference db -

    -
  2. -
  3. -

    -Verify that actions are accessing correct db -

    -
      -
    1. -

      --runtests - inmem -

      -
    2. -
    3. -

      --list-runs - local (but not megatest.db) -

      -
    4. -
    5. -

      -dashboard - local (but not megatest.db) -

      -
    6. -
    -
  4. -
  5. -

    -Mirror db to /var/tmp… -

    -
  6. -
  7. -

    -Dashboard read db from per-run db. -

    -
  8. -
  9. -

    -Dashboard read db from /var/tmp -

    -
  10. -
  11. -

    -Runs register in tasks table in monitor.db -

    -
  12. -
  13. -

    -Server polls tasks table for next action (in addition?) -

    -
  14. -
  15. -

    -Change run loop to execute in server, triggered by call to polling of tasks table -

    -
  16. -
-
-
-
-

Getting Started

-
-
Getting started with Megatest
-
-

How to install Megatest and set it up for running your regressions and continuous integration process.

-
-
-

Installation

-
-
-

Dependencies

-

Chicken scheme and a number of "eggs" are required for building -Megatest. See the script installall.sch in the utils directory of the -distribution for a mostly automated way to install everything needed -for building Megatest on Linux.

-


[An example footnote.]

-

And now for something completely different: monkeys, lions and -tigers (Bengal and Siberian) using the alternative syntax index -entries. - - - -Note that multi-entry terms generate separate index entries.

-

Here are a couple of image examples: an -images/smallnew.png - -example inline image followed by an example block image:

-
-
-Tiger image -
-
Figure 1. Tiger block image
-
-

Followed by an example table:

-
- - --- - - - - - - - - - - - - - - - -
Table 1. An example table
Option Description

-a USER GROUP

Add USER to GROUP.

-R GROUP

Disables access to GROUP.

-
-
-
Example 1. An example example
-
-

Lorum ipum…

-
-
-
-

Sub-section with Anchor

-

Sub-section at level 2.

-
-

Chapter Sub-section

-

Sub-section at level 3.

-
-
Chapter Sub-section
-

Sub-section at level 4.

-

This is the maximum sub-section depth supported by the distributed -AsciiDoc configuration. -
[A second example footnote.]

-
-
-
-
-
-
-

The Second Chapter

-
-

An example link to anchor at start of the first sub-section.

-

An example link to a bibliography entry [taoup].

-
-
-

Writing Tests

-
-

The First Chapter of the Second Part

-
-

Chapters grouped into book parts are at level 1 and can contain -sub-sections.

-
-
-

How To Do Things

-
-

Tricks

-
-

This section is a compendium of a various useful tricks for debugging, -configuring and generally getting the most out of Megatest.

-
-
-
-

Limiting your running jobs

-
-

The following example will limit a test in the jobgroup "group1" to no more than 10 tests simultaneously.

-

In your testconfig:

-
-
-
[test_meta]
-jobgroup group1
-
-

In your megatest.config:

-
-
-
[jobgroups]
-group1 10
-custdes 4
-
-
-
-
-

Debugging Tricks

-
-
-

Examining The Environment

-
-

During Config File Processing

-
-
-

Organising Your Tests and Tasks

-
-
-
[tests-paths]
-1 #{get misc parent}/simplerun/tests
-
-
-
-
[setup]
-
-

The runscript method is a brute force way to run scripts where the -user is responsible for setting STATE and STATUS

-
-
-
runscript main.csh
-
-
-
-
-

Debugging Server Problems

-
-
-
sudo lsof -i
-sudo netstat -lptu
-sudo netstat -tulpn
-
-
-
-
-

Reference

-
-

The First Chapter of the Second Part

-
-

Chapters grouped into book parts are at level 1 and can contain -sub-sections.

-
-
-
-

The testconfig File

-
-
-

Setup section

-
-

Header

-
-
-
[setup]
-
-

The runscript method is a brute force way to run scripts where the -user is responsible for setting STATE and STATUS

-
-
-
runscript main.csh
-
-
-
-
-

Requirements section

-
-

Header

-
-
-
[requirements]
-
-
-
-

Wait on Other Tests

-
-
-
# A normal waiton waits for the prior tests to be COMPLETED
-# and PASS, CHECK or WAIVED
-waiton test1 test2
-
-
-
-

Mode

-

The default (i.e. if mode is not specified) is normal. All pre-dependent tests -must be COMPLETED and PASS, CHECK or WAIVED before the test will start

-
-
-
mode   normal
-
-

The toplevel mode requires only that the prior tests are COMPLETED.

-
-
-
mode toplevel
-
-

A item based waiton will start items in a test when the -same-named item is COMPLETED and PASS, CHECK or WAIVED -in the prior test

-
-
-
mode itemmatch
-
-
-
-
# With a toplevel test you may wish to generate your list
-# of tests to run dynamically
-#
-# waiton #{shell get-valid-tests-to-run.sh}
-
-
-
-

Run time limit

-
-
-
runtimelim 1h 2m 3s  # this will automatically kill the test if it runs for more than 1h 2m and 3s
-
-
-
-

Skip

-
-
-

Header

-
-
-
[skip]
-
-
-
-

Skip on Still-running Tests

-
-
-
# NB// If the prevrunning line exists with *any* value the test will
-# automatically SKIP if the same-named test is currently RUNNING
-
-prevrunning x
-
-
-
-

Skip if a File Exists

-
-
-
fileexists /path/to/a/file # skip if /path/to/a/file exists
-
-
-
-

Controlled waiver propagation

-

If test is FAIL and previous test in run with same MT_TARGET is WAIVED then apply the following rules from the testconfig: -If a waiver check is specified in the testconfig apply the check and if it passes then set this FAIL to WAIVED

-

Waiver check has two parts, 1) a list of waiver, rulename, filepatterns and 2) the rulename script spec (note that "diff" and "logpro" are predefined)

-
-
-
###### EXAMPLE FROM testconfig #########
-# matching file(s) will be diff'd with previous run and logpro applied
-# if PASS or WARN result from logpro then WAIVER state is set
-#
-[waivers]
-# logpro_file    rulename      input_glob
-waiver_1         logpro        lookittmp.log
-
-[waiver_rules]
-
-# This builtin rule is the default if there is no <waivername>.logpro file
-# diff   diff %file1% %file2%
-
-# This builtin rule is applied if a <waivername>.logpro file exists
-# logpro diff %file1% %file2% | logpro %waivername%.logpro %waivername%.html
-
-
-
-
-

Ezsteps

-

To transfer the environment to the next step you can do the following:

-
-
-
$MT_MEGATEST -env2file .ezsteps/${stepname}
-
-
-
-

Triggers

-

In your testconfig triggers can be specified

-
-
-
[triggers]
-
-# Call script running.sh when test goes to state=RUNNING, status=PASS
-RUNNING/PASS running.sh
-
-# Call script running.sh any time state goes to RUNNING
-RUNNING/ running.sh
-
-# Call script onpass.sh any time status goes to PASS
-PASS/ onpass.sh
-
-
-
-

Megatest Internals

-
-
-server.png -
-
-
-
-
-
-

Appendix A: Example Appendix

-
-

One or more optional appendixes go here at section level zero.

-
-

Appendix Sub-section

-
- - - -
-
Note
-
Preface and appendix subsections start out of sequence at level -2 (level 1 is skipped). This only applies to multi-part book -documents.
-
-
-
-
-
-

Example Bibliography

-
-

The bibliography list is a style of AsciiDoc bulleted list.

-
    -
  • -

    -[taoup] Eric Steven Raymond. The Art of Unix - Programming. Addison-Wesley. ISBN 0-13-142901-9. -

    -
  • -
  • -

    -[walsh-muellner] Norman Walsh & Leonard Muellner. - DocBook - The Definitive Guide. O’Reilly & Associates. 1999. - ISBN 1-56592-580-7. -

    -
  • -
-
-
-
-

Example Glossary

-
-

Glossaries are optional. Glossaries entries are an example of a style -of AsciiDoc labeled lists.

-
-
-A glossary term -
-
-

- The corresponding (indented) definition. -

-
-
-A second glossary term -
-
-

- The corresponding (indented) definition. -

-
-
-
-
-
-

Example Colophon

-
-

Text at the end of a book describing facts about its production.

-
-
-
-

Example Index

-
-
-
-
-

- - - + + + + + +The Megatest Users Manual + + + + + +
+
+

Preface

+
+

This book is organised as three sub-books; getting started, writing tests and reference.

+
+

Why Megatest?

+

The Megatest project was started for two reasons, the first was an +immediate and pressing need for a generalized tool to manage a suite +of regression tests and the second was the fact that the author had +written or maintained several such tools at different companies over +the years and it seemed a good thing to have a single open source +tool, flexible enough to meet the needs of any team doing continuous +integrating and or running a complex suite of tests for release +qualification.

+
+
+

Megatest Design Philosophy

+

Megatest is intended to provide the minimum needed resources to make +writing a suite of tests and tasks for implementing continuous build +for software, design engineering or process control (via owlfs for +example) without being specialized for any specific problem +space. Megatest in of itself does not know what constitutes a PASS or +FAIL of a test. In most cases megatest is best used in conjunction +with logpro or a similar tool to parse, analyze and decide on the test +outcome.

+
+
+

Megatest Architecture

+

All data to specify the tests and configure the system is stored in +plain text files. All system state is stored in an sqlite3 +database. Tests are launched using the launching system available for +the distributed compute platform in use. A template script is provided +which can launch jobs on local and remote Linux hosts. Currently +megatest uses the network filesystem to call home to your master +sqlite3 database.

+
+
+
+

Road Map

+

Note 1: This road-map is tentative and subject to change without notice.

+

Note 2: Starting over. Old plan is commented out.

+
+

Current Items

+
+
+

ww05 - migrate to inmem-db

+

Keep as much the same as possible. Add internal reference to almost +eliminate contention on db(s).

+
    +
  1. +

    +Add internal reference db +

    +
  2. +
  3. +

    +Verify that actions are accessing correct db +

    +
      +
    1. +

      +-runtests - inmem +

      +
    2. +
    3. +

      +-list-runs - local (but not megatest.db) +

      +
    4. +
    5. +

      +dashboard - local (but not megatest.db) +

      +
    6. +
    +
  4. +
  5. +

    +Mirror db to /var/tmp… +

    +
  6. +
  7. +

    +Dashboard read db from per-run db. +

    +
  8. +
  9. +

    +Dashboard read db from /var/tmp +

    +
  10. +
  11. +

    +Runs register in tasks table in monitor.db +

    +
  12. +
  13. +

    +Server polls tasks table for next action (in addition?) +

    +
  14. +
  15. +

    +Change run loop to execute in server, triggered by call to polling of tasks table +

    +
  16. +
+
+
+
+

Getting Started

+
+
Getting started with Megatest
+
+

How to install Megatest and set it up for running your regressions and continuous integration process.

+
+
+

Installation

+
+
+

Dependencies

+

Chicken scheme and a number of "eggs" are required for building +Megatest. See the script installall.sch in the utils directory of the +distribution for a mostly automated way to install everything needed +for building Megatest on Linux.

+


[An example footnote.]

+

And now for something completely different: monkeys, lions and +tigers (Bengal and Siberian) using the alternative syntax index +entries. + + + +Note that multi-entry terms generate separate index entries.

+

Here are a couple of image examples: an +images/smallnew.png + +example inline image followed by an example block image:

+
+
+Tiger image +
+
Figure 1. Tiger block image
+
+

Followed by an example table:

+
+ + +++ + + + + + + + + + + + + + + + +
Table 1. An example table
Option Description

-a USER GROUP

Add USER to GROUP.

-R GROUP

Disables access to GROUP.

+
+
+
Example 1. An example example
+
+

Lorum ipum…

+
+
+
+

Sub-section with Anchor

+

Sub-section at level 2.

+
+

Chapter Sub-section

+

Sub-section at level 3.

+
+
Chapter Sub-section
+

Sub-section at level 4.

+

This is the maximum sub-section depth supported by the distributed +AsciiDoc configuration. +
[A second example footnote.]

+
+
+
+
+
+
+

The Second Chapter

+
+

An example link to anchor at start of the first sub-section.

+

An example link to a bibliography entry [taoup].

+
+
+

Writing Tests

+
+

The First Chapter of the Second Part

+
+

Chapters grouped into book parts are at level 1 and can contain +sub-sections.

+
+
+

How To Do Things

+
+

Tricks

+
+

This section is a compendium of a various useful tricks for debugging, +configuring and generally getting the most out of Megatest.

+
+
+
+

Limiting your running jobs

+
+

The following example will limit a test in the jobgroup "group1" to no more than 10 tests simultaneously.

+

In your testconfig:

+
+
+
[test_meta]
+jobgroup group1
+
+

In your megatest.config:

+
+
+
[jobgroups]
+group1 10
+custdes 4
+
+
+
+
+

Debugging Tricks

+
+
+

Examining The Environment

+
+

During Config File Processing

+
+
+

Organising Your Tests and Tasks

+
+
+
[tests-paths]
+1 #{get misc parent}/simplerun/tests
+
+
+
+
[setup]
+
+

The runscript method is a brute force way to run scripts where the +user is responsible for setting STATE and STATUS

+
+
+
runscript main.csh
+
+
+
+
+

Debugging Server Problems

+
+
+
sudo lsof -i
+sudo netstat -lptu
+sudo netstat -tulpn
+
+
+
+
+

Reference

+
+

The First Chapter of the Second Part

+
+

Chapters grouped into book parts are at level 1 and can contain +sub-sections.

+
+
+
+

The testconfig File

+
+
+

Setup section

+
+

Header

+
+
+
[setup]
+
+

The runscript method is a brute force way to run scripts where the +user is responsible for setting STATE and STATUS

+
+
+
runscript main.csh
+
+
+
+
+

Requirements section

+
+

Header

+
+
+
[requirements]
+
+
+
+

Wait on Other Tests

+
+
+
# A normal waiton waits for the prior tests to be COMPLETED
+# and PASS, CHECK or WAIVED
+waiton test1 test2
+
+
+
+

Mode

+

The default (i.e. if mode is not specified) is normal. All pre-dependent tests +must be COMPLETED and PASS, CHECK or WAIVED before the test will start

+
+
+
mode   normal
+
+

The toplevel mode requires only that the prior tests are COMPLETED.

+
+
+
mode toplevel
+
+

A item based waiton will start items in a test when the +same-named item is COMPLETED and PASS, CHECK or WAIVED +in the prior test

+
+
+
mode itemmatch
+
+
+
+
# With a toplevel test you may wish to generate your list
+# of tests to run dynamically
+#
+# waiton #{shell get-valid-tests-to-run.sh}
+
+
+
+

Run time limit

+
+
+
runtimelim 1h 2m 3s  # this will automatically kill the test if it runs for more than 1h 2m and 3s
+
+
+
+

Skip

+
+
+

Header

+
+
+
[skip]
+
+
+
+

Skip on Still-running Tests

+
+
+
# NB// If the prevrunning line exists with *any* value the test will
+# automatically SKIP if the same-named test is currently RUNNING
+
+prevrunning x
+
+
+
+

Skip if a File Exists

+
+
+
fileexists /path/to/a/file # skip if /path/to/a/file exists
+
+
+
+

Controlled waiver propagation

+

If test is FAIL and previous test in run with same MT_TARGET is WAIVED then apply the following rules from the testconfig: +If a waiver check is specified in the testconfig apply the check and if it passes then set this FAIL to WAIVED

+

Waiver check has two parts, 1) a list of waiver, rulename, filepatterns and 2) the rulename script spec (note that "diff" and "logpro" are predefined)

+
+
+
###### EXAMPLE FROM testconfig #########
+# matching file(s) will be diff'd with previous run and logpro applied
+# if PASS or WARN result from logpro then WAIVER state is set
+#
+[waivers]
+# logpro_file    rulename      input_glob
+waiver_1         logpro        lookittmp.log
+
+[waiver_rules]
+
+# This builtin rule is the default if there is no <waivername>.logpro file
+# diff   diff %file1% %file2%
+
+# This builtin rule is applied if a <waivername>.logpro file exists
+# logpro diff %file1% %file2% | logpro %waivername%.logpro %waivername%.html
+
+
+
+
+

Ezsteps

+

To transfer the environment to the next step you can do the following:

+
+
+
$MT_MEGATEST -env2file .ezsteps/${stepname}
+
+
+
+

Triggers

+

In your testconfig triggers can be specified

+
+
+
[triggers]
+
+# Call script running.sh when test goes to state=RUNNING, status=PASS
+RUNNING/PASS running.sh
+
+# Call script running.sh any time state goes to RUNNING
+RUNNING/ running.sh
+
+# Call script onpass.sh any time status goes to PASS
+PASS/ onpass.sh
+
+

Scripts called will have; test-id test-rundir trigger, added to the commandline.

+

HINT

+

To start an xterm (useful for debugging), use a command line like the following:

+
+
+
[triggers]
+COMPLETED/ xterm -e bash -s --
+
+
+ + + +
+
Note
+
There is a trailing space after the --
+
+
+
+

Megatest Internals

+
+
+server.png +
+
+
+
+
+
+

Appendix A: Example Appendix

+
+

One or more optional appendixes go here at section level zero.

+
+

Appendix Sub-section

+
+ + + +
+
Note
+
Preface and appendix subsections start out of sequence at level +2 (level 1 is skipped). This only applies to multi-part book +documents.
+
+
+
+
+
+

Example Bibliography

+
+

The bibliography list is a style of AsciiDoc bulleted list.

+
    +
  • +

    +[taoup] Eric Steven Raymond. The Art of Unix + Programming. Addison-Wesley. ISBN 0-13-142901-9. +

    +
  • +
  • +

    +[walsh-muellner] Norman Walsh & Leonard Muellner. + DocBook - The Definitive Guide. O’Reilly & Associates. 1999. + ISBN 1-56592-580-7. +

    +
  • +
+
+
+
+

Example Glossary

+
+

Glossaries are optional. Glossaries entries are an example of a style +of AsciiDoc labeled lists.

+
+
+A glossary term +
+
+

+ The corresponding (indented) definition. +

+
+
+A second glossary term +
+
+

+ The corresponding (indented) definition. +

+
+
+
+
+
+

Example Colophon

+
+

Text at the end of a book describing facts about its production.

+
+
+
+

Example Index

+
+
+
+
+

+ + + Index: docs/manual/server.png ================================================================== --- docs/manual/server.png +++ docs/manual/server.png cannot compute difference between binary files Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -256,10 +256,11 @@ (close-all-connections!)) (debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries) (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1))) (begin (mutex-unlock! *http-mutex*) + (tasks:kill-server-run-id run-id) #f)) (begin (debug:print-info 11 "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") ;; set up the http-client here (max-retry-attempts 1) @@ -275,10 +276,11 @@ ;; ((exn http client-error) e (print e))) (set! res (handle-exceptions exn (begin (debug:print 0 "ERROR: failure in with-input-from-request. Giving up.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) #f) (with-input-from-request ;; was dat fullurl (list (cons 'key "thekey") (cons 'cmd cmd) @@ -536,10 +538,11 @@ (set! *didsomething* #t) (thread-join! th2) (exit))))) (define (http-transport:server-signal-handler signum) + (signal-mask! signum) (handle-exceptions exn (debug:print " ... exiting ...") (let ((th1 (make-thread (lambda () (thread-sleep! 1)) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -361,15 +361,20 @@ (begin (for-each (lambda (pid) (handle-exceptions exn - (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.") + (begin + (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))) (debug:print 0 "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") - (process-signal pid signal/int) - (thread-sleep! 5) - (process-signal pid signal/kill))) + (if (process:alive? pid) + (begin + (process-signal pid signal/int) + (thread-sleep! 5) + (if (process:process-alive? pid) + (process-signal pid signal/kill)))))) pids) (tests:test-set-status! run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f)) (begin (debug:print 0 "ERROR: Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f) @@ -452,20 +457,23 @@ (begin (handle-exceptions exn (begin (debug:print 0 "ERROR: Something went wrong when trying to create linktree dir at " linktree) + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (exit 1)) (create-directory linktree #t)))) (begin (debug:print 0 "ERROR: linktree not defined in [setup] section of megatest.config") (exit 1))) (if linktree (let ((dbdir (conc linktree "/.db"))) (handle-exceptions exn - (debug:print 0 "ERROR: failed to create the " dbdir " area for your database files") + (begin + (debug:print 0 "ERROR: failed to create the " dbdir " area for your database files") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))) (if (not (directory-exists? dbdir))(create-directory dbdir))) (setenv "MT_LINKTREE" linktree)) (begin (debug:print 0 "ERROR: linktree is required in your megatest.config [setup] section") (exit 1))) Index: lock-queue.scm ================================================================== --- lock-queue.scm +++ lock-queue.scm @@ -36,34 +36,38 @@ (begin (thread-sleep! 10) (if (> count 0) (lock-queue:open-db fname count: (- count 1)) db)) - (sqlite3:execute - db - "CREATE TABLE IF NOT EXISTS queue ( - id INTEGER PRIMARY KEY, - test_id INTEGER, - start_time INTEGER, - state TEXT, - CONSTRAINT queue_constraint UNIQUE (test_id));") - (sqlite3:execute - db - "CREATE TABLE IF NOT EXISTS runlocks ( - id INTEGER PRIMARY KEY, - test_id INTEGER, - run_lock TEXT, - CONSTRAINT runlock_constraint UNIQUE (run_lock));")))) + (sqlite3:with-transaction + db + (lambda () + (sqlite3:execute + db + "CREATE TABLE IF NOT EXISTS queue ( + id INTEGER PRIMARY KEY, + test_id INTEGER, + start_time INTEGER, + state TEXT, + CONSTRAINT queue_constraint UNIQUE (test_id));") + (sqlite3:execute + db + "CREATE TABLE IF NOT EXISTS runlocks ( + id INTEGER PRIMARY KEY, + test_id INTEGER, + run_lock TEXT, + CONSTRAINT runlock_constraint UNIQUE (run_lock));")))))) (sqlite3:set-busy-handler! db handler) db)) (define (lock-queue:set-state db test-id newstate #!key (remtries 10)) (handle-exceptions exn (if (> remtries 0) (begin (debug:print 0 "WARNING: exception on lock-queue:set-state. Trying again in 30 seconds.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 30) (lock-queue:set-state db test-id newstate remtries: (- remtries 1))) (begin (debug:print 0 "ERROR: Failed to set lock state for test with id " test-id ", error: " ((condition-property-accessor 'exn 'message) exn) ", giving up.") #f)) @@ -75,10 +79,11 @@ (handle-exceptions exn (if (> remtries 0) (begin (debug:print 0 "WARNING: exception on lock-queue:any-younger. Trying again in 30 seconds.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 30) (lock-queue:any-younger? db mystart test-id remtries: (- remtries 1))) (begin (debug:print 0 "ERROR: Failed to find younger locks for test with id " test-id ", error: " ((condition-property-accessor 'exn 'message) exn) ", giving up.") #f)) @@ -98,10 +103,12 @@ (mklckqry (sqlite3:prepare db "INSERT INTO runlocks (test_id,run_lock) VALUES (?,'locked');"))) (let ((result (handle-exceptions exn (begin + (debug:print 0 "WARNING: failed to get queue lock. Will try again in a few seconds") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 10) (if (> count 0) (lock-queue:get-lock db test-id count: (- count 1))) #f) (sqlite3:with-transaction @@ -125,10 +132,12 @@ (define (lock-queue:release-lock fname test-id #!key (count 10)) (let ((db (lock-queue:open-db fname))) (handle-exceptions exn (begin + (debug:print 0 "WARNING: Failed to release queue lock. Will try again in few seconds") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 10) (if (> count 0) (lock-queue:release-lock fname test-id count: (- count 1)) #f)) (sqlite3:execute db "DELETE FROM runlocks WHERE test_id=?;" test-id) @@ -136,10 +145,12 @@ (define (lock-queue:steal-lock db test-id #!key (count 10)) (handle-exceptions exn (begin + (debug:print 0 "WARNING: Failed to steal queue lock. Will try again in few seconds") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 10) (if (> count 0) (lock-queue:steal-lock db test-id count: (- count 1)) #f)) (sqlite3:execute db "DELETE FROM runlocks WHERE run_lock='locked';")) @@ -153,10 +164,12 @@ (let ((db (lock-queue:open-db fname)) (mystart (current-seconds))) (handle-exceptions exn (begin + (debug:print 0 "WARNING: Failed to find out if it is ok to skip the wait queue. Will try again in few seconds") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 10) (if (> count 0) (lock-queue:wait-turn fname test-id count: (- count 1)) #f)) (sqlite3:execute Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -8,11 +8,13 @@ ;; PURPOSE. ;; (include "common.scm") ;; (include "megatest-version.scm") -(use sqlite3 srfi-1 posix regex regex-case srfi-69 base64 format readline apropos json http-client directory-utils z3) ;; (srfi 18) extras) +(use sqlite3 srfi-1 posix regex regex-case srfi-69 base64 format readline apropos json + http-client directory-utils z3 srfi-18) ;; extras) + (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) ;; (use zmq) @@ -278,10 +280,44 @@ "-v" ;; verbose 2, more than normal (normal is 1) "-q" ;; quiet 0, errors/warnings only ) args:arg-hash 0)) + +;; The watchdog is to keep an eye on things like db sync etc. +;; +(define *watchdog* + (make-thread + (lambda () + (let loop () + (thread-sleep! 5) ;; five second resolution is only a minor burden and should be tolerable + + ;; sync for filesystem local db writes + ;; + (let ((start-time (current-seconds))) + (mutex-lock! *db-multi-sync-mutex*) + (for-each + (lambda (run-id) + (let ((last-write (hash-table-ref/default *db-local-sync* run-id 0))) + (if ;; (and + (> (- start-time last-write) 5) ;; every five seconds + ;; (common:db-access-allowed?)) + (begin + (db:multi-db-sync (list run-id) 'new2old) + (if (common:low-noise-print 30 "sync new to old") + (debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " (- (current-seconds) start-time) " seconds")) + (hash-table-delete! *db-local-sync* run-id))))) + (hash-table-keys *db-local-sync*)) + (mutex-unlock! *db-multi-sync-mutex*)) + + ;; keep going unless time to exit + ;; + (if (not *time-to-exit*) + (loop)))) + "Watchdog thread")) + +(thread-start! *watchdog*) (define (std-exit-procedure) (rmt:print-db-stats) (let ((run-ids (hash-table-keys *db-local-sync*))) (if (not (null? run-ids)) @@ -1353,10 +1389,13 @@ (if *runremote* (close-all-connections!)) (if (not *didsomething*) (debug:print 0 help)) +(set! *time-to-exit* #t) +(thread-join! *watchdog*) + (if (not (eq? *globalexitstatus* 0)) (if (or (args:get-arg "-runtests")(args:get-arg "-runall")) (begin (debug:print 0 "NOTE: Subprocesses with non-zero exit code detected: " *globalexitstatus*) (exit 0)) Index: mt.scm ================================================================== --- mt.scm +++ mt.scm @@ -165,21 +165,27 @@ ;; S T A T E A N D S T A T U S F O R T E S T S ;;====================================================================== ;; speed up for common cases with a little logic (define (mt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) - (cond - ((and newstate newstatus newcomment) - (rmt:general-call 'state-status-msg run-id newstate newstatus newcomment test-id)) - ((and newstate newstatus) - (rmt:general-call 'state-status run-id newstate newstatus test-id)) - (else - (if newstate (rmt:general-call 'set-test-state run-id newstate test-id)) - (if newstatus (rmt:general-call 'set-test-status run-id newstatus test-id)) - (if newcomment (rmt:general-call 'set-test-comment run-id newcomment test-id)))) - (mt:process-triggers run-id test-id newstate newstatus) - #t) + (if (not (and run-id test-id)) + (begin + (debug:print 0 "ERROR: bad data handed to mt:test-set-state-status-by-id, run-id=" run-id ", test-id=" test-id ", newstate=" newstate) + (print-call-chain) + #f) + (begin + (cond + ((and newstate newstatus newcomment) + (rmt:general-call 'state-status-msg run-id newstate newstatus newcomment test-id)) + ((and newstate newstatus) + (rmt:general-call 'state-status run-id newstate newstatus test-id)) + (else + (if newstate (rmt:general-call 'set-test-state run-id newstate test-id)) + (if newstatus (rmt:general-call 'set-test-status run-id newstatus test-id)) + (if newcomment (rmt:general-call 'set-test-comment run-id newcomment test-id)))) + (mt:process-triggers run-id test-id newstate newstatus) + #t))) (define (mt:test-set-state-status-by-testname run-id test-name item-path new-state new-status new-comment) (let ((test-id (cdb:remote-run db:get-test-id-cached #f run-id test-name item-path))) (mt:test-set-state-status-by-id test-id new-state new-status new-comment))) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -31,11 +31,11 @@ (sqlite3:set-busy-handler! db handler) (sqlite3:execute db "PRAGMA synchronous = 0;") (if (not exists) (sqlite3:execute db - "CREATE TABLE ports ( + "CREATE TABLE IF NOT EXISTS ports ( port INTEGER PRIMARY KEY, state TEXT DEFAULT 'not-used', fail_count INTEGER DEFAULT 0, update_time TIMESTAMP DEFAULT (strftime('%s','now')) );")) db)) @@ -88,16 +88,25 @@ (sqlite3:finalize! qry2) (sqlite3:finalize! qry3) res)) (define (portlogger:get-prev-used-port db) - (sqlite3:fold-row - (lambda (var curr) - (or curr var curr)) - #f - db - "SELECT (port) FROM ports WHERE state='released' LIMIT 1;")) + (handle-exceptions + exn + (begin + (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 "exn=" (condition->list exn)) + (print-call-chain) + (debug:print 0 "Continuing anyway.") + #f) + (sqlite3:fold-row + (lambda (var curr) + (or curr var curr)) + #f + db + "SELECT (port) FROM ports WHERE state='released' LIMIT 1;"))) (define (portlogger:find-port db) (let* ((lowport (let ((val (configf:lookup *configdat* "server" "lowport"))) (if (and val (string->number val)) @@ -104,11 +113,19 @@ (string->number val) 32768))) (portnum (or (portlogger:get-prev-used-port db) (+ lowport ;; top of registered ports is 49152 but lets use ports in the registered range (random (- 64000 lowport)))))) - (portlogger:take-port db portnum) + (handle-exceptions + exn + (begin + (debug:print 0 "EXCEPTION: portlogger database probably overloaded or unreadable. If you see this message again remove /tmp/.$USER-portlogger.db") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 "exn=" (condition->list exn)) + (print-call-chain) + (debug:print 0 "Continuing anyway.")) + (portlogger:take-port db portnum)) portnum)) ;; set port to "released", "failed" etc. ;; (define (portlogger:set-port db portnum value) @@ -122,20 +139,30 @@ ;;====================================================================== ;; MAIN ;;====================================================================== (define (portlogger:main . args) - (let* ((db (portlogger:open-db (conc "/tmp/." (current-user-name) "-portlogger.db"))) + (let* ((dbfname (conc "/tmp/." (current-user-name) "-portlogger.db")) + (db (portlogger:open-db dbfname)) (numargs (length args)) - (result (cond - ((> numargs 1) ;; most commands - (case (string->symbol (car args)) ;; commands with two or more params - ((take)(portlogger:take-port db (string->number (cadr args)))) - ((set) (portlogger:set-port db - (string->number (cadr args)) - (caddr args)) - (caddr args)) - ((failed)(portlogger:set-failed db (string->number (cadr args))) 'failed)))))) + (result + (handle-exceptions + exn + (begin + (debug:print 0 "EXCEPTION: portlogger database at " dbfname " probably overloaded or unreadable. Try removing it.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) + (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) + (print-call-chain)) + (cond + ((> numargs 1) ;; most commands + (case (string->symbol (car args)) ;; commands with two or more params + ((take)(portlogger:take-port db (string->number (cadr args)))) + ((set) (portlogger:set-port db + (string->number (cadr args)) + (caddr args)) + (caddr args)) + ((failed)(portlogger:set-failed db (string->number (cadr args))) 'failed))))))) (sqlite3:finalize! db) result)) ;; (print (apply portlogger:main (cdr (argv)))) Index: process.scm ================================================================== --- process.scm +++ process.scm @@ -51,10 +51,12 @@ ;; (print "Called with cmd=" cmd ", proc=" proc ", params=" params) (handle-exceptions exn (begin (print "ERROR: Failed to run command: " cmd " " (string-intersperse params " ")) + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) #f) (let-values (((fh fho pid) (if (null? params) (process cmd) (process cmd params)))) (let loop ((curr (read-line fh)) @@ -124,6 +126,15 @@ (if (eof-object? inl) (reverse res) (let ((pid (string->number inl))) (if proc (proc pid)) (loop (read-line) (cons pid res)))))))) - + +(define (process:alive? pid) + (handle-exceptions + exn + ;; possibly pid is a process not a child, look in /proc to see if it is running still + (file-exists? (conc "/proc/" pid)) + (let-values (((rpid exit-type exit-signal)(process-wait pid #t))) + (and (number? rpid) + (equal? rpid pid))))) + Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -58,19 +58,22 @@ ;; cmd is a symbol ;; vars is a json string encoding the parameters for the call ;; (define (rmt:send-receive cmd rid params) ;; clean out old connections + (mutex-lock! *db-multi-sync-mutex*) (let ((expire-time (- (current-seconds) 60))) (for-each (lambda (run-id) - (let ((connection (hash-table-ref *runremote* run-id))) - (if (< (http-transport:server-dat-get-last-access connection) expire-time) + (let ((connection (hash-table-ref/default *runremote* run-id #f))) + (if ;; (and connection + (< (http-transport:server-dat-get-last-access connection) expire-time) ; ) (begin (debug:print-info 0 "Discarding connection to server for run-id " run-id ", too long between accesses") (hash-table-delete! *runremote* run-id))))) (hash-table-keys *runremote*))) + (mutex-unlock! *db-multi-sync-mutex*) (let* ((run-id (if rid rid 0)) (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f))) (if cinfo cinfo ;; NB// can cache the answer for server running for 10 seconds ... @@ -104,10 +107,12 @@ (mutex-lock! *db-stats-mutex*) (handle-exceptions exn (begin (debug:print 0 "WARNING: stats collection failed in update-db-stats") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) #f) ;; if this fails we don't care, it is just stats (let* ((cmd (if (eq? rawcmd 'general-call) (car params) rawcmd)) (stat-vec (hash-table-ref/default *db-stats* cmd #f))) (if (not stat-vec) (let ((newvec (vector 0 0))) @@ -151,14 +156,14 @@ (loop (car tal)(cdr tal) newmax-cmd currmax))))))) (mutex-unlock! *db-stats-mutex*) res)) (define (rmt:open-qry-close-locally cmd run-id params) - (let* ((dbdir (conc (configf:lookup *configdat* "setup" "linktree") "/.db")) - (dbstruct-local (if *dbstruct-db* + (let* ((dbstruct-local (if *dbstruct-db* *dbstruct-db* - (let ((db (make-dbr:dbstruct path: dbdir local: #t))) + (let* ((dbdir (conc (configf:lookup *configdat* "setup" "linktree") "/.db")) + (db (make-dbr:dbstruct path: dbdir local: #t))) (set! *dbstruct-db* db) db))) (db-file-path (db:dbfile-path 0))) ;; (read-only (not (file-read-access? db-file-path))) (let* ((start (current-milliseconds)) @@ -167,20 +172,12 @@ (rmt:update-db-stats cmd params duration) ;; mark this run as dirty if this was a write (if (not (member cmd api:read-only-queries)) (let ((start-time (current-seconds))) (mutex-lock! *db-multi-sync-mutex*) - (let ((last-sync (hash-table-ref/default *db-local-sync* run-id 0))) - (if ;; (and - (> (- start-time last-sync) 5) ;; every five seconds - ;; (common:db-access-allowed?)) - (begin - ;; MOVE THIS TO A THREAD? - (db:multi-db-sync (list run-id) 'new2old) - (if (common:low-noise-print 30 "sync new to old") - (debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " (- (current-seconds) start-time) " seconds")) - (hash-table-set! *db-local-sync* run-id start-time)))) + (if (not (hash-table-ref/default *db-local-sync* run-id #f)) + (hash-table-set! *db-local-sync* run-id start-time)) ;; the oldest "write" (mutex-unlock! *db-multi-sync-mutex*))) res))) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) @@ -267,11 +264,11 @@ (define (rmt:get-test-info-by-id run-id test-id) (if (and (number? run-id)(number? test-id)) (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) (begin - (debug:print 0 "ERROR: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) + (debug:print 0 "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain) #f))) (define (rmt:test-get-rundir-from-test-id run-id test-id) (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -499,11 +499,11 @@ (let ((items-list (items:get-items-from-config tconfig))) (if (list? items-list) (begin (if (null? items-list) (let ((test-id (rmt:get-test-id run-id test-name ""))) - (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" "Failed to run due to failed prerequisites"))) + (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" "Failed to run due to failed prerequisites")))) (tests:testqueue-set-items! test-record items-list) (list hed tal reg reruns)) (begin (debug:print 0 "ERROR: The proc from reading the items table did not yield a list - please report this") (exit 1)))))) @@ -537,11 +537,11 @@ (let ((trimmed-tal (mt:discard-blocked-tests run-id hed tal test-records)) (trimmed-reg (mt:discard-blocked-tests run-id hed reg test-records))) (debug:print 1 "WARNING: test " hed " has discarded prerequisites, removing it from the queue") (let ((test-id (rmt:get-test-id run-id hed ""))) - (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to discarded prerequisites")) + (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to discarded prerequisites"))) (if (and (null? trimmed-tal) (null? trimmed-reg)) #f (list (runs:queue-next-hed trimmed-tal trimmed-reg reglen regfull) @@ -561,11 +561,11 @@ ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) - (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")) + (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while."))) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns)))) @@ -575,13 +575,14 @@ (member 'normal testmode)) (debug:print-info 1 "test " hed " (mode=" testmode ") has failed prerequisite(s); " (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") ", removing it from to-do list") (let ((test-id (rmt:get-test-id run-id hed ""))) - (if (not (null? prereq-fails)) - (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites") - (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))) + (if test-id + (if (not (null? prereq-fails)) + (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites") + (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))) (if (or (not (null? reg))(not (null? tal))) (begin (hash-table-set! test-registry hed 'CANNOTRUN) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) @@ -772,11 +773,11 @@ (if (vector? hed) (begin (debug:print 1 "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") (let ((test-id (rmt:get-test-id run-id hed ""))) - (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites")) + (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))) (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) ;; This next is for the items (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'removed) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -324,12 +324,28 @@ ;; no elegance here ... ;; (define (tasks:kill-server hostname pid) (debug:print-info 0 "Attempting to kill server process " pid " on host " hostname) (setenv "TARGETHOST" hostname) - (system (conc "nbfake kill " pid))) + (setenv "TARGETHOST_LOGF" "server-kills.log") + (system (conc "nbfake kill " pid)) + (unsetenv "TARGETHOST_LOGF") + (unsetenv "TARGETHOST")) +;; look up a server by run-id and send it a kill, also delete the record for that server +;; +(define (tasks:kill-server-run-id run-id) + (let* ((tdb (tasks:open-db)) + (sdat (tasks:get-server mdb run-id))) + (if sdat + (let ((hostname (vector-ref sdat 6)) + (pid (vector-ref sdat 5))) + (debug:print-info 0 "Killing server for run-id " run-id " on host " hostname " with pid " pid) + (tasks:kill-server hostname pid) + (tasks:server-delete-record mdb server-id tag) ) + (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")))) + ;; (if status ;; #t means alive ;; (begin ;; (if (equal? hostname (get-host-name)) ;; (handle-exceptions ;; exn @@ -634,28 +650,31 @@ (lambda (record) (let* ((param-key (list-ref record 8)) (match-dat (string-search hostpid-rx param-key))) (if match-dat (let ((hostname (cadr match-dat)) - (pid (caddr match-dat))) + (pid (string->number (caddr match-dat)))) (debug:print 0 "Sending SIGINT to process " pid " on host " hostname) (if (equal? (get-host-name) hostname) - (begin - (handle-exceptions - exn - (begin - (debug:print 0 "Kill of process " pid " on host " hostname " failed.") - (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) - #t) - (process-signal (string->number pid) signal/int) - (thread-sleep! 5) - (process-signal (string->number pid) signal/kill))) + (if (process:alive? pid) + (begin + (handle-exceptions + exn + (begin + (debug:print 0 "Kill of process " pid " on host " hostname " failed.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + #t) + (process-signal pid signal/int) + (thread-sleep! 5) + (if (process:alive? pid) + (process-signal pid signal/kill))))) ;; (call-with-environment-variables (let ((old-targethost (getenv "TARGETHOST"))) (setenv "TARGETHOST" hostname) (system (conc "nbfake kill " pid)) - (if old-targethost (setenv "TARGETHOST" old-targethost))))) + (if old-targethost (setenv "TARGETHOST" old-targethost)) + (unsetenv "TARGETHOST")))) (debug:print 0 "ERROR: no record or improper record for " target "/" run-name " in tasks_queue in monitor.db")))) records))) ;;====================================================================== Index: tdb.scm ================================================================== --- tdb.scm +++ tdb.scm @@ -70,11 +70,12 @@ (debug:print-info 11 "open-test-db END (sucessful)" work-area) ;; now let's test that everything is correct (handle-exceptions exn (begin - (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test" + (debug:print 0 "ERROR: problem accessing test db " work-area ", you probably should clean and re-run this test or remove the file " + dbpath ".\n " ((condition-property-accessor 'exn 'message) exn)) #f) ;; Is there a cheaper single line operation that will check for existance of a table ;; and raise an exception ? (sqlite3:execute db "SELECT id FROM test_data LIMIT 1;")) Index: tests/Makefile ================================================================== --- tests/Makefile +++ tests/Makefile @@ -174,11 +174,11 @@ clean : rm cleanprep kill : killall -v mtest main.sh dboard || true - rm -rf *run/db/* */megatest.db */logging.db */monitor.db fullrun/tmp/mt_*/* fullrun/tmp/mt_*/.db* fullrun/logs/*.log || true + rm -rf *run/db/* */megatest.db */logging.db */monitor.db fullrun/tmp/mt_*/* fullrun/tmp/mt_*/.db* fullrun/logs/*.log fullrun/*.log || true killall -v mtest dboard || true hardkill : kill sleep 2;killall -v mtest main.sh dboard -9