Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -126,10 +126,11 @@ (http-transport:client-connect (tasks:hostinfo-get-interface server-info) (tasks:hostinfo-get-port server-info))) ;; client:signal-handler (define (client:signal-handler signum) + (signal-mask! signum) (handle-exceptions exn (debug:print " ... exiting ...") (let ((th1 (make-thread (lambda () "") ;; do nothing for now (was flush out last call if applicable) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -156,11 +156,17 @@ (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest")) (define (common:read-encoded-string instr) (handle-exceptions exn - (read (open-input-string (base64:base64-decode instr))) + (handle-exceptions + exn + (begin + (debug:print 0 "ERROR: received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn)) + (print-call-chain) + #f) + (read (open-input-string (base64:base64-decode instr)))) (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;;====================================================================== ;; S T A T E S A N D S T A T U S E S ;;====================================================================== Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -296,11 +296,15 @@ (thread-sleep! 3) (if (and rundb (sqlite3:database? rundb)) (handle-exceptions exn - #t ;; (debug:print 0 "WARNING: database files may not have been closed correctly. Consider running -cleanup-db") + (begin + (debug:print 0 "WARNING: database files may not have been closed correctly. Consider running -cleanup-db") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print-call-chain) + #f) (sqlite3:interrupt! rundb) (sqlite3:finalize! rundb #t)))) ;; (mutex-unlock! *db-sync-mutex*) ) @@ -2437,11 +2441,11 @@ (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn) (if (eq? err-status 'done) default (begin - (debug:print 0 "ERROR: query " stmt " failed " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 "ERROR: query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain) default))) (apply sqlite3:first-result db stmt params))) ;;====================================================================== Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -1,1272 +1,1284 @@ - - - - - -The Megatest Users Manual - - - - - -
-
-

Preface

-
-

This book is organised as three sub-books; getting started, writing tests and reference.

-
-

Why Megatest?

-

The Megatest project was started for two reasons, the first was an -immediate and pressing need for a generalized tool to manage a suite -of regression tests and the second was the fact that the author had -written or maintained several such tools at different companies over -the years and it seemed a good thing to have a single open source -tool, flexible enough to meet the needs of any team doing continuous -integrating and or running a complex suite of tests for release -qualification.

-
-
-

Megatest Design Philosophy

-

Megatest is intended to provide the minimum needed resources to make -writing a suite of tests and tasks for implementing continuous build -for software, design engineering or process control (via owlfs for -example) without being specialized for any specific problem -space. Megatest in of itself does not know what constitutes a PASS or -FAIL of a test. In most cases megatest is best used in conjunction -with logpro or a similar tool to parse, analyze and decide on the test -outcome.

-
-
-

Megatest Architecture

-

All data to specify the tests and configure the system is stored in -plain text files. All system state is stored in an sqlite3 -database. Tests are launched using the launching system available for -the distributed compute platform in use. A template script is provided -which can launch jobs on local and remote Linux hosts. Currently -megatest uses the network filesystem to call home to your master -sqlite3 database.

-
-
-
-

Road Map

-

Note 1: This road-map is tentative and subject to change without notice.

-

Note 2: Starting over. Old plan is commented out.

-
-

Current Items

-
-
-

ww05 - migrate to inmem-db

-

Keep as much the same as possible. Add internal reference to almost -eliminate contention on db(s).

-
    -
  1. -

    -Add internal reference db -

    -
  2. -
  3. -

    -Verify that actions are accessing correct db -

    -
      -
    1. -

      --runtests - inmem -

      -
    2. -
    3. -

      --list-runs - local (but not megatest.db) -

      -
    4. -
    5. -

      -dashboard - local (but not megatest.db) -

      -
    6. -
    -
  4. -
  5. -

    -Mirror db to /var/tmp… -

    -
  6. -
  7. -

    -Dashboard read db from per-run db. -

    -
  8. -
  9. -

    -Dashboard read db from /var/tmp -

    -
  10. -
  11. -

    -Runs register in tasks table in monitor.db -

    -
  12. -
  13. -

    -Server polls tasks table for next action (in addition?) -

    -
  14. -
  15. -

    -Change run loop to execute in server, triggered by call to polling of tasks table -

    -
  16. -
-
-
-
-

Getting Started

-
-
Getting started with Megatest
-
-

How to install Megatest and set it up for running your regressions and continuous integration process.

-
-
-

Installation

-
-
-

Dependencies

-

Chicken scheme and a number of "eggs" are required for building -Megatest. See the script installall.sch in the utils directory of the -distribution for a mostly automated way to install everything needed -for building Megatest on Linux.

-


[An example footnote.]

-

And now for something completely different: monkeys, lions and -tigers (Bengal and Siberian) using the alternative syntax index -entries. - - - -Note that multi-entry terms generate separate index entries.

-

Here are a couple of image examples: an -images/smallnew.png - -example inline image followed by an example block image:

-
-
-Tiger image -
-
Figure 1. Tiger block image
-
-

Followed by an example table:

-
- - --- - - - - - - - - - - - - - - - -
Table 1. An example table
Option Description

-a USER GROUP

Add USER to GROUP.

-R GROUP

Disables access to GROUP.

-
-
-
Example 1. An example example
-
-

Lorum ipum…

-
-
-
-

Sub-section with Anchor

-

Sub-section at level 2.

-
-

Chapter Sub-section

-

Sub-section at level 3.

-
-
Chapter Sub-section
-

Sub-section at level 4.

-

This is the maximum sub-section depth supported by the distributed -AsciiDoc configuration. -
[A second example footnote.]

-
-
-
-
-
-
-

The Second Chapter

-
-

An example link to anchor at start of the first sub-section.

-

An example link to a bibliography entry [taoup].

-
-
-

Writing Tests

-
-

The First Chapter of the Second Part

-
-

Chapters grouped into book parts are at level 1 and can contain -sub-sections.

-
-
-

How To Do Things

-
-

Tricks

-
-

This section is a compendium of a various useful tricks for debugging, -configuring and generally getting the most out of Megatest.

-
-
-
-

Limiting your running jobs

-
-

The following example will limit a test in the jobgroup "group1" to no more than 10 tests simultaneously.

-

In your testconfig:

-
-
-
[test_meta]
-jobgroup group1
-
-

In your megatest.config:

-
-
-
[jobgroups]
-group1 10
-custdes 4
-
-
-
-
-

Debugging Tricks

-
-
-

Examining The Environment

-
-

During Config File Processing

-
-
-

Organising Your Tests and Tasks

-
-
-
[tests-paths]
-1 #{get misc parent}/simplerun/tests
-
-
-
-
[setup]
-
-

The runscript method is a brute force way to run scripts where the -user is responsible for setting STATE and STATUS

-
-
-
runscript main.csh
-
-
-
-
-

Debugging Server Problems

-
-
-
sudo lsof -i
-sudo netstat -lptu
-sudo netstat -tulpn
-
-
-
-
-

Reference

-
-

The First Chapter of the Second Part

-
-

Chapters grouped into book parts are at level 1 and can contain -sub-sections.

-
-
-
-

The testconfig File

-
-
-

Setup section

-
-

Header

-
-
-
[setup]
-
-

The runscript method is a brute force way to run scripts where the -user is responsible for setting STATE and STATUS

-
-
-
runscript main.csh
-
-
-
-
-

Requirements section

-
-

Header

-
-
-
[requirements]
-
-
-
-

Wait on Other Tests

-
-
-
# A normal waiton waits for the prior tests to be COMPLETED
-# and PASS, CHECK or WAIVED
-waiton test1 test2
-
-
-
-

Mode

-

The default (i.e. if mode is not specified) is normal. All pre-dependent tests -must be COMPLETED and PASS, CHECK or WAIVED before the test will start

-
-
-
mode   normal
-
-

The toplevel mode requires only that the prior tests are COMPLETED.

-
-
-
mode toplevel
-
-

A item based waiton will start items in a test when the -same-named item is COMPLETED and PASS, CHECK or WAIVED -in the prior test

-
-
-
mode itemmatch
-
-
-
-
# With a toplevel test you may wish to generate your list
-# of tests to run dynamically
-#
-# waiton #{shell get-valid-tests-to-run.sh}
-
-
-
-

Run time limit

-
-
-
runtimelim 1h 2m 3s  # this will automatically kill the test if it runs for more than 1h 2m and 3s
-
-
-
-

Skip

-
-
-

Header

-
-
-
[skip]
-
-
-
-

Skip on Still-running Tests

-
-
-
# NB// If the prevrunning line exists with *any* value the test will
-# automatically SKIP if the same-named test is currently RUNNING
-
-prevrunning x
-
-
-
-

Skip if a File Exists

-
-
-
fileexists /path/to/a/file # skip if /path/to/a/file exists
-
-
-
-

Controlled waiver propagation

-

If test is FAIL and previous test in run with same MT_TARGET is WAIVED then apply the following rules from the testconfig: -If a waiver check is specified in the testconfig apply the check and if it passes then set this FAIL to WAIVED

-

Waiver check has two parts, 1) a list of waiver, rulename, filepatterns and 2) the rulename script spec (note that "diff" and "logpro" are predefined)

-
-
-
###### EXAMPLE FROM testconfig #########
-# matching file(s) will be diff'd with previous run and logpro applied
-# if PASS or WARN result from logpro then WAIVER state is set
-#
-[waivers]
-# logpro_file    rulename      input_glob
-waiver_1         logpro        lookittmp.log
-
-[waiver_rules]
-
-# This builtin rule is the default if there is no <waivername>.logpro file
-# diff   diff %file1% %file2%
-
-# This builtin rule is applied if a <waivername>.logpro file exists
-# logpro diff %file1% %file2% | logpro %waivername%.logpro %waivername%.html
-
-
-
-
-

Ezsteps

-

To transfer the environment to the next step you can do the following:

-
-
-
$MT_MEGATEST -env2file .ezsteps/${stepname}
-
-
-
-

Triggers

-

In your testconfig triggers can be specified

-
-
-
[triggers]
-
-# Call script running.sh when test goes to state=RUNNING, status=PASS
-RUNNING/PASS running.sh
-
-# Call script running.sh any time state goes to RUNNING
-RUNNING/ running.sh
-
-# Call script onpass.sh any time status goes to PASS
-PASS/ onpass.sh
-
-
-
-

Megatest Internals

-
-
-server.png -
-
-
-
-
-
-

Appendix A: Example Appendix

-
-

One or more optional appendixes go here at section level zero.

-
-

Appendix Sub-section

-
- - - -
-
Note
-
Preface and appendix subsections start out of sequence at level -2 (level 1 is skipped). This only applies to multi-part book -documents.
-
-
-
-
-
-

Example Bibliography

-
-

The bibliography list is a style of AsciiDoc bulleted list.

-
    -
  • -

    -[taoup] Eric Steven Raymond. The Art of Unix - Programming. Addison-Wesley. ISBN 0-13-142901-9. -

    -
  • -
  • -

    -[walsh-muellner] Norman Walsh & Leonard Muellner. - DocBook - The Definitive Guide. O’Reilly & Associates. 1999. - ISBN 1-56592-580-7. -

    -
  • -
-
-
-
-

Example Glossary

-
-

Glossaries are optional. Glossaries entries are an example of a style -of AsciiDoc labeled lists.

-
-
-A glossary term -
-
-

- The corresponding (indented) definition. -

-
-
-A second glossary term -
-
-

- The corresponding (indented) definition. -

-
-
-
-
-
-

Example Colophon

-
-

Text at the end of a book describing facts about its production.

-
-
-
-

Example Index

-
-
-
-
-

- - - + + + + + +The Megatest Users Manual + + + + + +
+
+

Preface

+
+

This book is organised as three sub-books; getting started, writing tests and reference.

+
+

Why Megatest?

+

The Megatest project was started for two reasons, the first was an +immediate and pressing need for a generalized tool to manage a suite +of regression tests and the second was the fact that the author had +written or maintained several such tools at different companies over +the years and it seemed a good thing to have a single open source +tool, flexible enough to meet the needs of any team doing continuous +integrating and or running a complex suite of tests for release +qualification.

+
+
+

Megatest Design Philosophy

+

Megatest is intended to provide the minimum needed resources to make +writing a suite of tests and tasks for implementing continuous build +for software, design engineering or process control (via owlfs for +example) without being specialized for any specific problem +space. Megatest in of itself does not know what constitutes a PASS or +FAIL of a test. In most cases megatest is best used in conjunction +with logpro or a similar tool to parse, analyze and decide on the test +outcome.

+
+
+

Megatest Architecture

+

All data to specify the tests and configure the system is stored in +plain text files. All system state is stored in an sqlite3 +database. Tests are launched using the launching system available for +the distributed compute platform in use. A template script is provided +which can launch jobs on local and remote Linux hosts. Currently +megatest uses the network filesystem to call home to your master +sqlite3 database.

+
+
+
+

Road Map

+

Note 1: This road-map is tentative and subject to change without notice.

+

Note 2: Starting over. Old plan is commented out.

+
+

Current Items

+
+
+

ww05 - migrate to inmem-db

+

Keep as much the same as possible. Add internal reference to almost +eliminate contention on db(s).

+
    +
  1. +

    +Add internal reference db +

    +
  2. +
  3. +

    +Verify that actions are accessing correct db +

    +
      +
    1. +

      +-runtests - inmem +

      +
    2. +
    3. +

      +-list-runs - local (but not megatest.db) +

      +
    4. +
    5. +

      +dashboard - local (but not megatest.db) +

      +
    6. +
    +
  4. +
  5. +

    +Mirror db to /var/tmp… +

    +
  6. +
  7. +

    +Dashboard read db from per-run db. +

    +
  8. +
  9. +

    +Dashboard read db from /var/tmp +

    +
  10. +
  11. +

    +Runs register in tasks table in monitor.db +

    +
  12. +
  13. +

    +Server polls tasks table for next action (in addition?) +

    +
  14. +
  15. +

    +Change run loop to execute in server, triggered by call to polling of tasks table +

    +
  16. +
+
+
+
+

Getting Started

+
+
Getting started with Megatest
+
+

How to install Megatest and set it up for running your regressions and continuous integration process.

+
+
+

Installation

+
+
+

Dependencies

+

Chicken scheme and a number of "eggs" are required for building +Megatest. See the script installall.sch in the utils directory of the +distribution for a mostly automated way to install everything needed +for building Megatest on Linux.

+


[An example footnote.]

+

And now for something completely different: monkeys, lions and +tigers (Bengal and Siberian) using the alternative syntax index +entries. + + + +Note that multi-entry terms generate separate index entries.

+

Here are a couple of image examples: an +images/smallnew.png + +example inline image followed by an example block image:

+
+
+Tiger image +
+
Figure 1. Tiger block image
+
+

Followed by an example table:

+
+ + +++ + + + + + + + + + + + + + + + +
Table 1. An example table
Option Description

-a USER GROUP

Add USER to GROUP.

-R GROUP

Disables access to GROUP.

+
+
+
Example 1. An example example
+
+

Lorum ipum…

+
+
+
+

Sub-section with Anchor

+

Sub-section at level 2.

+
+

Chapter Sub-section

+

Sub-section at level 3.

+
+
Chapter Sub-section
+

Sub-section at level 4.

+

This is the maximum sub-section depth supported by the distributed +AsciiDoc configuration. +
[A second example footnote.]

+
+
+
+
+
+
+

The Second Chapter

+
+

An example link to anchor at start of the first sub-section.

+

An example link to a bibliography entry [taoup].

+
+
+

Writing Tests

+
+

The First Chapter of the Second Part

+
+

Chapters grouped into book parts are at level 1 and can contain +sub-sections.

+
+
+

How To Do Things

+
+

Tricks

+
+

This section is a compendium of a various useful tricks for debugging, +configuring and generally getting the most out of Megatest.

+
+
+
+

Limiting your running jobs

+
+

The following example will limit a test in the jobgroup "group1" to no more than 10 tests simultaneously.

+

In your testconfig:

+
+
+
[test_meta]
+jobgroup group1
+
+

In your megatest.config:

+
+
+
[jobgroups]
+group1 10
+custdes 4
+
+
+
+
+

Debugging Tricks

+
+
+

Examining The Environment

+
+

During Config File Processing

+
+
+

Organising Your Tests and Tasks

+
+
+
[tests-paths]
+1 #{get misc parent}/simplerun/tests
+
+
+
+
[setup]
+
+

The runscript method is a brute force way to run scripts where the +user is responsible for setting STATE and STATUS

+
+
+
runscript main.csh
+
+
+
+
+

Debugging Server Problems

+
+
+
sudo lsof -i
+sudo netstat -lptu
+sudo netstat -tulpn
+
+
+
+
+

Reference

+
+

The First Chapter of the Second Part

+
+

Chapters grouped into book parts are at level 1 and can contain +sub-sections.

+
+
+
+

The testconfig File

+
+
+

Setup section

+
+

Header

+
+
+
[setup]
+
+

The runscript method is a brute force way to run scripts where the +user is responsible for setting STATE and STATUS

+
+
+
runscript main.csh
+
+
+
+
+

Requirements section

+
+

Header

+
+
+
[requirements]
+
+
+
+

Wait on Other Tests

+
+
+
# A normal waiton waits for the prior tests to be COMPLETED
+# and PASS, CHECK or WAIVED
+waiton test1 test2
+
+
+
+

Mode

+

The default (i.e. if mode is not specified) is normal. All pre-dependent tests +must be COMPLETED and PASS, CHECK or WAIVED before the test will start

+
+
+
mode   normal
+
+

The toplevel mode requires only that the prior tests are COMPLETED.

+
+
+
mode toplevel
+
+

A item based waiton will start items in a test when the +same-named item is COMPLETED and PASS, CHECK or WAIVED +in the prior test

+
+
+
mode itemmatch
+
+
+
+
# With a toplevel test you may wish to generate your list
+# of tests to run dynamically
+#
+# waiton #{shell get-valid-tests-to-run.sh}
+
+
+
+

Run time limit

+
+
+
runtimelim 1h 2m 3s  # this will automatically kill the test if it runs for more than 1h 2m and 3s
+
+
+
+

Skip

+
+
+

Header

+
+
+
[skip]
+
+
+
+

Skip on Still-running Tests

+
+
+
# NB// If the prevrunning line exists with *any* value the test will
+# automatically SKIP if the same-named test is currently RUNNING
+
+prevrunning x
+
+
+
+

Skip if a File Exists

+
+
+
fileexists /path/to/a/file # skip if /path/to/a/file exists
+
+
+
+

Controlled waiver propagation

+

If test is FAIL and previous test in run with same MT_TARGET is WAIVED then apply the following rules from the testconfig: +If a waiver check is specified in the testconfig apply the check and if it passes then set this FAIL to WAIVED

+

Waiver check has two parts, 1) a list of waiver, rulename, filepatterns and 2) the rulename script spec (note that "diff" and "logpro" are predefined)

+
+
+
###### EXAMPLE FROM testconfig #########
+# matching file(s) will be diff'd with previous run and logpro applied
+# if PASS or WARN result from logpro then WAIVER state is set
+#
+[waivers]
+# logpro_file    rulename      input_glob
+waiver_1         logpro        lookittmp.log
+
+[waiver_rules]
+
+# This builtin rule is the default if there is no <waivername>.logpro file
+# diff   diff %file1% %file2%
+
+# This builtin rule is applied if a <waivername>.logpro file exists
+# logpro diff %file1% %file2% | logpro %waivername%.logpro %waivername%.html
+
+
+
+
+

Ezsteps

+

To transfer the environment to the next step you can do the following:

+
+
+
$MT_MEGATEST -env2file .ezsteps/${stepname}
+
+
+
+

Triggers

+

In your testconfig triggers can be specified

+
+
+
[triggers]
+
+# Call script running.sh when test goes to state=RUNNING, status=PASS
+RUNNING/PASS running.sh
+
+# Call script running.sh any time state goes to RUNNING
+RUNNING/ running.sh
+
+# Call script onpass.sh any time status goes to PASS
+PASS/ onpass.sh
+
+

Scripts called will have; test-id test-rundir trigger, added to the commandline.

+

HINT

+

To start an xterm (useful for debugging), use a command line like the following:

+
+
+
[triggers]
+COMPLETED/ xterm -e bash -s --
+
+
+ + + +
+
Note
+
There is a trailing space after the --
+
+
+
+

Megatest Internals

+
+
+server.png +
+
+
+
+
+
+

Appendix A: Example Appendix

+
+

One or more optional appendixes go here at section level zero.

+
+

Appendix Sub-section

+
+ + + +
+
Note
+
Preface and appendix subsections start out of sequence at level +2 (level 1 is skipped). This only applies to multi-part book +documents.
+
+
+
+
+
+

Example Bibliography

+
+

The bibliography list is a style of AsciiDoc bulleted list.

+
    +
  • +

    +[taoup] Eric Steven Raymond. The Art of Unix + Programming. Addison-Wesley. ISBN 0-13-142901-9. +

    +
  • +
  • +

    +[walsh-muellner] Norman Walsh & Leonard Muellner. + DocBook - The Definitive Guide. O’Reilly & Associates. 1999. + ISBN 1-56592-580-7. +

    +
  • +
+
+
+
+

Example Glossary

+
+

Glossaries are optional. Glossaries entries are an example of a style +of AsciiDoc labeled lists.

+
+
+A glossary term +
+
+

+ The corresponding (indented) definition. +

+
+
+A second glossary term +
+
+

+ The corresponding (indented) definition. +

+
+
+
+
+
+

Example Colophon

+
+

Text at the end of a book describing facts about its production.

+
+
+
+

Example Index

+
+
+
+
+

+ + + Index: docs/manual/server.png ================================================================== --- docs/manual/server.png +++ docs/manual/server.png cannot compute difference between binary files Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -256,10 +256,11 @@ (close-all-connections!)) (debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries) (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1))) (begin (mutex-unlock! *http-mutex*) + (tasks:kill-server-run-id run-id) #f)) (begin (debug:print-info 11 "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") ;; set up the http-client here (max-retry-attempts 1) @@ -275,10 +276,11 @@ ;; ((exn http client-error) e (print e))) (set! res (handle-exceptions exn (begin (debug:print 0 "ERROR: failure in with-input-from-request. Giving up.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) #f) (with-input-from-request ;; was dat fullurl (list (cons 'key "thekey") (cons 'cmd cmd) @@ -536,10 +538,11 @@ (set! *didsomething* #t) (thread-join! th2) (exit))))) (define (http-transport:server-signal-handler signum) + (signal-mask! signum) (handle-exceptions exn (debug:print " ... exiting ...") (let ((th1 (make-thread (lambda () (thread-sleep! 1)) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -361,15 +361,20 @@ (begin (for-each (lambda (pid) (handle-exceptions exn - (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.") + (begin + (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))) (debug:print 0 "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") - (process-signal pid signal/int) - (thread-sleep! 5) - (process-signal pid signal/kill))) + (if (process:alive? pid) + (begin + (process-signal pid signal/int) + (thread-sleep! 5) + (if (process:process-alive? pid) + (process-signal pid signal/kill)))))) pids) (tests:test-set-status! run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f)) (begin (debug:print 0 "ERROR: Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f) @@ -452,20 +457,23 @@ (begin (handle-exceptions exn (begin (debug:print 0 "ERROR: Something went wrong when trying to create linktree dir at " linktree) + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (exit 1)) (create-directory linktree #t)))) (begin (debug:print 0 "ERROR: linktree not defined in [setup] section of megatest.config") (exit 1))) (if linktree (let ((dbdir (conc linktree "/.db"))) (handle-exceptions exn - (debug:print 0 "ERROR: failed to create the " dbdir " area for your database files") + (begin + (debug:print 0 "ERROR: failed to create the " dbdir " area for your database files") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))) (if (not (directory-exists? dbdir))(create-directory dbdir))) (setenv "MT_LINKTREE" linktree)) (begin (debug:print 0 "ERROR: linktree is required in your megatest.config [setup] section") (exit 1))) Index: lock-queue.scm ================================================================== --- lock-queue.scm +++ lock-queue.scm @@ -36,34 +36,38 @@ (begin (thread-sleep! 10) (if (> count 0) (lock-queue:open-db fname count: (- count 1)) db)) - (sqlite3:execute - db - "CREATE TABLE IF NOT EXISTS queue ( - id INTEGER PRIMARY KEY, - test_id INTEGER, - start_time INTEGER, - state TEXT, - CONSTRAINT queue_constraint UNIQUE (test_id));") - (sqlite3:execute - db - "CREATE TABLE IF NOT EXISTS runlocks ( - id INTEGER PRIMARY KEY, - test_id INTEGER, - run_lock TEXT, - CONSTRAINT runlock_constraint UNIQUE (run_lock));")))) + (sqlite3:with-transaction + db + (lambda () + (sqlite3:execute + db + "CREATE TABLE IF NOT EXISTS queue ( + id INTEGER PRIMARY KEY, + test_id INTEGER, + start_time INTEGER, + state TEXT, + CONSTRAINT queue_constraint UNIQUE (test_id));") + (sqlite3:execute + db + "CREATE TABLE IF NOT EXISTS runlocks ( + id INTEGER PRIMARY KEY, + test_id INTEGER, + run_lock TEXT, + CONSTRAINT runlock_constraint UNIQUE (run_lock));")))))) (sqlite3:set-busy-handler! db handler) db)) (define (lock-queue:set-state db test-id newstate #!key (remtries 10)) (handle-exceptions exn (if (> remtries 0) (begin (debug:print 0 "WARNING: exception on lock-queue:set-state. Trying again in 30 seconds.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 30) (lock-queue:set-state db test-id newstate remtries: (- remtries 1))) (begin (debug:print 0 "ERROR: Failed to set lock state for test with id " test-id ", error: " ((condition-property-accessor 'exn 'message) exn) ", giving up.") #f)) @@ -75,10 +79,11 @@ (handle-exceptions exn (if (> remtries 0) (begin (debug:print 0 "WARNING: exception on lock-queue:any-younger. Trying again in 30 seconds.") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 30) (lock-queue:any-younger? db mystart test-id remtries: (- remtries 1))) (begin (debug:print 0 "ERROR: Failed to find younger locks for test with id " test-id ", error: " ((condition-property-accessor 'exn 'message) exn) ", giving up.") #f)) @@ -98,10 +103,12 @@ (mklckqry (sqlite3:prepare db "INSERT INTO runlocks (test_id,run_lock) VALUES (?,'locked');"))) (let ((result (handle-exceptions exn (begin + (debug:print 0 "WARNING: failed to get queue lock. Will try again in a few seconds") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 10) (if (> count 0) (lock-queue:get-lock db test-id count: (- count 1))) #f) (sqlite3:with-transaction @@ -125,10 +132,12 @@ (define (lock-queue:release-lock fname test-id #!key (count 10)) (let ((db (lock-queue:open-db fname))) (handle-exceptions exn (begin + (debug:print 0 "WARNING: Failed to release queue lock. Will try again in few seconds") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 10) (if (> count 0) (lock-queue:release-lock fname test-id count: (- count 1)) #f)) (sqlite3:execute db "DELETE FROM runlocks WHERE test_id=?;" test-id) @@ -136,10 +145,12 @@ (define (lock-queue:steal-lock db test-id #!key (count 10)) (handle-exceptions exn (begin + (debug:print 0 "WARNING: Failed to steal queue lock. Will try again in few seconds") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 10) (if (> count 0) (lock-queue:steal-lock db test-id count: (- count 1)) #f)) (sqlite3:execute db "DELETE FROM runlocks WHERE run_lock='locked';")) @@ -153,10 +164,12 @@ (let ((db (lock-queue:open-db fname)) (mystart (current-seconds))) (handle-exceptions exn (begin + (debug:print 0 "WARNING: Failed to find out if it is ok to skip the wait queue. Will try again in few seconds") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! 10) (if (> count 0) (lock-queue:wait-turn fname test-id count: (- count 1)) #f)) (sqlite3:execute Index: process.scm ================================================================== --- process.scm +++ process.scm @@ -125,5 +125,15 @@ (reverse res) (let ((pid (string->number inl))) (if proc (proc pid)) (loop (read-line) (cons pid res)))))))) + +(define (process:alive? pid) + (handle-exceptions + exn + ;; possibly pid is a process not a child, look in /proc to see if it is running still + (file-exists? (conc "/proc/" pid)) + (let-values (((rpid exit-type exit-signal)(process-wait pid #t))) + (and (number? rpid) + (equal? rpid pid))))) + Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -324,12 +324,28 @@ ;; no elegance here ... ;; (define (tasks:kill-server hostname pid) (debug:print-info 0 "Attempting to kill server process " pid " on host " hostname) (setenv "TARGETHOST" hostname) - (system (conc "nbfake kill " pid))) + (setenv "TARGETHOST_LOGF" "server-kills.log") + (system (conc "nbfake kill " pid)) + (unsetenv "TARGETHOST_LOGF") + (unsetenv "TARGETHOST")) +;; look up a server by run-id and send it a kill, also delete the record for that server +;; +(define (tasks:kill-server-run-id run-id) + (let* ((tdb (tasks:open-db)) + (sdat (tasks:get-server mdb run-id))) + (if sdat + (let ((hostname (vector-ref sdat 6)) + (pid (vector-ref sdat 5))) + (debug:print-info 0 "Killing server for run-id " run-id " on host " hostname " with pid " pid) + (tasks:kill-server hostname pid) + (tasks:server-delete-record mdb server-id tag) ) + (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")))) + ;; (if status ;; #t means alive ;; (begin ;; (if (equal? hostname (get-host-name)) ;; (handle-exceptions ;; exn