When a testbox get a job, record its information in es. The deadline field indicates the time when the job executed by the testbox times out. It's a sign that other measures are needed to intervene. Can use curl -XGET http://localhost:9200/testbox/_doc/_search -d '{"query":{"range":{"deadline":{"lt":"now"}}}}' to find all timeout testboxs and jobs
Signed-off-by: Wu Zhende wuzhende666@163.com --- src/lib/sched.cr | 19 +++++++++++++++++++ src/scheduler/find_job_boot.cr | 1 + 2 files changed, 20 insertions(+)
diff --git a/src/lib/sched.cr b/src/lib/sched.cr index 51e0d74..8d7c218 100644 --- a/src/lib/sched.cr +++ b/src/lib/sched.cr @@ -134,6 +134,25 @@ class Sched @log.warn(e) end
+ def set_tbox_boot_wtmp(job : Job) + time = Time.local + booting_time = time.to_s("%Y-%m-%dT%H:%M:%S") + + runtime = (job["timeout"]? || job["runtime"]?).to_s + runtime = 1800 if runtime.empty? + + # reserve 300 seconds for system startup, hw machine will need such long time + deadline = (time + (runtime.to_i32 * 2 + 300).second).to_s("%Y-%m-%dT%H:%M:%S") + hash = { + "job_id" => job["id"], + "state" => "booting", + "booting_time" => booting_time, + "deadline" => deadline + } + + @es.update_tbox(job["testbox"], hash) + end + def report_ssh_port testbox = @env.params.query["tbox_name"] ssh_port = @env.params.query["ssh_port"].to_s diff --git a/src/scheduler/find_job_boot.cr b/src/scheduler/find_job_boot.cr index b7f3d75..f8d7caa 100644 --- a/src/scheduler/find_job_boot.cr +++ b/src/scheduler/find_job_boot.cr @@ -101,6 +101,7 @@ class Sched
if job create_job_cpio(job.dump_to_json_any, Kemal.config.public_folder) + set_tbox_boot_wtmp(job) else # for physical machines spawn { auto_submit_idle_job(host) }