[how] mq publish the crash event, lifecycle will deal the job event. when job_state is crash and the job is consumed, then close the job and reboot crash machine, if no job, just reboot crash machine.
Signed-off-by: Li Ping 1477412247@qq.com --- src/lib/lifecycle.cr | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+)
diff --git a/src/lib/lifecycle.cr b/src/lib/lifecycle.cr index 4bd2ac0..8f34903 100644 --- a/src/lib/lifecycle.cr +++ b/src/lib/lifecycle.cr @@ -122,6 +122,8 @@ class Lifecycle on_job_close(event) when "abnormal" on_abnormal_job(event) + when "crash" + on_job_crash(event) else on_other_job(event) end @@ -190,6 +192,17 @@ class Lifecycle update_cached_machine(job["testbox"].to_s, event) end
+ def on_job_crash(event) + event_job_id = event["job_id"].to_s + if @jobs[event_job_id]? + @jobs.delete(event_job_id) + spawn @scheduler_api.close_job(event_job_id, "crash", "lifecycle") + end + + testbox = event["testbox"].to_s + reboot_crash_machine(testbox, event) + end + def on_job_boot(event) event_job_id = event["job_id"]?.to_s @jobs[event_job_id] = event unless event_job_id.empty? @@ -298,6 +311,22 @@ class Lifecycle spawn @scheduler_api.close_job(job_id, reason, "lifecycle") end
+ def reboot_crash_machine(testbox, event) + @machines.delete(testbox) + machine = @es.get_tbox(testbox) + return unless machine + return unless event["time"].to_s.bigger_than?(machine["time"]?) + + mq_queue = get_machine_reboot_queue(testbox) + machine.as_h.delete("history") + machine.as_h["testbox"] = JSON::Any.new(testbox) + @mq.publish_confirm(mq_queue, machine.to_json, durable: true) + + machine["state"] = "rebooting_queue" + machine["time"] = Time.local.to_s("%Y-%m-%dT%H:%M:%S+0800") + @es.update_tbox(testbox, machine.as_h) + end + def reboot_timeout_machine(testbox) @machines.delete(testbox) machine = @es.get_tbox(testbox)