1. use "e.inspect_with_backtrace" to print more detailed error information
2. add data to save data when an error is reported
Signed-off-by: Wu Zhende wuzhende666@163.com --- src/lib/lifecycle.cr | 58 ++++++++++++++------------ src/lib/mq.cr | 2 +- src/lib/sched.cr | 14 +++---- src/monitoring/amqp.cr | 12 +++++- src/monitoring/filter.cr | 6 ++- src/scheduler/close_job.cr | 2 +- src/scheduler/download_file.cr | 2 +- src/scheduler/find_next_job_boot.cr | 2 +- src/scheduler/request_cluster_state.cr | 2 +- 9 files changed, 59 insertions(+), 41 deletions(-)
diff --git a/src/lib/lifecycle.cr b/src/lib/lifecycle.cr index e56a681..fe870de 100644 --- a/src/lib/lifecycle.cr +++ b/src/lib/lifecycle.cr @@ -45,7 +45,7 @@ class Lifecycle rescue e @log.warn({ "resource" => "/alive", - "message" => e.to_s + "message" => e.inspect_with_backtrace }.to_json) end
@@ -74,7 +74,7 @@ class Lifecycle rescue e @log.warn({ "resource" => "init_from_es", - "message" => e.to_s + "message" => e.inspect_with_backtrace }.to_json) end
@@ -118,30 +118,34 @@ class Lifecycle
def mq_event_loop puts "deal job events" - q = @mq.ch.queue("job_mq", durable: false) - q.subscribe(no_ack: false) do |msg| - event = JSON.parse(msg.body_io.to_s) - job_state = event["job_state"]? - - case job_state - when "boot" - on_job_boot(event) - when "close" - on_job_close(event) - when "abnormal" - on_abnormal_job(event) - when "crash" - on_job_crash(event) - else - on_other_job(event) + event = JSON::Any.new(nil) + begin + q = @mq.ch.queue("job_mq", durable: false) + q.subscribe(no_ack: false) do |msg| + event = JSON.parse(msg.body_io.to_s) + job_state = event["job_state"]? + + case job_state + when "boot" + on_job_boot(event) + when "close" + on_job_close(event) + when "abnormal" + on_abnormal_job(event) + when "crash" + on_job_crash(event) + else + on_other_job(event) + end + @mq.ch.basic_ack(msg.delivery_tag) end - @mq.ch.basic_ack(msg.delivery_tag) + rescue e + @log.warn({ + "resource" => "mq_event_loop", + "message" => e.inspect_with_backtrace, + "event" => event + }.to_json) end - rescue e - @log.warn({ - "resource" => "mq_event_loop", - "message" => e.to_s - }.to_json) end
def on_other_job(event) @@ -269,7 +273,8 @@ class Lifecycle rescue e @log.warn({ "resource" => "timeout_job_loop", - "message" => e.to_s + "message" => e.inspect_with_backtrace, + "job_id" => dead_job_id }.to_json) end end @@ -285,7 +290,8 @@ class Lifecycle rescue e @log.warn({ "resource" => "timeout_machine_loop", - "message" => e.to_s + "message" => e.inspect_with_backtrace, + "testbox" => dead_machine_name }.to_json) end end diff --git a/src/lib/mq.cr b/src/lib/mq.cr index 1c7f4b7..67fcee2 100644 --- a/src/lib/mq.cr +++ b/src/lib/mq.cr @@ -28,7 +28,7 @@ class MQClient }.to_json) rescue e @log.warn({ - "msg" => e.to_s, + "msg" => e.inspect_with_backtrace, "source" => "mq_client" }.to_json) end diff --git a/src/lib/sched.cr b/src/lib/sched.cr index 7a9e4d8..43fe47b 100644 --- a/src/lib/sched.cr +++ b/src/lib/sched.cr @@ -56,7 +56,7 @@ class Sched def alive(version) "LKP Alive! The time is #{Time.local}, version = #{version}" rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) end
def normalize_mac(mac : String) @@ -72,7 +72,7 @@ class Sched "No yet!" end rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) end
def del_host_mac @@ -84,7 +84,7 @@ class Sched "No yet!" end rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) end
def set_host2queues @@ -96,7 +96,7 @@ class Sched "No yet!" end rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) end
def del_host2queues @@ -108,7 +108,7 @@ class Sched "No yet!" end rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) end
def get_time @@ -143,7 +143,7 @@ class Sched hash["testbox"] = testbox @log.info(hash.to_json) rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) end
def send_mq_msg(job_state) @@ -223,7 +223,7 @@ class Sched
@log.info(%({"job_id": "#{job_id}", "state": "set ssh port", "ssh_port": "#{ssh_port}", "tbox_name": "#{testbox}"})) rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) end
private def query_consumable_keys(shortest_queue_name) diff --git a/src/monitoring/amqp.cr b/src/monitoring/amqp.cr index f23113a..6ba7b1b 100644 --- a/src/monitoring/amqp.cr +++ b/src/monitoring/amqp.cr @@ -29,7 +29,11 @@ class MessageQueueClient begin filter.filter_msg(msg.body_io) rescue e - @log.warn("filter message failed: #{e}") + @log.warn({ + "resource" => "filter_message", + "message" => e.inspect_with_backtrace, + "data" => msg.body_io.to_s + }.to_json) end end end @@ -42,7 +46,11 @@ class MessageQueueClient filter_msg(conn, filter, exchange_name, queue_name) end rescue e - @log.warn("monitoring_message_queue failed: #{e}") + @log.warn({ + "resource" => "monitoring_message_queue", + "message" => e.inspect_with_backtrace, + "data" => "#{exchange_name}, #{queue_name}" + }.to_json) sleep 5 end end diff --git a/src/monitoring/filter.cr b/src/monitoring/filter.cr index b663f11..fe41571 100644 --- a/src/monitoring/filter.cr +++ b/src/monitoring/filter.cr @@ -47,7 +47,11 @@ class Filter @hash[query].each do |socket| socket.send msg.to_json rescue e - @log.warn("send msg failed: #{e}") + @log.warn({ + "resource" => "send_msg", + "message" => e.inspect_with_backtrace, + "data" => "query: #{query}, mas: #{msg}" + }.to_json) remove_filter_rule(query, socket) end end diff --git a/src/scheduler/close_job.cr b/src/scheduler/close_job.cr index a112a38..0f051e6 100644 --- a/src/scheduler/close_job.cr +++ b/src/scheduler/close_job.cr @@ -30,7 +30,7 @@ class Sched job_state ||= "complete" @log.info(%({"job_id": "#{job_id}", "job_state": "#{job_state}"})) rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) ensure source = @env.params.query["source"]? if source != "lifecycle" diff --git a/src/scheduler/download_file.cr b/src/scheduler/download_file.cr index 50e27ba..48ce55a 100644 --- a/src/scheduler/download_file.cr +++ b/src/scheduler/download_file.cr @@ -11,6 +11,6 @@ class Sched
send_file @env, file_path rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) end end diff --git a/src/scheduler/find_next_job_boot.cr b/src/scheduler/find_next_job_boot.cr index 091c41a..15daebf 100644 --- a/src/scheduler/find_next_job_boot.cr +++ b/src/scheduler/find_next_job_boot.cr @@ -15,6 +15,6 @@ class Sched
response rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) end end diff --git a/src/scheduler/request_cluster_state.cr b/src/scheduler/request_cluster_state.cr index e8ef6f3..e9be3cf 100644 --- a/src/scheduler/request_cluster_state.cr +++ b/src/scheduler/request_cluster_state.cr @@ -69,7 +69,7 @@ class Sched # show cluster state return @redis.hash_get("sched/cluster_state", cluster_id) rescue e - @log.warn(e) + @log.warn(e.inspect_with_backtrace) end
# node_state: "finish" | "ready"