1. use "e.inspect_with_backtrace" to print more detailed error
information
2. add data to save data when an error is reported
Signed-off-by: Wu Zhende <wuzhende666(a)163.com>
---
src/lib/lifecycle.cr | 58 ++++++++++++++------------
src/lib/mq.cr | 2 +-
src/lib/sched.cr | 14 +++----
src/monitoring/amqp.cr | 12 +++++-
src/monitoring/filter.cr | 6 ++-
src/scheduler/close_job.cr | 2 +-
src/scheduler/download_file.cr | 2 +-
src/scheduler/find_next_job_boot.cr | 2 +-
src/scheduler/request_cluster_state.cr | 2 +-
9 files changed, 59 insertions(+), 41 deletions(-)
diff --git a/src/lib/lifecycle.cr b/src/lib/lifecycle.cr
index e56a681..fe870de 100644
--- a/src/lib/lifecycle.cr
+++ b/src/lib/lifecycle.cr
@@ -45,7 +45,7 @@ class Lifecycle
rescue e
@log.warn({
"resource" => "/alive",
- "message" => e.to_s
+ "message" => e.inspect_with_backtrace
}.to_json)
end
@@ -74,7 +74,7 @@ class Lifecycle
rescue e
@log.warn({
"resource" => "init_from_es",
- "message" => e.to_s
+ "message" => e.inspect_with_backtrace
}.to_json)
end
@@ -118,30 +118,34 @@ class Lifecycle
def mq_event_loop
puts "deal job events"
- q = @mq.ch.queue("job_mq", durable: false)
- q.subscribe(no_ack: false) do |msg|
- event = JSON.parse(msg.body_io.to_s)
- job_state = event["job_state"]?
-
- case job_state
- when "boot"
- on_job_boot(event)
- when "close"
- on_job_close(event)
- when "abnormal"
- on_abnormal_job(event)
- when "crash"
- on_job_crash(event)
- else
- on_other_job(event)
+ event = JSON::Any.new(nil)
+ begin
+ q = @mq.ch.queue("job_mq", durable: false)
+ q.subscribe(no_ack: false) do |msg|
+ event = JSON.parse(msg.body_io.to_s)
+ job_state = event["job_state"]?
+
+ case job_state
+ when "boot"
+ on_job_boot(event)
+ when "close"
+ on_job_close(event)
+ when "abnormal"
+ on_abnormal_job(event)
+ when "crash"
+ on_job_crash(event)
+ else
+ on_other_job(event)
+ end
+ @mq.ch.basic_ack(msg.delivery_tag)
end
- @mq.ch.basic_ack(msg.delivery_tag)
+ rescue e
+ @log.warn({
+ "resource" => "mq_event_loop",
+ "message" => e.inspect_with_backtrace,
+ "event" => event
+ }.to_json)
end
- rescue e
- @log.warn({
- "resource" => "mq_event_loop",
- "message" => e.to_s
- }.to_json)
end
def on_other_job(event)
@@ -269,7 +273,8 @@ class Lifecycle
rescue e
@log.warn({
"resource" => "timeout_job_loop",
- "message" => e.to_s
+ "message" => e.inspect_with_backtrace,
+ "job_id" => dead_job_id
}.to_json)
end
end
@@ -285,7 +290,8 @@ class Lifecycle
rescue e
@log.warn({
"resource" => "timeout_machine_loop",
- "message" => e.to_s
+ "message" => e.inspect_with_backtrace,
+ "testbox" => dead_machine_name
}.to_json)
end
end
diff --git a/src/lib/mq.cr b/src/lib/mq.cr
index 1c7f4b7..67fcee2 100644
--- a/src/lib/mq.cr
+++ b/src/lib/mq.cr
@@ -28,7 +28,7 @@ class MQClient
}.to_json)
rescue e
@log.warn({
- "msg" => e.to_s,
+ "msg" => e.inspect_with_backtrace,
"source" => "mq_client"
}.to_json)
end
diff --git a/src/lib/sched.cr b/src/lib/sched.cr
index 7a9e4d8..43fe47b 100644
--- a/src/lib/sched.cr
+++ b/src/lib/sched.cr
@@ -56,7 +56,7 @@ class Sched
def alive(version)
"LKP Alive! The time is #{Time.local}, version = #{version}"
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
end
def normalize_mac(mac : String)
@@ -72,7 +72,7 @@ class Sched
"No yet!"
end
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
end
def del_host_mac
@@ -84,7 +84,7 @@ class Sched
"No yet!"
end
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
end
def set_host2queues
@@ -96,7 +96,7 @@ class Sched
"No yet!"
end
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
end
def del_host2queues
@@ -108,7 +108,7 @@ class Sched
"No yet!"
end
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
end
def get_time
@@ -143,7 +143,7 @@ class Sched
hash["testbox"] = testbox
@log.info(hash.to_json)
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
end
def send_mq_msg(job_state)
@@ -223,7 +223,7 @@ class Sched
@log.info(%({"job_id": "#{job_id}", "state": "set ssh port", "ssh_port": "#{ssh_port}", "tbox_name": "#{testbox}"}))
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
end
private def query_consumable_keys(shortest_queue_name)
diff --git a/src/monitoring/amqp.cr b/src/monitoring/amqp.cr
index f23113a..6ba7b1b 100644
--- a/src/monitoring/amqp.cr
+++ b/src/monitoring/amqp.cr
@@ -29,7 +29,11 @@ class MessageQueueClient
begin
filter.filter_msg(msg.body_io)
rescue e
- @log.warn("filter message failed: #{e}")
+ @log.warn({
+ "resource" => "filter_message",
+ "message" => e.inspect_with_backtrace,
+ "data" => msg.body_io.to_s
+ }.to_json)
end
end
end
@@ -42,7 +46,11 @@ class MessageQueueClient
filter_msg(conn, filter, exchange_name, queue_name)
end
rescue e
- @log.warn("monitoring_message_queue failed: #{e}")
+ @log.warn({
+ "resource" => "monitoring_message_queue",
+ "message" => e.inspect_with_backtrace,
+ "data" => "#{exchange_name}, #{queue_name}"
+ }.to_json)
sleep 5
end
end
diff --git a/src/monitoring/filter.cr b/src/monitoring/filter.cr
index b663f11..fe41571 100644
--- a/src/monitoring/filter.cr
+++ b/src/monitoring/filter.cr
@@ -47,7 +47,11 @@ class Filter
@hash[query].each do |socket|
socket.send msg.to_json
rescue e
- @log.warn("send msg failed: #{e}")
+ @log.warn({
+ "resource" => "send_msg",
+ "message" => e.inspect_with_backtrace,
+ "data" => "query: #{query}, mas: #{msg}"
+ }.to_json)
remove_filter_rule(query, socket)
end
end
diff --git a/src/scheduler/close_job.cr b/src/scheduler/close_job.cr
index a112a38..0f051e6 100644
--- a/src/scheduler/close_job.cr
+++ b/src/scheduler/close_job.cr
@@ -30,7 +30,7 @@ class Sched
job_state ||= "complete"
@log.info(%({"job_id": "#{job_id}", "job_state": "#{job_state}"}))
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
ensure
source = @env.params.query["source"]?
if source != "lifecycle"
diff --git a/src/scheduler/download_file.cr b/src/scheduler/download_file.cr
index 50e27ba..48ce55a 100644
--- a/src/scheduler/download_file.cr
+++ b/src/scheduler/download_file.cr
@@ -11,6 +11,6 @@ class Sched
send_file @env, file_path
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
end
end
diff --git a/src/scheduler/find_next_job_boot.cr b/src/scheduler/find_next_job_boot.cr
index 091c41a..15daebf 100644
--- a/src/scheduler/find_next_job_boot.cr
+++ b/src/scheduler/find_next_job_boot.cr
@@ -15,6 +15,6 @@ class Sched
response
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
end
end
diff --git a/src/scheduler/request_cluster_state.cr b/src/scheduler/request_cluster_state.cr
index e8ef6f3..e9be3cf 100644
--- a/src/scheduler/request_cluster_state.cr
+++ b/src/scheduler/request_cluster_state.cr
@@ -69,7 +69,7 @@ class Sched
# show cluster state
return @redis.hash_get("sched/cluster_state", cluster_id)
rescue e
- @log.warn(e)
+ @log.warn(e.inspect_with_backtrace)
end
# node_state: "finish" | "ready"
--
2.23.0