From: Andrey Grodzovsky andrey.grodzovsky@amd.com
mainline inclusion from mainline-v5.17-rc1 commit 542cff7893a37445f98ece26aeb3c9c1055e9ea4 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IB3ULB
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Probelm: Singlaning one sched fence from within another's sched fence singal callback generates lockdep splat because the both have same lockdep class of their fence->lock
Fix: Fix bellow stack by rescheduling to irq work of signaling and killing of jobs that left when entity is killed.
[11176.741181] dump_stack+0x10/0x12 [11176.741186] __lock_acquire.cold+0x208/0x2df [11176.741197] lock_acquire+0xc6/0x2d0 [11176.741204] ? dma_fence_signal+0x28/0x80 [11176.741212] _raw_spin_lock_irqsave+0x4d/0x70 [11176.741219] ? dma_fence_signal+0x28/0x80 [11176.741225] dma_fence_signal+0x28/0x80 [11176.741230] drm_sched_fence_finished+0x12/0x20 [gpu_sched] [11176.741240] drm_sched_entity_kill_jobs_cb+0x1c/0x50 [gpu_sched] [11176.741248] dma_fence_signal_timestamp_locked+0xac/0x1a0 [11176.741254] dma_fence_signal+0x3b/0x80 [11176.741260] drm_sched_fence_finished+0x12/0x20 [gpu_sched] [11176.741268] drm_sched_job_done.isra.0+0x7f/0x1a0 [gpu_sched] [11176.741277] drm_sched_job_done_cb+0x12/0x20 [gpu_sched] [11176.741284] dma_fence_signal_timestamp_locked+0xac/0x1a0 [11176.741290] dma_fence_signal+0x3b/0x80 [11176.741296] amdgpu_fence_process+0xd1/0x140 [amdgpu] [11176.741504] sdma_v4_0_process_trap_irq+0x8c/0xb0 [amdgpu] [11176.741731] amdgpu_irq_dispatch+0xce/0x250 [amdgpu] [11176.741954] amdgpu_ih_process+0x81/0x100 [amdgpu] [11176.742174] amdgpu_irq_handler+0x26/0xa0 [amdgpu] [11176.742393] __handle_irq_event_percpu+0x4f/0x2c0 [11176.742402] handle_irq_event_percpu+0x33/0x80 [11176.742408] handle_irq_event+0x39/0x60 [11176.742414] handle_edge_irq+0x93/0x1d0 [11176.742419] __common_interrupt+0x50/0xe0 [11176.742426] common_interrupt+0x80/0x90
Signed-off-by: Andrey Grodzovsky andrey.grodzovsky@amd.com Suggested-by: Daniel Vetter daniel.vetter@ffwll.ch Suggested-by: Christian König christian.koenig@amd.com Tested-by: Christian König christian.koenig@amd.com Reviewed-by: Christian König christian.koenig@amd.com Link: https://www.spinics.net/lists/dri-devel/msg321250.html
Conflicts: context conflicts in below files: drivers/gpu/drm/scheduler/sched_entity.c include/drm/gpu_scheduler.h
Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/gpu/drm/scheduler/sched_entity.c | 15 ++++++++++++--- include/drm/gpu_scheduler.h | 12 +++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index 1b1b770dc949..fa2771b9bac8 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -189,6 +189,16 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout) } EXPORT_SYMBOL(drm_sched_entity_flush);
+static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk) +{ + struct drm_sched_job *job = container_of(wrk, typeof(*job), work); + + drm_sched_fence_finished(job->s_fence); + WARN_ON(job->s_fence->parent); + job->sched->ops->free_job(job); +} + + /** * drm_sched_entity_kill_jobs - helper for drm_sched_entity_kill_jobs * @@ -203,9 +213,8 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, struct drm_sched_job *job = container_of(cb, struct drm_sched_job, finish_cb);
- drm_sched_fence_finished(job->s_fence); - WARN_ON(job->s_fence->parent); - job->sched->ops->free_job(job); + init_irq_work(&job->work, drm_sched_entity_kill_jobs_irq_work); + irq_work_queue(&job->work); }
static struct dma_fence * diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 842df37f43be..56562016cefd 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -28,6 +28,7 @@ #include <linux/dma-fence.h> #include <linux/completion.h> #include <linux/xarray.h> +#include <linux/irq_work.h>
#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000)
@@ -194,7 +195,16 @@ struct drm_sched_job { struct spsc_node queue_node; struct drm_gpu_scheduler *sched; struct drm_sched_fence *s_fence; - struct dma_fence_cb finish_cb; + + /* + * work is used only after finish_cb has been used and will not be + * accessed anymore. + */ + union { + struct dma_fence_cb finish_cb; + struct irq_work work; + }; + struct list_head node; uint64_t id; atomic_t karma;