From: Mukesh Ojha quic_mojha@quicinc.com
stable inclusion from stable-v4.19.302 commit 7c452e5f9fbfcb710c6154d54d70688b4dc63431 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8T1U8
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
----------------------------------------------
[ Upstream commit 01daccf748323dfc61112f474cf2ba81015446b0 ]
In following scenario(diagram), when one thread X running dev_coredumpm() adds devcd device to the framework which sends uevent notification to userspace and another thread Y reads this uevent and call to devcd_data_write() which eventually try to delete the queued timer that is not initialized/queued yet.
So, debug object reports some warning and in the meantime, timer is initialized and queued from X path. and from Y path, it gets reinitialized again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
To fix this, introduce mutex and a boolean flag to serialize the behaviour.
cpu0(X) cpu1(Y)
dev_coredump() uevent sent to user space device_add() ======================> user space process Y reads the uevents writes to devcd fd which results into writes to
devcd_data_write() mod_delayed_work() try_to_grab_pending() del_timer() debug_assert_init() INIT_DELAYED_WORK() schedule_delayed_work() debug_object_fixup() timer_fixup_assert_init() timer_setup() do_init_timer() /* Above call reinitializes the timer to timer->entry.pprev=NULL and this will be checked later in timer_pending() call. */ timer_pending() !hlist_unhashed_lockless(&timer->entry) !h->pprev /* del_timer() checks h->pprev and finds it to be NULL due to which try_to_grab_pending() stucks. */
Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.co... Signed-off-by: Mukesh Ojha quic_mojha@quicinc.com Link: https://lore.kernel.org/r/1663073424-13663-1-git-send-email-quic_mojha@quici... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Stable-dep-of: af54d778a038 ("devcoredump: Send uevent once devcd is ready") Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Zhang Zekun zhangzekun11@huawei.com --- drivers/base/devcoredump.c | 83 +++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 2 deletions(-)
diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c index 22deb0dd8a8c..7a5e4971943c 100644 --- a/drivers/base/devcoredump.c +++ b/drivers/base/devcoredump.c @@ -29,6 +29,47 @@ struct devcd_entry { struct device devcd_dev; void *data; size_t datalen; + /* + * Here, mutex is required to serialize the calls to del_wk work between + * user/kernel space which happens when devcd is added with device_add() + * and that sends uevent to user space. User space reads the uevents, + * and calls to devcd_data_write() which try to modify the work which is + * not even initialized/queued from devcoredump. + * + * + * + * cpu0(X) cpu1(Y) + * + * dev_coredump() uevent sent to user space + * device_add() ======================> user space process Y reads the + * uevents writes to devcd fd + * which results into writes to + * + * devcd_data_write() + * mod_delayed_work() + * try_to_grab_pending() + * del_timer() + * debug_assert_init() + * INIT_DELAYED_WORK() + * schedule_delayed_work() + * + * + * Also, mutex alone would not be enough to avoid scheduling of + * del_wk work after it get flush from a call to devcd_free() + * mentioned as below. + * + * disabled_store() + * devcd_free() + * mutex_lock() devcd_data_write() + * flush_delayed_work() + * mutex_unlock() + * mutex_lock() + * mod_delayed_work() + * mutex_unlock() + * So, delete_work flag is required. + */ + struct mutex mutex; + bool delete_work; struct module *owner; ssize_t (*read)(char *buffer, loff_t offset, size_t count, void *data, size_t datalen); @@ -88,7 +129,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj, struct device *dev = kobj_to_dev(kobj); struct devcd_entry *devcd = dev_to_devcd(dev);
- mod_delayed_work(system_wq, &devcd->del_wk, 0); + mutex_lock(&devcd->mutex); + if (!devcd->delete_work) { + devcd->delete_work = true; + mod_delayed_work(system_wq, &devcd->del_wk, 0); + } + mutex_unlock(&devcd->mutex);
return count; } @@ -116,7 +162,12 @@ static int devcd_free(struct device *dev, void *data) { struct devcd_entry *devcd = dev_to_devcd(dev);
+ mutex_lock(&devcd->mutex); + if (!devcd->delete_work) + devcd->delete_work = true; + flush_delayed_work(&devcd->del_wk); + mutex_unlock(&devcd->mutex); return 0; }
@@ -126,6 +177,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr, return sysfs_emit(buf, "%d\n", devcd_disabled); }
+/* + * + * disabled_store() worker() + * class_for_each_device(&devcd_class, + * NULL, NULL, devcd_free) + * ... + * ... + * while ((dev = class_dev_iter_next(&iter)) + * devcd_del() + * device_del() + * put_device() <- last reference + * error = fn(dev, data) devcd_dev_release() + * devcd_free(dev, data) kfree(devcd) + * mutex_lock(&devcd->mutex); + * + * + * In the above diagram, It looks like disabled_store() would be racing with parallely + * running devcd_del() and result in memory abort while acquiring devcd->mutex which + * is called after kfree of devcd memory after dropping its last reference with + * put_device(). However, this will not happens as fn(dev, data) runs + * with its own reference to device via klist_node so it is not its last reference. + * so, above situation would not occur. + */ + static ssize_t disabled_store(struct class *class, struct class_attribute *attr, const char *buf, size_t count) { @@ -291,13 +366,16 @@ void dev_coredumpm(struct device *dev, struct module *owner, devcd->read = read; devcd->free = free; devcd->failing_dev = get_device(dev); + devcd->delete_work = false;
+ mutex_init(&devcd->mutex); device_initialize(&devcd->devcd_dev);
dev_set_name(&devcd->devcd_dev, "devcd%d", atomic_inc_return(&devcd_count)); devcd->devcd_dev.class = &devcd_class;
+ mutex_lock(&devcd->mutex); if (device_add(&devcd->devcd_dev)) goto put_device;
@@ -311,10 +389,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
INIT_DELAYED_WORK(&devcd->del_wk, devcd_del); schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT); - + mutex_unlock(&devcd->mutex); return; put_device: put_device(&devcd->devcd_dev); + mutex_unlock(&devcd->mutex); put_module: module_put(owner); free: