hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/IB5UKT
--------------------------------
This patch adds the synchronous waiting mechanism for the volume unhash in erofs ondemand mode. There are 2 main reasons:
1) Currently volume's unhash relies on its own reference count to be 0, partly depending on the user state release fd. That is, if the user state fails to turn off fd properly, then the volume cannot be unhashed. Next time when mount the specified volume with the same name, the system will waits for the release of the previous volume. However, the write semaphore of the corresponding superblock is not released. When traversing the sb, hung task will occur because the read semaphore cannot be obtained:
[mount] [sync] vfs_get_super ksys_sync sget_fc iterate_supers alloc_super down_write_nested ----pin sb->s_umount list_add_tail(&s->s_list, &super_blocks) erofs_fc_fill_super super_lock ... down_read ----hungtask fscache_hash_volume ----wait for volume
2) During the umount process, object generates the cache directory entry (cachefiles_commit_tmpfile), but it is not synchronized. After umount is complete, the user may see that the cache directory is not generated. When inuse() is called in user mode, the cache directory cannot be found.
The solution: 1) For the erofs on-demand loading scenario, "n_hash_cookies" has been introduced in "struct fscache_volume", increased whenever there is a child cookie hashed and decreased if the cookie has unhashed. When it returns zero, the volume is awakened with unhash. FSCACHE_CACHE_SYNC_VOLUME_UNHASH flag is introduced in "struct fscache_cache", which is used to indicate whether this feature is enabled. 2) cachefiles_free_volume() need to be called to ensure the next mount successful, otherwise -ENODATA will be returned because cache_priv will not be created in new volume and the object may not be initialized. To prevent use-after-free issue caused by the kfree(volume)/kfree(cache), "ref" and "lock" have been introduced in "struct cachefiles_volume".
There are three benefits to this: 1) The unhash of volume does not depend on whether the fd is handled correctly by the userland. If fd is not closed after umount, it does not matter if it continues to be used, because object->file is already NULL as cachefiles_clean_up_object() is called before fscache_unhash_cookie(). 2) The cache directory can be guaranteed to be generated before the umount completes unless the process is interruped. This is because cachefiles_commit_tmpfile() is called before fscache_unhash_cookie(). 3) Before this patch, it is possible that after umount, calling inuse() on a cache entry may still shows -EBUSY, because the order of umount and cachefiles_put_directory() is indeterminate. Now thanks to volume->lock, calling cull/inuse after umount is finished will not return an error.
Fixes: 62ab63352350 ("fscache: Implement volume registration") Signed-off-by: Zizhi Wo wozizhi@huawei.com --- fs/cachefiles/cache.c | 2 ++ fs/cachefiles/interface.c | 12 ++++++++- fs/cachefiles/internal.h | 4 +++ fs/cachefiles/volume.c | 51 ++++++++++++++++++++++++++++++----- fs/erofs/fscache.c | 4 +++ fs/fscache/cache.c | 1 - fs/fscache/cookie.c | 28 +++++++++++++++++++ fs/fscache/volume.c | 38 +++++++++++++++++++++----- include/linux/fscache-cache.h | 28 ++++++++++++++++++- include/linux/fscache.h | 2 +- 10 files changed, 154 insertions(+), 16 deletions(-)
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index 9fb06dc16520..1bbe4393d289 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -367,6 +367,7 @@ static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache) continue; } list_del_init(&volume->cache_link); + cachefiles_get_volume(volume); } spin_unlock(&cache->object_list_lock); if (!volume) @@ -374,6 +375,7 @@ static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache)
cachefiles_withdraw_volume(volume); fscache_put_volume(vcookie, fscache_volume_put_withdraw); + cachefiles_put_volume(volume); }
_leave(""); diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 35ba2117a6f6..c1bb52588d65 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -22,7 +22,7 @@ static struct cachefiles_object *cachefiles_alloc_object(struct fscache_cookie *cookie) { struct fscache_volume *vcookie = cookie->volume; - struct cachefiles_volume *volume = vcookie->cache_priv; + struct cachefiles_volume *volume = READ_ONCE(vcookie->cache_priv); struct cachefiles_object *object;
_enter("{%s},%x,", vcookie->key, cookie->debug_id); @@ -38,6 +38,15 @@ struct cachefiles_object *cachefiles_alloc_object(struct fscache_cookie *cookie)
refcount_set(&object->ref, 1);
+ /* + * After enabling sync_volume_unhash, fscache_relinquish_volume() may + * release cachefiles_volume while fscache_volume->ref is non-zero. + * However, cachefiles_object may still access cachefiles_cache through + * object->volume->cache (e.g., in cachefiles_ondemand_fd_release). + * Therefore, we need to pin cachefiles_volume through the object. + */ + cachefiles_get_volume(volume); + spin_lock_init(&object->lock); INIT_LIST_HEAD(&object->cache_link); object->volume = volume; @@ -96,6 +105,7 @@ void cachefiles_put_object(struct cachefiles_object *object, cachefiles_ondemand_deinit_obj_info(object); cache = object->volume->cache->cache; fscache_put_cookie(object->cookie, fscache_cookie_put_object); + cachefiles_put_volume(object->volume); object->cookie = NULL; kmem_cache_free(cachefiles_object_jar, object); fscache_uncount_object(cache); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index cb9607d643c7..7215e96fc2cf 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -37,9 +37,11 @@ enum cachefiles_content { * Cached volume representation. */ struct cachefiles_volume { + refcount_t ref; struct cachefiles_cache *cache; struct list_head cache_link; /* Link in cache->volumes */ struct fscache_volume *vcookie; /* The netfs's representation */ + struct mutex lock; /* Lock for cachefiles_volume */ bool dir_has_put; /* Indicates cache dir has been put */ struct dentry *dentry; /* The volume dentry */ struct dentry *fanout[256]; /* Fanout subdirs */ @@ -406,6 +408,8 @@ static inline void cachefiles_end_secure(struct cachefiles_cache *cache, /* * volume.c */ +void cachefiles_get_volume(struct cachefiles_volume *volume); +void cachefiles_put_volume(struct cachefiles_volume *volume); void cachefiles_acquire_volume(struct fscache_volume *volume); void cachefiles_free_volume(struct fscache_volume *volume); void cachefiles_withdraw_volume(struct cachefiles_volume *volume); diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c index aa5467335d85..582dd42192b1 100644 --- a/fs/cachefiles/volume.c +++ b/fs/cachefiles/volume.c @@ -10,6 +10,19 @@ #include "internal.h" #include <trace/events/fscache.h>
+void cachefiles_get_volume(struct cachefiles_volume *volume) +{ + refcount_inc(&volume->ref); +} + +void cachefiles_put_volume(struct cachefiles_volume *volume) +{ + if (refcount_dec_and_test(&volume->ref)) { + mutex_destroy(&volume->lock); + kfree(volume); + } +} + /* * Allocate and set up a volume representation. We make sure all the fanout * directories are created and pinned. @@ -33,6 +46,7 @@ void cachefiles_acquire_volume(struct fscache_volume *vcookie) volume->vcookie = vcookie; volume->cache = cache; INIT_LIST_HEAD(&volume->cache_link); + mutex_init(&volume->lock);
cachefiles_begin_secure(cache, &saved_cred);
@@ -77,7 +91,17 @@ void cachefiles_acquire_volume(struct fscache_volume *vcookie)
cachefiles_end_secure(cache, saved_cred);
- vcookie->cache_priv = volume; + /* + * The purpose of introducing volume->ref is twofold: + * 1) To allow cachefiles_object to pin cachefiles_volume. + * 2) To handle the concurrency between cachefiles_free_volume() and + * cachefiles_withdraw_volume() introduced by enabling sync_unhash + * volume, preventing the former from releasing cachefiles_volume and + * causing a use-after-free in the latter. + */ + refcount_set(&volume->ref, 1); + /* Prevent writing vcookie->cache_priv before writing volume->ref. */ + smp_store_release(&vcookie->cache_priv, volume); n_accesses = atomic_inc_return(&vcookie->n_accesses); /* Stop wakeups on dec-to-0 */ trace_fscache_access_volume(vcookie->debug_id, 0, refcount_read(&vcookie->ref), @@ -98,6 +122,7 @@ void cachefiles_acquire_volume(struct fscache_volume *vcookie) error_name: kfree(name); error_vol: + mutex_destroy(&volume->lock); kfree(volume); cachefiles_end_secure(cache, saved_cred); } @@ -111,26 +136,40 @@ static void __cachefiles_free_volume(struct cachefiles_volume *volume)
_enter("");
- if (volume->dir_has_put) + mutex_lock(&volume->lock); + if (volume->dir_has_put) { + mutex_unlock(&volume->lock); return; + }
volume->dir_has_put = true;
for (i = 0; i < 256; i++) cachefiles_put_directory(volume->fanout[i]); cachefiles_put_directory(volume->dentry); - kfree(volume); + mutex_unlock(&volume->lock); }
void cachefiles_free_volume(struct fscache_volume *vcookie) { struct cachefiles_volume *volume = vcookie->cache_priv;
- spin_lock(&volume->cache->object_list_lock); - list_del_init(&volume->cache_link); - spin_unlock(&volume->cache->object_list_lock); + /* + * Prevents access to the cachefiles_cache that has been freed caused + * by the concurrency between cachefiles_free_volume() and + * cachefiles_daemon_release(), the later may kree(cache). + */ + mutex_lock(&volume->lock); + if (!volume->dir_has_put) { + spin_lock(&volume->cache->object_list_lock); + list_del_init(&volume->cache_link); + spin_unlock(&volume->cache->object_list_lock); + } + mutex_unlock(&volume->lock); + vcookie->cache_priv = NULL; __cachefiles_free_volume(volume); + cachefiles_put_volume(volume); }
void cachefiles_withdraw_volume(struct cachefiles_volume *volume) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index afc37c9029ce..7bcd1d261e7d 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -5,6 +5,7 @@ */ #include <linux/pseudo_fs.h> #include <linux/fscache.h> +#include <linux/fscache-cache.h> #include "internal.h"
static DEFINE_MUTEX(erofs_domain_list_lock); @@ -366,6 +367,9 @@ static int erofs_fscache_register_volume(struct super_block *sb) erofs_err(sb, "failed to register volume for %s", name); ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP; volume = NULL; + } else { + /* enable synchronous unhashing for the associated volumes */ + fscache_set_sync_volume_unhash(volume->cache); }
sbi->volume = volume; diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index 9397ed39b0b4..2ff7ea90f13f 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -213,7 +213,6 @@ void fscache_relinquish_cache(struct fscache_cache *cache) fscache_cache_put_prep_failed : fscache_cache_put_relinquish;
- cache->ops = NULL; cache->cache_priv = NULL; fscache_set_cache_state(cache, FSCACHE_CACHE_IS_NOT_PRESENT); fscache_put_cache(cache, where); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index d4d4b3a8b106..a42985d5033c 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -319,6 +319,26 @@ static bool fscache_cookie_same(const struct fscache_cookie *a,
static atomic_t fscache_cookie_debug_id = ATOMIC_INIT(1);
+/* + * Increase the volume->n_hash_cookies count. Must be called when a cookie + * associated with the volume is added on the hash link. + */ +static void fscache_increase_volume_hash_cookies(struct fscache_volume *volume) +{ + atomic_inc(&volume->n_hash_cookies); +} + +/* + * Decrease the volume->n_hash_cookies count. Must be called when a cookie + * associated with the volume is unhashed. When the count reaches 0, the volume + * waiting to be unhash needs to be awakened. + */ +static void fscache_decrease_volume_hash_cookies(struct fscache_volume *volume) +{ + if (atomic_dec_and_test(&volume->n_hash_cookies)) + wake_up_var(&volume->n_hash_cookies); +} + /* * Allocate a cookie. */ @@ -425,6 +445,9 @@ static bool fscache_hash_cookie(struct fscache_cookie *candidate) set_bit(FSCACHE_COOKIE_IS_HASHED, &candidate->flags); hlist_bl_unlock(h);
+ if (fscache_test_sync_volume_unhash(candidate->volume->cache)) + fscache_increase_volume_hash_cookies(candidate->volume); + if (wait_for) { fscache_wait_on_collision(candidate, wait_for); fscache_put_cookie(wait_for, fscache_cookie_put_hash_collision); @@ -933,6 +956,7 @@ static void fscache_cookie_drop_from_lru(struct fscache_cookie *cookie) static void fscache_unhash_cookie(struct fscache_cookie *cookie) { struct hlist_bl_head *h; + struct fscache_volume *volume = cookie->volume; unsigned int bucket;
bucket = cookie->key_hash & (ARRAY_SIZE(fscache_cookie_hash) - 1); @@ -942,6 +966,10 @@ static void fscache_unhash_cookie(struct fscache_cookie *cookie) hlist_bl_del(&cookie->hash_link); clear_bit(FSCACHE_COOKIE_IS_HASHED, &cookie->flags); hlist_bl_unlock(h); + + if (fscache_test_sync_volume_unhash(volume->cache)) + fscache_decrease_volume_hash_cookies(volume); + fscache_stat(&fscache_n_relinquishes_dropped); }
diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index 89efb8071a75..4b371d8fafed 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -389,27 +389,35 @@ static void fscache_unhash_volume(struct fscache_volume *volume) h = &fscache_volume_hash[bucket];
hlist_bl_lock(h); - hlist_bl_del(&volume->hash_link); + hlist_bl_del_init(&volume->hash_link); if (test_bit(FSCACHE_VOLUME_COLLIDED_WITH, &volume->flags)) fscache_wake_pending_volume(volume, h); hlist_bl_unlock(h); }
/* - * Drop a cache's volume attachments. + * Clear the volume->cache_priv. */ -static void fscache_free_volume(struct fscache_volume *volume) +static void fscache_clear_volume_priv(struct fscache_volume *volume) { - struct fscache_cache *cache = volume->cache; - if (volume->cache_priv) { __fscache_begin_volume_access(volume, NULL, fscache_access_relinquish_volume); if (volume->cache_priv) - cache->ops->free_volume(volume); + volume->cache->ops->free_volume(volume); fscache_end_volume_access(volume, NULL, fscache_access_relinquish_volume_end); } +} + +/* + * Drop a cache's volume attachments. + */ +static void fscache_free_volume(struct fscache_volume *volume) +{ + struct fscache_cache *cache = volume->cache; + + fscache_clear_volume_priv(volume);
down_write(&fscache_addremove_sem); list_del_init(&volume->proc_link); @@ -461,6 +469,24 @@ void __fscache_relinquish_volume(struct fscache_volume *volume, memcpy(volume->coherency, coherency_data, volume->coherency_len); }
+ if (fscache_test_sync_volume_unhash(volume->cache)) { + int ret = wait_var_event_killable(&volume->n_hash_cookies, + (atomic_read_acquire(&volume->n_hash_cookies) == 0)); + if (ret == -ERESTARTSYS) { + pr_info("Waiting for unhashing has been interrupted!" \ + "(n_hash_cookies %d)\n", + atomic_read(&volume->n_hash_cookies)); + } else if (!hlist_bl_unhashed(&volume->hash_link)) { + /* + * To ensure that the corresponding cache entries can + * be created on the next mount, thereby completing + * the mount successfully. + */ + fscache_clear_volume_priv(volume); + fscache_unhash_volume(volume); + } + } + fscache_put_volume(volume, fscache_volume_put_relinquish); } EXPORT_SYMBOL(__fscache_relinquish_volume); diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 151043af41a1..642fc1276c63 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -44,8 +44,9 @@ struct fscache_cache { unsigned int debug_id; enum fscache_cache_state state; char *name; + KABI_USE(1, unsigned long flags) +#define FSCACHE_CACHE_SYNC_VOLUME_UNHASH 0 /* Mark to enable volume sync unhash */
- KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) @@ -199,6 +200,31 @@ static inline void fscache_wait_for_objects(struct fscache_cache *cache) atomic_read(&cache->object_count) == 0); }
+/** + * fscache_set_sync_volume_unhash - Enable the synchronization unhash mechanism + * @cache: The cache to set + * + * Enable the volume synchronization wait-unhash mechanism. The volume will be + * unhashed in advance, without relying on the reference count reaching zero. + */ +static inline void fscache_set_sync_volume_unhash(struct fscache_cache *cache) +{ + set_bit(FSCACHE_CACHE_SYNC_VOLUME_UNHASH, &cache->flags); +} + +/** + * fscache_test_sync_volume_unhash - Check whether the synchronization + * unhash mechanism is enabled + * @cache: The cache to query + * + * Indicate whether the volume synchronization wait-unhash mechanism is + * currently enabled. + */ +static inline bool fscache_test_sync_volume_unhash(struct fscache_cache *cache) +{ + return test_bit(FSCACHE_CACHE_SYNC_VOLUME_UNHASH, &cache->flags); +} + #ifdef CONFIG_FSCACHE_STATS extern atomic_t fscache_n_read; extern atomic_t fscache_n_write; diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 3afe007834d1..99cb06ee3f65 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -90,7 +90,7 @@ struct fscache_volume { #define FSCACHE_VOLUME_CREATING 4 /* Volume is being created on disk */ u8 coherency_len; /* Length of the coherency data */
- KABI_RESERVE(1) + KABI_USE(1, atomic_t n_hash_cookies) /* Number of hashed data cookies in volume */ KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4)