From: Jan Kara jack@suse.cz
mainline inclusion from mainline-v5.14-rc1 commit a921c655f2033dd1ce1379128efe881dda23ea37 category: bugfix bugzilla: 185777 https://gitee.com/openeuler/kernel/issues/I4LM14 CVE: NA
---------------------------
Currently, bfq does very little in bfq_requests_merged() and handles all the request cleanup in bfq_finish_requeue_request() called from blk_mq_free_request(). That is currently safe only because blk_mq_free_request() is called shortly after bfq_requests_merged() while bfqd->lock is still held. However to fix a lock inversion between bfqd->lock and ioc->lock, we need to call blk_mq_free_request() after dropping bfqd->lock. That would mean that already merged request could be seen by other processes inside bfq queues and possibly dispatched to the device which is wrong. So move cleanup of the request from bfq_finish_requeue_request() to bfq_requests_merged().
Acked-by: Paolo Valente paolo.valente@linaro.org Signed-off-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210623093634.27879-2-jack@suse.cz Signed-off-by: Jens Axboe axboe@kernel.dk
conflict: in bfq_finish_requeue_request, code in hulk have the line atomic_dec(&rq->mq_hctx->elevator_queued); that is conflicted; Signed-off-by: zhangwensheng zhangwensheng5@huawei.com Reviewed-by: qiulaibin qiulaibin@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/bfq-iosched.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fd3c23d516b8..27e01b4cd528 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2326,7 +2326,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, *next_bfqq = bfq_init_rq(next);
if (!bfqq) - return; + goto remove;
/* * If next and rq belong to the same bfq_queue and next is older @@ -2349,6 +2349,14 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfqq->next_rq = rq;
bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +remove: + /* Merged request may be in the IO scheduler. Remove it. */ + if (!RB_EMPTY_NODE(&next->rb_node)) { + bfq_remove_request(next->q, next); + if (next_bfqq) + bfqg_stats_update_io_remove(bfqq_group(next_bfqq), + next->cmd_flags); + } }
/* Must be called with bfqq != NULL */ @@ -5901,6 +5909,7 @@ static void bfq_finish_requeue_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd; + unsigned long flags;
/* * rq either is not associated with any icq, or is an already @@ -5918,40 +5927,16 @@ static void bfq_finish_requeue_request(struct request *rq) rq->io_start_time_ns, rq->cmd_flags);
+ spin_lock_irqsave(&bfqd->lock, flags); if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - if (rq == bfqd->waited_rq) bfq_update_inject_limit(bfqd, bfqq);
bfq_completed_request(bfqq, bfqd); - bfq_finish_requeue_request_body(bfqq); atomic_dec(&rq->mq_hctx->elevator_queued); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, - * in which case we need to remove it (this should - * never happen in case of requeue). And we cannot - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. - * This situation seems to occur only in process - * context, as a consequence of a merge. In the - * current version of the code, this implies that the - * lock is held. - */ - - if (!RB_EMPTY_NODE(&rq->rb_node)) { - bfq_remove_request(rq->q, rq); - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } - bfq_finish_requeue_request_body(bfqq); } + bfq_finish_requeue_request_body(bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags);
/* * Reset private fields. In case of a requeue, this allows
From: Jan Kara jack@suse.cz
mainline inclusion from mainline-v5.14-rc1 commit fd2ef39cc9a6b9c4c41864ac506906c52f94b06a category: bugfix bugzilla: 185777 https://gitee.com/openeuler/kernel/issues/I4LM14 CVE: NA
------------------------------
Lockdep complains about lock inversion between ioc->lock and bfqd->lock:
bfqd -> ioc: put_io_context+0x33/0x90 -> ioc->lock grabbed blk_mq_free_request+0x51/0x140 blk_put_request+0xe/0x10 blk_attempt_req_merge+0x1d/0x30 elv_attempt_insert_merge+0x56/0xa0 blk_mq_sched_try_insert_merge+0x4b/0x60 bfq_insert_requests+0x9e/0x18c0 -> bfqd->lock grabbed blk_mq_sched_insert_requests+0xd6/0x2b0 blk_mq_flush_plug_list+0x154/0x280 blk_finish_plug+0x40/0x60 ext4_writepages+0x696/0x1320 do_writepages+0x1c/0x80 __filemap_fdatawrite_range+0xd7/0x120 sync_file_range+0xac/0xf0
ioc->bfqd: bfq_exit_icq+0xa3/0xe0 -> bfqd->lock grabbed put_io_context_active+0x78/0xb0 -> ioc->lock grabbed exit_io_context+0x48/0x50 do_exit+0x7e9/0xdd0 do_group_exit+0x54/0xc0
To avoid this inversion we change blk_mq_sched_try_insert_merge() to not free the merged request but rather leave that upto the caller similarly to blk_mq_sched_try_merge(). And in bfq_insert_requests() we make sure to free all the merged requests after dropping bfqd->lock.
Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Reviewed-by: Ming Lei ming.lei@redhat.com Acked-by: Paolo Valente paolo.valente@linaro.org Signed-off-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20210623093634.27879-3-jack@suse.cz Signed-off-by: Jens Axboe axboe@kernel.dk
Conflict: 1. mainline include/linux/elevator.h file change; 2. mainline block/mq-deadline-main.c file change, now change to block/mq-deadline.c; Signed-off-by: zhangwensheng zhangwensheng5@huawei.com Reviewed-by: qiulaibin qiulaibin@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/bfq-iosched.c | 6 ++++-- block/blk-merge.c | 19 ++++++++----------- block/blk-mq-sched.c | 5 +++-- block/blk-mq-sched.h | 3 ++- block/blk-mq.h | 11 +++++++++++ block/blk.h | 2 +- block/elevator.c | 11 ++++++++--- block/mq-deadline.c | 5 ++++- include/linux/elevator.h | 3 ++- 9 files changed, 43 insertions(+), 22 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 27e01b4cd528..aa1a808fa072 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2235,9 +2235,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+ spin_unlock_irq(&bfqd->lock); if (free) blk_mq_free_request(free); - spin_unlock_irq(&bfqd->lock);
return ret; } @@ -5508,14 +5508,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct bfq_queue *bfqq; bool idle_timer_disabled = false; unsigned int cmd_flags; + LIST_HEAD(free);
#ifdef CONFIG_BFQ_GROUP_IOSCHED if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) bfqg_stats_update_legacy_io(q, rq); #endif spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + blk_mq_free_requests(&free); return; }
diff --git a/block/blk-merge.c b/block/blk-merge.c index 26f4bcc10de9..6518e0ae2835 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -831,18 +831,15 @@ static struct request *attempt_front_merge(struct request_queue *q, return NULL; }
-int blk_attempt_req_merge(struct request_queue *q, struct request *rq, - struct request *next) +/* + * Try to merge 'next' into 'rq'. Return true if the merge happened, false + * otherwise. The caller is responsible for freeing 'next' if the merge + * happened. + */ +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next) { - struct request *free; - - free = attempt_merge(q, rq, next); - if (free) { - blk_put_request(free); - return 1; - } - - return 0; + return attempt_merge(q, rq, next); }
bool blk_rq_merge_ok(struct request *rq, struct bio *bio) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a3266541bd06..606bef13f1c2 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -381,9 +381,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, return ret; }
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { - return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 0476360f05f1..15f3d611db10 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -12,7 +12,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
diff --git a/block/blk-mq.h b/block/blk-mq.h index 6f87c0681443..7f3194657dff 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -283,6 +283,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q, return NULL; }
+/* Free all requests on the list */ +static inline void blk_mq_free_requests(struct list_head *list) +{ + while (!list_empty(list)) { + struct request *rq = list_entry_rq(list->next); + + list_del_init(&rq->queuelist); + blk_mq_free_request(rq); + } +} + /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. diff --git a/block/blk.h b/block/blk.h index 3ef4472b9a83..cd39fd0c93f1 100644 --- a/block/blk.h +++ b/block/blk.h @@ -235,7 +235,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *, void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); diff --git a/block/elevator.c b/block/elevator.c index 27eb70ec277a..65dfc7559a36 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -353,9 +353,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, * we can append 'rq' to an existing request, so we can throw 'rq' away * afterwards. * - * Returns true if we merged, false otherwise + * Returns true if we merged, false otherwise. 'free' will contain all + * requests that need to be freed. */ -bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { struct request *__rq; bool ret; @@ -366,8 +368,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) /* * First try one-hit cache. */ - if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) + if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { + list_add(&rq->queuelist, free); return true; + }
if (blk_queue_noxmerges(q)) return false; @@ -381,6 +385,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) break;
+ list_add(&rq->queuelist, free); /* The merged request could be merged with others, try again */ ret = true; rq = __rq; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index e4e90761eab3..43994cce1eb2 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -489,6 +489,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + LIST_HEAD(free);
/* * This may be a requeue of a write request that has locked its @@ -496,8 +497,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ blk_req_zone_write_unlock(rq);
- if (blk_mq_sched_try_insert_merge(q, rq)) + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { + blk_mq_free_requests(&free); return; + }
blk_mq_sched_request_inserted(rq);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h index bc26b4e11f62..0bb7489e0cfb 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -117,7 +117,8 @@ extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, enum elv_merge); -extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); +extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, + struct list_head *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *);
From: Keith Busch kbusch@kernel.org
mainline inclusion from mainline-v5.14-rc1 commit fb9b16e15cd70e21d8af7f03d700deb9509c2ce8 category: bugfix bugzilla: 185778 https://gitee.com/openeuler/kernel/issues/I4LM14 CVE: NA
-----------------------------------------
The synchronous blk_execute_rq() had not provided a way for its callers to know if its request was successful or not. Return the blk_status_t result of the request.
Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Ming Lei ming.lei@redhat.com Signed-off-by: Keith Busch kbusch@kernel.org Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Link: https://lore.kernel.org/r/20210610214437.641245-4-kbusch@kernel.org Signed-off-by: Jens Axboe axboe@kernel.dk
conflict: 1. in blkdev.h and blk-exec, blk_execute_rq return value change; 2. input parameter in blk_execute_rq is not the same as mainline; Signed-off-by: zhangwensheng zhangwensheng5@huawei.com Reviewed-by: qiulaibin qiulaibin@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-exec.c | 7 +++++-- include/linux/blkdev.h | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/block/blk-exec.c b/block/blk-exec.c index 85324d53d072..b2676de4c6a5 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -21,7 +21,7 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error) { struct completion *waiting = rq->end_io_data;
- rq->end_io_data = NULL; + rq->end_io_data = (void *)(uintptr_t)error;
/* * complete last, if this is a stack request the process (and thus @@ -75,8 +75,9 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. + * Return: The blk_status_t result provided to blk_mq_end_request(). */ -void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, +blk_status_t blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head) { DECLARE_COMPLETION_ONSTACK(wait); @@ -91,5 +92,7 @@ void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); else wait_for_completion_io(&wait); + + return (blk_status_t)(uintptr_t)rq->end_io_data; } EXPORT_SYMBOL(blk_execute_rq); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f02a74feee63..6627e3c6cb43 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -959,10 +959,10 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns extern int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); -extern void blk_execute_rq(struct request_queue *, struct gendisk *, - struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); +blk_status_t blk_execute_rq(struct request_queue *, struct gendisk *, + struct request *, int);
/* Helper to convert REQ_OP_XXX to its string format XXX */ extern const char *blk_op_str(unsigned int op);
From: Fang Lijun fanglijun3@huawei.com
ascend inclusion category: Bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA
--------------
System cann't use the cdm nodes memory, but it can mmap all nodes huge pages, so it will cause Bus error when mmap succeed but the huge pages were not enough.
When set the cdmmask, users will transfer the numa id by mmap flag to map the specific numa node hugepages, if there was not enough hugepages on this node, return -ENOMEM.
Dvpp use flags MAP_CHECKNODE to enable check node hugetlb. The global variable numanode will cause the mmap not be reenterable, so use the flags BITS[26:31] directly. v2: fix a compiling error on platforms such as mips
Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/alpha/include/uapi/asm/mman.h | 1 + arch/mips/include/uapi/asm/mman.h | 1 + arch/parisc/include/uapi/asm/mman.h | 1 + arch/powerpc/include/uapi/asm/mman.h | 1 + arch/sparc/include/uapi/asm/mman.h | 1 + arch/xtensa/include/uapi/asm/mman.h | 1 + fs/hugetlbfs/inode.c | 45 ++++++++++++++++++++++++++++ include/linux/hugetlb.h | 1 + include/linux/mm.h | 11 +++++++ include/linux/mman.h | 15 ++++++++++ include/uapi/asm-generic/mman.h | 1 + mm/hugetlb.c | 3 ++ mm/mmap.c | 22 ++++++++++++-- 13 files changed, 101 insertions(+), 3 deletions(-)
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 87abc7b03360..eeb0b9cc0bee 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -32,6 +32,7 @@ #define MAP_HUGETLB 0x100000 /* create a huge page mapping */ #define MAP_FIXED_NOREPLACE 0x200000/* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */
#define MS_ASYNC 1 /* sync memory asynchronously */ #define MS_SYNC 2 /* synchronous memory sync */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 61cd225fcaa4..00437067f14d 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -50,6 +50,7 @@ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */
/* * Flags for msync diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 851678907640..0bdf4ae5b69f 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -27,6 +27,7 @@ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_UNINITIALIZED 0 /* uninitialized anonymous mmap */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */
#define MS_SYNC 1 /* synchronous memory sync */ #define MS_ASYNC 2 /* sync memory asynchronously */ diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index f0eb04780148..908fa2ad02cc 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h @@ -26,6 +26,7 @@ #define MCL_FUTURE 0x4000 /* lock all additions to address space */ #define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */
/* Override any generic PKEY permission defines */ #define PKEY_DISABLE_EXECUTE 0x4 diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h index 8caf19c604d0..06578c16a683 100644 --- a/arch/sparc/include/uapi/asm/mman.h +++ b/arch/sparc/include/uapi/asm/mman.h @@ -22,5 +22,6 @@ #define MCL_FUTURE 0x4000 /* lock all additions to address space */ #define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */
#endif /* _UAPI__SPARC_MMAN_H__ */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index a52ac8462b7d..717561c7e85a 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -57,6 +57,7 @@ #define MAP_HUGETLB 0x80000 /* create a huge page mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */ +#define MAP_CHECKNODE 0x800000 /* hugetlb numa node check */ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 96c5f4c5ee6e..2e2e4983f1ba 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -118,6 +118,45 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); }
+/* + * Check current numa node has enough free huge pages to mmap hugetlb. + * resv_huge_pages_node: mmap hugepages but haven't used in current + * numa node. + */ +static int hugetlb_checknode(struct vm_area_struct *vma, long nr) +{ + int nid; + int ret = 0; + struct hstate *h = &default_hstate; + + spin_lock(&hugetlb_lock); + + nid = vma->vm_flags >> CHECKNODE_BITS; + + if (nid >= MAX_NUMNODES) { + ret = -EINVAL; + goto err; + } + + if (h->free_huge_pages_node[nid] < nr) { + ret = -ENOMEM; + goto err; + } else { + if (h->resv_huge_pages_node[nid] + nr > + h->free_huge_pages_node[nid]) { + ret = -ENOMEM; + goto err; + } else { + h->resv_huge_pages_node[nid] += nr; + ret = 0; + } + } + +err: + spin_unlock(&hugetlb_lock); + return ret; +} + /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. @@ -175,6 +214,12 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) inode_lock(inode); file_accessed(file);
+ if (is_set_cdmmask() && (vma->vm_flags & VM_CHECKNODE)) { + ret = hugetlb_checknode(vma, len >> huge_page_shift(h)); + if (ret < 0) + goto out; + } + ret = -ENOMEM; if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bf734fb00a1d..fd9635a6a92f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -581,6 +581,7 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; + unsigned int resv_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP unsigned int nr_free_vmemmap_pages; #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index ae9b6688677f..100c113e62a7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -97,6 +97,15 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif
+#ifdef CONFIG_COHERENT_DEVICE +static inline bool is_set_cdmmask(void) +{ + return !nodes_empty(cdmmask); +} +#else +#define is_set_cdmmask() (0) +#endif + #include <asm/page.h> #include <asm/processor.h>
@@ -304,6 +313,8 @@ extern unsigned int kobjsize(const void *objp); #define VM_CDM 0x100000000 /* Contains coherent device memory */ #endif
+#define VM_CHECKNODE 0x200000000 + #ifdef CONFIG_USERSWAP /* bit[32:36] is the protection key of intel, so use a large value for VM_USWAP */ #define VM_USWAP 0x2000000000000000 diff --git a/include/linux/mman.h b/include/linux/mman.h index 629cefc4ecba..7908bf3e5696 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -8,6 +8,21 @@ #include <linux/atomic.h> #include <uapi/linux/mman.h>
+#ifdef CONFIG_COHERENT_DEVICE +#define CHECKNODE_BITS 48 +#define CHECKNODE_MASK (~((_AC(1, UL) << CHECKNODE_BITS) - 1)) +static inline void set_vm_checknode(vm_flags_t *vm_flags, unsigned long flags) +{ + if (is_set_cdmmask()) + *vm_flags |= VM_CHECKNODE | ((((flags >> MAP_HUGE_SHIFT) & + MAP_HUGE_MASK) << CHECKNODE_BITS) & CHECKNODE_MASK); +} +#else +#define CHECKNODE_BITS (0) +static inline void set_vm_checknode(vm_flags_t *vm_flags, unsigned long flags) +{} +#endif + /* * Arrange for legacy / undefined architecture specific flags to be * ignored by mmap handling code. diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 344bb9b090a7..d7f0f48117b0 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -5,6 +5,7 @@ #include <asm-generic/mman-common.h>
#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ +#define MAP_CHECKNODE 0x0400 /* hugetlb numa node check */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ #define MAP_LOCKED 0x2000 /* pages are locked */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6ae2d2e90681..d0672e482879 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -30,6 +30,7 @@ #include <linux/numa.h> #include <linux/llist.h> #include <linux/cma.h> +#include <linux/mman.h>
#include <asm/page.h> #include <asm/pgalloc.h> @@ -1164,6 +1165,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; + if (is_set_cdmmask() && (vma->vm_flags & VM_CHECKNODE)) + h->resv_huge_pages_node[vma->vm_flags >> CHECKNODE_BITS]--; }
mpol_cond_put(mpol); diff --git a/mm/mmap.c b/mm/mmap.c index f705137fd248..a208057be6f1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1581,6 +1581,12 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len, vm_flags |= VM_NORESERVE; }
+ /* set numa node id into vm_flags, + * hugetlbfs file mmap will use it to check node + */ + if (flags & MAP_CHECKNODE) + set_vm_checknode(&vm_flags, flags); + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || @@ -1825,12 +1831,23 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; struct hstate *hs; + int page_size_log;
- hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); + /* + * If config cdm node, flags bits [26:31] used for + * mmap hugetlb check node + */ + if (is_set_cdmmask()) + page_size_log = 0; + else + page_size_log = (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK; + + hs = hstate_sizelog(page_size_log); if (!hs) return -EINVAL;
len = ALIGN(len, huge_page_size(hs)); + /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called @@ -1839,8 +1856,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE, - (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); + &user, HUGETLB_ANONHUGE_INODE, page_size_log); if (IS_ERR(file)) return PTR_ERR(file); }
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4LH1X CVE: NA
--------------------------------
When online tasks occupy cpu long time, offline task will not get cpu to run, the priority inversion issue may be triggered in this case. If the above case occurs, we will unthrottle offline tasks and let its get a chance to run. When online tasks occupy cpu over 5s(defaule value), we will unthrottle offline tasks and enter a msleep loop before exit to usermode util the cpu goto idle.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/sched.h | 7 +++ include/linux/sched/sysctl.h | 5 +++ kernel/entry/common.c | 7 ++- kernel/sched/core.c | 3 ++ kernel/sched/fair.c | 84 ++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 3 ++ kernel/sysctl.c | 23 ++++++++++ 7 files changed, 128 insertions(+), 4 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 9e9c0bd4197d..b977f07ed41c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2120,6 +2120,13 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
#ifdef CONFIG_QOS_SCHED void sched_move_offline_task(struct task_struct *p); +void sched_qos_offline_wait(void); +int sched_qos_cpu_overload(void); +#else +static inline int sched_qos_cpu_overload(void) +{ + return 0; +} #endif
#endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 525d73dd8ef9..cd2b767bbff8 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -74,6 +74,11 @@ extern unsigned int sysctl_sched_uclamp_util_min_rt_default; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif
+#ifdef CONFIG_QOS_SCHED +extern unsigned int sysctl_overload_detect_period; +extern unsigned int sysctl_offline_wait_interval; +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; #endif diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 8a4dd7027e90..df3c534dc138 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -160,6 +160,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_SIGPENDING) arch_do_signal(regs);
+#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + if (ti_work & _TIF_NOTIFY_RESUME) { tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); @@ -187,7 +191,8 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
lockdep_assert_irqs_disabled();
- if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) || + sched_qos_cpu_overload())) ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 856c4123e92a..b46717970ab9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7394,6 +7394,9 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ +#ifdef CONFIG_QOS_SCHED + init_qos_hrtimer(i); +#endif init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1a0cb9a4161e..e5cf15fb9e84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -24,6 +24,9 @@ #ifdef CONFIG_SCHED_STEAL #include "sparsemask.h" #endif +#ifdef CONFIG_QOS_SCHED +#include <linux/delay.h> +#endif
/* * Targeted preemption latency for CPU-bound tasks: @@ -153,6 +156,10 @@ int __weak arch_asym_cpu_priority(int cpu)
#ifdef CONFIG_QOS_SCHED static DEFINE_PER_CPU_SHARED_ALIGNED(struct list_head, qos_throttled_cfs_rq); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct hrtimer, qos_overload_timer); +static DEFINE_PER_CPU(int, qos_cpu_overload); +unsigned int sysctl_overload_detect_period = 5000; /* in ms */ +unsigned int sysctl_offline_wait_interval = 100; /* in ms */ static int unthrottle_qos_cfs_rqs(int cpu); #endif
@@ -7245,6 +7252,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ }
#ifdef CONFIG_QOS_SCHED +static void start_qos_hrtimer(int cpu); static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -7283,6 +7291,9 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
}
+ if (list_empty(&per_cpu(qos_throttled_cfs_rq, cpu_of(rq)))) + start_qos_hrtimer(cpu_of(rq)); + cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq);
@@ -7342,7 +7353,7 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); }
-static int unthrottle_qos_cfs_rqs(int cpu) +static int __unthrottle_qos_cfs_rqs(int cpu) { struct cfs_rq *cfs_rq, *tmp_rq; int res = 0; @@ -7358,11 +7369,26 @@ static int unthrottle_qos_cfs_rqs(int cpu) return res; }
+static int unthrottle_qos_cfs_rqs(int cpu) +{ + int res; + + res = __unthrottle_qos_cfs_rqs(cpu); + if (res) + hrtimer_cancel(&(per_cpu(qos_overload_timer, cpu))); + + return res; +} + static bool check_qos_cfs_rq(struct cfs_rq *cfs_rq) { + if (unlikely(__this_cpu_read(qos_cpu_overload))) { + return false; + } + if (unlikely(cfs_rq && cfs_rq->tg->qos_level < 0 && - !sched_idle_cpu(smp_processor_id()) && - cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { + !sched_idle_cpu(smp_processor_id()) && + cfs_rq->h_nr_running == cfs_rq->idle_h_nr_running)) { throttle_qos_cfs_rq(cfs_rq); return true; } @@ -7380,6 +7406,56 @@ static inline void unthrottle_qos_sched_group(struct cfs_rq *cfs_rq) unthrottle_qos_cfs_rq(cfs_rq); rq_unlock_irqrestore(rq, &rf); } + +void sched_qos_offline_wait(void) +{ + long qos_level; + + while (unlikely(this_cpu_read(qos_cpu_overload))) { + rcu_read_lock(); + qos_level = task_group(current)->qos_level; + rcu_read_unlock(); + if (qos_level != -1 || signal_pending(current)) + break; + msleep_interruptible(sysctl_offline_wait_interval); + } +} + +int sched_qos_cpu_overload(void) +{ + return __this_cpu_read(qos_cpu_overload); +} + +static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) +{ + struct rq_flags rf; + struct rq *rq = this_rq(); + + rq_lock_irqsave(rq, &rf); + if (__unthrottle_qos_cfs_rqs(smp_processor_id())) + __this_cpu_write(qos_cpu_overload, 1); + rq_unlock_irqrestore(rq, &rf); + + return HRTIMER_NORESTART; +} + +static void start_qos_hrtimer(int cpu) +{ + ktime_t time; + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + time = ktime_add_ms(hrtimer->base->get_time(), (u64)sysctl_overload_detect_period); + hrtimer_set_expires(hrtimer, time); + hrtimer_start_expires(hrtimer, HRTIMER_MODE_ABS_PINNED); +} + +void init_qos_hrtimer(int cpu) +{ + struct hrtimer *hrtimer = &(per_cpu(qos_overload_timer, cpu)); + + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer->function = qos_overload_timer_handler; +} #endif
struct task_struct * @@ -7548,6 +7624,8 @@ done: __maybe_unused; rq->idle_stamp = 0; goto again; } + + __this_cpu_write(qos_cpu_overload, 0); #endif /* * rq is about to be idle, check if we need to update the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9ec230220ee3..4c58086cf080 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1100,6 +1100,9 @@ static inline int cpu_of(struct rq *rq) #endif }
+#ifdef CONFIG_QOS_SCHED +void init_qos_hrtimer(int cpu); +#endif
#ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 261787cebd8e..749ef59224e2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -128,6 +128,9 @@ static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif +#ifdef CONFIG_QOS_SCHED +static int hundred_thousand = 100000; +#endif #ifdef CONFIG_PERF_EVENTS static int six_hundred_forty_kb = 640 * 1024; #endif @@ -2725,6 +2728,26 @@ static struct ctl_table kern_table[] = { .mode = 0555, .child = ias_table, }, +#ifdef CONFIG_QOS_SCHED + { + .procname = "qos_overload_detect_period_ms", + .data = &sysctl_overload_detect_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one_thousand, + .extra2 = &hundred_thousand, + }, + { + .procname = "qos_offline_wait_interval_ms", + .data = &sysctl_offline_wait_interval, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one_hundred, + .extra2 = &one_thousand, + }, +#endif { } };
From: Trond Myklebust trond.myklebust@hammerspace.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4IL3P CVE: NA
--------------------------------
After we've looked up the transport module, we need to ensure it can't go away until we've finished running the transport setup code.
Signed-off-by: Trond Myklebust trond.myklebust@hammerspace.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/sunrpc/xprt.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 04aaca4b8bf9..cdf5cc67a005 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -158,6 +158,32 @@ xprt_class_release(const struct xprt_class *t) module_put(t->owner); }
+static const struct xprt_class * +xprt_class_find_by_ident_locked(int ident) +{ + const struct xprt_class *t; + + list_for_each_entry(t, &xprt_list, list) { + if (t->ident != ident) + continue; + if (!try_module_get(t->owner)) + continue; + return t; + } + return NULL; +} + +static const struct xprt_class * +xprt_class_find_by_ident(int ident) +{ + const struct xprt_class *t; + + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_ident_locked(ident); + spin_unlock(&xprt_list_lock); + return t; +} + static const struct xprt_class * xprt_class_find_by_netid_locked(const char *netid) { @@ -1959,21 +1985,17 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { struct rpc_xprt *xprt; - struct xprt_class *t; + const struct xprt_class *t;
- spin_lock(&xprt_list_lock); - list_for_each_entry(t, &xprt_list, list) { - if (t->ident == args->ident) { - spin_unlock(&xprt_list_lock); - goto found; - } + t = xprt_class_find_by_ident(args->ident); + if (!t) { + dprintk("RPC: transport (%d) not supported\n", args->ident); + return ERR_PTR(-EIO); } - spin_unlock(&xprt_list_lock); - dprintk("RPC: transport (%d) not supported\n", args->ident); - return ERR_PTR(-EIO);
-found: xprt = t->setup(args); + xprt_class_release(t); + if (IS_ERR(xprt)) goto out; if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)
From: Lijun Fang fanglijun3@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4JMLR CVE: NA -------------------
If it enable COHERENT_DEVICE and config HBM for device, the tmpfs will statistics all memory including HBM, which couldn't be used by system. When the system runs out of memory, statistical residual memory but also left a lot, so this will cause many problems.
Signed-off-by: Lijun Fang fanglijun3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/shmem.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-)
diff --git a/mm/shmem.c b/mm/shmem.c index 07e3f0d0ba12..b488b6373454 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -124,9 +124,37 @@ struct shmem_options { };
#ifdef CONFIG_TMPFS +#ifdef CONFIG_COHERENT_DEVICE +static unsigned long ddr_totalram_pages(void) +{ + int nid; + int zone_type; + unsigned long managed_pages = 0; + pg_data_t *pgdat; + + if (nodes_empty(cdmmask)) + return totalram_pages(); + + for_each_online_node(nid) { + if (is_cdm_node(nid)) + continue; + pgdat = NODE_DATA(nid); + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); + } + + return managed_pages; +} +#else +static unsigned long ddr_totalram_pages(void) +{ + return totalram_pages(); +} +#endif + static unsigned long shmem_default_max_blocks(void) { - return totalram_pages() / 2; + return ddr_totalram_pages() / 2; }
static unsigned long shmem_default_max_inodes(void)